From 31cbd2acc5ab6a35cd88ff7db773d6a0fdc36bb5 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Mon, 15 Oct 2018 17:26:23 +0200
Subject: [PATCH 01/85] step 1

---
 Makefile                                      |   80 -
 Makefile.am                                   |   48 +
 README.md                                     |   43 -
 c_utils/planck.make                           |   18 -
 config/config.auto.in                         |   12 -
 config/rules.common                           |   33 -
 configure.ac                                  |  173 +-
 docsrc/c_utils.dox                            |  290 ----
 docsrc/footer.html                            |    5 -
 docsrc/index_code.html                        |   15 -
 docsrc/libfftpack.dox                         |  290 ----
 docsrc/libsharp.dox                           |  291 ----
 docsrc/planck.make                            |   20 -
 fortran/sharp.f90                             |  286 ----
 fortran/test_sharp.f90                        |   84 -
 libfftpack/README                             |   34 -
 libfftpack/bluestein.c                        |  173 --
 libfftpack/bluestein.h                        |   48 -
 libfftpack/fftpack.c                          |  833 ----------
 libfftpack/fftpack.h                          |   64 -
 libfftpack/fftpack_inc.c                      |  306 ----
 libfftpack/libfftpack.dox                     |    5 -
 libfftpack/ls_fft.c                           |  291 ----
 libfftpack/ls_fft.h                           |  161 --
 libfftpack/planck.make                        |   21 -
 libsharp/planck.make                          |   29 -
 libsharp/sharp.c                              |   18 +-
 libsharp/sharp.h                              |    3 -
 libsharp/sharp_geomhelpers.c                  |   20 +-
 libsharp/sharp_legendre.c                     | 1319 ---------------
 libsharp/sharp_legendre.c.in                  |  176 --
 libsharp/sharp_legendre.h                     |   62 -
 libsharp/sharp_legendre_table.c               | 1467 -----------------
 libsharp/sharp_legendre_table.h               |   97 --
 m4/m4_ax_create_pkgconfig_info.m4             |  351 ++++
 python/fake_pyrex/Pyrex/Distutils/__init__.py |    1 -
 .../fake_pyrex/Pyrex/Distutils/build_ext.py   |    1 -
 python/fake_pyrex/Pyrex/__init__.py           |    1 -
 python/fake_pyrex/README                      |    2 -
 python/libsharp/__init__.py                   |    1 -
 python/libsharp/libsharp.pxd                  |   92 --
 python/libsharp/libsharp.pyx                  |  324 ----
 python/libsharp/libsharp_mpi.pyx              |   17 -
 python/libsharp/tests/__init__.py             |    1 -
 python/libsharp/tests/test_legendre.py        |   58 -
 python/libsharp/tests/test_legendre_table.py  |   36 -
 python/libsharp/tests/test_sht.py             |   32 -
 .../tests/test_smoothing_noise_pol_mpi.py     |  137 --
 python/setup.py                               |   83 -
 runjinja.py                                   |   19 -
 50 files changed, 488 insertions(+), 7483 deletions(-)
 delete mode 100644 Makefile
 create mode 100644 Makefile.am
 delete mode 100644 README.md
 delete mode 100644 c_utils/planck.make
 delete mode 100644 config/config.auto.in
 delete mode 100644 config/rules.common
 delete mode 100644 docsrc/c_utils.dox
 delete mode 100644 docsrc/footer.html
 delete mode 100644 docsrc/index_code.html
 delete mode 100644 docsrc/libfftpack.dox
 delete mode 100644 docsrc/libsharp.dox
 delete mode 100644 docsrc/planck.make
 delete mode 100644 fortran/sharp.f90
 delete mode 100644 fortran/test_sharp.f90
 delete mode 100644 libfftpack/README
 delete mode 100644 libfftpack/bluestein.c
 delete mode 100644 libfftpack/bluestein.h
 delete mode 100644 libfftpack/fftpack.c
 delete mode 100644 libfftpack/fftpack.h
 delete mode 100644 libfftpack/fftpack_inc.c
 delete mode 100644 libfftpack/libfftpack.dox
 delete mode 100644 libfftpack/ls_fft.c
 delete mode 100644 libfftpack/ls_fft.h
 delete mode 100644 libfftpack/planck.make
 delete mode 100644 libsharp/planck.make
 delete mode 100644 libsharp/sharp_legendre.c
 delete mode 100644 libsharp/sharp_legendre.c.in
 delete mode 100644 libsharp/sharp_legendre.h
 delete mode 100644 libsharp/sharp_legendre_table.c
 delete mode 100644 libsharp/sharp_legendre_table.h
 create mode 100644 m4/m4_ax_create_pkgconfig_info.m4
 delete mode 100644 python/fake_pyrex/Pyrex/Distutils/__init__.py
 delete mode 100644 python/fake_pyrex/Pyrex/Distutils/build_ext.py
 delete mode 100644 python/fake_pyrex/Pyrex/__init__.py
 delete mode 100644 python/fake_pyrex/README
 delete mode 100644 python/libsharp/__init__.py
 delete mode 100644 python/libsharp/libsharp.pxd
 delete mode 100644 python/libsharp/libsharp.pyx
 delete mode 100644 python/libsharp/libsharp_mpi.pyx
 delete mode 100644 python/libsharp/tests/__init__.py
 delete mode 100644 python/libsharp/tests/test_legendre.py
 delete mode 100644 python/libsharp/tests/test_legendre_table.py
 delete mode 100644 python/libsharp/tests/test_sht.py
 delete mode 100644 python/libsharp/tests/test_smoothing_noise_pol_mpi.py
 delete mode 100644 python/setup.py
 delete mode 100755 runjinja.py

diff --git a/Makefile b/Makefile
deleted file mode 100644
index 5a3184c..0000000
--- a/Makefile
+++ /dev/null
@@ -1,80 +0,0 @@
-SHARP_TARGET?=auto
-ifndef SHARP_TARGET
-  SHARP_TARGET:=$(error SHARP_TARGET undefined. Please see README.compilation for help)UNDEFINED
-endif
-
-default: compile_all
-SRCROOT:=$(shell pwd)
-include $(SRCROOT)/config/config.$(SHARP_TARGET)
-include $(SRCROOT)/config/rules.common
-
-all_hdr:=
-all_lib:=
-all_cbin:=
-
-FULL_INCLUDE:=
-
-include c_utils/planck.make
-include libfftpack/planck.make
-include libsharp/planck.make
-include docsrc/planck.make
-
-CYTHON_MODULES=python/libsharp/libsharp.so $(if $(MPI_CFLAGS), python/libsharp/libsharp_mpi.so)
-
-$(all_lib): %: | $(LIBDIR)_mkdir
-	@echo "#  creating library $*"
-	$(ARCREATE) $@ $^
-
-$(all_cbin): %: | $(BINDIR)_mkdir
-	@echo "#  linking C binary $*"
-	$(CL) -o $@ $^ $(CLFLAGS)
-
-compile_all: $(all_cbin) hdrcopy
-
-hdrclean:
-	@if [ -d $(INCDIR) ]; then rm -rf $(INCDIR)/* ; fi
-
-hdrcopy: | $(INCDIR)_mkdir
-	@if [ "$(all_hdr)" ]; then cp -p $(all_hdr) $(INCDIR); fi
-
-$(notdir $(all_cbin)) : % : $(BINDIR)/%
-
-test: compile_all
-	$(BINDIR)/sharp_testsuite acctest && \
-	$(BINDIR)/sharp_testsuite test healpix 2048 -1 1024 -1 0 1 && \
-	$(BINDIR)/sharp_testsuite test fejer1 2047 -1 -1 4096 2 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 2047 -1 -1 4096 0 2
-
-perftest: compile_all
-	$(BINDIR)/sharp_testsuite test healpix 2048 -1 1024 -1 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 63 -1 -1 128 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 127 -1 -1 256 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 255 -1 -1 512 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 511 -1 -1 1024 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 1023 -1 -1 2048 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 2047 -1 -1 4096 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 4095 -1 -1 8192 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 8191 -1 -1 16384 0 1
-
-%.c: %.c.in
-# Only do this if the md5sum changed, in order to avoid Python and Jinja
-# dependency when not modifying the c.in file
-	grep `md5sum $< | cut -d ' ' -f 1` $@ || ./runjinja.py < $< > $@
-
-genclean:
-	rm libsharp/sharp_legendre.c || exit 0
-
-$(CYTHON_MODULES): %.so: %.pyx
-ifndef PIC_CFLAGS
-	$(error Python extension must be built using the --enable-pic configure option.)
-endif
-	cython $<
-	$(CC) $(DEBUG_CFLAGS) $(OPENMP_CFLAGS) $(PIC_CFLAGS) `python-config --cflags` -I$(INCDIR) -o $(<:.pyx=.o) -c $(<:.pyx=.c)
-	$(CL) -shared $(<:.pyx=.o) $(OPENMP_CFLAGS) $(CYTHON_OBJ) -L$(LIBDIR) -lsharp -lfftpack -lc_utils -L`python-config --prefix`/lib `python-config --ldflags` -o $@
-
-python: $(all_lib) hdrcopy $(CYTHON_MODULES)
-
-# the following test files are automatic; the sht wrapper test
-# must be run manually and requires MPI at the moment..
-pytest: python
-	cd python && nosetests --nocapture libsharp/tests/test_legendre_table.py libsharp/tests/test_legendre.py
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..0d40b92
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,48 @@
+ACLOCAL_AMFLAGS = -I m4
+
+lib_LTLIBRARIES = libsharp.la
+
+src_sharp = \
+  libsharp/sharp.c \
+  libsharp/sharp_almhelpers.c \
+  libsharp/sharp_announce.c \
+  libsharp/sharp_core.c \
+  libsharp/sharp_geomhelpers.c \
+  libsharp/sharp_legendre_roots.c \
+  libsharp/sharp_ylmgen_c.c \
+  libsharp/sharp_announce.h \
+  libsharp/sharp_complex_hacks.h \
+  libsharp/sharp_core.h \
+  libsharp/sharp_internal.h \
+  libsharp/sharp_legendre_roots.h \
+  libsharp/sharp_lowlevel.h \
+  libsharp/sharp_vecsupport.h \
+  libsharp/sharp_vecutil.h \
+  libsharp/sharp_ylmgen_c.h
+
+include_HEADERS = \
+  libsharp/sharp.h \
+  libsharp/sharp_lowlevel.h \
+  libsharp/sharp_geomhelpers.h \
+  libsharp/sharp_almhelpers.h \
+  libsharp/sharp_cxx.h
+
+EXTRA_DIST = \
+  libsharp/sharp_core_inc.c \
+  libsharp/sharp_core_inc2.c \
+  libsharp/sharp_core_inchelper.c
+
+libsharp_la_SOURCES = $(src_sharp)
+
+#check_PROGRAMS = ffttest
+#ffttest_SOURCES = ffttest.c
+#ffttest_LDADD = libpocketfft.la -lm
+
+#TESTS = ffttest
+
+AM_CFLAGS = -I$(top_srcdir)
+
+pkgconfigdir = $(libdir)/pkgconfig
+nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc
+
+DISTCLEANFILES=@PACKAGE_NAME@.pc @PACKAGE_NAME@.pc.in  @PACKAGE_NAME@-uninstalled.pc  @PACKAGE_NAME@-uninstalled.sh
diff --git a/README.md b/README.md
deleted file mode 100644
index 24652b2..0000000
--- a/README.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# Libsharp
-
-*IMPORTANT NOTE*: It appears that the default branch upon cloning from
-github.com/dagss/libsharp was an outdated 'dagss' branch instead of
-the 'master' branch. To get the latest copy,
-please do `git checkout master; git pull`. New clones are no longer affected.
-
-## Paper
-
-https://arxiv.org/abs/1303.4945
-
-## Compilation
-
-GNU make is required for compilation.
-
-Libsharp compilation has been successfully tested with GNU and Intel compilers.
-When using gcc, version 4.x is required [1].
-Since libsharp was written in standard C99, other compilers should work fine,
-but SSE2/AVX support will most likely be deactivated.
-
-If you obtained libsharp directly from the git repository, you will also
-need a copy of the GNU autotools. In this case, run "autoconf" in libsharp's
-main directory before any other steps.
-For libsharp releases distributed as a .tar.gz file, this step is not necessary.
-
-Afterwards, simply run "./configure"; if this fails, please refer to the output
-of "./configure --help" for additional hints and, if necessary, provide
-additional flags to the configure script.
-Once the script finishes successfully, run "make"
-(or "gmake"). This should install the compilation products in the
-subdirectory "auto/".
-
-Documentation can be created by the command "(g)make doc".
-However this requires the doxygen application to be installed
-on your system.
-The documentation will be created in the subdirectory doc/.
-
-
-[1] Some versions of the gcc 4.4.x release series contain a bug which causes
-the compiler to crash during libsharp compilation. This appears to be fixed
-in the gcc 4.4.7 release. It is possible to work around this problem by adding
-the compiler flag "-fno-tree-fre" after the other optimization flags - the
-configure script should do this automatically.
diff --git a/c_utils/planck.make b/c_utils/planck.make
deleted file mode 100644
index 4f0ccb1..0000000
--- a/c_utils/planck.make
+++ /dev/null
@@ -1,18 +0,0 @@
-PKG:=c_utils
-
-SD:=$(SRCROOT)/$(PKG)
-OD:=$(BLDROOT)/$(PKG)
-
-FULL_INCLUDE+= -I$(SD)
-
-HDR_$(PKG):=$(SD)/*.h
-LIB_$(PKG):=$(LIBDIR)/libc_utils.a
-
-OBJ:=c_utils.o walltime_c.o memusage.o
-OBJ:=$(OBJ:%=$(OD)/%)
-
-$(OBJ): $(HDR_$(PKG)) | $(OD)_mkdir
-$(LIB_$(PKG)): $(OBJ)
-
-all_hdr+=$(HDR_$(PKG))
-all_lib+=$(LIB_$(PKG))
diff --git a/config/config.auto.in b/config/config.auto.in
deleted file mode 100644
index 841cec0..0000000
--- a/config/config.auto.in
+++ /dev/null
@@ -1,12 +0,0 @@
-@SILENT_RULE@
-
-CC=@CC@
-CL=@CC@
-CCFLAGS_NO_C=@CCFLAGS_NO_C@
-CCFLAGS=$(CCFLAGS_NO_C) -c
-CLFLAGS=-L. -L$(LIBDIR) @LDCCFLAGS@ -lm
-DEBUG_CFLAGS=@DEBUG_CFLAGS@
-MPI_CFLAGS=@MPI_CFLAGS@
-OPENMP_CFLAGS=@OPENMP_CFLAGS@
-PIC_CFLAGS=@PIC_CFLAGS@
-ARCREATE=@ARCREATE@
diff --git a/config/rules.common b/config/rules.common
deleted file mode 100644
index bac2a2c..0000000
--- a/config/rules.common
+++ /dev/null
@@ -1,33 +0,0 @@
-BLDROOT   = $(SRCROOT)/build.$(SHARP_TARGET)
-PREFIX    = $(SRCROOT)/$(SHARP_TARGET)
-BINDIR    = $(PREFIX)/bin
-INCDIR    = $(PREFIX)/include
-LIBDIR    = $(PREFIX)/lib
-DOCDIR    = $(SRCROOT)/doc
-PYTHONDIR = $(SRCROOT)/python/libsharp
-
-# do not use any suffix rules
-.SUFFIXES:
-# do not use any default rules
-.DEFAULT:
-
-echo_config:
-	@echo using configuration \'$(SHARP_TARGET)\'
-
-$(BLDROOT)/%.o : $(SRCROOT)/%.c | echo_config
-	@echo "#  compiling $*.c"
-	cd $(@D) && $(CC) $(FULL_INCLUDE) -I$(BLDROOT) $(CCFLAGS) $<
-
-$(BLDROOT)/%.o : $(SRCROOT)/%.cc | echo_config
-	@echo "#  compiling $*.cc"
-	cd $(@D) && $(CXX) $(FULL_INCLUDE) -I$(BLDROOT) $(CXXCFLAGS) $<
-
-%_mkdir:
-	@if [ ! -d $* ]; then mkdir -p $* ; fi
-
-clean:
-	rm -rf $(BLDROOT) $(PREFIX) $(DOCDIR) autom4te.cache/ config.log config.status
-	rm -rf $(PYTHONDIR)/*.c $(PYTHONDIR)/*.o $(PYTHONDIR)/*.so
-
-distclean: clean
-	rm -f config/config.auto
diff --git a/configure.ac b/configure.ac
index 79b435e..9d8e203 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,113 +1,80 @@
-AC_INIT(config/config.auto.in)
+AC_INIT([libsharp], [1.0.0])
+AM_INIT_AUTOMAKE([foreign subdir-objects -Wall -Werror])
+AM_MAINTAINER_MODE([enable])
 
-AC_CHECK_PROG([uname_found],[uname],[1],[0])
-if test $uname_found -eq 0 ; then
-    echo "No uname found; setting system type to unknown."
-    system="unknown"
-else
-    system=`uname -s`-`uname -r`
-fi
-AC_LANG([C])
+dnl
+dnl Needed for linking on Windows.
+dnl Protect with m4_ifdef because AM_PROG_AR is required in
+dnl autoconf >= 1.12 when using -Wall, but the macro is
+dnl absent in old versions of autoconf.
+dnl
+m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
 
-AC_TRY_COMPILE([], [@%:@ifndef __INTEL_COMPILER
-choke me
-@%:@endif], [ICC=[yes]], [ICC=[no]])
+LT_INIT
+AC_CONFIG_MACRO_DIR([m4])
 
-if test $ICC = yes; then GCC=no; fi
-CCTYPE=unknown
-if test $GCC = yes; then CCTYPE=gcc; fi
-if test $ICC = yes; then CCTYPE=icc; fi
-AC_OPENMP
+dnl
+dnl By default, install the headers into a subdirectory of
+dnl ${prefix}/include to avoid possible header filename collisions.
+dnl
+includedir="${includedir}/${PACKAGE_NAME}"
 
-SILENT_RULE=".SILENT:"
-AC_ARG_ENABLE(noisy-make,
-  [  --enable-noisy-make     enable detailed make output],
-  [if test "$enableval" = yes; then
-     SILENT_RULE=""
-   fi])
+dnl
+dnl Enable silent build rules if this version of Automake supports them
+dnl
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 
-ENABLE_MPI=no
-AC_ARG_ENABLE(mpi,
-  [  --enable-mpi            enable generation of MPI-parallel code],
-  [if test "$enableval" = yes; then
-     ENABLE_MPI=yes
-   fi])
+AC_DEFUN([AX_CHECK_COMPILE_FLAG],
+[AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
+AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
+  ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
+  _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],
+    [AS_VAR_SET(CACHEVAR,[yes])],
+    [AS_VAR_SET(CACHEVAR,[no])])
+  _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
+AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes],
+  [m4_default([$2], :)],
+  [m4_default([$3], :)])
+AS_VAR_POPDEF([CACHEVAR])dnl
+])dnl AX_CHECK_COMPILE_FLAGS
 
-ENABLE_DEBUG=no
-AC_ARG_ENABLE(debug,
-  [  --enable-debug          enable generation of debugging symbols],
-  [if test "$enableval" = yes; then
-     ENABLE_DEBUG=yes
-   fi])
+dnl
+dnl Introduce --enable-native-optimizations command line argument to turn on
+dnl -march=native compiler flag, disabled by default.
+dnl
+AC_ARG_ENABLE(
+    [native-optimizations],
+    [AS_HELP_STRING([--enable-native-optimizations], [Enable non-portable optimizations for your own CPU by compiling with -march=native @<:@default=no@:>@])]
+)
 
-ENABLE_PIC=no
-AC_ARG_ENABLE(pic,
-  [  --enable-pic            enable generation of position independent code],
-  [if test "$enableval" = yes; then
-     ENABLE_PIC=yes
-   fi])
+AC_PROG_CC_C99
+AS_IF(
+    [test "x$enable_native_optimizations" = "xyes"],
+    [AX_CHECK_COMPILE_FLAG([-march=native],[CC="$CC -march=native"])],
+    dnl
+    dnl FIXME: On GCC 4.4, we hit an internal compiler error unless either
+    dnl -march=native or -fno-tree-fre is specified.
+    dnl
+    [
+        AS_IF(
+            [test "x$GCC" = "xyes" -a "x`$CC -dumpversion | cut -d. -f1,2`" = "x4.4"],
+            [AX_CHECK_COMPILE_FLAG([-fno-tree-fre], [CFLAGS="$CFLAGS -fno-tree-fre"])]
+        )
+    ]
+)
+AX_CHECK_COMPILE_FLAG([-fno-math-errno],[CFLAGS="$CFLAGS -fno-math-errno"])
+AX_CHECK_COMPILE_FLAG([-fno-trapping-math],[CFLAGS="$CFLAGS -fno-trapping-math"])
+AX_CHECK_COMPILE_FLAG([-fno-rounding-math],[CFLAGS="$CFLAGS -fno-rounding-math"])
+AX_CHECK_COMPILE_FLAG([-fno-signaling-nans],[CFLAGS="$CFLAGS -fno-signaling-nans"])
+AX_CHECK_COMPILE_FLAG([-fcx-limited-range],[CFLAGS="$CFLAGS -fcx-limited-range"])
 
-case $CCTYPE in
-  gcc)
-    CCFLAGS="-O3 -fno-tree-vectorize -ffast-math -fomit-frame-pointer -std=c99 -pedantic -Wextra -Wall -Wno-unknown-pragmas -Wshadow -Wmissing-prototypes -Wfatal-errors"
-    GCCVERSION="`$CC -dumpversion 2>&1`"
-    echo "Using gcc version $GCCVERSION"
-    AC_SUBST(GCCVERSION)
-    changequote(,)
-    gcc43=`echo $GCCVERSION | grep -c '^4\.[3456789]'`
-    gcc44=`echo $GCCVERSION | grep -c '^4\.4'`
-    changequote([,])
-    if test $gcc43 -gt 0; then
-      CCFLAGS="$CCFLAGS -march=native"
-    fi
-    if test $gcc44 -gt 0; then
-      CCFLAGS="$CCFLAGS -fno-tree-fre"
-    fi
-    ;;
-  icc)
-    CCFLAGS="-O3 -xHOST -std=c99 -ip -Wbrief -Wall -vec-report0 -openmp-report0 -wd383,981,1419,1572"
-    ;;
-  *)
-    CCFLAGS="-O2"
-    # Don't do anything now
-    ;;
-esac
+AC_PROG_LIBTOOL
 
-case $system in
-  Darwin-*)
-    ARCREATE="libtool -static -o"
-    ;;
-  *)
-    ARCREATE="ar cr"
-    ;;
-esac
+dnl
+dnl Create pkgconfig .pc file.
+dnl
+AX_CREATE_PKGCONFIG_INFO(,,,,[])
 
-if test $ENABLE_DEBUG = yes; then
-  DEBUG_CFLAGS="-g"
-fi
-
-if test $ENABLE_PIC = yes; then
-  PIC_CFLAGS="-fPIC"
-fi
-
-if test $ENABLE_MPI = yes; then
-  MPI_CFLAGS="-DUSE_MPI"
-fi
-
-CCFLAGS="$CCFLAGS $DEBUG_CFLAGS $OPENMP_CFLAGS $PIC_CFLAGS $MPI_CFLAGS"
-
-CCFLAGS_NO_C="$CCFLAGS $CPPFLAGS"
-
-LDCCFLAGS="$LDFLAGS $CCFLAGS"
-
-AC_SUBST(SILENT_RULE)
-AC_SUBST(CC)
-AC_SUBST(CCFLAGS_NO_C)
-AC_SUBST(LDCCFLAGS)
-AC_SUBST(DEBUG_CFLAGS)
-AC_SUBST(MPI_CFLAGS)
-AC_SUBST(OPENMP_CFLAGS)
-AC_SUBST(PIC_CFLAGS)
-AC_SUBST(ARCREATE)
-
-AC_OUTPUT(config/config.auto)
+AC_CONFIG_FILES([Makefile])
+AC_OUTPUT
diff --git a/docsrc/c_utils.dox b/docsrc/c_utils.dox
deleted file mode 100644
index daf432f..0000000
--- a/docsrc/c_utils.dox
+++ /dev/null
@@ -1,290 +0,0 @@
-# Doxyfile 1.8.1
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-DOXYFILE_ENCODING      = UTF-8
-PROJECT_NAME           = "LevelS C support library"
-PROJECT_NUMBER         = 0.1
-PROJECT_BRIEF          =
-PROJECT_LOGO           =
-OUTPUT_DIRECTORY       = .
-CREATE_SUBDIRS         = NO
-OUTPUT_LANGUAGE        = English
-BRIEF_MEMBER_DESC      = NO
-REPEAT_BRIEF           = YES
-ABBREVIATE_BRIEF       =
-ALWAYS_DETAILED_SEC    = NO
-INLINE_INHERITED_MEMB  = NO
-FULL_PATH_NAMES        = NO
-STRIP_FROM_PATH        =
-STRIP_FROM_INC_PATH    =
-SHORT_NAMES            = NO
-JAVADOC_AUTOBRIEF      = NO
-QT_AUTOBRIEF           = NO
-MULTILINE_CPP_IS_BRIEF = NO
-INHERIT_DOCS           = YES
-SEPARATE_MEMBER_PAGES  = NO
-TAB_SIZE               = 8
-ALIASES                =
-TCL_SUBST              =
-OPTIMIZE_OUTPUT_FOR_C  = YES
-OPTIMIZE_OUTPUT_JAVA   = NO
-OPTIMIZE_FOR_FORTRAN   = NO
-OPTIMIZE_OUTPUT_VHDL   = NO
-EXTENSION_MAPPING      =
-MARKDOWN_SUPPORT       = YES
-BUILTIN_STL_SUPPORT    = NO
-CPP_CLI_SUPPORT        = NO
-SIP_SUPPORT            = NO
-IDL_PROPERTY_SUPPORT   = YES
-DISTRIBUTE_GROUP_DOC   = NO
-SUBGROUPING            = YES
-INLINE_GROUPED_CLASSES = NO
-INLINE_SIMPLE_STRUCTS  = NO
-TYPEDEF_HIDES_STRUCT   = NO
-SYMBOL_CACHE_SIZE      = 0
-LOOKUP_CACHE_SIZE      = 0
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-EXTRACT_ALL            = NO
-EXTRACT_PRIVATE        = NO
-EXTRACT_PACKAGE        = NO
-EXTRACT_STATIC         = NO
-EXTRACT_LOCAL_CLASSES  = YES
-EXTRACT_LOCAL_METHODS  = NO
-EXTRACT_ANON_NSPACES   = NO
-HIDE_UNDOC_MEMBERS     = YES
-HIDE_UNDOC_CLASSES     = YES
-HIDE_FRIEND_COMPOUNDS  = YES
-HIDE_IN_BODY_DOCS      = NO
-INTERNAL_DOCS          = NO
-CASE_SENSE_NAMES       = YES
-HIDE_SCOPE_NAMES       = NO
-SHOW_INCLUDE_FILES     = YES
-FORCE_LOCAL_INCLUDES   = NO
-INLINE_INFO            = YES
-SORT_MEMBER_DOCS       = NO
-SORT_BRIEF_DOCS        = NO
-SORT_MEMBERS_CTORS_1ST = NO
-SORT_GROUP_NAMES       = NO
-SORT_BY_SCOPE_NAME     = NO
-STRICT_PROTO_MATCHING  = NO
-GENERATE_TODOLIST      = YES
-GENERATE_TESTLIST      = YES
-GENERATE_BUGLIST       = YES
-GENERATE_DEPRECATEDLIST= YES
-ENABLED_SECTIONS       =
-MAX_INITIALIZER_LINES  = 30
-SHOW_USED_FILES        = YES
-SHOW_FILES             = YES
-SHOW_NAMESPACES        = YES
-FILE_VERSION_FILTER    =
-LAYOUT_FILE            =
-CITE_BIB_FILES         =
-#---------------------------------------------------------------------------
-# configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-QUIET                  = YES
-WARNINGS               = YES
-WARN_IF_UNDOCUMENTED   = YES
-WARN_IF_DOC_ERROR      = YES
-WARN_NO_PARAMDOC       = NO
-WARN_FORMAT            = "$file:$line: $text"
-WARN_LOGFILE           =
-#---------------------------------------------------------------------------
-# configuration options related to the input files
-#---------------------------------------------------------------------------
-INPUT                  = ../c_utils
-INPUT_ENCODING         = UTF-8
-FILE_PATTERNS          = *.h \
-                         *.c \
-                         *.dox
-RECURSIVE              = YES
-EXCLUDE                =
-EXCLUDE_SYMLINKS       = NO
-EXCLUDE_PATTERNS       =
-EXCLUDE_SYMBOLS        =
-EXAMPLE_PATH           =
-EXAMPLE_PATTERNS       =
-EXAMPLE_RECURSIVE      = NO
-IMAGE_PATH             =
-INPUT_FILTER           =
-FILTER_PATTERNS        =
-FILTER_SOURCE_FILES    = NO
-FILTER_SOURCE_PATTERNS =
-#---------------------------------------------------------------------------
-# configuration options related to source browsing
-#---------------------------------------------------------------------------
-SOURCE_BROWSER         = YES
-INLINE_SOURCES         = NO
-STRIP_CODE_COMMENTS    = NO
-REFERENCED_BY_RELATION = NO
-REFERENCES_RELATION    = NO
-REFERENCES_LINK_SOURCE = YES
-USE_HTAGS              = NO
-VERBATIM_HEADERS       = YES
-#---------------------------------------------------------------------------
-# configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-ALPHABETICAL_INDEX     = YES
-COLS_IN_ALPHA_INDEX    = 5
-IGNORE_PREFIX          =
-#---------------------------------------------------------------------------
-# configuration options related to the HTML output
-#---------------------------------------------------------------------------
-GENERATE_HTML          = YES
-HTML_OUTPUT            = htmldoc
-HTML_FILE_EXTENSION    = .html
-HTML_HEADER            =
-HTML_FOOTER            = footer.html
-HTML_STYLESHEET        =
-HTML_EXTRA_FILES       =
-HTML_COLORSTYLE_HUE    = 220
-HTML_COLORSTYLE_SAT    = 100
-HTML_COLORSTYLE_GAMMA  = 80
-HTML_TIMESTAMP         = YES
-HTML_DYNAMIC_SECTIONS  = NO
-HTML_INDEX_NUM_ENTRIES = 100
-GENERATE_DOCSET        = NO
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-DOCSET_PUBLISHER_NAME  = Publisher
-GENERATE_HTMLHELP      = NO
-CHM_FILE               =
-HHC_LOCATION           =
-GENERATE_CHI           = NO
-CHM_INDEX_ENCODING     =
-BINARY_TOC             = NO
-TOC_EXPAND             = NO
-GENERATE_QHP           = NO
-QCH_FILE               =
-QHP_NAMESPACE          = org.doxygen.Project
-QHP_VIRTUAL_FOLDER     = doc
-QHP_CUST_FILTER_NAME   =
-QHP_CUST_FILTER_ATTRS  =
-QHP_SECT_FILTER_ATTRS  =
-QHG_LOCATION           =
-GENERATE_ECLIPSEHELP   = NO
-ECLIPSE_DOC_ID         = org.doxygen.Project
-DISABLE_INDEX          = NO
-GENERATE_TREEVIEW      = NO
-ENUM_VALUES_PER_LINE   = 4
-TREEVIEW_WIDTH         = 250
-EXT_LINKS_IN_WINDOW    = NO
-FORMULA_FONTSIZE       = 10
-FORMULA_TRANSPARENT    = YES
-USE_MATHJAX            = NO
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-MATHJAX_EXTENSIONS     =
-SEARCHENGINE           = NO
-SERVER_BASED_SEARCH    = NO
-#---------------------------------------------------------------------------
-# configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-GENERATE_LATEX         = NO
-LATEX_OUTPUT           = latex
-LATEX_CMD_NAME         = latex
-MAKEINDEX_CMD_NAME     = makeindex
-COMPACT_LATEX          = YES
-PAPER_TYPE             = a4wide
-EXTRA_PACKAGES         =
-LATEX_HEADER           =
-LATEX_FOOTER           =
-PDF_HYPERLINKS         = YES
-USE_PDFLATEX           = YES
-LATEX_BATCHMODE        = NO
-LATEX_HIDE_INDICES     = NO
-LATEX_SOURCE_CODE      = NO
-LATEX_BIB_STYLE        = plain
-#---------------------------------------------------------------------------
-# configuration options related to the RTF output
-#---------------------------------------------------------------------------
-GENERATE_RTF           = NO
-RTF_OUTPUT             = rtf
-COMPACT_RTF            = NO
-RTF_HYPERLINKS         = NO
-RTF_STYLESHEET_FILE    =
-RTF_EXTENSIONS_FILE    =
-#---------------------------------------------------------------------------
-# configuration options related to the man page output
-#---------------------------------------------------------------------------
-GENERATE_MAN           = NO
-MAN_OUTPUT             = man
-MAN_EXTENSION          = .3
-MAN_LINKS              = NO
-#---------------------------------------------------------------------------
-# configuration options related to the XML output
-#---------------------------------------------------------------------------
-GENERATE_XML           = NO
-XML_OUTPUT             = xml
-XML_SCHEMA             =
-XML_DTD                =
-XML_PROGRAMLISTING     = YES
-#---------------------------------------------------------------------------
-# configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-GENERATE_AUTOGEN_DEF   = NO
-#---------------------------------------------------------------------------
-# configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-GENERATE_PERLMOD       = NO
-PERLMOD_LATEX          = NO
-PERLMOD_PRETTY         = YES
-PERLMOD_MAKEVAR_PREFIX =
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-ENABLE_PREPROCESSING   = YES
-MACRO_EXPANSION        = NO
-EXPAND_ONLY_PREDEF     = NO
-SEARCH_INCLUDES        = YES
-INCLUDE_PATH           =
-INCLUDE_FILE_PATTERNS  =
-PREDEFINED             =
-EXPAND_AS_DEFINED      =
-SKIP_FUNCTION_MACROS   = YES
-#---------------------------------------------------------------------------
-# Configuration::additions related to external references
-#---------------------------------------------------------------------------
-TAGFILES               =
-GENERATE_TAGFILE       = c_utils.tag
-ALLEXTERNALS           = NO
-EXTERNAL_GROUPS        = YES
-PERL_PATH              = /usr/bin/perl
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-CLASS_DIAGRAMS         = YES
-MSCGEN_PATH            =
-HIDE_UNDOC_RELATIONS   = YES
-HAVE_DOT               = NO
-DOT_NUM_THREADS        = 0
-DOT_FONTNAME           = FreeSans
-DOT_FONTSIZE           = 10
-DOT_FONTPATH           =
-CLASS_GRAPH            = YES
-COLLABORATION_GRAPH    = YES
-GROUP_GRAPHS           = YES
-UML_LOOK               = NO
-UML_LIMIT_NUM_FIELDS   = 10
-TEMPLATE_RELATIONS     = YES
-INCLUDE_GRAPH          = NO
-INCLUDED_BY_GRAPH      = NO
-CALL_GRAPH             = NO
-CALLER_GRAPH           = NO
-GRAPHICAL_HIERARCHY    = NO
-DIRECTORY_GRAPH        = YES
-DOT_IMAGE_FORMAT       = png
-INTERACTIVE_SVG        = NO
-DOT_PATH               =
-DOTFILE_DIRS           =
-MSCFILE_DIRS           =
-DOT_GRAPH_MAX_NODES    = 50
-MAX_DOT_GRAPH_DEPTH    = 0
-DOT_TRANSPARENT        = NO
-DOT_MULTI_TARGETS      = NO
-GENERATE_LEGEND        = YES
-DOT_CLEANUP            = YES
diff --git a/docsrc/footer.html b/docsrc/footer.html
deleted file mode 100644
index 6f5dbf0..0000000
--- a/docsrc/footer.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<hr><address style="align: right;"><small>
-Generated on $datetime for $projectname
-</a> </small></address>
-</body>
-</html>
diff --git a/docsrc/index_code.html b/docsrc/index_code.html
deleted file mode 100644
index d8a001d..0000000
--- a/docsrc/index_code.html
+++ /dev/null
@@ -1,15 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
-<html><head><meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1">
-<title>Libsharp source code documentation</title>
-</head><body>
-<H1>Libsharp source code documentation</H1>
-
-<H2>C interfaces</H2>
-
-<ul>
-<li><a href="c_utils/index.html">C support library</a>
-<li><a href="libfftpack/index.html">FFT interface</a>
-<li><a href="libsharp/index.html">Library for spherical harmonic transforms</a>
-</ul>
-</body>
-</html>
diff --git a/docsrc/libfftpack.dox b/docsrc/libfftpack.dox
deleted file mode 100644
index 7ff2c23..0000000
--- a/docsrc/libfftpack.dox
+++ /dev/null
@@ -1,290 +0,0 @@
-# Doxyfile 1.8.1
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-DOXYFILE_ENCODING      = UTF-8
-PROJECT_NAME           = "LevelS FFT library"
-PROJECT_NUMBER         = 0.1
-PROJECT_BRIEF          =
-PROJECT_LOGO           =
-OUTPUT_DIRECTORY       = .
-CREATE_SUBDIRS         = NO
-OUTPUT_LANGUAGE        = English
-BRIEF_MEMBER_DESC      = NO
-REPEAT_BRIEF           = YES
-ABBREVIATE_BRIEF       =
-ALWAYS_DETAILED_SEC    = NO
-INLINE_INHERITED_MEMB  = NO
-FULL_PATH_NAMES        = NO
-STRIP_FROM_PATH        =
-STRIP_FROM_INC_PATH    =
-SHORT_NAMES            = NO
-JAVADOC_AUTOBRIEF      = NO
-QT_AUTOBRIEF           = NO
-MULTILINE_CPP_IS_BRIEF = NO
-INHERIT_DOCS           = YES
-SEPARATE_MEMBER_PAGES  = NO
-TAB_SIZE               = 8
-ALIASES                =
-TCL_SUBST              =
-OPTIMIZE_OUTPUT_FOR_C  = YES
-OPTIMIZE_OUTPUT_JAVA   = NO
-OPTIMIZE_FOR_FORTRAN   = NO
-OPTIMIZE_OUTPUT_VHDL   = NO
-EXTENSION_MAPPING      =
-MARKDOWN_SUPPORT       = YES
-BUILTIN_STL_SUPPORT    = NO
-CPP_CLI_SUPPORT        = NO
-SIP_SUPPORT            = NO
-IDL_PROPERTY_SUPPORT   = YES
-DISTRIBUTE_GROUP_DOC   = NO
-SUBGROUPING            = YES
-INLINE_GROUPED_CLASSES = NO
-INLINE_SIMPLE_STRUCTS  = NO
-TYPEDEF_HIDES_STRUCT   = NO
-SYMBOL_CACHE_SIZE      = 0
-LOOKUP_CACHE_SIZE      = 0
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-EXTRACT_ALL            = NO
-EXTRACT_PRIVATE        = NO
-EXTRACT_PACKAGE        = NO
-EXTRACT_STATIC         = NO
-EXTRACT_LOCAL_CLASSES  = YES
-EXTRACT_LOCAL_METHODS  = NO
-EXTRACT_ANON_NSPACES   = NO
-HIDE_UNDOC_MEMBERS     = YES
-HIDE_UNDOC_CLASSES     = YES
-HIDE_FRIEND_COMPOUNDS  = YES
-HIDE_IN_BODY_DOCS      = NO
-INTERNAL_DOCS          = NO
-CASE_SENSE_NAMES       = YES
-HIDE_SCOPE_NAMES       = NO
-SHOW_INCLUDE_FILES     = YES
-FORCE_LOCAL_INCLUDES   = NO
-INLINE_INFO            = YES
-SORT_MEMBER_DOCS       = NO
-SORT_BRIEF_DOCS        = NO
-SORT_MEMBERS_CTORS_1ST = NO
-SORT_GROUP_NAMES       = NO
-SORT_BY_SCOPE_NAME     = NO
-STRICT_PROTO_MATCHING  = NO
-GENERATE_TODOLIST      = YES
-GENERATE_TESTLIST      = YES
-GENERATE_BUGLIST       = YES
-GENERATE_DEPRECATEDLIST= YES
-ENABLED_SECTIONS       =
-MAX_INITIALIZER_LINES  = 30
-SHOW_USED_FILES        = YES
-SHOW_FILES             = YES
-SHOW_NAMESPACES        = YES
-FILE_VERSION_FILTER    =
-LAYOUT_FILE            =
-CITE_BIB_FILES         =
-#---------------------------------------------------------------------------
-# configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-QUIET                  = YES
-WARNINGS               = YES
-WARN_IF_UNDOCUMENTED   = YES
-WARN_IF_DOC_ERROR      = YES
-WARN_NO_PARAMDOC       = NO
-WARN_FORMAT            = "$file:$line: $text"
-WARN_LOGFILE           =
-#---------------------------------------------------------------------------
-# configuration options related to the input files
-#---------------------------------------------------------------------------
-INPUT                  = ../libfftpack
-INPUT_ENCODING         = UTF-8
-FILE_PATTERNS          = *.h \
-                         *.c \
-                         *.dox
-RECURSIVE              = YES
-EXCLUDE                =
-EXCLUDE_SYMLINKS       = NO
-EXCLUDE_PATTERNS       =
-EXCLUDE_SYMBOLS        =
-EXAMPLE_PATH           =
-EXAMPLE_PATTERNS       =
-EXAMPLE_RECURSIVE      = NO
-IMAGE_PATH             =
-INPUT_FILTER           =
-FILTER_PATTERNS        =
-FILTER_SOURCE_FILES    = NO
-FILTER_SOURCE_PATTERNS =
-#---------------------------------------------------------------------------
-# configuration options related to source browsing
-#---------------------------------------------------------------------------
-SOURCE_BROWSER         = YES
-INLINE_SOURCES         = NO
-STRIP_CODE_COMMENTS    = NO
-REFERENCED_BY_RELATION = NO
-REFERENCES_RELATION    = NO
-REFERENCES_LINK_SOURCE = YES
-USE_HTAGS              = NO
-VERBATIM_HEADERS       = YES
-#---------------------------------------------------------------------------
-# configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-ALPHABETICAL_INDEX     = YES
-COLS_IN_ALPHA_INDEX    = 5
-IGNORE_PREFIX          =
-#---------------------------------------------------------------------------
-# configuration options related to the HTML output
-#---------------------------------------------------------------------------
-GENERATE_HTML          = YES
-HTML_OUTPUT            = htmldoc
-HTML_FILE_EXTENSION    = .html
-HTML_HEADER            =
-HTML_FOOTER            = footer.html
-HTML_STYLESHEET        =
-HTML_EXTRA_FILES       =
-HTML_COLORSTYLE_HUE    = 220
-HTML_COLORSTYLE_SAT    = 100
-HTML_COLORSTYLE_GAMMA  = 80
-HTML_TIMESTAMP         = YES
-HTML_DYNAMIC_SECTIONS  = NO
-HTML_INDEX_NUM_ENTRIES = 100
-GENERATE_DOCSET        = NO
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-DOCSET_PUBLISHER_NAME  = Publisher
-GENERATE_HTMLHELP      = NO
-CHM_FILE               =
-HHC_LOCATION           =
-GENERATE_CHI           = NO
-CHM_INDEX_ENCODING     =
-BINARY_TOC             = NO
-TOC_EXPAND             = NO
-GENERATE_QHP           = NO
-QCH_FILE               =
-QHP_NAMESPACE          = org.doxygen.Project
-QHP_VIRTUAL_FOLDER     = doc
-QHP_CUST_FILTER_NAME   =
-QHP_CUST_FILTER_ATTRS  =
-QHP_SECT_FILTER_ATTRS  =
-QHG_LOCATION           =
-GENERATE_ECLIPSEHELP   = NO
-ECLIPSE_DOC_ID         = org.doxygen.Project
-DISABLE_INDEX          = NO
-GENERATE_TREEVIEW      = NO
-ENUM_VALUES_PER_LINE   = 4
-TREEVIEW_WIDTH         = 250
-EXT_LINKS_IN_WINDOW    = NO
-FORMULA_FONTSIZE       = 10
-FORMULA_TRANSPARENT    = YES
-USE_MATHJAX            = NO
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-MATHJAX_EXTENSIONS     =
-SEARCHENGINE           = NO
-SERVER_BASED_SEARCH    = NO
-#---------------------------------------------------------------------------
-# configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-GENERATE_LATEX         = NO
-LATEX_OUTPUT           = latex
-LATEX_CMD_NAME         = latex
-MAKEINDEX_CMD_NAME     = makeindex
-COMPACT_LATEX          = YES
-PAPER_TYPE             = a4wide
-EXTRA_PACKAGES         =
-LATEX_HEADER           =
-LATEX_FOOTER           =
-PDF_HYPERLINKS         = YES
-USE_PDFLATEX           = YES
-LATEX_BATCHMODE        = NO
-LATEX_HIDE_INDICES     = NO
-LATEX_SOURCE_CODE      = NO
-LATEX_BIB_STYLE        = plain
-#---------------------------------------------------------------------------
-# configuration options related to the RTF output
-#---------------------------------------------------------------------------
-GENERATE_RTF           = NO
-RTF_OUTPUT             = rtf
-COMPACT_RTF            = NO
-RTF_HYPERLINKS         = NO
-RTF_STYLESHEET_FILE    =
-RTF_EXTENSIONS_FILE    =
-#---------------------------------------------------------------------------
-# configuration options related to the man page output
-#---------------------------------------------------------------------------
-GENERATE_MAN           = NO
-MAN_OUTPUT             = man
-MAN_EXTENSION          = .3
-MAN_LINKS              = NO
-#---------------------------------------------------------------------------
-# configuration options related to the XML output
-#---------------------------------------------------------------------------
-GENERATE_XML           = NO
-XML_OUTPUT             = xml
-XML_SCHEMA             =
-XML_DTD                =
-XML_PROGRAMLISTING     = YES
-#---------------------------------------------------------------------------
-# configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-GENERATE_AUTOGEN_DEF   = NO
-#---------------------------------------------------------------------------
-# configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-GENERATE_PERLMOD       = NO
-PERLMOD_LATEX          = NO
-PERLMOD_PRETTY         = YES
-PERLMOD_MAKEVAR_PREFIX =
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-ENABLE_PREPROCESSING   = YES
-MACRO_EXPANSION        = NO
-EXPAND_ONLY_PREDEF     = NO
-SEARCH_INCLUDES        = YES
-INCLUDE_PATH           =
-INCLUDE_FILE_PATTERNS  =
-PREDEFINED             =
-EXPAND_AS_DEFINED      =
-SKIP_FUNCTION_MACROS   = YES
-#---------------------------------------------------------------------------
-# Configuration::additions related to external references
-#---------------------------------------------------------------------------
-TAGFILES               = c_utils.tag=../c_utils
-GENERATE_TAGFILE       = libfftpack.tag
-ALLEXTERNALS           = NO
-EXTERNAL_GROUPS        = YES
-PERL_PATH              = /usr/bin/perl
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-CLASS_DIAGRAMS         = YES
-MSCGEN_PATH            =
-HIDE_UNDOC_RELATIONS   = YES
-HAVE_DOT               = NO
-DOT_NUM_THREADS        = 0
-DOT_FONTNAME           = FreeSans
-DOT_FONTSIZE           = 10
-DOT_FONTPATH           =
-CLASS_GRAPH            = YES
-COLLABORATION_GRAPH    = YES
-GROUP_GRAPHS           = YES
-UML_LOOK               = NO
-UML_LIMIT_NUM_FIELDS   = 10
-TEMPLATE_RELATIONS     = YES
-INCLUDE_GRAPH          = NO
-INCLUDED_BY_GRAPH      = NO
-CALL_GRAPH             = NO
-CALLER_GRAPH           = NO
-GRAPHICAL_HIERARCHY    = NO
-DIRECTORY_GRAPH        = YES
-DOT_IMAGE_FORMAT       = png
-INTERACTIVE_SVG        = NO
-DOT_PATH               =
-DOTFILE_DIRS           =
-MSCFILE_DIRS           =
-DOT_GRAPH_MAX_NODES    = 50
-MAX_DOT_GRAPH_DEPTH    = 0
-DOT_TRANSPARENT        = NO
-DOT_MULTI_TARGETS      = NO
-GENERATE_LEGEND        = YES
-DOT_CLEANUP            = YES
diff --git a/docsrc/libsharp.dox b/docsrc/libsharp.dox
deleted file mode 100644
index b476ab4..0000000
--- a/docsrc/libsharp.dox
+++ /dev/null
@@ -1,291 +0,0 @@
-# Doxyfile 1.8.1
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-DOXYFILE_ENCODING      = UTF-8
-PROJECT_NAME           = "LevelS SHT library"
-PROJECT_NUMBER         = 0.1
-PROJECT_BRIEF          =
-PROJECT_LOGO           =
-OUTPUT_DIRECTORY       = .
-CREATE_SUBDIRS         = NO
-OUTPUT_LANGUAGE        = English
-BRIEF_MEMBER_DESC      = NO
-REPEAT_BRIEF           = YES
-ABBREVIATE_BRIEF       =
-ALWAYS_DETAILED_SEC    = NO
-INLINE_INHERITED_MEMB  = NO
-FULL_PATH_NAMES        = NO
-STRIP_FROM_PATH        =
-STRIP_FROM_INC_PATH    =
-SHORT_NAMES            = NO
-JAVADOC_AUTOBRIEF      = NO
-QT_AUTOBRIEF           = NO
-MULTILINE_CPP_IS_BRIEF = NO
-INHERIT_DOCS           = YES
-SEPARATE_MEMBER_PAGES  = NO
-TAB_SIZE               = 8
-ALIASES                =
-TCL_SUBST              =
-OPTIMIZE_OUTPUT_FOR_C  = YES
-OPTIMIZE_OUTPUT_JAVA   = NO
-OPTIMIZE_FOR_FORTRAN   = NO
-OPTIMIZE_OUTPUT_VHDL   = NO
-EXTENSION_MAPPING      =
-MARKDOWN_SUPPORT       = YES
-BUILTIN_STL_SUPPORT    = NO
-CPP_CLI_SUPPORT        = NO
-SIP_SUPPORT            = NO
-IDL_PROPERTY_SUPPORT   = YES
-DISTRIBUTE_GROUP_DOC   = NO
-SUBGROUPING            = YES
-INLINE_GROUPED_CLASSES = NO
-INLINE_SIMPLE_STRUCTS  = NO
-TYPEDEF_HIDES_STRUCT   = NO
-SYMBOL_CACHE_SIZE      = 0
-LOOKUP_CACHE_SIZE      = 0
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-EXTRACT_ALL            = NO
-EXTRACT_PRIVATE        = NO
-EXTRACT_PACKAGE        = NO
-EXTRACT_STATIC         = NO
-EXTRACT_LOCAL_CLASSES  = YES
-EXTRACT_LOCAL_METHODS  = NO
-EXTRACT_ANON_NSPACES   = NO
-HIDE_UNDOC_MEMBERS     = YES
-HIDE_UNDOC_CLASSES     = YES
-HIDE_FRIEND_COMPOUNDS  = YES
-HIDE_IN_BODY_DOCS      = NO
-INTERNAL_DOCS          = NO
-CASE_SENSE_NAMES       = YES
-HIDE_SCOPE_NAMES       = NO
-SHOW_INCLUDE_FILES     = YES
-FORCE_LOCAL_INCLUDES   = NO
-INLINE_INFO            = YES
-SORT_MEMBER_DOCS       = NO
-SORT_BRIEF_DOCS        = NO
-SORT_MEMBERS_CTORS_1ST = NO
-SORT_GROUP_NAMES       = NO
-SORT_BY_SCOPE_NAME     = NO
-STRICT_PROTO_MATCHING  = NO
-GENERATE_TODOLIST      = YES
-GENERATE_TESTLIST      = YES
-GENERATE_BUGLIST       = YES
-GENERATE_DEPRECATEDLIST= YES
-ENABLED_SECTIONS       =
-MAX_INITIALIZER_LINES  = 30
-SHOW_USED_FILES        = YES
-SHOW_FILES             = YES
-SHOW_NAMESPACES        = YES
-FILE_VERSION_FILTER    =
-LAYOUT_FILE            =
-CITE_BIB_FILES         =
-#---------------------------------------------------------------------------
-# configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-QUIET                  = YES
-WARNINGS               = YES
-WARN_IF_UNDOCUMENTED   = YES
-WARN_IF_DOC_ERROR      = YES
-WARN_NO_PARAMDOC       = NO
-WARN_FORMAT            = "$file:$line: $text"
-WARN_LOGFILE           =
-#---------------------------------------------------------------------------
-# configuration options related to the input files
-#---------------------------------------------------------------------------
-INPUT                  = ../libsharp
-INPUT_ENCODING         = UTF-8
-FILE_PATTERNS          = *.h \
-                         *.c \
-                         *.dox
-RECURSIVE              = YES
-EXCLUDE                =
-EXCLUDE_SYMLINKS       = NO
-EXCLUDE_PATTERNS       =
-EXCLUDE_SYMBOLS        =
-EXAMPLE_PATH           =
-EXAMPLE_PATTERNS       =
-EXAMPLE_RECURSIVE      = NO
-IMAGE_PATH             =
-INPUT_FILTER           =
-FILTER_PATTERNS        =
-FILTER_SOURCE_FILES    = NO
-FILTER_SOURCE_PATTERNS =
-#---------------------------------------------------------------------------
-# configuration options related to source browsing
-#---------------------------------------------------------------------------
-SOURCE_BROWSER         = YES
-INLINE_SOURCES         = NO
-STRIP_CODE_COMMENTS    = NO
-REFERENCED_BY_RELATION = NO
-REFERENCES_RELATION    = NO
-REFERENCES_LINK_SOURCE = YES
-USE_HTAGS              = NO
-VERBATIM_HEADERS       = YES
-#---------------------------------------------------------------------------
-# configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-ALPHABETICAL_INDEX     = YES
-COLS_IN_ALPHA_INDEX    = 5
-IGNORE_PREFIX          =
-#---------------------------------------------------------------------------
-# configuration options related to the HTML output
-#---------------------------------------------------------------------------
-GENERATE_HTML          = YES
-HTML_OUTPUT            = htmldoc
-HTML_FILE_EXTENSION    = .html
-HTML_HEADER            =
-HTML_FOOTER            = footer.html
-HTML_STYLESHEET        =
-HTML_EXTRA_FILES       =
-HTML_COLORSTYLE_HUE    = 220
-HTML_COLORSTYLE_SAT    = 100
-HTML_COLORSTYLE_GAMMA  = 80
-HTML_TIMESTAMP         = YES
-HTML_DYNAMIC_SECTIONS  = NO
-HTML_INDEX_NUM_ENTRIES = 100
-GENERATE_DOCSET        = NO
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-DOCSET_PUBLISHER_NAME  = Publisher
-GENERATE_HTMLHELP      = NO
-CHM_FILE               =
-HHC_LOCATION           =
-GENERATE_CHI           = NO
-CHM_INDEX_ENCODING     =
-BINARY_TOC             = NO
-TOC_EXPAND             = NO
-GENERATE_QHP           = NO
-QCH_FILE               =
-QHP_NAMESPACE          = org.doxygen.Project
-QHP_VIRTUAL_FOLDER     = doc
-QHP_CUST_FILTER_NAME   =
-QHP_CUST_FILTER_ATTRS  =
-QHP_SECT_FILTER_ATTRS  =
-QHG_LOCATION           =
-GENERATE_ECLIPSEHELP   = NO
-ECLIPSE_DOC_ID         = org.doxygen.Project
-DISABLE_INDEX          = NO
-GENERATE_TREEVIEW      = NO
-ENUM_VALUES_PER_LINE   = 4
-TREEVIEW_WIDTH         = 250
-EXT_LINKS_IN_WINDOW    = NO
-FORMULA_FONTSIZE       = 10
-FORMULA_TRANSPARENT    = YES
-USE_MATHJAX            = NO
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-MATHJAX_EXTENSIONS     =
-SEARCHENGINE           = NO
-SERVER_BASED_SEARCH    = NO
-#---------------------------------------------------------------------------
-# configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-GENERATE_LATEX         = NO
-LATEX_OUTPUT           = latex
-LATEX_CMD_NAME         = latex
-MAKEINDEX_CMD_NAME     = makeindex
-COMPACT_LATEX          = YES
-PAPER_TYPE             = a4wide
-EXTRA_PACKAGES         =
-LATEX_HEADER           =
-LATEX_FOOTER           =
-PDF_HYPERLINKS         = YES
-USE_PDFLATEX           = YES
-LATEX_BATCHMODE        = NO
-LATEX_HIDE_INDICES     = NO
-LATEX_SOURCE_CODE      = NO
-LATEX_BIB_STYLE        = plain
-#---------------------------------------------------------------------------
-# configuration options related to the RTF output
-#---------------------------------------------------------------------------
-GENERATE_RTF           = NO
-RTF_OUTPUT             = rtf
-COMPACT_RTF            = NO
-RTF_HYPERLINKS         = NO
-RTF_STYLESHEET_FILE    =
-RTF_EXTENSIONS_FILE    =
-#---------------------------------------------------------------------------
-# configuration options related to the man page output
-#---------------------------------------------------------------------------
-GENERATE_MAN           = NO
-MAN_OUTPUT             = man
-MAN_EXTENSION          = .3
-MAN_LINKS              = NO
-#---------------------------------------------------------------------------
-# configuration options related to the XML output
-#---------------------------------------------------------------------------
-GENERATE_XML           = NO
-XML_OUTPUT             = xml
-XML_SCHEMA             =
-XML_DTD                =
-XML_PROGRAMLISTING     = YES
-#---------------------------------------------------------------------------
-# configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-GENERATE_AUTOGEN_DEF   = NO
-#---------------------------------------------------------------------------
-# configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-GENERATE_PERLMOD       = NO
-PERLMOD_LATEX          = NO
-PERLMOD_PRETTY         = YES
-PERLMOD_MAKEVAR_PREFIX =
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-ENABLE_PREPROCESSING   = YES
-MACRO_EXPANSION        = NO
-EXPAND_ONLY_PREDEF     = NO
-SEARCH_INCLUDES        = YES
-INCLUDE_PATH           =
-INCLUDE_FILE_PATTERNS  =
-PREDEFINED             =
-EXPAND_AS_DEFINED      =
-SKIP_FUNCTION_MACROS   = YES
-#---------------------------------------------------------------------------
-# Configuration::additions related to external references
-#---------------------------------------------------------------------------
-TAGFILES               = libfftpack.tag=../libfftpack \
-                         c_utils.tag=../c_utils
-GENERATE_TAGFILE       = libsharp.tag
-ALLEXTERNALS           = NO
-EXTERNAL_GROUPS        = YES
-PERL_PATH              = /usr/bin/perl
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-CLASS_DIAGRAMS         = YES
-MSCGEN_PATH            =
-HIDE_UNDOC_RELATIONS   = YES
-HAVE_DOT               = NO
-DOT_NUM_THREADS        = 0
-DOT_FONTNAME           = FreeSans
-DOT_FONTSIZE           = 10
-DOT_FONTPATH           =
-CLASS_GRAPH            = YES
-COLLABORATION_GRAPH    = YES
-GROUP_GRAPHS           = YES
-UML_LOOK               = NO
-UML_LIMIT_NUM_FIELDS   = 10
-TEMPLATE_RELATIONS     = YES
-INCLUDE_GRAPH          = NO
-INCLUDED_BY_GRAPH      = NO
-CALL_GRAPH             = NO
-CALLER_GRAPH           = NO
-GRAPHICAL_HIERARCHY    = NO
-DIRECTORY_GRAPH        = YES
-DOT_IMAGE_FORMAT       = png
-INTERACTIVE_SVG        = NO
-DOT_PATH               =
-DOTFILE_DIRS           =
-MSCFILE_DIRS           =
-DOT_GRAPH_MAX_NODES    = 50
-MAX_DOT_GRAPH_DEPTH    = 0
-DOT_TRANSPARENT        = NO
-DOT_MULTI_TARGETS      = NO
-GENERATE_LEGEND        = YES
-DOT_CLEANUP            = YES
diff --git a/docsrc/planck.make b/docsrc/planck.make
deleted file mode 100644
index 0d0a462..0000000
--- a/docsrc/planck.make
+++ /dev/null
@@ -1,20 +0,0 @@
-PKG:=docsrc
-
-docsrc_idx: $(DOCDIR)_mkdir
-	cp $(SRCROOT)/docsrc/index_code.html $(DOCDIR)/index.html
-
-docsrc_code_doc: $(DOCDIR)_mkdir docsrc_idx
-	cd $(SRCROOT)/docsrc; \
-	for i in c_utils libfftpack libsharp; do \
-	  doxygen $${i}.dox; \
-	  rm -rf $(DOCDIR)/$${i}; mv htmldoc $(DOCDIR)/$${i}; \
-	done; \
-	rm *.tag;
-
-docsrc_clean:
-	cd $(SRCROOT)/docsrc; \
-	rm -f *.tag
-	cd $(SRCROOT)/docsrc; \
-	rm -rf htmldoc
-
-doc: docsrc_code_doc
diff --git a/fortran/sharp.f90 b/fortran/sharp.f90
deleted file mode 100644
index 36a1d11..0000000
--- a/fortran/sharp.f90
+++ /dev/null
@@ -1,286 +0,0 @@
-module sharp
-  use iso_c_binding
-  implicit none
-  ! alm_info flags
-  integer, parameter :: SHARP_PACKED = 1
-
-  ! sharp job types
-  enum, bind(c)
-      enumerator :: SHARP_YtW = 0
-      enumerator :: SHARP_Y = 1
-      enumerator :: SHARP_Yt = 2
-      enumerator :: SHARP_WY = 3
-      enumerator :: SHARP_ALM2MAP_DERIV1 = 4
-   end enum
-
-  ! sharp job flags
-  integer, parameter :: SHARP_DP             = ISHFT(1, 4)
-  integer, parameter :: SHARP_ADD            = ISHFT(1, 5)
-  integer, parameter :: SHARP_REAL_HARMONICS = ISHFT(1, 6)
-  integer, parameter :: SHARP_NO_FFT         = ISHFT(1, 7)
-
-  type sharp_geom_info
-     type(c_ptr) :: handle
-     integer(c_intptr_t) :: n_local
-  end type sharp_geom_info
-
-  type sharp_alm_info
-     type(c_ptr) :: handle
-     integer(c_intptr_t) :: n_local
-  end type sharp_alm_info
-
-  interface
-
-     ! alm_info
-     subroutine sharp_make_general_alm_info( &
-         lmax, nm, stride, mval, mvstart, flags, alm_info) bind(c)
-       use iso_c_binding
-       integer(c_int), value, intent(in)    :: lmax, nm, stride, flags
-       integer(c_int), intent(in)           :: mval(nm)
-       integer(c_intptr_t), intent(in)     :: mvstart(nm)
-       type(c_ptr), intent(out)             :: alm_info
-     end subroutine sharp_make_general_alm_info
-
-     subroutine c_sharp_make_mmajor_real_packed_alm_info( &
-         lmax, stride, nm, ms, alm_info) bind(c, name='sharp_make_mmajor_real_packed_alm_info')
-       use iso_c_binding
-       integer(c_int), value, intent(in)    :: lmax, nm, stride
-       integer(c_int), intent(in), optional :: ms(nm)
-       type(c_ptr), intent(out)             :: alm_info
-     end subroutine c_sharp_make_mmajor_real_packed_alm_info
-
-     function c_sharp_alm_count(alm_info) bind(c, name='sharp_alm_count')
-       use iso_c_binding
-       integer(c_intptr_t)           :: c_sharp_alm_count
-       type(c_ptr), value, intent(in) :: alm_info
-     end function c_sharp_alm_count
-
-     subroutine c_sharp_destroy_alm_info(alm_info) bind(c, name='sharp_destroy_alm_info')
-       use iso_c_binding
-       type(c_ptr), value                   :: alm_info
-     end subroutine c_sharp_destroy_alm_info
-
-     ! geom_info
-     subroutine sharp_make_subset_healpix_geom_info ( &
-          nside, stride, nrings, rings, weight, geom_info) bind(c)
-       use iso_c_binding
-       integer(c_int), value, intent(in)    :: nside, stride, nrings
-       integer(c_int), intent(in), optional :: rings(nrings)
-       real(c_double), intent(in), optional :: weight(2 * nside)
-       type(c_ptr), intent(out)             :: geom_info
-     end subroutine sharp_make_subset_healpix_geom_info
-
-     subroutine c_sharp_destroy_geom_info(geom_info) bind(c, name='sharp_destroy_geom_info')
-       use iso_c_binding
-       type(c_ptr), value                   :: geom_info
-     end subroutine c_sharp_destroy_geom_info
-
-     function c_sharp_map_size(info) bind(c, name='sharp_map_size')
-       use iso_c_binding
-       integer(c_intptr_t) :: c_sharp_map_size
-       type(c_ptr), value   :: info
-     end function c_sharp_map_size
-
-
-     ! execute
-     subroutine c_sharp_execute(type, spin, alm, map, geom_info, alm_info, ntrans, &
-                                flags, time, opcnt) bind(c, name='sharp_execute')
-       use iso_c_binding
-       integer(c_int), value                        :: type, spin, ntrans, flags
-       type(c_ptr), value                           :: alm_info, geom_info
-       real(c_double), intent(out), optional        :: time
-       integer(c_long_long), intent(out), optional  :: opcnt
-       type(c_ptr), intent(in)                      :: alm(*), map(*)
-     end subroutine c_sharp_execute
-
-     subroutine c_sharp_execute_mpi(comm, type, spin, alm, map, geom_info, alm_info, ntrans, &
-                                    flags, time, opcnt) bind(c, name='sharp_execute_mpi_fortran')
-       use iso_c_binding
-       integer(c_int), value                        :: comm, type, spin, ntrans, flags
-       type(c_ptr), value                           :: alm_info, geom_info
-       real(c_double), intent(out), optional        :: time
-       integer(c_long_long), intent(out), optional  :: opcnt
-       type(c_ptr), intent(in)                      :: alm(*), map(*)
-     end subroutine c_sharp_execute_mpi
-
-     ! Legendre transforms
-     subroutine c_sharp_legendre_transform(bl, recfac, lmax, x, out, nx) &
-          bind(c, name='sharp_legendre_transform')
-       use iso_c_binding
-       integer(c_intptr_t), value :: lmax, nx
-       real(c_double) :: bl(lmax + 1), x(nx), out(nx)
-       real(c_double), optional :: recfac(lmax + 1)
-     end subroutine c_sharp_legendre_transform
-
-     subroutine c_sharp_legendre_transform_s(bl, recfac, lmax, x, out, nx) &
-          bind(c, name='sharp_legendre_transform_s')
-       use iso_c_binding
-       integer(c_intptr_t), value :: lmax, nx
-       real(c_float) :: bl(lmax + 1), x(nx), out(nx)
-       real(c_float), optional :: recfac(lmax + 1)
-     end subroutine c_sharp_legendre_transform_s
-  end interface
-
-  interface sharp_execute
-     module procedure sharp_execute_d
-  end interface
-
-  interface sharp_legendre_transform
-     module procedure sharp_legendre_transform_d, sharp_legendre_transform_s
-  end interface sharp_legendre_transform
-
-contains
-  ! alm info
-
-  ! if ms is not passed, we default to using m=0..lmax.
-  subroutine sharp_make_mmajor_real_packed_alm_info(lmax, ms, alm_info)
-    use iso_c_binding
-    integer(c_int), value, intent(in)    :: lmax
-    integer(c_int), intent(in), optional :: ms(:)
-    type(sharp_alm_info), intent(out)    :: alm_info
-    !--
-    integer(c_int), allocatable          :: ms_copy(:)
-    integer(c_int)                       :: nm
-
-    if (present(ms)) then
-       nm = size(ms)
-       allocate(ms_copy(nm))
-       ms_copy = ms
-       call c_sharp_make_mmajor_real_packed_alm_info(lmax, 1, nm, ms_copy, alm_info=alm_info%handle)
-       deallocate(ms_copy)
-    else
-       call c_sharp_make_mmajor_real_packed_alm_info(lmax, 1, lmax + 1, alm_info=alm_info%handle)
-    end if
-    alm_info%n_local = c_sharp_alm_count(alm_info%handle)
-  end subroutine sharp_make_mmajor_real_packed_alm_info
-
-  subroutine sharp_destroy_alm_info(alm_info)
-    use iso_c_binding
-    type(sharp_alm_info), intent(inout) :: alm_info
-    call c_sharp_destroy_alm_info(alm_info%handle)
-    alm_info%handle = c_null_ptr
-  end subroutine sharp_destroy_alm_info
-
-
-  ! geom info
-  subroutine sharp_make_healpix_geom_info(nside, rings, weight, geom_info)
-    integer(c_int), value                :: nside
-    integer(c_int), optional             :: rings(:)
-    real(c_double), intent(in), optional :: weight(2 * nside)
-    type(sharp_geom_info), intent(out)   :: geom_info
-    !--
-    integer(c_int) :: nrings
-    integer(c_int), allocatable :: rings_copy(:)
-
-    if (present(rings)) then
-       nrings = size(rings)
-       allocate(rings_copy(nrings))
-       rings_copy = rings
-       call sharp_make_subset_healpix_geom_info(nside, 1, nrings, rings_copy, &
-                                                weight, geom_info%handle)
-       deallocate(rings_copy)
-    else
-       call sharp_make_subset_healpix_geom_info(nside, 1, nrings=4 * nside - 1, &
-                                                weight=weight, geom_info=geom_info%handle)
-    end if
-    geom_info%n_local = c_sharp_map_size(geom_info%handle)
-  end subroutine sharp_make_healpix_geom_info
-
-  subroutine sharp_destroy_geom_info(geom_info)
-    use iso_c_binding
-    type(sharp_geom_info), intent(inout) :: geom_info
-    call c_sharp_destroy_geom_info(geom_info%handle)
-    geom_info%handle = c_null_ptr
-  end subroutine sharp_destroy_geom_info
-
-
-  ! Currently the only mode supported is stacked (not interleaved) maps.
-  !
-  ! Note that passing the exact dimension of alm/map is necesarry, it
-  ! prevents the caller from doing too crazy slicing prior to pass array
-  ! in...
-  !
-  ! Usage:
-  !
-  ! The alm array must have shape exactly alm(alm_info%n_local, nmaps)
-  ! The maps array must have shape exactly map(map_info%n_local, nmaps).
-  subroutine sharp_execute_d(type, spin, nmaps, alm, alm_info, map, geom_info, &
-                             add, time, opcnt, comm)
-    use iso_c_binding
-    use mpi
-    implicit none
-    integer(c_int), value                        :: type, spin, nmaps
-    integer(c_int), optional                     :: comm
-    logical, value, optional                     :: add  ! should add instead of replace out
-
-    type(sharp_alm_info)                         :: alm_info
-    type(sharp_geom_info)                        :: geom_info
-    real(c_double), intent(out), optional        :: time
-    integer(c_long_long), intent(out), optional  :: opcnt
-    real(c_double), target, intent(inout)        :: alm(0:alm_info%n_local - 1, 1:nmaps)
-    real(c_double), target, intent(inout)        :: map(0:geom_info%n_local - 1, 1:nmaps)
-    !--
-    integer(c_int)         :: mod_flags, ntrans, k
-    type(c_ptr), target    :: alm_ptr(nmaps)
-    type(c_ptr), target    :: map_ptr(nmaps)
-
-    mod_flags = SHARP_DP
-    if (present(add) .and. add) then
-       mod_flags = or(mod_flags, SHARP_ADD)
-    end if
-
-    if (spin == 0) then
-       ntrans = nmaps
-    else
-       ntrans = nmaps / 2
-    end if
-
-    ! Set up pointer table to access maps
-    alm_ptr(:) = c_null_ptr
-    map_ptr(:) = c_null_ptr
-    do k = 1, nmaps
-       if (alm_info%n_local > 0) alm_ptr(k) = c_loc(alm(0, k))
-       if (geom_info%n_local > 0) map_ptr(k) = c_loc(map(0, k))
-    end do
-
-    if (present(comm)) then
-      call c_sharp_execute_mpi(comm, type, spin, alm_ptr, map_ptr, &
-          geom_info=geom_info%handle, &
-          alm_info=alm_info%handle, &
-          ntrans=ntrans, &
-          flags=mod_flags, &
-          time=time, &
-          opcnt=opcnt)
-    else
-      call c_sharp_execute(type, spin, alm_ptr, map_ptr, &
-          geom_info=geom_info%handle, &
-          alm_info=alm_info%handle, &
-          ntrans=ntrans, &
-          flags=mod_flags, &
-          time=time, &
-          opcnt=opcnt)
-   end if
-  end subroutine sharp_execute_d
-
-  subroutine sharp_legendre_transform_d(bl, x, out)
-    use iso_c_binding
-    real(c_double) :: bl(:)
-    real(c_double) :: x(:), out(size(x))
-    !--
-    integer(c_intptr_t) :: lmax, nx
-    call c_sharp_legendre_transform(bl, lmax=int(size(bl) - 1, c_intptr_t), &
-                                    x=x, out=out, nx=int(size(x), c_intptr_t))
-  end subroutine sharp_legendre_transform_d
-
-  subroutine sharp_legendre_transform_s(bl, x, out)
-    use iso_c_binding
-    real(c_float) :: bl(:)
-    real(c_float) :: x(:), out(size(x))
-    !--
-    integer(c_intptr_t) :: lmax, nx
-    call c_sharp_legendre_transform_s(bl, lmax=int(size(bl) - 1, c_intptr_t), &
-                                      x=x, out=out, nx=int(size(x), c_intptr_t))
-  end subroutine sharp_legendre_transform_s
-
-
-end module
diff --git a/fortran/test_sharp.f90 b/fortran/test_sharp.f90
deleted file mode 100644
index 0b7cce2..0000000
--- a/fortran/test_sharp.f90
+++ /dev/null
@@ -1,84 +0,0 @@
-program test_sharp
-  use mpi
-  use sharp
-  use iso_c_binding, only : c_ptr, c_double
-  implicit none
-
-  integer, parameter :: lmax = 2, nside = 2
-  type(sharp_alm_info) :: alm_info
-  type(sharp_geom_info) :: geom_info
-
-  real(c_double), dimension(0:(lmax + 1)**2 - 1, 1:1) :: alm
-  real(c_double), dimension(0:12*nside**2 - 1, 1:1) :: map
-
-  integer(c_int), dimension(1:lmax + 1) :: ms
-  integer(c_int), dimension(1:4 * nside - 1) :: rings
-  integer(c_int) :: nm, m, nrings, iring
-  integer :: nodecount, rank, ierr
-
-  call MPI_Init(ierr)
-  call MPI_Comm_size(MPI_COMM_WORLD, nodecount, ierr)
-  call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
-
-  nm = 0
-  do m = rank, lmax, nodecount
-     nm = nm + 1
-     ms(nm) = m
-  end do
-
-  nrings = 0
-  do iring = rank + 1, 4 * nside - 1, nodecount
-     nrings = nrings + 1
-     rings(nrings) = iring
-  end do
-
-  alm = 0
-  map = 0
-  if (rank == 0) then
-    alm(0, 1) = 1
-  end if
-
-  print *, ms(1:nm)
-  call sharp_make_mmajor_real_packed_alm_info(lmax, ms=ms(1:nm), alm_info=alm_info)
-  print *, 'alm_info%n_local', alm_info%n_local
-  call sharp_make_healpix_geom_info(nside, rings=rings(1:nrings), geom_info=geom_info)
-  print *, 'geom_info%n_local', geom_info%n_local
-  print *, 'execute'
-  call sharp_execute(SHARP_Y, 0, 1, alm, alm_info, map, geom_info, comm=MPI_COMM_WORLD)
-
-  print *, alm
-  print *, map
-
-  call sharp_destroy_alm_info(alm_info)
-  call sharp_destroy_geom_info(geom_info)
-  print *, 'DONE'
-  call MPI_Finalize(ierr)
-
-  print *, 'LEGENDRE TRANSFORMS'
-
-  call test_legendre_transforms()
-
-contains
-  subroutine test_legendre_transforms()
-    integer, parameter :: lmax = 20, nx=10
-    real(c_double) :: bl(0:lmax)
-    real(c_double) :: x(nx), out(nx)
-    real(c_float) :: out_s(nx)
-    !--
-    integer :: l, i
-
-    do l = 0, lmax
-       bl(l) = 1.0 / real(l + 1, c_double)
-    end do
-    do i = 1, nx
-       x(i) = 1 / real(i, c_double)
-    end do
-    out = 0
-    call sharp_legendre_transform(bl, x, out)
-    print *, out
-    call sharp_legendre_transform(real(bl, c_float), real(x, c_float), out_s)
-    print *, out_s
-  end subroutine test_legendre_transforms
-
-
-end program test_sharp
diff --git a/libfftpack/README b/libfftpack/README
deleted file mode 100644
index 2c7e7cb..0000000
--- a/libfftpack/README
+++ /dev/null
@@ -1,34 +0,0 @@
-ls_fft description:
-
-This package is intended to calculate one-dimensional real or complex FFTs
-with high accuracy and good efficiency even for lengths containing large
-prime factors.
-The code is written in C, but a Fortran wrapper exists as well.
-
-Before any FFT is executed, a plan must be generated for it. Plan creation
-is designed to be fast, so that there is no significant overhead if the
-plan is only used once or a few times.
-
-The main component of the code is based on Paul N. Swarztrauber's FFTPACK in the
-double precision incarnation by Hugh C. Pumphrey
-(http://www.netlib.org/fftpack/dp.tgz).
-
-I replaced the iterative sine and cosine calculations in radfg() and radbg()
-by an exact calculation, which slightly improves the transform accuracy for
-real FFTs with lengths containing large prime factors.
-
-Since FFTPACK becomes quite slow for FFT lengths with large prime factors
-(in the worst case of prime lengths it reaches O(n*n) complexity), I
-implemented Bluestein's algorithm, which computes a FFT of length n by
-several FFTs of length n2>=2*n-1 and a convolution. Since n2 can be chosen
-to be highly composite, this algorithm is more efficient if n has large
-prime factors. The longer FFTs themselves are then computed using the FFTPACK
-routines.
-Bluestein's algorithm was implemented according to the description at
-http://en.wikipedia.org/wiki/Bluestein's_FFT_algorithm.
-
-Thread-safety:
-All routines can be called concurrently; all information needed by ls_fft
-is stored in the plan variable. However, using the same plan variable on
-multiple threads simultaneously is not supported and will lead to data
-corruption.
diff --git a/libfftpack/bluestein.c b/libfftpack/bluestein.c
deleted file mode 100644
index 2e2005c..0000000
--- a/libfftpack/bluestein.c
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- *  This file is part of libfftpack.
- *
- *  libfftpack is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libfftpack is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libfftpack; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*
- *  Copyright (C) 2005, 2006, 2007, 2008 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include "fftpack.h"
-#include "bluestein.h"
-
-/* returns the sum of all prime factors of n */
-size_t prime_factor_sum (size_t n)
-  {
-  size_t result=0,x,limit,tmp;
-  while (((tmp=(n>>1))<<1)==n)
-    { result+=2; n=tmp; }
-
-  limit=(size_t)sqrt(n+0.01);
-  for (x=3; x<=limit; x+=2)
-  while ((tmp=(n/x))*x==n)
-    {
-    result+=x;
-    n=tmp;
-    limit=(size_t)sqrt(n+0.01);
-    }
-  if (n>1) result+=n;
-
-  return result;
-  }
-
-/* returns the smallest composite of 2, 3 and 5 which is >= n */
-static size_t good_size(size_t n)
-  {
-  size_t f2, f23, f235, bestfac=2*n;
-  if (n<=6) return n;
-
-  for (f2=1; f2<bestfac; f2*=2)
-    for (f23=f2; f23<bestfac; f23*=3)
-      for (f235=f23; f235<bestfac; f235*=5)
-        if (f235>=n) bestfac=f235;
-  return bestfac;
-  }
-
-void bluestein_i (size_t n, double **tstorage, size_t *worksize)
-  {
-  static const double pi=3.14159265358979323846;
-  size_t n2=good_size(n*2-1);
-  size_t m, coeff;
-  double angle, xn2;
-  double *bk, *bkf, *work;
-  double pibyn=pi/n;
-  *worksize=2+2*n+8*n2+16;
-  *tstorage = RALLOC(double,2+2*n+8*n2+16);
-  ((size_t *)(*tstorage))[0]=n2;
-  bk  = *tstorage+2;
-  bkf = *tstorage+2+2*n;
-  work= *tstorage+2+2*(n+n2);
-
-/* initialize b_k */
-  bk[0] = 1;
-  bk[1] = 0;
-
-  coeff=0;
-  for (m=1; m<n; ++m)
-    {
-    coeff+=2*m-1;
-    if (coeff>=2*n) coeff-=2*n;
-    angle = pibyn*coeff;
-    bk[2*m] = cos(angle);
-    bk[2*m+1] = sin(angle);
-    }
-
-/* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
-  xn2 = 1./n2;
-  bkf[0] = bk[0]*xn2;
-  bkf[1] = bk[1]*xn2;
-  for (m=2; m<2*n; m+=2)
-    {
-    bkf[m]   = bkf[2*n2-m]   = bk[m]   *xn2;
-    bkf[m+1] = bkf[2*n2-m+1] = bk[m+1] *xn2;
-    }
-  for (m=2*n;m<=(2*n2-2*n+1);++m)
-    bkf[m]=0.;
-  cffti (n2,work);
-  cfftf (n2,bkf,work);
-  }
-
-void bluestein (size_t n, double *data, double *tstorage, int isign)
-  {
-  size_t n2=*((size_t *)tstorage);
-  size_t m;
-  double *bk, *bkf, *akf, *work;
-  bk  = tstorage+2;
-  bkf = tstorage+2+2*n;
-  work= tstorage+2+2*(n+n2);
-  akf = tstorage+2+2*n+6*n2+16;
-
-/* initialize a_k and FFT it */
-  if (isign>0)
-    for (m=0; m<2*n; m+=2)
-      {
-      akf[m]   = data[m]*bk[m]   - data[m+1]*bk[m+1];
-      akf[m+1] = data[m]*bk[m+1] + data[m+1]*bk[m];
-      }
-  else
-    for (m=0; m<2*n; m+=2)
-      {
-      akf[m]   = data[m]*bk[m]   + data[m+1]*bk[m+1];
-      akf[m+1] =-data[m]*bk[m+1] + data[m+1]*bk[m];
-      }
-  for (m=2*n; m<2*n2; ++m)
-    akf[m]=0;
-
-  cfftf (n2,akf,work);
-
-/* do the convolution */
-  if (isign>0)
-    for (m=0; m<2*n2; m+=2)
-      {
-      double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
-      akf[m  ]  =  akf[m]*bkf[m]   + akf[m+1]*bkf[m+1];
-      akf[m+1]  = im;
-      }
-  else
-    for (m=0; m<2*n2; m+=2)
-      {
-      double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
-      akf[m  ]  = akf[m]*bkf[m]   - akf[m+1]*bkf[m+1];
-      akf[m+1]  = im;
-      }
-
-
-/* inverse FFT */
-  cfftb (n2,akf,work);
-
-/* multiply by b_k* */
-  if (isign>0)
-    for (m=0; m<2*n; m+=2)
-      {
-      data[m]   = bk[m]  *akf[m] - bk[m+1]*akf[m+1];
-      data[m+1] = bk[m+1]*akf[m] + bk[m]  *akf[m+1];
-      }
-  else
-    for (m=0; m<2*n; m+=2)
-      {
-      data[m]   = bk[m]  *akf[m] + bk[m+1]*akf[m+1];
-      data[m+1] =-bk[m+1]*akf[m] + bk[m]  *akf[m+1];
-      }
-  }
diff --git a/libfftpack/bluestein.h b/libfftpack/bluestein.h
deleted file mode 100644
index 91e5b28..0000000
--- a/libfftpack/bluestein.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- *  This file is part of libfftpack.
- *
- *  libfftpack is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libfftpack is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libfftpack; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*
- *  Copyright (C) 2005 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#ifndef PLANCK_BLUESTEIN_H
-#define PLANCK_BLUESTEIN_H
-
-#include "c_utils.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-size_t prime_factor_sum (size_t n);
-
-void bluestein_i (size_t n, double **tstorage, size_t *worksize);
-void bluestein (size_t n, double *data, double *tstorage, int isign);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/libfftpack/fftpack.c b/libfftpack/fftpack.c
deleted file mode 100644
index 6d09d06..0000000
--- a/libfftpack/fftpack.c
+++ /dev/null
@@ -1,833 +0,0 @@
-/*
- *  This file is part of libfftpack.
- *
- *  libfftpack is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libfftpack is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libfftpack; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*
-  fftpack.c : A set of FFT routines in C.
-  Algorithmically based on Fortran-77 FFTPACK by Paul N. Swarztrauber
-  (Version 4, 1985).
-
-  C port by Martin Reinecke (2010)
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include "fftpack.h"
-
-#define WA(x,i) wa[(i)+(x)*ido]
-#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
-#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
-#define PM(a,b,c,d) { a=c+d; b=c-d; }
-#define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
-#define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; }
-#define SCALEC(a,b) { a.r*=b; a.i*=b; }
-#define CONJFLIPC(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
-/* (a+ib) = conj(c+id) * (e+if) */
-#define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; }
-
-typedef struct {
-  double r,i;
-} cmplx;
-
-#define CONCAT(a,b) a ## b
-
-#define X(arg) CONCAT(passb,arg)
-#define BACKWARD
-#include "fftpack_inc.c"
-#undef BACKWARD
-#undef X
-
-#define X(arg) CONCAT(passf,arg)
-#include "fftpack_inc.c"
-#undef X
-
-#undef CC
-#undef CH
-#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
-#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
-
-static void radf2 (size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=2;
-  size_t i, k, ic;
-  double ti2, tr2;
-
-  for (k=0; k<l1; k++)
-    PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1))
-  if ((ido&1)==0)
-    for (k=0; k<l1; k++)
-      {
-      CH(    0,1,k) = -CC(ido-1,k,1);
-      CH(ido-1,0,k) =  CC(ido-1,k,0);
-      }
-  if (ido<=2) return;
-  for (k=0; k<l1; k++)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
-      PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2)
-      PM (CH(i  ,0,k),CH(ic  ,1,k),ti2,CC(i  ,k,0))
-      }
-  }
-
-static void radf3(size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=3;
-  static const double taur=-0.5, taui=0.86602540378443864676;
-  size_t i, k, ic;
-  double ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3;
-
-  for (k=0; k<l1; k++)
-    {
-    cr2=CC(0,k,1)+CC(0,k,2);
-    CH(0,0,k) = CC(0,k,0)+cr2;
-    CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
-    CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
-    }
-  if (ido==1) return;
-  for (k=0; k<l1; k++)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
-      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
-      cr2=dr2+dr3;
-      ci2=di2+di3;
-      CH(i-1,0,k) = CC(i-1,k,0)+cr2;
-      CH(i  ,0,k) = CC(i  ,k,0)+ci2;
-      tr2 = CC(i-1,k,0)+taur*cr2;
-      ti2 = CC(i  ,k,0)+taur*ci2;
-      tr3 = taui*(di2-di3);
-      ti3 = taui*(dr3-dr2);
-      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3)
-      PM(CH(i  ,2,k),CH(ic  ,1,k),ti3,ti2)
-      }
-  }
-
-static void radf4(size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=4;
-  static const double hsqt2=0.70710678118654752440;
-  size_t i, k, ic;
-  double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-
-  for (k=0; k<l1; k++)
-    {
-    PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1))
-    PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2))
-    PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1)
-    }
-  if ((ido&1)==0)
-    for (k=0; k<l1; k++)
-      {
-      ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
-      tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
-      PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1)
-      PM (CH(    0,3,k),CH(    0,1,k),ti1,CC(ido-1,k,2))
-      }
-  if (ido<=2) return;
-  for (k=0; k<l1; k++)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
-      MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
-      MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
-      PM(tr1,tr4,cr4,cr2)
-      PM(ti1,ti4,ci2,ci4)
-      PM(tr2,tr3,CC(i-1,k,0),cr3)
-      PM(ti2,ti3,CC(i  ,k,0),ci3)
-      PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1)
-      PM(CH(i  ,0,k),CH(ic  ,3,k),ti1,ti2)
-      PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4)
-      PM(CH(i  ,2,k),CH(ic  ,1,k),tr4,ti3)
-      }
-  }
-
-static void radf5(size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=5;
-  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
-                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
-  size_t i, k, ic;
-  double ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3,
-         dr4, dr5, cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
-
-  for (k=0; k<l1; k++)
-    {
-    PM (cr2,ci5,CC(0,k,4),CC(0,k,1))
-    PM (cr3,ci4,CC(0,k,3),CC(0,k,2))
-    CH(0,0,k)=CC(0,k,0)+cr2+cr3;
-    CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
-    CH(0,2,k)=ti11*ci5+ti12*ci4;
-    CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
-    CH(0,4,k)=ti12*ci5-ti11*ci4;
-    }
-  if (ido==1) return;
-  for (k=0; k<l1;++k)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
-      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
-      MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
-      MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4))
-      PM(cr2,ci5,dr5,dr2)
-      PM(ci2,cr5,di2,di5)
-      PM(cr3,ci4,dr4,dr3)
-      PM(ci3,cr4,di3,di4)
-      CH(i-1,0,k)=CC(i-1,k,0)+cr2+cr3;
-      CH(i  ,0,k)=CC(i  ,k,0)+ci2+ci3;
-      tr2=CC(i-1,k,0)+tr11*cr2+tr12*cr3;
-      ti2=CC(i  ,k,0)+tr11*ci2+tr12*ci3;
-      tr3=CC(i-1,k,0)+tr12*cr2+tr11*cr3;
-      ti3=CC(i  ,k,0)+tr12*ci2+tr11*ci3;
-      MULPM(tr5,tr4,cr5,cr4,ti11,ti12)
-      MULPM(ti5,ti4,ci5,ci4,ti11,ti12)
-      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5)
-      PM(CH(i  ,2,k),CH(ic  ,1,k),ti5,ti2)
-      PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4)
-      PM(CH(i  ,4,k),CH(ic  ,3,k),ti4,ti3)
-      }
-  }
-
-#undef CH
-#undef CC
-#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
-#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
-#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
-#define C2(a,b) cc[(a)+idl1*(b)]
-#define CH2(a,b) ch[(a)+idl1*(b)]
-static void radfg(size_t ido, size_t ip, size_t l1, size_t idl1,
-  double *cc, double *ch, const double *wa)
-  {
-  const size_t cdim=ip;
-  static const double twopi=6.28318530717958647692;
-  size_t idij, ipph, i, j, k, l, j2, ic, jc, lc, ik;
-  double ai1, ai2, ar1, ar2, arg;
-  double *csarr;
-  size_t aidx;
-
-  ipph=(ip+1)/ 2;
-  if(ido!=1)
-    {
-    memcpy(ch,cc,idl1*sizeof(double));
-
-    for(j=1; j<ip; j++)
-      for(k=0; k<l1; k++)
-        {
-        CH(0,k,j)=C1(0,k,j);
-        idij=(j-1)*ido+1;
-        for(i=2; i<ido; i+=2,idij+=2)
-          MULPM(CH(i-1,k,j),CH(i,k,j),wa[idij-1],wa[idij],C1(i-1,k,j),C1(i,k,j))
-        }
-
-    for(j=1,jc=ip-1; j<ipph; j++,jc--)
-      for(k=0; k<l1; k++)
-        for(i=2; i<ido; i+=2)
-          {
-          PM(C1(i-1,k,j),C1(i  ,k,jc),CH(i-1,k,jc),CH(i-1,k,j ))
-          PM(C1(i  ,k,j),C1(i-1,k,jc),CH(i  ,k,j ),CH(i  ,k,jc))
-          }
-    }
-  else
-    memcpy(cc,ch,idl1*sizeof(double));
-
-  for(j=1,jc=ip-1; j<ipph; j++,jc--)
-    for(k=0; k<l1; k++)
-      PM(C1(0,k,j),C1(0,k,jc),CH(0,k,jc),CH(0,k,j))
-
-  csarr=RALLOC(double,2*ip);
-  arg=twopi / ip;
-  csarr[0]=1.;
-  csarr[1]=0.;
-  csarr[2]=csarr[2*ip-2]=cos(arg);
-  csarr[3]=sin(arg); csarr[2*ip-1]=-csarr[3];
-  for (i=2; i<=ip/2; ++i)
-    {
-    csarr[2*i]=csarr[2*ip-2*i]=cos(i*arg);
-    csarr[2*i+1]=sin(i*arg);
-    csarr[2*ip-2*i+1]=-csarr[2*i+1];
-    }
-  for(l=1,lc=ip-1; l<ipph; l++,lc--)
-    {
-    ar1=csarr[2*l];
-    ai1=csarr[2*l+1];
-    for(ik=0; ik<idl1; ik++)
-      {
-      CH2(ik,l)=C2(ik,0)+ar1*C2(ik,1);
-      CH2(ik,lc)=ai1*C2(ik,ip-1);
-      }
-    aidx=2*l;
-    for(j=2,jc=ip-2; j<ipph; j++,jc--)
-      {
-      aidx+=2*l;
-      if (aidx>=2*ip) aidx-=2*ip;
-      ar2=csarr[aidx];
-      ai2=csarr[aidx+1];
-      for(ik=0; ik<idl1; ik++)
-        {
-        CH2(ik,l )+=ar2*C2(ik,j );
-        CH2(ik,lc)+=ai2*C2(ik,jc);
-        }
-      }
-    }
-  DEALLOC(csarr);
-
-  for(j=1; j<ipph; j++)
-    for(ik=0; ik<idl1; ik++)
-      CH2(ik,0)+=C2(ik,j);
-
-  for(k=0; k<l1; k++)
-    memcpy(&CC(0,0,k),&CH(0,k,0),ido*sizeof(double));
-  for(j=1; j<ipph; j++)
-    {
-    jc=ip-j;
-    j2=2*j;
-    for(k=0; k<l1; k++)
-      {
-      CC(ido-1,j2-1,k) = CH(0,k,j );
-      CC(0    ,j2  ,k) = CH(0,k,jc);
-      }
-    }
-  if(ido==1) return;
-
-  for(j=1; j<ipph; j++)
-    {
-    jc=ip-j;
-    j2=2*j;
-    for(k=0; k<l1; k++)
-      for(i=2; i<ido; i+=2)
-        {
-        ic=ido-i;
-        PM (CC(i-1,j2,k),CC(ic-1,j2-1,k),CH(i-1,k,j ),CH(i-1,k,jc))
-        PM (CC(i  ,j2,k),CC(ic  ,j2-1,k),CH(i  ,k,jc),CH(i  ,k,j ))
-        }
-    }
-  }
-
-#undef CC
-#undef CH
-#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
-#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
-
-static void radb2(size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=2;
-  size_t i, k, ic;
-  double ti2, tr2;
-
-  for (k=0; k<l1; k++)
-    PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k))
-  if ((ido&1)==0)
-    for (k=0; k<l1; k++)
-      {
-      CH(ido-1,k,0) =  2*CC(ido-1,0,k);
-      CH(ido-1,k,1) = -2*CC(0    ,1,k);
-      }
-  if (ido<=2) return;
-  for (k=0; k<l1;++k)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k))
-      PM (ti2,CH(i  ,k,0),CC(i  ,0,k),CC(ic  ,1,k))
-      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2)
-      }
-  }
-
-static void radb3(size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=3;
-  static const double taur=-0.5, taui=0.86602540378443864676;
-  size_t i, k, ic;
-  double ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
-
-  for (k=0; k<l1; k++)
-    {
-    tr2=2*CC(ido-1,1,k);
-    cr2=CC(0,0,k)+taur*tr2;
-    CH(0,k,0)=CC(0,0,k)+tr2;
-    ci3=2*taui*CC(0,2,k);
-    PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
-    }
-  if (ido==1) return;
-  for (k=0; k<l1; k++)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      tr2=CC(i-1,2,k)+CC(ic-1,1,k);
-      ti2=CC(i  ,2,k)-CC(ic  ,1,k);
-      cr2=CC(i-1,0,k)+taur*tr2;
-      ci2=CC(i  ,0,k)+taur*ti2;
-      CH(i-1,k,0)=CC(i-1,0,k)+tr2;
-      CH(i  ,k,0)=CC(i  ,0,k)+ti2;
-      cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));
-      ci3=taui*(CC(i  ,2,k)+CC(ic  ,1,k));
-      PM(dr3,dr2,cr2,ci3)
-      PM(di2,di3,ci2,cr3)
-      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
-      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
-      }
-  }
-
-static void radb4(size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=4;
-  static const double sqrt2=1.41421356237309504880;
-  size_t i, k, ic;
-  double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-
-  for (k=0; k<l1; k++)
-    {
-    PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k))
-    tr3=2*CC(ido-1,1,k);
-    tr4=2*CC(0,2,k);
-    PM (CH(0,k,0),CH(0,k,2),tr2,tr3)
-    PM (CH(0,k,3),CH(0,k,1),tr1,tr4)
-    }
-  if ((ido&1)==0)
-    for (k=0; k<l1; k++)
-      {
-      PM (ti1,ti2,CC(0    ,3,k),CC(0    ,1,k))
-      PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k))
-      CH(ido-1,k,0)=tr2+tr2;
-      CH(ido-1,k,1)=sqrt2*(tr1-ti1);
-      CH(ido-1,k,2)=ti2+ti2;
-      CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
-      }
-  if (ido<=2) return;
-  for (k=0; k<l1;++k)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k))
-      PM (ti1,ti2,CC(i  ,0,k),CC(ic  ,3,k))
-      PM (tr4,ti3,CC(i  ,2,k),CC(ic  ,1,k))
-      PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k))
-      PM (CH(i-1,k,0),cr3,tr2,tr3)
-      PM (CH(i  ,k,0),ci3,ti2,ti3)
-      PM (cr4,cr2,tr1,tr4)
-      PM (ci2,ci4,ti1,ti4)
-      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2)
-      MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3)
-      MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4)
-      }
-  }
-
-static void radb5(size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=5;
-  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
-                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
-  size_t i, k, ic;
-  double ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4,
-         ti2, ti3, ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
-
-  for (k=0; k<l1; k++)
-    {
-    ti5=2*CC(0,2,k);
-    ti4=2*CC(0,4,k);
-    tr2=2*CC(ido-1,1,k);
-    tr3=2*CC(ido-1,3,k);
-    CH(0,k,0)=CC(0,0,k)+tr2+tr3;
-    cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
-    cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
-    MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
-    PM(CH(0,k,4),CH(0,k,1),cr2,ci5)
-    PM(CH(0,k,3),CH(0,k,2),cr3,ci4)
-    }
-  if (ido==1) return;
-  for (k=0; k<l1;++k)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k))
-      PM(ti5,ti2,CC(i  ,2,k),CC(ic  ,1,k))
-      PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k))
-      PM(ti4,ti3,CC(i  ,4,k),CC(ic  ,3,k))
-      CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
-      CH(i  ,k,0)=CC(i  ,0,k)+ti2+ti3;
-      cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
-      ci2=CC(i  ,0,k)+tr11*ti2+tr12*ti3;
-      cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
-      ci3=CC(i  ,0,k)+tr12*ti2+tr11*ti3;
-      MULPM(cr5,cr4,tr5,tr4,ti11,ti12)
-      MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
-      PM(dr4,dr3,cr3,ci4)
-      PM(di3,di4,ci3,cr4)
-      PM(dr5,dr2,cr2,ci5)
-      PM(di2,di5,ci2,cr5)
-      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
-      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
-      MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4)
-      MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5)
-      }
-  }
-
-static void radbg(size_t ido, size_t ip, size_t l1, size_t idl1,
-  double *cc, double *ch, const double *wa)
-  {
-  const size_t cdim=ip;
-  static const double twopi=6.28318530717958647692;
-  size_t idij, ipph, i, j, k, l, j2, ic, jc, lc, ik;
-  double ai1, ai2, ar1, ar2, arg;
-  double *csarr;
-  size_t aidx;
-
-  ipph=(ip+1)/ 2;
-  for(k=0; k<l1; k++)
-    memcpy(&CH(0,k,0),&CC(0,0,k),ido*sizeof(double));
-  for(j=1; j<ipph; j++)
-    {
-    jc=ip-j;
-    j2=2*j;
-    for(k=0; k<l1; k++)
-      {
-      CH(0,k,j )=2*CC(ido-1,j2-1,k);
-      CH(0,k,jc)=2*CC(0    ,j2  ,k);
-      }
-    }
-
-  if(ido!=1)
-    for(j=1,jc=ip-1; j<ipph; j++,jc--)
-      for(k=0; k<l1; k++)
-        for(i=2; i<ido; i+=2)
-          {
-          ic=ido-i;
-          PM (CH(i-1,k,j ),CH(i-1,k,jc),CC(i-1,2*j,k),CC(ic-1,2*j-1,k))
-          PM (CH(i  ,k,jc),CH(i  ,k,j ),CC(i  ,2*j,k),CC(ic  ,2*j-1,k))
-          }
-
-  csarr=RALLOC(double,2*ip);
-  arg=twopi/ip;
-  csarr[0]=1.;
-  csarr[1]=0.;
-  csarr[2]=csarr[2*ip-2]=cos(arg);
-  csarr[3]=sin(arg); csarr[2*ip-1]=-csarr[3];
-  for (i=2; i<=ip/2; ++i)
-    {
-    csarr[2*i]=csarr[2*ip-2*i]=cos(i*arg);
-    csarr[2*i+1]=sin(i*arg);
-    csarr[2*ip-2*i+1]=-csarr[2*i+1];
-    }
-  for(l=1; l<ipph; l++)
-    {
-    lc=ip-l;
-    ar1=csarr[2*l];
-    ai1=csarr[2*l+1];
-    for(ik=0; ik<idl1; ik++)
-      {
-      C2(ik,l)=CH2(ik,0)+ar1*CH2(ik,1);
-      C2(ik,lc)=ai1*CH2(ik,ip-1);
-      }
-    aidx=2*l;
-    for(j=2; j<ipph; j++)
-      {
-      jc=ip-j;
-      aidx+=2*l;
-      if (aidx>=2*ip) aidx-=2*ip;
-      ar2=csarr[aidx];
-      ai2=csarr[aidx+1];
-      for(ik=0; ik<idl1; ik++)
-        {
-        C2(ik,l )+=ar2*CH2(ik,j );
-        C2(ik,lc)+=ai2*CH2(ik,jc);
-        }
-      }
-    }
-  DEALLOC(csarr);
-
-  for(j=1; j<ipph; j++)
-    for(ik=0; ik<idl1; ik++)
-      CH2(ik,0)+=CH2(ik,j);
-
-  for(j=1,jc=ip-1; j<ipph; j++,jc--)
-    for(k=0; k<l1; k++)
-      PM (CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc))
-
-  if(ido==1)
-    return;
-  for(j=1,jc=ip-1; j<ipph; j++,jc--)
-    for(k=0; k<l1; k++)
-      for(i=2; i<ido; i+=2)
-        {
-        PM (CH(i-1,k,jc),CH(i-1,k,j ),C1(i-1,k,j),C1(i  ,k,jc))
-        PM (CH(i  ,k,j ),CH(i  ,k,jc),C1(i  ,k,j),C1(i-1,k,jc))
-        }
-  memcpy(cc,ch,idl1*sizeof(double));
-
-  for(j=1; j<ip; j++)
-    for(k=0; k<l1; k++)
-      {
-      C1(0,k,j)=CH(0,k,j);
-      idij=(j-1)*ido+1;
-      for(i=2; i<ido; i+=2,idij+=2)
-        MULPM (C1(i,k,j),C1(i-1,k,j),wa[idij-1],wa[idij],CH(i,k,j),CH(i-1,k,j))
-      }
-  }
-
-#undef CC
-#undef CH
-#undef PM
-#undef MULPM
-
-
-/*----------------------------------------------------------------------
-   cfftf1, cfftb1, cfftf, cfftb, cffti1, cffti. Complex FFTs.
-  ----------------------------------------------------------------------*/
-
-static void cfft1(size_t n, cmplx c[], cmplx ch[], const cmplx wa[],
-  const size_t ifac[], int isign)
-  {
-  size_t k1, l1=1, nf=ifac[1], iw=0;
-  cmplx *p1=c, *p2=ch;
-
-  for(k1=0; k1<nf; k1++)
-    {
-    size_t ip=ifac[k1+2];
-    size_t l2=ip*l1;
-    size_t ido = n/l2;
-    if(ip==4)
-      (isign>0) ? passb4(ido, l1, p1, p2, wa+iw)
-                : passf4(ido, l1, p1, p2, wa+iw);
-    else if(ip==2)
-      (isign>0) ? passb2(ido, l1, p1, p2, wa+iw)
-                : passf2(ido, l1, p1, p2, wa+iw);
-    else if(ip==3)
-      (isign>0) ? passb3(ido, l1, p1, p2, wa+iw)
-                : passf3(ido, l1, p1, p2, wa+iw);
-    else if(ip==5)
-      (isign>0) ? passb5(ido, l1, p1, p2, wa+iw)
-                : passf5(ido, l1, p1, p2, wa+iw);
-    else if(ip==6)
-      (isign>0) ? passb6(ido, l1, p1, p2, wa+iw)
-                : passf6(ido, l1, p1, p2, wa+iw);
-    else
-      (isign>0) ? passbg(ido, ip, l1, p1, p2, wa+iw)
-                : passfg(ido, ip, l1, p1, p2, wa+iw);
-    SWAP(p1,p2,cmplx *);
-    l1=l2;
-    iw+=(ip-1)*ido;
-    }
-  if (p1!=c)
-    memcpy (c,p1,n*sizeof(cmplx));
-  }
-
-void cfftf(size_t n, double c[], double wsave[])
-  {
-  if (n!=1)
-    cfft1(n, (cmplx*)c, (cmplx*)wsave, (cmplx*)(wsave+2*n),
-          (size_t*)(wsave+4*n),-1);
-  }
-
-void cfftb(size_t n, double c[], double wsave[])
-  {
-  if (n!=1)
-    cfft1(n, (cmplx*)c, (cmplx*)wsave, (cmplx*)(wsave+2*n),
-          (size_t*)(wsave+4*n),+1);
-  }
-
-static void factorize (size_t n, const size_t *pf, size_t npf, size_t *ifac)
-  {
-  size_t nl=n, nf=0, ntry=0, j=0, i;
-
-startloop:
-  j++;
-  ntry = (j<=npf) ? pf[j-1] : ntry+2;
-  do
-    {
-    size_t nq=nl / ntry;
-    size_t nr=nl-ntry*nq;
-    if (nr!=0)
-      goto startloop;
-    nf++;
-    ifac[nf+1]=ntry;
-    nl=nq;
-    if ((ntry==2) && (nf!=1))
-      {
-      for (i=nf+1; i>2; --i)
-        ifac[i]=ifac[i-1];
-      ifac[2]=2;
-      }
-    }
-  while(nl!=1);
-  ifac[0]=n;
-  ifac[1]=nf;
-  }
-
-static void cffti1(size_t n, double wa[], size_t ifac[])
-  {
-  static const size_t ntryh[5]={4,6,3,2,5};
-  static const double twopi=6.28318530717958647692;
-  size_t j, k, fi;
-
-  double argh=twopi/n;
-  size_t i=0, l1=1;
-  factorize (n,ntryh,5,ifac);
-  for(k=1; k<=ifac[1]; k++)
-    {
-    size_t ip=ifac[k+1];
-    size_t ido=n/(l1*ip);
-    for(j=1; j<ip; j++)
-      {
-      size_t is = i;
-      double argld=j*l1*argh;
-      wa[i  ]=1;
-      wa[i+1]=0;
-      for(fi=1; fi<=ido; fi++)
-        {
-        double arg=fi*argld;
-        i+=2;
-        wa[i  ]=cos(arg);
-        wa[i+1]=sin(arg);
-        }
-      if(ip>6)
-        {
-        wa[is  ]=wa[i  ];
-        wa[is+1]=wa[i+1];
-        }
-      }
-    l1*=ip;
-    }
-  }
-
-void cffti(size_t n, double wsave[])
-  { if (n!=1) cffti1(n, wsave+2*n,(size_t*)(wsave+4*n)); }
-
-
-/*----------------------------------------------------------------------
-   rfftf1, rfftb1, rfftf, rfftb, rffti1, rffti. Real FFTs.
-  ----------------------------------------------------------------------*/
-
-static void rfftf1(size_t n, double c[], double ch[], const double wa[],
-  const size_t ifac[])
-  {
-  size_t k1, l1=n, nf=ifac[1], iw=n-1;
-  double *p1=ch, *p2=c;
-
-  for(k1=1; k1<=nf;++k1)
-    {
-    size_t ip=ifac[nf-k1+2];
-    size_t ido=n / l1;
-    l1 /= ip;
-    iw-=(ip-1)*ido;
-    SWAP (p1,p2,double *);
-    if(ip==4)
-      radf4(ido, l1, p1, p2, wa+iw);
-    else if(ip==2)
-      radf2(ido, l1, p1, p2, wa+iw);
-    else if(ip==3)
-      radf3(ido, l1, p1, p2, wa+iw);
-    else if(ip==5)
-      radf5(ido, l1, p1, p2, wa+iw);
-    else
-      {
-      if (ido==1)
-        SWAP (p1,p2,double *);
-      radfg(ido, ip, l1, ido*l1, p1, p2, wa+iw);
-      SWAP (p1,p2,double *);
-      }
-    }
-  if (p1==c)
-    memcpy (c,ch,n*sizeof(double));
-  }
-
-static void rfftb1(size_t n, double c[], double ch[], const double wa[],
-  const size_t ifac[])
-  {
-  size_t k1, l1=1, nf=ifac[1], iw=0;
-  double *p1=c, *p2=ch;
-
-  for(k1=1; k1<=nf; k1++)
-    {
-    size_t ip = ifac[k1+1],
-           ido= n/(ip*l1);
-    if(ip==4)
-      radb4(ido, l1, p1, p2, wa+iw);
-    else if(ip==2)
-      radb2(ido, l1, p1, p2, wa+iw);
-    else if(ip==3)
-      radb3(ido, l1, p1, p2, wa+iw);
-    else if(ip==5)
-      radb5(ido, l1, p1, p2, wa+iw);
-    else
-      {
-      radbg(ido, ip, l1, ido*l1, p1, p2, wa+iw);
-      if (ido!=1)
-        SWAP (p1,p2,double *);
-      }
-    SWAP (p1,p2,double *);
-    l1*=ip;
-    iw+=(ip-1)*ido;
-    }
-  if (p1!=c)
-    memcpy (c,ch,n*sizeof(double));
-  }
-
-void rfftf(size_t n, double r[], double wsave[])
-  { if(n!=1) rfftf1(n, r, wsave, wsave+n,(size_t*)(wsave+2*n)); }
-
-void rfftb(size_t n, double r[], double wsave[])
-  { if(n!=1) rfftb1(n, r, wsave, wsave+n,(size_t*)(wsave+2*n)); }
-
-static void rffti1(size_t n, double wa[], size_t ifac[])
-  {
-  static const size_t ntryh[4]={4,2,3,5};
-  static const double twopi=6.28318530717958647692;
-  size_t i, j, k, fi;
-
-  double argh=twopi/n;
-  size_t is=0, l1=1;
-  factorize (n,ntryh,4,ifac);
-  for (k=1; k<ifac[1]; k++)
-    {
-    size_t ip=ifac[k+1],
-           ido=n/(l1*ip);
-    for (j=1; j<ip; ++j)
-      {
-      double argld=j*l1*argh;
-      for(i=is,fi=1; i<=ido+is-3; i+=2,++fi)
-        {
-        double arg=fi*argld;
-        wa[i  ]=cos(arg);
-        wa[i+1]=sin(arg);
-        }
-      is+=ido;
-      }
-    l1*=ip;
-    }
-  }
-
-void rffti(size_t n, double wsave[])
-  { if (n!=1) rffti1(n, wsave+n,(size_t*)(wsave+2*n)); }
diff --git a/libfftpack/fftpack.h b/libfftpack/fftpack.h
deleted file mode 100644
index 6a2e96e..0000000
--- a/libfftpack/fftpack.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- *  This file is part of libfftpack.
- *
- *  libfftpack is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libfftpack is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libfftpack; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*
-  fftpack.h : function declarations for fftpack.c
-  Algorithmically based on Fortran-77 FFTPACK by Paul N. Swarztrauber
-  (Version 4, 1985).
-
-  Pekka Janhunen 23.2.1995
-
-  (reformatted by joerg arndt)
-
-  reformatted and slightly enhanced by Martin Reinecke (2004)
- */
-
-#ifndef PLANCK_FFTPACK_H
-#define PLANCK_FFTPACK_H
-
-#include "c_utils.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*! forward complex transform */
-void cfftf(size_t N, double complex_data[], double wrk[]);
-/*! backward complex transform */
-void cfftb(size_t N, double complex_data[], double wrk[]);
-/*! initializer for complex transforms */
-void cffti(size_t N, double wrk[]);
-
-/*! forward real transform */
-void rfftf(size_t N, double data[], double wrk[]);
-/*! backward real transform */
-void rfftb(size_t N, double data[], double wrk[]);
-/*! initializer for real transforms */
-void rffti(size_t N, double wrk[]);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/libfftpack/fftpack_inc.c b/libfftpack/fftpack_inc.c
deleted file mode 100644
index 55d0ac5..0000000
--- a/libfftpack/fftpack_inc.c
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
- *  This file is part of libfftpack.
- *
- *  libfftpack is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libfftpack is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libfftpack; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*
-  fftpack.c : A set of FFT routines in C.
-  Algorithmically based on Fortran-77 FFTPACK by Paul N. Swarztrauber
-  (Version 4, 1985).
-
-  C port by Martin Reinecke (2010)
- */
-
-#ifdef BACKWARD
-#define PSIGN +
-#define PMSIGNC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
-/* a = b*c */
-#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
-#else
-#define PSIGN -
-#define PMSIGNC(a,b,c,d) { a.r=c.r-d.r; a.i=c.i-d.i; b.r=c.r+d.r; b.i=c.i+d.i; }
-/* a = conj(b)*c */
-#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
-#endif
-
-static void X(2) (size_t ido, size_t l1, const cmplx *cc, cmplx *ch,
-  const cmplx *wa)
-  {
-  const size_t cdim=2;
-  size_t k,i;
-  cmplx t;
-  if (ido==1)
-    for (k=0;k<l1;++k)
-      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
-  else
-    for (k=0;k<l1;++k)
-      for (i=0;i<ido;++i)
-        {
-        PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
-        MULPMSIGNC (CH(i,k,1),WA(0,i),t)
-        }
-  }
-
-static void X(3)(size_t ido, size_t l1, const cmplx *cc, cmplx *ch,
-  const cmplx *wa)
-  {
-  const size_t cdim=3;
-  static const double taur=-0.5, taui= PSIGN 0.86602540378443864676;
-  size_t i, k;
-  cmplx c2, c3, d2, d3, t2;
-
-  if (ido==1)
-    for (k=0; k<l1; ++k)
-      {
-      PMC (t2,c3,CC(0,1,k),CC(0,2,k))
-      ADDC (CH(0,k,0),t2,CC(0,0,k))
-      SCALEC(t2,taur)
-      ADDC(c2,CC(0,0,k),t2)
-      SCALEC(c3,taui)
-      CONJFLIPC(c3)
-      PMC(CH(0,k,1),CH(0,k,2),c2,c3)
-      }
-  else
-    for (k=0; k<l1; ++k)
-      for (i=0; i<ido; ++i)
-        {
-        PMC (t2,c3,CC(i,1,k),CC(i,2,k))
-        ADDC (CH(i,k,0),t2,CC(i,0,k))
-        SCALEC(t2,taur)
-        ADDC(c2,CC(i,0,k),t2)
-        SCALEC(c3,taui)
-        CONJFLIPC(c3)
-        PMC(d2,d3,c2,c3)
-        MULPMSIGNC(CH(i,k,1),WA(0,i),d2)
-        MULPMSIGNC(CH(i,k,2),WA(1,i),d3)
-        }
-  }
-
-static void X(4)(size_t ido, size_t l1, const cmplx *cc, cmplx *ch,
-  const cmplx *wa)
-  {
-  const size_t cdim=4;
-  size_t i, k;
-  cmplx c2, c3, c4, t1, t2, t3, t4;
-
-  if (ido==1)
-    for (k=0; k<l1; ++k)
-      {
-      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
-      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
-      CONJFLIPC(t4)
-      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
-      PMSIGNC (CH(0,k,1),CH(0,k,3),t1,t4)
-      }
-  else
-    for (k=0; k<l1; ++k)
-      for (i=0; i<ido; ++i)
-        {
-        PMC(t2,t1,CC(i,0,k),CC(i,2,k))
-        PMC(t3,t4,CC(i,1,k),CC(i,3,k))
-        CONJFLIPC(t4)
-        PMC(CH(i,k,0),c3,t2,t3)
-        PMSIGNC (c2,c4,t1,t4)
-        MULPMSIGNC (CH(i,k,1),WA(0,i),c2)
-        MULPMSIGNC (CH(i,k,2),WA(1,i),c3)
-        MULPMSIGNC (CH(i,k,3),WA(2,i),c4)
-        }
-  }
-
-static void X(5)(size_t ido, size_t l1, const cmplx *cc, cmplx *ch,
-  const cmplx *wa)
-  {
-  const size_t cdim=5;
-  static const double tr11= 0.3090169943749474241,
-                      ti11= PSIGN 0.95105651629515357212,
-                      tr12=-0.8090169943749474241,
-                      ti12= PSIGN 0.58778525229247312917;
-  size_t i, k;
-  cmplx c2, c3, c4, c5, d2, d3, d4, d5, t2, t3, t4, t5;
-
-  if (ido==1)
-    for (k=0; k<l1; ++k)
-      {
-      PMC (t2,t5,CC(0,1,k),CC(0,4,k))
-      PMC (t3,t4,CC(0,2,k),CC(0,3,k))
-      CH(0,k,0).r=CC(0,0,k).r+t2.r+t3.r;
-      CH(0,k,0).i=CC(0,0,k).i+t2.i+t3.i;
-      c2.r=CC(0,0,k).r+tr11*t2.r+tr12*t3.r;
-      c2.i=CC(0,0,k).i+tr11*t2.i+tr12*t3.i;
-      c3.r=CC(0,0,k).r+tr12*t2.r+tr11*t3.r;
-      c3.i=CC(0,0,k).i+tr12*t2.i+tr11*t3.i;
-      c5.r=ti11*t5.r+ti12*t4.r;
-      c5.i=ti11*t5.i+ti12*t4.i;
-      c4.r=ti12*t5.r-ti11*t4.r;
-      c4.i=ti12*t5.i-ti11*t4.i;
-      CONJFLIPC(c5)
-      PMC(CH(0,k,1),CH(0,k,4),c2,c5)
-      CONJFLIPC(c4)
-      PMC(CH(0,k,2),CH(0,k,3),c3,c4)
-      }
-  else
-    for (k=0; k<l1; ++k)
-      for (i=0; i<ido; ++i)
-        {
-        PMC (t2,t5,CC(i,1,k),CC(i,4,k))
-        PMC (t3,t4,CC(i,2,k),CC(i,3,k))
-        CH(i,k,0).r=CC(i,0,k).r+t2.r+t3.r;
-        CH(i,k,0).i=CC(i,0,k).i+t2.i+t3.i;
-        c2.r=CC(i,0,k).r+tr11*t2.r+tr12*t3.r;
-        c2.i=CC(i,0,k).i+tr11*t2.i+tr12*t3.i;
-        c3.r=CC(i,0,k).r+tr12*t2.r+tr11*t3.r;
-        c3.i=CC(i,0,k).i+tr12*t2.i+tr11*t3.i;
-        c5.r=ti11*t5.r+ti12*t4.r;
-        c5.i=ti11*t5.i+ti12*t4.i;
-        c4.r=ti12*t5.r-ti11*t4.r;
-        c4.i=ti12*t5.i-ti11*t4.i;
-        CONJFLIPC(c5)
-        PMC(d2,d5,c2,c5)
-        CONJFLIPC(c4)
-        PMC(d3,d4,c3,c4)
-        MULPMSIGNC (CH(i,k,1),WA(0,i),d2)
-        MULPMSIGNC (CH(i,k,2),WA(1,i),d3)
-        MULPMSIGNC (CH(i,k,3),WA(2,i),d4)
-        MULPMSIGNC (CH(i,k,4),WA(3,i),d5)
-        }
-  }
-
-static void X(6)(size_t ido, size_t l1, const cmplx *cc, cmplx *ch,
-  const cmplx *wa)
-  {
-  const size_t cdim=6;
-  static const double taui= PSIGN 0.86602540378443864676;
-  cmplx ta1,ta2,ta3,a0,a1,a2,tb1,tb2,tb3,b0,b1,b2,d1,d2,d3,d4,d5;
-  size_t i, k;
-
-  if (ido==1)
-    for (k=0; k<l1; ++k)
-      {
-      PMC(ta1,ta3,CC(0,2,k),CC(0,4,k))
-      ta2.r = CC(0,0,k).r - .5*ta1.r;
-      ta2.i = CC(0,0,k).i - .5*ta1.i;
-      SCALEC(ta3,taui)
-      ADDC(a0,CC(0,0,k),ta1)
-      CONJFLIPC(ta3)
-      PMC(a1,a2,ta2,ta3)
-      PMC(tb1,tb3,CC(0,5,k),CC(0,1,k))
-      tb2.r = CC(0,3,k).r - .5*tb1.r;
-      tb2.i = CC(0,3,k).i - .5*tb1.i;
-      SCALEC(tb3,taui)
-      ADDC(b0,CC(0,3,k),tb1)
-      CONJFLIPC(tb3)
-      PMC(b1,b2,tb2,tb3)
-      PMC(CH(0,k,0),CH(0,k,3),a0,b0)
-      PMC(CH(0,k,4),CH(0,k,1),a1,b1)
-      PMC(CH(0,k,2),CH(0,k,5),a2,b2)
-      }
-  else
-    for (k=0; k<l1; ++k)
-      for (i=0; i<ido; ++i)
-        {
-        PMC(ta1,ta3,CC(i,2,k),CC(i,4,k))
-        ta2.r = CC(i,0,k).r - .5*ta1.r;
-        ta2.i = CC(i,0,k).i - .5*ta1.i;
-        SCALEC(ta3,taui)
-        ADDC(a0,CC(i,0,k),ta1)
-        CONJFLIPC(ta3)
-        PMC(a1,a2,ta2,ta3)
-        PMC(tb1,tb3,CC(i,5,k),CC(i,1,k))
-        tb2.r = CC(i,3,k).r - .5*tb1.r;
-        tb2.i = CC(i,3,k).i - .5*tb1.i;
-        SCALEC(tb3,taui)
-        ADDC(b0,CC(i,3,k),tb1)
-        CONJFLIPC(tb3)
-        PMC(b1,b2,tb2,tb3)
-        PMC(CH(i,k,0),d3,a0,b0)
-        PMC(d4,d1,a1,b1)
-        PMC(d2,d5,a2,b2)
-        MULPMSIGNC (CH(i,k,1),WA(0,i),d1)
-        MULPMSIGNC (CH(i,k,2),WA(1,i),d2)
-        MULPMSIGNC (CH(i,k,3),WA(2,i),d3)
-        MULPMSIGNC (CH(i,k,4),WA(3,i),d4)
-        MULPMSIGNC (CH(i,k,5),WA(4,i),d5)
-        }
-  }
-
-static void X(g)(size_t ido, size_t ip, size_t l1, const cmplx *cc, cmplx *ch,
-  const cmplx *wa)
-  {
-  const size_t cdim=ip;
-  cmplx *tarr=RALLOC(cmplx,2*ip);
-  cmplx *ccl=tarr, *wal=tarr+ip;
-  size_t i,j,k,l,jc,lc;
-  size_t ipph = (ip+1)/2;
-
-  for (i=1; i<ip; ++i)
-    wal[i]=wa[ido*(i-1)];
-  for (k=0; k<l1; ++k)
-    for (i=0; i<ido; ++i)
-      {
-      cmplx s=CC(i,0,k);
-      ccl[0] = CC(i,0,k);
-      for(j=1,jc=ip-1; j<ipph; ++j,--jc)
-        {
-        PMC (ccl[j],ccl[jc],CC(i,j,k),CC(i,jc,k))
-        ADDC (s,s,ccl[j])
-        }
-      CH(i,k,0) = s;
-      for (j=1, jc=ip-1; j<=ipph; ++j,--jc)
-        {
-        cmplx abr=ccl[0], abi={0.,0.};
-        size_t iang=0;
-        for (l=1,lc=ip-1; l<ipph; ++l,--lc)
-          {
-          iang+=j;
-          if (iang>ip) iang-=ip;
-          abr.r += ccl[l ].r*wal[iang].r;
-          abr.i += ccl[l ].i*wal[iang].r;
-          abi.r += ccl[lc].r*wal[iang].i;
-          abi.i += ccl[lc].i*wal[iang].i;
-          }
-#ifndef BACKWARD
-          { abi.i=-abi.i; abi.r=-abi.r; }
-#endif
-        CONJFLIPC(abi)
-        PMC(CH(i,k,j),CH(i,k,jc),abr,abi)
-        }
-      }
-
-  DEALLOC(tarr);
-
-  if (ido==1) return;
-
-  for (j=1; j<ip; ++j)
-    for (k=0; k<l1; ++k)
-      {
-      size_t idij=(j-1)*ido+1;
-      for(i=1; i<ido; ++i, ++idij)
-        {
-        cmplx t=CH(i,k,j);
-        MULPMSIGNC (CH(i,k,j),wa[idij],t)
-        }
-      }
-  }
-
-#undef PSIGN
-#undef PMSIGNC
-#undef MULPMSIGNC
diff --git a/libfftpack/libfftpack.dox b/libfftpack/libfftpack.dox
deleted file mode 100644
index 9ed2362..0000000
--- a/libfftpack/libfftpack.dox
+++ /dev/null
@@ -1,5 +0,0 @@
-/*! \mainpage Libfftpack documentation
-  <ul>
-  <li>\ref fftgroup "Programming interface"
-  </ul>
- */
diff --git a/libfftpack/ls_fft.c b/libfftpack/ls_fft.c
deleted file mode 100644
index b1c0c96..0000000
--- a/libfftpack/ls_fft.c
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- *  This file is part of libfftpack.
- *
- *  libfftpack is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libfftpack is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libfftpack; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*
- *  Copyright (C) 2005 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include "bluestein.h"
-#include "fftpack.h"
-#include "ls_fft.h"
-
-complex_plan make_complex_plan (size_t length)
-  {
-  complex_plan plan = RALLOC(complex_plan_i,1);
-  size_t pfsum = prime_factor_sum(length);
-  double comp1 = (double)(length*pfsum);
-  double comp2 = 2*3*length*log(3.*length);
-  comp2*=3.; /* fudge factor that appears to give good overall performance */
-  plan->length=length;
-  plan->bluestein = (comp2<comp1);
-  if (plan->bluestein)
-    bluestein_i (length,&(plan->work),&(plan->worksize));
-  else
-    {
-    plan->worksize=4*length+15;
-    plan->work=RALLOC(double,4*length+15);
-    cffti(length, plan->work);
-    }
-  return plan;
-  }
-
-complex_plan copy_complex_plan (complex_plan plan)
-  {
-  if (!plan) return NULL;
-  {
-  complex_plan newplan = RALLOC(complex_plan_i,1);
-  *newplan = *plan;
-  newplan->work=RALLOC(double,newplan->worksize);
-  memcpy(newplan->work,plan->work,sizeof(double)*newplan->worksize);
-  return newplan;
-  }
-  }
-
-void kill_complex_plan (complex_plan plan)
-  {
-  DEALLOC(plan->work);
-  DEALLOC(plan);
-  }
-
-void complex_plan_forward (complex_plan plan, double *data)
-  {
-  if (plan->bluestein)
-    bluestein (plan->length, data, plan->work, -1);
-  else
-    cfftf (plan->length, data, plan->work);
-  }
-
-void complex_plan_backward (complex_plan plan, double *data)
-  {
-  if (plan->bluestein)
-    bluestein (plan->length, data, plan->work, 1);
-  else
-    cfftb (plan->length, data, plan->work);
-  }
-
-
-real_plan make_real_plan (size_t length)
-  {
-  real_plan plan = RALLOC(real_plan_i,1);
-  size_t pfsum = prime_factor_sum(length);
-  double comp1 = .5*length*pfsum;
-  double comp2 = 2*3*length*log(3.*length);
-  comp2*=3; /* fudge factor that appears to give good overall performance */
-  plan->length=length;
-  plan->bluestein = (comp2<comp1);
-  if (plan->bluestein)
-    bluestein_i (length,&(plan->work),&(plan->worksize));
-  else
-    {
-    plan->worksize=2*length+15;
-    plan->work=RALLOC(double,2*length+15);
-    rffti(length, plan->work);
-    }
-  return plan;
-  }
-
-real_plan copy_real_plan (real_plan plan)
-  {
-  if (!plan) return NULL;
-  {
-  real_plan newplan = RALLOC(real_plan_i,1);
-  *newplan = *plan;
-  newplan->work=RALLOC(double,newplan->worksize);
-  memcpy(newplan->work,plan->work,sizeof(double)*newplan->worksize);
-  return newplan;
-  }
-  }
-
-void kill_real_plan (real_plan plan)
-  {
-  DEALLOC(plan->work);
-  DEALLOC(plan);
-  }
-
-void real_plan_forward_fftpack (real_plan plan, double *data)
-  {
-  if (plan->bluestein)
-    {
-    size_t m;
-    size_t n=plan->length;
-    double *tmp = RALLOC(double,2*n);
-    for (m=0; m<n; ++m)
-      {
-      tmp[2*m] = data[m];
-      tmp[2*m+1] = 0.;
-      }
-    bluestein(n,tmp,plan->work,-1);
-    data[0] = tmp[0];
-    memcpy (data+1, tmp+2, (n-1)*sizeof(double));
-    DEALLOC(tmp);
-    }
-  else
-    rfftf (plan->length, data, plan->work);
-  }
-
-static void fftpack2halfcomplex (double *data, size_t n)
-  {
-  size_t m;
-  double *tmp = RALLOC(double,n);
-  tmp[0]=data[0];
-  for (m=1; m<(n+1)/2; ++m)
-    {
-    tmp[m]=data[2*m-1];
-    tmp[n-m]=data[2*m];
-    }
-  if (!(n&1))
-    tmp[n/2]=data[n-1];
-  memcpy (data,tmp,n*sizeof(double));
-  DEALLOC(tmp);
-  }
-
-static void halfcomplex2fftpack (double *data, size_t n)
-  {
-  size_t m;
-  double *tmp = RALLOC(double,n);
-  tmp[0]=data[0];
-  for (m=1; m<(n+1)/2; ++m)
-    {
-    tmp[2*m-1]=data[m];
-    tmp[2*m]=data[n-m];
-    }
-  if (!(n&1))
-    tmp[n-1]=data[n/2];
-  memcpy (data,tmp,n*sizeof(double));
-  DEALLOC(tmp);
-  }
-
-void real_plan_forward_fftw (real_plan plan, double *data)
-  {
-  real_plan_forward_fftpack (plan, data);
-  fftpack2halfcomplex (data,plan->length);
-  }
-
-void real_plan_backward_fftpack (real_plan plan, double *data)
-  {
-  if (plan->bluestein)
-    {
-    size_t m;
-    size_t n=plan->length;
-    double *tmp = RALLOC(double,2*n);
-    tmp[0]=data[0];
-    tmp[1]=0.;
-    memcpy (tmp+2,data+1, (n-1)*sizeof(double));
-    if ((n&1)==0) tmp[n+1]=0.;
-    for (m=2; m<n; m+=2)
-      {
-      tmp[2*n-m]=tmp[m];
-      tmp[2*n-m+1]=-tmp[m+1];
-      }
-    bluestein (n, tmp, plan->work, 1);
-    for (m=0; m<n; ++m)
-      data[m] = tmp[2*m];
-    DEALLOC(tmp);
-    }
-  else
-    rfftb (plan->length, data, plan->work);
-  }
-
-void real_plan_backward_fftw (real_plan plan, double *data)
-  {
-  halfcomplex2fftpack (data,plan->length);
-  real_plan_backward_fftpack (plan, data);
-  }
-
-void real_plan_forward_c (real_plan plan, double *data)
-  {
-  size_t m;
-  size_t n=plan->length;
-
-  if (plan->bluestein)
-    {
-    for (m=1; m<2*n; m+=2)
-      data[m]=0;
-    bluestein (plan->length, data, plan->work, -1);
-    data[1]=0;
-    for (m=2; m<n; m+=2)
-      {
-      double avg;
-      avg = 0.5*(data[2*n-m]+data[m]);
-      data[2*n-m] = data[m] = avg;
-      avg = 0.5*(data[2*n-m+1]-data[m+1]);
-      data[2*n-m+1] = avg;
-      data[m+1] = -avg;
-      }
-    if ((n&1)==0) data[n+1] = 0.;
-    }
-  else
-    {
-/* using "m+m" instead of "2*m" to avoid a nasty bug in Intel's compiler */
-    for (m=0; m<n; ++m) data[m+1] = data[m+m];
-    rfftf (n, data+1, plan->work);
-    data[0] = data[1];
-    data[1] = 0;
-    for (m=2; m<n; m+=2)
-      {
-      data[2*n-m]   =  data[m];
-      data[2*n-m+1] = -data[m+1];
-      }
-    if ((n&1)==0) data[n+1] = 0.;
-    }
-  }
-
-void real_plan_backward_c (real_plan plan, double *data)
-  {
-  size_t n=plan->length;
-
-  if (plan->bluestein)
-    {
-    size_t m;
-    data[1]=0;
-    for (m=2; m<n; m+=2)
-      {
-      double avg;
-      avg = 0.5*(data[2*n-m]+data[m]);
-      data[2*n-m] = data[m] = avg;
-      avg = 0.5*(data[2*n-m+1]-data[m+1]);
-      data[2*n-m+1] = avg;
-      data[m+1] = -avg;
-      }
-    if ((n&1)==0) data[n+1] = 0.;
-    bluestein (plan->length, data, plan->work, 1);
-    for (m=1; m<2*n; m+=2)
-      data[m]=0;
-    }
-  else
-    {
-    ptrdiff_t m;
-    data[1] = data[0];
-    rfftb (n, data+1, plan->work);
-    for (m=n-1; m>=0; --m)
-      {
-      data[2*m]   = data[m+1];
-      data[2*m+1] = 0.;
-      }
-    }
-  }
diff --git a/libfftpack/ls_fft.h b/libfftpack/ls_fft.h
deleted file mode 100644
index 2d555eb..0000000
--- a/libfftpack/ls_fft.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- *  This file is part of libfftpack.
- *
- *  libfftpack is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libfftpack is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libfftpack; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file ls_fft.h
- *  Interface for the LevelS FFT package.
- *
- *  Copyright (C) 2004 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#ifndef PLANCK_LS_FFT_H
-#define PLANCK_LS_FFT_H
-
-#include "c_utils.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*!\defgroup fftgroup FFT interface
-This package is intended to calculate one-dimensional real or complex FFTs
-with high accuracy and good efficiency even for lengths containing large
-prime factors.
-The code is written in C, but a Fortran wrapper exists as well.
-
-Before any FFT is executed, a plan must be generated for it. Plan creation
-is designed to be fast, so that there is no significant overhead if the
-plan is only used once or a few times.
-
-The main component of the code is based on Paul N. Swarztrauber's FFTPACK in the
-double precision incarnation by Hugh C. Pumphrey
-(http://www.netlib.org/fftpack/dp.tgz).
-
-I replaced the iterative sine and cosine calculations in radfg() and radbg()
-by an exact calculation, which slightly improves the transform accuracy for
-real FFTs with lengths containing large prime factors.
-
-Since FFTPACK becomes quite slow for FFT lengths with large prime factors
-(in the worst case of prime lengths it reaches \f$\mathcal{O}(n^2)\f$
-complexity), I implemented Bluestein's algorithm, which computes a FFT of length
-\f$n\f$ by several FFTs of length \f$n_2\ge 2n-1\f$ and a convolution. Since
-\f$n_2\f$ can be chosen to be highly composite, this algorithm is more efficient
-if \f$n\f$ has large prime factors. The longer FFTs themselves are then computed
-using the FFTPACK routines.
-Bluestein's algorithm was implemented according to the description on Wikipedia
-(<a href="http://en.wikipedia.org/wiki/Bluestein%27s_FFT_algorithm">
-http://en.wikipedia.org/wiki/Bluestein%27s_FFT_algorithm</a>).
-
-\b Thread-safety:
-All routines can be called concurrently; all information needed by
-<tt>ls_fft</tt> is stored in the plan variable. However, using the same plan
-variable on multiple threads simultaneously is not supported and will lead to
-data corruption.
-*/
-/*! \{ */
-
-typedef struct
-  {
-  double *work;
-  size_t length, worksize;
-  int bluestein;
-  } complex_plan_i;
-
-/*! The opaque handle type for complex-FFT plans. */
-typedef complex_plan_i * complex_plan;
-
-/*! Returns a plan for a complex FFT with \a length elements. */
-complex_plan make_complex_plan (size_t length);
-/*! Constructs a copy of \a plan. */
-complex_plan copy_complex_plan (complex_plan plan);
-/*! Destroys a plan for a complex FFT. */
-void kill_complex_plan (complex_plan plan);
-/*! Computes a complex forward FFT on \a data, using \a plan.
-    \a Data has the form <tt>r0, i0, r1, i1, ...,
-    r[length-1], i[length-1]</tt>. */
-void complex_plan_forward (complex_plan plan, double *data);
-/*! Computes a complex backward FFT on \a data, using \a plan.
-    \a Data has the form <tt>r0, i0, r1, i1, ...,
-    r[length-1], i[length-1]</tt>. */
-void complex_plan_backward (complex_plan plan, double *data);
-
-typedef struct
-  {
-  double *work;
-  size_t length, worksize;
-  int bluestein;
-  } real_plan_i;
-
-/*! The opaque handle type for real-FFT plans. */
-typedef real_plan_i * real_plan;
-
-/*! Returns a plan for a real FFT with \a length elements. */
-real_plan make_real_plan (size_t length);
-/*! Constructs a copy of \a plan. */
-real_plan copy_real_plan (real_plan plan);
-/*! Destroys a plan for a real FFT. */
-void kill_real_plan (real_plan plan);
-/*! Computes a real forward FFT on \a data, using \a plan
-    and assuming the FFTPACK storage scheme:
-    - on entry, \a data has the form <tt>r0, r1, ..., r[length-1]</tt>;
-    - on exit, it has the form <tt>r0, r1, i1, r2, i2, ...</tt>
-      (a total of \a length values). */
-void real_plan_forward_fftpack (real_plan plan, double *data);
-/*! Computes a real backward FFT on \a data, using \a plan
-    and assuming the FFTPACK storage scheme:
-    - on entry, \a data has the form <tt>r0, r1, i1, r2, i2, ...</tt>
-    (a total of \a length values);
-    - on exit, it has the form <tt>r0, r1, ..., r[length-1]</tt>. */
-void real_plan_backward_fftpack (real_plan plan, double *data);
-/*! Computes a real forward FFT on \a data, using \a plan
-    and assuming the FFTW halfcomplex storage scheme:
-    - on entry, \a data has the form <tt>r0, r1, ..., r[length-1]</tt>;
-    - on exit, it has the form <tt>r0, r1, r2, ..., i2, i1</tt>. */
-void real_plan_forward_fftw (real_plan plan, double *data);
-/*! Computes a real backward FFT on \a data, using \a plan
-    and assuming the FFTW halfcomplex storage scheme:
-    - on entry, \a data has the form <tt>r0, r1, r2, ..., i2, i1</tt>.
-    - on exit, it has the form <tt>r0, r1, ..., r[length-1]</tt>. */
-void real_plan_backward_fftw (real_plan plan, double *data);
-/*! Computes a real forward FFT on \a data, using \a plan
-    and assuming a full-complex storage scheme:
-    - on entry, \a data has the form <tt>r0, [ignored], r1, [ignored], ...,
-      r[length-1], [ignored]</tt>;
-    - on exit, it has the form <tt>r0, i0, r1, i1, ...,
-      r[length-1], i[length-1]</tt>. */
-void real_plan_forward_c (real_plan plan, double *data);
-/*! Computes a real backward FFT on \a data, using \a plan
-    and assuming a full-complex storage scheme:
-    - on entry, \a data has the form <tt>r0, i0, r1, i1, ...,
-      r[length-1], i[length-1]</tt>;
-    - on exit, it has the form <tt>r0, 0, r1, 0, ..., r[length-1], 0</tt>. */
-void real_plan_backward_c (real_plan plan, double *data);
-
-/*! \} */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/libfftpack/planck.make b/libfftpack/planck.make
deleted file mode 100644
index c171367..0000000
--- a/libfftpack/planck.make
+++ /dev/null
@@ -1,21 +0,0 @@
-PKG:=libfftpack
-
-SD:=$(SRCROOT)/$(PKG)
-OD:=$(BLDROOT)/$(PKG)
-
-FULL_INCLUDE+= -I$(SD)
-
-HDR_$(PKG):=$(SD)/*.h
-LIB_$(PKG):=$(LIBDIR)/libfftpack.a
-OBJ:=fftpack.o bluestein.o ls_fft.o
-OBJ:=$(OBJ:%=$(OD)/%)
-
-ODEP:=$(HDR_$(PKG)) $(HDR_c_utils)
-
-$(OD)/fftpack.o: $(SD)/fftpack_inc.c
-
-$(OBJ): $(ODEP) | $(OD)_mkdir
-$(LIB_$(PKG)): $(OBJ)
-
-all_hdr+=$(HDR_$(PKG))
-all_lib+=$(LIB_$(PKG))
diff --git a/libsharp/planck.make b/libsharp/planck.make
deleted file mode 100644
index 76d534f..0000000
--- a/libsharp/planck.make
+++ /dev/null
@@ -1,29 +0,0 @@
-PKG:=libsharp
-
-SD:=$(SRCROOT)/$(PKG)
-OD:=$(BLDROOT)/$(PKG)
-
-FULL_INCLUDE+= -I$(SD)
-
-HDR_$(PKG):=$(SD)/*.h
-LIB_$(PKG):=$(LIBDIR)/libsharp.a
-BIN:=sharp_testsuite
-LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o sharp_legendre_roots.o sharp_legendre_table.o
-ALLOBJ:=$(LIBOBJ) sharp_testsuite.o
-LIBOBJ:=$(LIBOBJ:%=$(OD)/%)
-ALLOBJ:=$(ALLOBJ:%=$(OD)/%)
-
-ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils)
-$(OD)/sharp_core.o: $(SD)/sharp_core_inchelper.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c
-$(OD)/sharp.o: $(SD)/sharp_mpi.c
-BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils)
-
-$(LIB_$(PKG)): $(LIBOBJ)
-
-$(ALLOBJ): $(ODEP) | $(OD)_mkdir
-BIN:=$(BIN:%=$(BINDIR)/%)
-$(BIN): $(BINDIR)/% : $(OD)/%.o $(BDEP)
-
-all_hdr+=$(HDR_$(PKG))
-all_lib+=$(LIB_$(PKG))
-all_cbin+=$(BIN)
diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index 1eb8857..b1b9277 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -30,7 +30,7 @@
  */
 
 #include <math.h>
-#include "ls_fft.h"
+#include "pocketfft/pocketfft.h"
 #include "sharp_ylmgen_c.h"
 #include "sharp_internal.h"
 #include "c_utils.h"
@@ -82,7 +82,7 @@ typedef struct
   double phi0_;
   dcmplx *shiftarr;
   int s_shift;
-  real_plan plan;
+  rfft_plan plan;
   int norot;
   } ringhelper;
 
@@ -94,7 +94,7 @@ static void ringhelper_init (ringhelper *self)
 
 static void ringhelper_destroy (ringhelper *self)
   {
-  if (self->plan) kill_real_plan(self->plan);
+  if (self->plan) destroy_rfft_plan(self->plan);
   DEALLOC(self->shiftarr);
   ringhelper_init(self);
   }
@@ -111,11 +111,11 @@ static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
       for (int m=0; m<=mmax; ++m)
         self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
       }
-  if (!self->plan) self->plan=make_real_plan(nph);
-  if (nph!=(int)self->plan->length)
+  if (!self->plan) self->plan=make_rfft_plan(nph);
+  if (nph!=(int)rfft_length(self->plan))
     {
-    kill_real_plan(self->plan);
-    self->plan=make_real_plan(nph);
+    destroy_rfft_plan(self->plan);
+    self->plan=make_rfft_plan(nph);
     }
   }
 
@@ -323,7 +323,7 @@ static void ringhelper_phase2ring (ringhelper *self,
       }
     }
   data[1]=data[0];
-  real_plan_backward_fftpack (self->plan, &(data[1]));
+  rfft_backward (self->plan, &(data[1]), 1.);
   }
 
 static void ringhelper_ring2phase (ringhelper *self,
@@ -342,7 +342,7 @@ static void ringhelper_ring2phase (ringhelper *self,
   if (flags&SHARP_REAL_HARMONICS)
     wgt *= sqrt_two;
 
-  real_plan_forward_fftpack (self->plan, &(data[1]));
+  rfft_forward (self->plan, &(data[1]), 1.);
   data[0]=data[1];
   data[1]=data[nph+1]=0.;
 
diff --git a/libsharp/sharp.h b/libsharp/sharp.h
index 6722aee..9c5dd57 100644
--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@@ -39,8 +39,5 @@
 #include <complex.h>
 
 #include "sharp_lowlevel.h"
-#include "sharp_legendre.h"
-#include "sharp_legendre_roots.h"
-#include "sharp_legendre_table.h"
 
 #endif
diff --git a/libsharp/sharp_geomhelpers.c b/libsharp/sharp_geomhelpers.c
index dbb44e0..0aed60d 100644
--- a/libsharp/sharp_geomhelpers.c
+++ b/libsharp/sharp_geomhelpers.c
@@ -34,7 +34,7 @@
 #include "sharp_geomhelpers.h"
 #include "sharp_legendre_roots.h"
 #include "c_utils.h"
-#include "ls_fft.h"
+#include "pocketfft/pocketfft.h"
 #include <stdio.h>
 
 void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
@@ -161,9 +161,9 @@ void sharp_make_fejer1_geom_info (int nrings, int ppring, double phi0,
     weight[2*k  ]=2./(1.-4.*k*k)*sin((k*pi)/nrings);
     }
   if ((nrings&1)==0) weight[nrings-1]=0.;
-  real_plan plan = make_real_plan(nrings);
-  real_plan_backward_fftpack(plan,weight);
-  kill_real_plan(plan);
+  rfft_plan plan = make_rfft_plan(nrings);
+  rfft_backward(plan,weight,1.);
+  destroy_rfft_plan(plan);
 
   for (int m=0; m<(nrings+1)/2; ++m)
     {
@@ -208,9 +208,9 @@ void sharp_make_cc_geom_info (int nrings, int ppring, double phi0,
   for (int k=1; k<=(n/2-1); ++k)
     weight[2*k-1]=2./(1.-4.*k*k) + dw;
   weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1);
-  real_plan plan = make_real_plan(n);
-  real_plan_backward_fftpack(plan,weight);
-  kill_real_plan(plan);
+  rfft_plan plan = make_rfft_plan(n);
+  rfft_backward(plan,weight,1.);
+  destroy_rfft_plan(plan);
   weight[n]=weight[0];
 
   for (int m=0; m<(nrings+1)/2; ++m)
@@ -256,9 +256,9 @@ void sharp_make_fejer2_geom_info (int nrings, int ppring, double phi0,
   for (int k=1; k<=(n/2-1); ++k)
     weight[2*k-1]=2./(1.-4.*k*k);
   weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.;
-  real_plan plan = make_real_plan(n);
-  real_plan_backward_fftpack(plan,weight);
-  kill_real_plan(plan);
+  rfft_plan plan = make_rfft_plan(n);
+  rfft_backward(plan,weight,1.);
+  destroy_rfft_plan(plan);
   for (int m=0; m<nrings; ++m)
     weight[m]=weight[m+1];
 
diff --git a/libsharp/sharp_legendre.c b/libsharp/sharp_legendre.c
deleted file mode 100644
index 24d69b6..0000000
--- a/libsharp/sharp_legendre.c
+++ /dev/null
@@ -1,1319 +0,0 @@
-/* DO NOT EDIT. md5sum of source: a8c5c18a7a19c378187dbf461d12eb5c *//*
-
-    NOTE NOTE NOTE
-
-    This file is edited in sharp_legendre.c.in which is then preprocessed.
-    Do not make manual  modifications to sharp_legendre.c.
-
-    NOTE NOTE NOTE
-
-*/
-
-
-/*
- *  This file is part of libsharp.
- *
- * Redistribution and use in source and binary forms, with or without
- * met:
- * 
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file sharp_legendre.c.in
- *
- *  Copyright (C) 2015 University of Oslo
- *  \author Dag Sverre Seljebotn
- */
-
-#ifndef NO_LEGENDRE
-#if (VLEN==8)
-#error This code is not tested with MIC; please compile with -DNO_LEGENDRE
-/* ...or test it (it probably works) and remove this check */
-#endif
-
-#ifndef SHARP_LEGENDRE_CS
-#define SHARP_LEGENDRE_CS 4
-#endif
-
-#define MAX_CS 6
-#if (SHARP_LEGENDRE_CS > MAX_CS)
-#error (SHARP_LEGENDRE_CS > MAX_CS)
-#endif
-
-#include "sharp_legendre.h"
-#include "sharp_vecsupport.h"
-
-#include <stdlib.h>
-
-
-
-static void legendre_transform_vec1(double *recfacs, double *bl, ptrdiff_t lmax,
-                                              double xarr[(1) * VLEN],
-                                              double out[(1) * VLEN]) {
-    
-    Tv P_0, Pm1_0, Pm2_0, x0, y0;
-    
-    Tv W1, W2, b, R;
-    ptrdiff_t l;
-
-    
-    x0 = vloadu(xarr + 0 * VLEN);
-    Pm1_0 = vload(1.0);
-    P_0 = x0;
-    b = vload(*bl);
-    y0 = vmul(Pm1_0, b);
-    
-    
-    b = vload(*(bl + 1));
-    
-    vfmaeq(y0, P_0, b);
-    
-
-    for (l = 2; l <= lmax; ++l) {
-        b = vload(*(bl + l));
-        R = vload(*(recfacs + l));
-        
-        /* 
-           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
-        */
-        
-        Pm2_0 = Pm1_0; Pm1_0 = P_0;
-        W1 = vmul(x0, Pm1_0);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_0);
-        P_0 = W1;
-        vfmaeq(P_0, W2, R);
-        vfmaeq(y0, P_0, b);
-        
-
-    }
-    
-    vstoreu(out + 0 * VLEN, y0);
-    
-}
-
-static void legendre_transform_vec2(double *recfacs, double *bl, ptrdiff_t lmax,
-                                              double xarr[(2) * VLEN],
-                                              double out[(2) * VLEN]) {
-    
-    Tv P_0, Pm1_0, Pm2_0, x0, y0;
-    
-    Tv P_1, Pm1_1, Pm2_1, x1, y1;
-    
-    Tv W1, W2, b, R;
-    ptrdiff_t l;
-
-    
-    x0 = vloadu(xarr + 0 * VLEN);
-    Pm1_0 = vload(1.0);
-    P_0 = x0;
-    b = vload(*bl);
-    y0 = vmul(Pm1_0, b);
-    
-    x1 = vloadu(xarr + 1 * VLEN);
-    Pm1_1 = vload(1.0);
-    P_1 = x1;
-    b = vload(*bl);
-    y1 = vmul(Pm1_1, b);
-    
-    
-    b = vload(*(bl + 1));
-    
-    vfmaeq(y0, P_0, b);
-    
-    vfmaeq(y1, P_1, b);
-    
-
-    for (l = 2; l <= lmax; ++l) {
-        b = vload(*(bl + l));
-        R = vload(*(recfacs + l));
-        
-        /* 
-           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
-        */
-        
-        Pm2_0 = Pm1_0; Pm1_0 = P_0;
-        W1 = vmul(x0, Pm1_0);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_0);
-        P_0 = W1;
-        vfmaeq(P_0, W2, R);
-        vfmaeq(y0, P_0, b);
-        
-        Pm2_1 = Pm1_1; Pm1_1 = P_1;
-        W1 = vmul(x1, Pm1_1);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_1);
-        P_1 = W1;
-        vfmaeq(P_1, W2, R);
-        vfmaeq(y1, P_1, b);
-        
-
-    }
-    
-    vstoreu(out + 0 * VLEN, y0);
-    
-    vstoreu(out + 1 * VLEN, y1);
-    
-}
-
-static void legendre_transform_vec3(double *recfacs, double *bl, ptrdiff_t lmax,
-                                              double xarr[(3) * VLEN],
-                                              double out[(3) * VLEN]) {
-    
-    Tv P_0, Pm1_0, Pm2_0, x0, y0;
-    
-    Tv P_1, Pm1_1, Pm2_1, x1, y1;
-    
-    Tv P_2, Pm1_2, Pm2_2, x2, y2;
-    
-    Tv W1, W2, b, R;
-    ptrdiff_t l;
-
-    
-    x0 = vloadu(xarr + 0 * VLEN);
-    Pm1_0 = vload(1.0);
-    P_0 = x0;
-    b = vload(*bl);
-    y0 = vmul(Pm1_0, b);
-    
-    x1 = vloadu(xarr + 1 * VLEN);
-    Pm1_1 = vload(1.0);
-    P_1 = x1;
-    b = vload(*bl);
-    y1 = vmul(Pm1_1, b);
-    
-    x2 = vloadu(xarr + 2 * VLEN);
-    Pm1_2 = vload(1.0);
-    P_2 = x2;
-    b = vload(*bl);
-    y2 = vmul(Pm1_2, b);
-    
-    
-    b = vload(*(bl + 1));
-    
-    vfmaeq(y0, P_0, b);
-    
-    vfmaeq(y1, P_1, b);
-    
-    vfmaeq(y2, P_2, b);
-    
-
-    for (l = 2; l <= lmax; ++l) {
-        b = vload(*(bl + l));
-        R = vload(*(recfacs + l));
-        
-        /* 
-           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
-        */
-        
-        Pm2_0 = Pm1_0; Pm1_0 = P_0;
-        W1 = vmul(x0, Pm1_0);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_0);
-        P_0 = W1;
-        vfmaeq(P_0, W2, R);
-        vfmaeq(y0, P_0, b);
-        
-        Pm2_1 = Pm1_1; Pm1_1 = P_1;
-        W1 = vmul(x1, Pm1_1);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_1);
-        P_1 = W1;
-        vfmaeq(P_1, W2, R);
-        vfmaeq(y1, P_1, b);
-        
-        Pm2_2 = Pm1_2; Pm1_2 = P_2;
-        W1 = vmul(x2, Pm1_2);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_2);
-        P_2 = W1;
-        vfmaeq(P_2, W2, R);
-        vfmaeq(y2, P_2, b);
-        
-
-    }
-    
-    vstoreu(out + 0 * VLEN, y0);
-    
-    vstoreu(out + 1 * VLEN, y1);
-    
-    vstoreu(out + 2 * VLEN, y2);
-    
-}
-
-static void legendre_transform_vec4(double *recfacs, double *bl, ptrdiff_t lmax,
-                                              double xarr[(4) * VLEN],
-                                              double out[(4) * VLEN]) {
-    
-    Tv P_0, Pm1_0, Pm2_0, x0, y0;
-    
-    Tv P_1, Pm1_1, Pm2_1, x1, y1;
-    
-    Tv P_2, Pm1_2, Pm2_2, x2, y2;
-    
-    Tv P_3, Pm1_3, Pm2_3, x3, y3;
-    
-    Tv W1, W2, b, R;
-    ptrdiff_t l;
-
-    
-    x0 = vloadu(xarr + 0 * VLEN);
-    Pm1_0 = vload(1.0);
-    P_0 = x0;
-    b = vload(*bl);
-    y0 = vmul(Pm1_0, b);
-    
-    x1 = vloadu(xarr + 1 * VLEN);
-    Pm1_1 = vload(1.0);
-    P_1 = x1;
-    b = vload(*bl);
-    y1 = vmul(Pm1_1, b);
-    
-    x2 = vloadu(xarr + 2 * VLEN);
-    Pm1_2 = vload(1.0);
-    P_2 = x2;
-    b = vload(*bl);
-    y2 = vmul(Pm1_2, b);
-    
-    x3 = vloadu(xarr + 3 * VLEN);
-    Pm1_3 = vload(1.0);
-    P_3 = x3;
-    b = vload(*bl);
-    y3 = vmul(Pm1_3, b);
-    
-    
-    b = vload(*(bl + 1));
-    
-    vfmaeq(y0, P_0, b);
-    
-    vfmaeq(y1, P_1, b);
-    
-    vfmaeq(y2, P_2, b);
-    
-    vfmaeq(y3, P_3, b);
-    
-
-    for (l = 2; l <= lmax; ++l) {
-        b = vload(*(bl + l));
-        R = vload(*(recfacs + l));
-        
-        /* 
-           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
-        */
-        
-        Pm2_0 = Pm1_0; Pm1_0 = P_0;
-        W1 = vmul(x0, Pm1_0);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_0);
-        P_0 = W1;
-        vfmaeq(P_0, W2, R);
-        vfmaeq(y0, P_0, b);
-        
-        Pm2_1 = Pm1_1; Pm1_1 = P_1;
-        W1 = vmul(x1, Pm1_1);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_1);
-        P_1 = W1;
-        vfmaeq(P_1, W2, R);
-        vfmaeq(y1, P_1, b);
-        
-        Pm2_2 = Pm1_2; Pm1_2 = P_2;
-        W1 = vmul(x2, Pm1_2);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_2);
-        P_2 = W1;
-        vfmaeq(P_2, W2, R);
-        vfmaeq(y2, P_2, b);
-        
-        Pm2_3 = Pm1_3; Pm1_3 = P_3;
-        W1 = vmul(x3, Pm1_3);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_3);
-        P_3 = W1;
-        vfmaeq(P_3, W2, R);
-        vfmaeq(y3, P_3, b);
-        
-
-    }
-    
-    vstoreu(out + 0 * VLEN, y0);
-    
-    vstoreu(out + 1 * VLEN, y1);
-    
-    vstoreu(out + 2 * VLEN, y2);
-    
-    vstoreu(out + 3 * VLEN, y3);
-    
-}
-
-static void legendre_transform_vec5(double *recfacs, double *bl, ptrdiff_t lmax,
-                                              double xarr[(5) * VLEN],
-                                              double out[(5) * VLEN]) {
-    
-    Tv P_0, Pm1_0, Pm2_0, x0, y0;
-    
-    Tv P_1, Pm1_1, Pm2_1, x1, y1;
-    
-    Tv P_2, Pm1_2, Pm2_2, x2, y2;
-    
-    Tv P_3, Pm1_3, Pm2_3, x3, y3;
-    
-    Tv P_4, Pm1_4, Pm2_4, x4, y4;
-    
-    Tv W1, W2, b, R;
-    ptrdiff_t l;
-
-    
-    x0 = vloadu(xarr + 0 * VLEN);
-    Pm1_0 = vload(1.0);
-    P_0 = x0;
-    b = vload(*bl);
-    y0 = vmul(Pm1_0, b);
-    
-    x1 = vloadu(xarr + 1 * VLEN);
-    Pm1_1 = vload(1.0);
-    P_1 = x1;
-    b = vload(*bl);
-    y1 = vmul(Pm1_1, b);
-    
-    x2 = vloadu(xarr + 2 * VLEN);
-    Pm1_2 = vload(1.0);
-    P_2 = x2;
-    b = vload(*bl);
-    y2 = vmul(Pm1_2, b);
-    
-    x3 = vloadu(xarr + 3 * VLEN);
-    Pm1_3 = vload(1.0);
-    P_3 = x3;
-    b = vload(*bl);
-    y3 = vmul(Pm1_3, b);
-    
-    x4 = vloadu(xarr + 4 * VLEN);
-    Pm1_4 = vload(1.0);
-    P_4 = x4;
-    b = vload(*bl);
-    y4 = vmul(Pm1_4, b);
-    
-    
-    b = vload(*(bl + 1));
-    
-    vfmaeq(y0, P_0, b);
-    
-    vfmaeq(y1, P_1, b);
-    
-    vfmaeq(y2, P_2, b);
-    
-    vfmaeq(y3, P_3, b);
-    
-    vfmaeq(y4, P_4, b);
-    
-
-    for (l = 2; l <= lmax; ++l) {
-        b = vload(*(bl + l));
-        R = vload(*(recfacs + l));
-        
-        /* 
-           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
-        */
-        
-        Pm2_0 = Pm1_0; Pm1_0 = P_0;
-        W1 = vmul(x0, Pm1_0);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_0);
-        P_0 = W1;
-        vfmaeq(P_0, W2, R);
-        vfmaeq(y0, P_0, b);
-        
-        Pm2_1 = Pm1_1; Pm1_1 = P_1;
-        W1 = vmul(x1, Pm1_1);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_1);
-        P_1 = W1;
-        vfmaeq(P_1, W2, R);
-        vfmaeq(y1, P_1, b);
-        
-        Pm2_2 = Pm1_2; Pm1_2 = P_2;
-        W1 = vmul(x2, Pm1_2);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_2);
-        P_2 = W1;
-        vfmaeq(P_2, W2, R);
-        vfmaeq(y2, P_2, b);
-        
-        Pm2_3 = Pm1_3; Pm1_3 = P_3;
-        W1 = vmul(x3, Pm1_3);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_3);
-        P_3 = W1;
-        vfmaeq(P_3, W2, R);
-        vfmaeq(y3, P_3, b);
-        
-        Pm2_4 = Pm1_4; Pm1_4 = P_4;
-        W1 = vmul(x4, Pm1_4);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_4);
-        P_4 = W1;
-        vfmaeq(P_4, W2, R);
-        vfmaeq(y4, P_4, b);
-        
-
-    }
-    
-    vstoreu(out + 0 * VLEN, y0);
-    
-    vstoreu(out + 1 * VLEN, y1);
-    
-    vstoreu(out + 2 * VLEN, y2);
-    
-    vstoreu(out + 3 * VLEN, y3);
-    
-    vstoreu(out + 4 * VLEN, y4);
-    
-}
-
-static void legendre_transform_vec6(double *recfacs, double *bl, ptrdiff_t lmax,
-                                              double xarr[(6) * VLEN],
-                                              double out[(6) * VLEN]) {
-    
-    Tv P_0, Pm1_0, Pm2_0, x0, y0;
-    
-    Tv P_1, Pm1_1, Pm2_1, x1, y1;
-    
-    Tv P_2, Pm1_2, Pm2_2, x2, y2;
-    
-    Tv P_3, Pm1_3, Pm2_3, x3, y3;
-    
-    Tv P_4, Pm1_4, Pm2_4, x4, y4;
-    
-    Tv P_5, Pm1_5, Pm2_5, x5, y5;
-    
-    Tv W1, W2, b, R;
-    ptrdiff_t l;
-
-    
-    x0 = vloadu(xarr + 0 * VLEN);
-    Pm1_0 = vload(1.0);
-    P_0 = x0;
-    b = vload(*bl);
-    y0 = vmul(Pm1_0, b);
-    
-    x1 = vloadu(xarr + 1 * VLEN);
-    Pm1_1 = vload(1.0);
-    P_1 = x1;
-    b = vload(*bl);
-    y1 = vmul(Pm1_1, b);
-    
-    x2 = vloadu(xarr + 2 * VLEN);
-    Pm1_2 = vload(1.0);
-    P_2 = x2;
-    b = vload(*bl);
-    y2 = vmul(Pm1_2, b);
-    
-    x3 = vloadu(xarr + 3 * VLEN);
-    Pm1_3 = vload(1.0);
-    P_3 = x3;
-    b = vload(*bl);
-    y3 = vmul(Pm1_3, b);
-    
-    x4 = vloadu(xarr + 4 * VLEN);
-    Pm1_4 = vload(1.0);
-    P_4 = x4;
-    b = vload(*bl);
-    y4 = vmul(Pm1_4, b);
-    
-    x5 = vloadu(xarr + 5 * VLEN);
-    Pm1_5 = vload(1.0);
-    P_5 = x5;
-    b = vload(*bl);
-    y5 = vmul(Pm1_5, b);
-    
-    
-    b = vload(*(bl + 1));
-    
-    vfmaeq(y0, P_0, b);
-    
-    vfmaeq(y1, P_1, b);
-    
-    vfmaeq(y2, P_2, b);
-    
-    vfmaeq(y3, P_3, b);
-    
-    vfmaeq(y4, P_4, b);
-    
-    vfmaeq(y5, P_5, b);
-    
-
-    for (l = 2; l <= lmax; ++l) {
-        b = vload(*(bl + l));
-        R = vload(*(recfacs + l));
-        
-        /* 
-           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
-        */
-        
-        Pm2_0 = Pm1_0; Pm1_0 = P_0;
-        W1 = vmul(x0, Pm1_0);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_0);
-        P_0 = W1;
-        vfmaeq(P_0, W2, R);
-        vfmaeq(y0, P_0, b);
-        
-        Pm2_1 = Pm1_1; Pm1_1 = P_1;
-        W1 = vmul(x1, Pm1_1);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_1);
-        P_1 = W1;
-        vfmaeq(P_1, W2, R);
-        vfmaeq(y1, P_1, b);
-        
-        Pm2_2 = Pm1_2; Pm1_2 = P_2;
-        W1 = vmul(x2, Pm1_2);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_2);
-        P_2 = W1;
-        vfmaeq(P_2, W2, R);
-        vfmaeq(y2, P_2, b);
-        
-        Pm2_3 = Pm1_3; Pm1_3 = P_3;
-        W1 = vmul(x3, Pm1_3);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_3);
-        P_3 = W1;
-        vfmaeq(P_3, W2, R);
-        vfmaeq(y3, P_3, b);
-        
-        Pm2_4 = Pm1_4; Pm1_4 = P_4;
-        W1 = vmul(x4, Pm1_4);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_4);
-        P_4 = W1;
-        vfmaeq(P_4, W2, R);
-        vfmaeq(y4, P_4, b);
-        
-        Pm2_5 = Pm1_5; Pm1_5 = P_5;
-        W1 = vmul(x5, Pm1_5);
-        W2 = W1;
-        W2 = vsub(W2, Pm2_5);
-        P_5 = W1;
-        vfmaeq(P_5, W2, R);
-        vfmaeq(y5, P_5, b);
-        
-
-    }
-    
-    vstoreu(out + 0 * VLEN, y0);
-    
-    vstoreu(out + 1 * VLEN, y1);
-    
-    vstoreu(out + 2 * VLEN, y2);
-    
-    vstoreu(out + 3 * VLEN, y3);
-    
-    vstoreu(out + 4 * VLEN, y4);
-    
-    vstoreu(out + 5 * VLEN, y5);
-    
-}
-
-
-
-static void legendre_transform_vec1_s(float *recfacs, float *bl, ptrdiff_t lmax,
-                                              float xarr[(1) * VLEN_s],
-                                              float out[(1) * VLEN_s]) {
-    
-    Tv_s P_0, Pm1_0, Pm2_0, x0, y0;
-    
-    Tv_s W1, W2, b, R;
-    ptrdiff_t l;
-
-    
-    x0 = vloadu_s(xarr + 0 * VLEN_s);
-    Pm1_0 = vload_s(1.0);
-    P_0 = x0;
-    b = vload_s(*bl);
-    y0 = vmul_s(Pm1_0, b);
-    
-    
-    b = vload_s(*(bl + 1));
-    
-    vfmaeq_s(y0, P_0, b);
-    
-
-    for (l = 2; l <= lmax; ++l) {
-        b = vload_s(*(bl + l));
-        R = vload_s(*(recfacs + l));
-        
-        /* 
-           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
-        */
-        
-        Pm2_0 = Pm1_0; Pm1_0 = P_0;
-        W1 = vmul_s(x0, Pm1_0);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_0);
-        P_0 = W1;
-        vfmaeq_s(P_0, W2, R);
-        vfmaeq_s(y0, P_0, b);
-        
-
-    }
-    
-    vstoreu_s(out + 0 * VLEN_s, y0);
-    
-}
-
-static void legendre_transform_vec2_s(float *recfacs, float *bl, ptrdiff_t lmax,
-                                              float xarr[(2) * VLEN_s],
-                                              float out[(2) * VLEN_s]) {
-    
-    Tv_s P_0, Pm1_0, Pm2_0, x0, y0;
-    
-    Tv_s P_1, Pm1_1, Pm2_1, x1, y1;
-    
-    Tv_s W1, W2, b, R;
-    ptrdiff_t l;
-
-    
-    x0 = vloadu_s(xarr + 0 * VLEN_s);
-    Pm1_0 = vload_s(1.0);
-    P_0 = x0;
-    b = vload_s(*bl);
-    y0 = vmul_s(Pm1_0, b);
-    
-    x1 = vloadu_s(xarr + 1 * VLEN_s);
-    Pm1_1 = vload_s(1.0);
-    P_1 = x1;
-    b = vload_s(*bl);
-    y1 = vmul_s(Pm1_1, b);
-    
-    
-    b = vload_s(*(bl + 1));
-    
-    vfmaeq_s(y0, P_0, b);
-    
-    vfmaeq_s(y1, P_1, b);
-    
-
-    for (l = 2; l <= lmax; ++l) {
-        b = vload_s(*(bl + l));
-        R = vload_s(*(recfacs + l));
-        
-        /* 
-           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
-        */
-        
-        Pm2_0 = Pm1_0; Pm1_0 = P_0;
-        W1 = vmul_s(x0, Pm1_0);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_0);
-        P_0 = W1;
-        vfmaeq_s(P_0, W2, R);
-        vfmaeq_s(y0, P_0, b);
-        
-        Pm2_1 = Pm1_1; Pm1_1 = P_1;
-        W1 = vmul_s(x1, Pm1_1);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_1);
-        P_1 = W1;
-        vfmaeq_s(P_1, W2, R);
-        vfmaeq_s(y1, P_1, b);
-        
-
-    }
-    
-    vstoreu_s(out + 0 * VLEN_s, y0);
-    
-    vstoreu_s(out + 1 * VLEN_s, y1);
-    
-}
-
-static void legendre_transform_vec3_s(float *recfacs, float *bl, ptrdiff_t lmax,
-                                              float xarr[(3) * VLEN_s],
-                                              float out[(3) * VLEN_s]) {
-    
-    Tv_s P_0, Pm1_0, Pm2_0, x0, y0;
-    
-    Tv_s P_1, Pm1_1, Pm2_1, x1, y1;
-    
-    Tv_s P_2, Pm1_2, Pm2_2, x2, y2;
-    
-    Tv_s W1, W2, b, R;
-    ptrdiff_t l;
-
-    
-    x0 = vloadu_s(xarr + 0 * VLEN_s);
-    Pm1_0 = vload_s(1.0);
-    P_0 = x0;
-    b = vload_s(*bl);
-    y0 = vmul_s(Pm1_0, b);
-    
-    x1 = vloadu_s(xarr + 1 * VLEN_s);
-    Pm1_1 = vload_s(1.0);
-    P_1 = x1;
-    b = vload_s(*bl);
-    y1 = vmul_s(Pm1_1, b);
-    
-    x2 = vloadu_s(xarr + 2 * VLEN_s);
-    Pm1_2 = vload_s(1.0);
-    P_2 = x2;
-    b = vload_s(*bl);
-    y2 = vmul_s(Pm1_2, b);
-    
-    
-    b = vload_s(*(bl + 1));
-    
-    vfmaeq_s(y0, P_0, b);
-    
-    vfmaeq_s(y1, P_1, b);
-    
-    vfmaeq_s(y2, P_2, b);
-    
-
-    for (l = 2; l <= lmax; ++l) {
-        b = vload_s(*(bl + l));
-        R = vload_s(*(recfacs + l));
-        
-        /* 
-           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
-        */
-        
-        Pm2_0 = Pm1_0; Pm1_0 = P_0;
-        W1 = vmul_s(x0, Pm1_0);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_0);
-        P_0 = W1;
-        vfmaeq_s(P_0, W2, R);
-        vfmaeq_s(y0, P_0, b);
-        
-        Pm2_1 = Pm1_1; Pm1_1 = P_1;
-        W1 = vmul_s(x1, Pm1_1);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_1);
-        P_1 = W1;
-        vfmaeq_s(P_1, W2, R);
-        vfmaeq_s(y1, P_1, b);
-        
-        Pm2_2 = Pm1_2; Pm1_2 = P_2;
-        W1 = vmul_s(x2, Pm1_2);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_2);
-        P_2 = W1;
-        vfmaeq_s(P_2, W2, R);
-        vfmaeq_s(y2, P_2, b);
-        
-
-    }
-    
-    vstoreu_s(out + 0 * VLEN_s, y0);
-    
-    vstoreu_s(out + 1 * VLEN_s, y1);
-    
-    vstoreu_s(out + 2 * VLEN_s, y2);
-    
-}
-
-static void legendre_transform_vec4_s(float *recfacs, float *bl, ptrdiff_t lmax,
-                                              float xarr[(4) * VLEN_s],
-                                              float out[(4) * VLEN_s]) {
-    
-    Tv_s P_0, Pm1_0, Pm2_0, x0, y0;
-    
-    Tv_s P_1, Pm1_1, Pm2_1, x1, y1;
-    
-    Tv_s P_2, Pm1_2, Pm2_2, x2, y2;
-    
-    Tv_s P_3, Pm1_3, Pm2_3, x3, y3;
-    
-    Tv_s W1, W2, b, R;
-    ptrdiff_t l;
-
-    
-    x0 = vloadu_s(xarr + 0 * VLEN_s);
-    Pm1_0 = vload_s(1.0);
-    P_0 = x0;
-    b = vload_s(*bl);
-    y0 = vmul_s(Pm1_0, b);
-    
-    x1 = vloadu_s(xarr + 1 * VLEN_s);
-    Pm1_1 = vload_s(1.0);
-    P_1 = x1;
-    b = vload_s(*bl);
-    y1 = vmul_s(Pm1_1, b);
-    
-    x2 = vloadu_s(xarr + 2 * VLEN_s);
-    Pm1_2 = vload_s(1.0);
-    P_2 = x2;
-    b = vload_s(*bl);
-    y2 = vmul_s(Pm1_2, b);
-    
-    x3 = vloadu_s(xarr + 3 * VLEN_s);
-    Pm1_3 = vload_s(1.0);
-    P_3 = x3;
-    b = vload_s(*bl);
-    y3 = vmul_s(Pm1_3, b);
-    
-    
-    b = vload_s(*(bl + 1));
-    
-    vfmaeq_s(y0, P_0, b);
-    
-    vfmaeq_s(y1, P_1, b);
-    
-    vfmaeq_s(y2, P_2, b);
-    
-    vfmaeq_s(y3, P_3, b);
-    
-
-    for (l = 2; l <= lmax; ++l) {
-        b = vload_s(*(bl + l));
-        R = vload_s(*(recfacs + l));
-        
-        /* 
-           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
-        */
-        
-        Pm2_0 = Pm1_0; Pm1_0 = P_0;
-        W1 = vmul_s(x0, Pm1_0);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_0);
-        P_0 = W1;
-        vfmaeq_s(P_0, W2, R);
-        vfmaeq_s(y0, P_0, b);
-        
-        Pm2_1 = Pm1_1; Pm1_1 = P_1;
-        W1 = vmul_s(x1, Pm1_1);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_1);
-        P_1 = W1;
-        vfmaeq_s(P_1, W2, R);
-        vfmaeq_s(y1, P_1, b);
-        
-        Pm2_2 = Pm1_2; Pm1_2 = P_2;
-        W1 = vmul_s(x2, Pm1_2);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_2);
-        P_2 = W1;
-        vfmaeq_s(P_2, W2, R);
-        vfmaeq_s(y2, P_2, b);
-        
-        Pm2_3 = Pm1_3; Pm1_3 = P_3;
-        W1 = vmul_s(x3, Pm1_3);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_3);
-        P_3 = W1;
-        vfmaeq_s(P_3, W2, R);
-        vfmaeq_s(y3, P_3, b);
-        
-
-    }
-    
-    vstoreu_s(out + 0 * VLEN_s, y0);
-    
-    vstoreu_s(out + 1 * VLEN_s, y1);
-    
-    vstoreu_s(out + 2 * VLEN_s, y2);
-    
-    vstoreu_s(out + 3 * VLEN_s, y3);
-    
-}
-
-static void legendre_transform_vec5_s(float *recfacs, float *bl, ptrdiff_t lmax,
-                                              float xarr[(5) * VLEN_s],
-                                              float out[(5) * VLEN_s]) {
-    
-    Tv_s P_0, Pm1_0, Pm2_0, x0, y0;
-    
-    Tv_s P_1, Pm1_1, Pm2_1, x1, y1;
-    
-    Tv_s P_2, Pm1_2, Pm2_2, x2, y2;
-    
-    Tv_s P_3, Pm1_3, Pm2_3, x3, y3;
-    
-    Tv_s P_4, Pm1_4, Pm2_4, x4, y4;
-    
-    Tv_s W1, W2, b, R;
-    ptrdiff_t l;
-
-    
-    x0 = vloadu_s(xarr + 0 * VLEN_s);
-    Pm1_0 = vload_s(1.0);
-    P_0 = x0;
-    b = vload_s(*bl);
-    y0 = vmul_s(Pm1_0, b);
-    
-    x1 = vloadu_s(xarr + 1 * VLEN_s);
-    Pm1_1 = vload_s(1.0);
-    P_1 = x1;
-    b = vload_s(*bl);
-    y1 = vmul_s(Pm1_1, b);
-    
-    x2 = vloadu_s(xarr + 2 * VLEN_s);
-    Pm1_2 = vload_s(1.0);
-    P_2 = x2;
-    b = vload_s(*bl);
-    y2 = vmul_s(Pm1_2, b);
-    
-    x3 = vloadu_s(xarr + 3 * VLEN_s);
-    Pm1_3 = vload_s(1.0);
-    P_3 = x3;
-    b = vload_s(*bl);
-    y3 = vmul_s(Pm1_3, b);
-    
-    x4 = vloadu_s(xarr + 4 * VLEN_s);
-    Pm1_4 = vload_s(1.0);
-    P_4 = x4;
-    b = vload_s(*bl);
-    y4 = vmul_s(Pm1_4, b);
-    
-    
-    b = vload_s(*(bl + 1));
-    
-    vfmaeq_s(y0, P_0, b);
-    
-    vfmaeq_s(y1, P_1, b);
-    
-    vfmaeq_s(y2, P_2, b);
-    
-    vfmaeq_s(y3, P_3, b);
-    
-    vfmaeq_s(y4, P_4, b);
-    
-
-    for (l = 2; l <= lmax; ++l) {
-        b = vload_s(*(bl + l));
-        R = vload_s(*(recfacs + l));
-        
-        /* 
-           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
-        */
-        
-        Pm2_0 = Pm1_0; Pm1_0 = P_0;
-        W1 = vmul_s(x0, Pm1_0);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_0);
-        P_0 = W1;
-        vfmaeq_s(P_0, W2, R);
-        vfmaeq_s(y0, P_0, b);
-        
-        Pm2_1 = Pm1_1; Pm1_1 = P_1;
-        W1 = vmul_s(x1, Pm1_1);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_1);
-        P_1 = W1;
-        vfmaeq_s(P_1, W2, R);
-        vfmaeq_s(y1, P_1, b);
-        
-        Pm2_2 = Pm1_2; Pm1_2 = P_2;
-        W1 = vmul_s(x2, Pm1_2);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_2);
-        P_2 = W1;
-        vfmaeq_s(P_2, W2, R);
-        vfmaeq_s(y2, P_2, b);
-        
-        Pm2_3 = Pm1_3; Pm1_3 = P_3;
-        W1 = vmul_s(x3, Pm1_3);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_3);
-        P_3 = W1;
-        vfmaeq_s(P_3, W2, R);
-        vfmaeq_s(y3, P_3, b);
-        
-        Pm2_4 = Pm1_4; Pm1_4 = P_4;
-        W1 = vmul_s(x4, Pm1_4);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_4);
-        P_4 = W1;
-        vfmaeq_s(P_4, W2, R);
-        vfmaeq_s(y4, P_4, b);
-        
-
-    }
-    
-    vstoreu_s(out + 0 * VLEN_s, y0);
-    
-    vstoreu_s(out + 1 * VLEN_s, y1);
-    
-    vstoreu_s(out + 2 * VLEN_s, y2);
-    
-    vstoreu_s(out + 3 * VLEN_s, y3);
-    
-    vstoreu_s(out + 4 * VLEN_s, y4);
-    
-}
-
-static void legendre_transform_vec6_s(float *recfacs, float *bl, ptrdiff_t lmax,
-                                              float xarr[(6) * VLEN_s],
-                                              float out[(6) * VLEN_s]) {
-    
-    Tv_s P_0, Pm1_0, Pm2_0, x0, y0;
-    
-    Tv_s P_1, Pm1_1, Pm2_1, x1, y1;
-    
-    Tv_s P_2, Pm1_2, Pm2_2, x2, y2;
-    
-    Tv_s P_3, Pm1_3, Pm2_3, x3, y3;
-    
-    Tv_s P_4, Pm1_4, Pm2_4, x4, y4;
-    
-    Tv_s P_5, Pm1_5, Pm2_5, x5, y5;
-    
-    Tv_s W1, W2, b, R;
-    ptrdiff_t l;
-
-    
-    x0 = vloadu_s(xarr + 0 * VLEN_s);
-    Pm1_0 = vload_s(1.0);
-    P_0 = x0;
-    b = vload_s(*bl);
-    y0 = vmul_s(Pm1_0, b);
-    
-    x1 = vloadu_s(xarr + 1 * VLEN_s);
-    Pm1_1 = vload_s(1.0);
-    P_1 = x1;
-    b = vload_s(*bl);
-    y1 = vmul_s(Pm1_1, b);
-    
-    x2 = vloadu_s(xarr + 2 * VLEN_s);
-    Pm1_2 = vload_s(1.0);
-    P_2 = x2;
-    b = vload_s(*bl);
-    y2 = vmul_s(Pm1_2, b);
-    
-    x3 = vloadu_s(xarr + 3 * VLEN_s);
-    Pm1_3 = vload_s(1.0);
-    P_3 = x3;
-    b = vload_s(*bl);
-    y3 = vmul_s(Pm1_3, b);
-    
-    x4 = vloadu_s(xarr + 4 * VLEN_s);
-    Pm1_4 = vload_s(1.0);
-    P_4 = x4;
-    b = vload_s(*bl);
-    y4 = vmul_s(Pm1_4, b);
-    
-    x5 = vloadu_s(xarr + 5 * VLEN_s);
-    Pm1_5 = vload_s(1.0);
-    P_5 = x5;
-    b = vload_s(*bl);
-    y5 = vmul_s(Pm1_5, b);
-    
-    
-    b = vload_s(*(bl + 1));
-    
-    vfmaeq_s(y0, P_0, b);
-    
-    vfmaeq_s(y1, P_1, b);
-    
-    vfmaeq_s(y2, P_2, b);
-    
-    vfmaeq_s(y3, P_3, b);
-    
-    vfmaeq_s(y4, P_4, b);
-    
-    vfmaeq_s(y5, P_5, b);
-    
-
-    for (l = 2; l <= lmax; ++l) {
-        b = vload_s(*(bl + l));
-        R = vload_s(*(recfacs + l));
-        
-        /* 
-           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
-        */
-        
-        Pm2_0 = Pm1_0; Pm1_0 = P_0;
-        W1 = vmul_s(x0, Pm1_0);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_0);
-        P_0 = W1;
-        vfmaeq_s(P_0, W2, R);
-        vfmaeq_s(y0, P_0, b);
-        
-        Pm2_1 = Pm1_1; Pm1_1 = P_1;
-        W1 = vmul_s(x1, Pm1_1);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_1);
-        P_1 = W1;
-        vfmaeq_s(P_1, W2, R);
-        vfmaeq_s(y1, P_1, b);
-        
-        Pm2_2 = Pm1_2; Pm1_2 = P_2;
-        W1 = vmul_s(x2, Pm1_2);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_2);
-        P_2 = W1;
-        vfmaeq_s(P_2, W2, R);
-        vfmaeq_s(y2, P_2, b);
-        
-        Pm2_3 = Pm1_3; Pm1_3 = P_3;
-        W1 = vmul_s(x3, Pm1_3);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_3);
-        P_3 = W1;
-        vfmaeq_s(P_3, W2, R);
-        vfmaeq_s(y3, P_3, b);
-        
-        Pm2_4 = Pm1_4; Pm1_4 = P_4;
-        W1 = vmul_s(x4, Pm1_4);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_4);
-        P_4 = W1;
-        vfmaeq_s(P_4, W2, R);
-        vfmaeq_s(y4, P_4, b);
-        
-        Pm2_5 = Pm1_5; Pm1_5 = P_5;
-        W1 = vmul_s(x5, Pm1_5);
-        W2 = W1;
-        W2 = vsub_s(W2, Pm2_5);
-        P_5 = W1;
-        vfmaeq_s(P_5, W2, R);
-        vfmaeq_s(y5, P_5, b);
-        
-
-    }
-    
-    vstoreu_s(out + 0 * VLEN_s, y0);
-    
-    vstoreu_s(out + 1 * VLEN_s, y1);
-    
-    vstoreu_s(out + 2 * VLEN_s, y2);
-    
-    vstoreu_s(out + 3 * VLEN_s, y3);
-    
-    vstoreu_s(out + 4 * VLEN_s, y4);
-    
-    vstoreu_s(out + 5 * VLEN_s, y5);
-    
-}
-
-
-
-
-
-void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax) {
-    /* (l - 1) / l, for l >= 2 */
-    ptrdiff_t l;
-    r[0] = 0;
-    r[1] = 1;
-    for (l = 2; l <= lmax; ++l) {
-        r[l] = (double)(l - 1) / (double)l;
-    }
-}
-
-void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax) {
-    /* (l - 1) / l, for l >= 2 */
-    ptrdiff_t l;
-    r[0] = 0;
-    r[1] = 1;
-    for (l = 2; l <= lmax; ++l) {
-        r[l] = (float)(l - 1) / (float)l;
-    }
-}
-
-
-/*
-  Compute sum_l b_l P_l(x_i) for all i. 
- */
-
-#define LEN (SHARP_LEGENDRE_CS * VLEN)
-#define LEN_s (SHARP_LEGENDRE_CS * VLEN_s)
-
-
-void sharp_legendre_transform(double *bl,
-                                   double *recfac,
-                                   ptrdiff_t lmax,
-                                   double *x, double *out, ptrdiff_t nx) {
-    double xchunk[MAX_CS * VLEN], outchunk[MAX_CS * LEN];
-    int compute_recfac;
-    ptrdiff_t i, j, len;
-
-    compute_recfac = (recfac == NULL);
-    if (compute_recfac) {
-        recfac = malloc(sizeof(double) * (lmax + 1));
-        sharp_legendre_transform_recfac(recfac, lmax);
-    }
-
-    for (j = 0; j != LEN; ++j) xchunk[j] = 0;
-
-    for (i = 0; i < nx; i += LEN) {
-        len = (i + (LEN) <= nx) ? (LEN) : (nx - i);
-        for (j = 0; j != len; ++j) xchunk[j] = x[i + j];
-        switch ((len + VLEN - 1) / VLEN) {
-          case 6: legendre_transform_vec6(recfac, bl, lmax, xchunk, outchunk); break;
-          case 5: legendre_transform_vec5(recfac, bl, lmax, xchunk, outchunk); break;
-          case 4: legendre_transform_vec4(recfac, bl, lmax, xchunk, outchunk); break;
-          case 3: legendre_transform_vec3(recfac, bl, lmax, xchunk, outchunk); break;
-          case 2: legendre_transform_vec2(recfac, bl, lmax, xchunk, outchunk); break;
-          case 1:
-          case 0:
-              legendre_transform_vec1(recfac, bl, lmax, xchunk, outchunk); break;
-        }
-        for (j = 0; j != len; ++j) out[i + j] = outchunk[j];
-    }
-    if (compute_recfac) {
-        free(recfac);
-    }
-}
-
-void sharp_legendre_transform_s(float *bl,
-                                   float *recfac,
-                                   ptrdiff_t lmax,
-                                   float *x, float *out, ptrdiff_t nx) {
-    float xchunk[MAX_CS * VLEN_s], outchunk[MAX_CS * LEN_s];
-    int compute_recfac;
-    ptrdiff_t i, j, len;
-
-    compute_recfac = (recfac == NULL);
-    if (compute_recfac) {
-        recfac = malloc(sizeof(float) * (lmax + 1));
-        sharp_legendre_transform_recfac_s(recfac, lmax);
-    }
-
-    for (j = 0; j != LEN_s; ++j) xchunk[j] = 0;
-
-    for (i = 0; i < nx; i += LEN_s) {
-        len = (i + (LEN_s) <= nx) ? (LEN_s) : (nx - i);
-        for (j = 0; j != len; ++j) xchunk[j] = x[i + j];
-        switch ((len + VLEN_s - 1) / VLEN_s) {
-          case 6: legendre_transform_vec6_s(recfac, bl, lmax, xchunk, outchunk); break;
-          case 5: legendre_transform_vec5_s(recfac, bl, lmax, xchunk, outchunk); break;
-          case 4: legendre_transform_vec4_s(recfac, bl, lmax, xchunk, outchunk); break;
-          case 3: legendre_transform_vec3_s(recfac, bl, lmax, xchunk, outchunk); break;
-          case 2: legendre_transform_vec2_s(recfac, bl, lmax, xchunk, outchunk); break;
-          case 1:
-          case 0:
-              legendre_transform_vec1_s(recfac, bl, lmax, xchunk, outchunk); break;
-        }
-        for (j = 0; j != len; ++j) out[i + j] = outchunk[j];
-    }
-    if (compute_recfac) {
-        free(recfac);
-    }
-}
-
-
-#endif
\ No newline at end of file
diff --git a/libsharp/sharp_legendre.c.in b/libsharp/sharp_legendre.c.in
deleted file mode 100644
index cd65012..0000000
--- a/libsharp/sharp_legendre.c.in
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
-
-    NOTE NOTE NOTE
-
-    This file is edited in sharp_legendre.c.in which is then preprocessed.
-    Do not make manual  modifications to sharp_legendre.c.
-
-    NOTE NOTE NOTE
-
-*/
-
-
-/*
- *  This file is part of libsharp.
- *
- * Redistribution and use in source and binary forms, with or without
- * met:
- * 
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file sharp_legendre.c.in
- *
- *  Copyright (C) 2015 University of Oslo
- *  \author Dag Sverre Seljebotn
- */
-
-#ifndef NO_LEGENDRE
-#if (VLEN==8)
-#error This code is not tested with MIC; please compile with -DNO_LEGENDRE
-/* ...or test it (it probably works) and remove this check */
-#endif
-
-#ifndef SHARP_LEGENDRE_CS
-#define SHARP_LEGENDRE_CS 4
-#endif
-
-#define MAX_CS 6
-#if (SHARP_LEGENDRE_CS > MAX_CS)
-#error (SHARP_LEGENDRE_CS > MAX_CS)
-#endif
-
-#include "sharp_legendre.h"
-#include "sharp_vecsupport.h"
-
-#include <stdlib.h>
-
-/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
-/*{ for cs in range(1, 7) }*/
-static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *bl, ptrdiff_t lmax,
-                                              {{scalar}} xarr[({{cs}}) * VLEN{{T}}],
-                                              {{scalar}} out[({{cs}}) * VLEN{{T}}]) {
-    /*{ for i in range(cs) }*/
-    Tv{{T}} P_{{i}}, Pm1_{{i}}, Pm2_{{i}}, x{{i}}, y{{i}};
-    /*{ endfor }*/
-    Tv{{T}} W1, W2, b, R;
-    ptrdiff_t l;
-
-    /*{ for i in range(cs) }*/
-    x{{i}} = vloadu{{T}}(xarr + {{i}} * VLEN{{T}});
-    Pm1_{{i}} = vload{{T}}(1.0);
-    P_{{i}} = x{{i}};
-    b = vload{{T}}(*bl);
-    y{{i}} = vmul{{T}}(Pm1_{{i}}, b);
-    /*{ endfor }*/
-    
-    b = vload{{T}}(*(bl + 1));
-    /*{ for i in range(cs) }*/
-    vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
-    /*{ endfor }*/
-
-    for (l = 2; l <= lmax; ++l) {
-        b = vload{{T}}(*(bl + l));
-        R = vload{{T}}(*(recfacs + l));
-        
-        /* 
-           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
-        */
-        /*{ for i in range(cs) }*/
-        Pm2_{{i}} = Pm1_{{i}}; Pm1_{{i}} = P_{{i}};
-        W1 = vmul{{T}}(x{{i}}, Pm1_{{i}});
-        W2 = W1;
-        W2 = vsub{{T}}(W2, Pm2_{{i}});
-        P_{{i}} = W1;
-        vfmaeq{{T}}(P_{{i}}, W2, R);
-        vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
-        /*{ endfor }*/
-
-    }
-    /*{ for i in range(cs) }*/
-    vstoreu{{T}}(out + {{i}} * VLEN{{T}}, y{{i}});
-    /*{ endfor }*/
-}
-/*{ endfor }*/
-/*{ endfor }*/
-
-
-/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
-void sharp_legendre_transform_recfac{{T}}({{scalar}} *r, ptrdiff_t lmax) {
-    /* (l - 1) / l, for l >= 2 */
-    ptrdiff_t l;
-    r[0] = 0;
-    r[1] = 1;
-    for (l = 2; l <= lmax; ++l) {
-        r[l] = ({{scalar}})(l - 1) / ({{scalar}})l;
-    }
-}
-/*{ endfor }*/
-
-/*
-  Compute sum_l b_l P_l(x_i) for all i. 
- */
-
-#define LEN (SHARP_LEGENDRE_CS * VLEN)
-#define LEN_s (SHARP_LEGENDRE_CS * VLEN_s)
-
-/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
-void sharp_legendre_transform{{T}}({{scalar}} *bl,
-                                   {{scalar}} *recfac,
-                                   ptrdiff_t lmax,
-                                   {{scalar}} *x, {{scalar}} *out, ptrdiff_t nx) {
-    {{scalar}} xchunk[MAX_CS * VLEN{{T}}], outchunk[MAX_CS * LEN{{T}}];
-    int compute_recfac;
-    ptrdiff_t i, j, len;
-
-    compute_recfac = (recfac == NULL);
-    if (compute_recfac) {
-        recfac = malloc(sizeof({{scalar}}) * (lmax + 1));
-        sharp_legendre_transform_recfac{{T}}(recfac, lmax);
-    }
-
-    for (j = 0; j != LEN{{T}}; ++j) xchunk[j] = 0;
-
-    for (i = 0; i < nx; i += LEN{{T}}) {
-        len = (i + (LEN{{T}}) <= nx) ? (LEN{{T}}) : (nx - i);
-        for (j = 0; j != len; ++j) xchunk[j] = x[i + j];
-        switch ((len + VLEN{{T}} - 1) / VLEN{{T}}) {
-          case 6: legendre_transform_vec6{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
-          case 5: legendre_transform_vec5{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
-          case 4: legendre_transform_vec4{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
-          case 3: legendre_transform_vec3{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
-          case 2: legendre_transform_vec2{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
-          case 1:
-          case 0:
-              legendre_transform_vec1{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
-        }
-        for (j = 0; j != len; ++j) out[i + j] = outchunk[j];
-    }
-    if (compute_recfac) {
-        free(recfac);
-    }
-}
-/*{ endfor }*/
-
-#endif
diff --git a/libsharp/sharp_legendre.h b/libsharp/sharp_legendre.h
deleted file mode 100644
index cfd8aee..0000000
--- a/libsharp/sharp_legendre.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- * Redistribution and use in source and binary forms, with or without
- * met:
- * 
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file sharp_legendre.h
- *  Interface for the Legendre transform parts of the spherical transform library.
- *
- *  Copyright (C) 2015 University of Oslo
- *  \author Dag Sverre Seljebotn
- */
-
-#ifndef SHARP_LEGENDRE_H
-#define SHARP_LEGENDRE_H
-
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef NO_LEGENDRE
-
-void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x,
-                              double *out, ptrdiff_t nx);
-void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x,
-                                float *out, ptrdiff_t nx);
-void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax);
-void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax);
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/libsharp/sharp_legendre_table.c b/libsharp/sharp_legendre_table.c
deleted file mode 100644
index 7fef085..0000000
--- a/libsharp/sharp_legendre_table.c
+++ /dev/null
@@ -1,1467 +0,0 @@
-/*
-
-This file originated as a concatenation of files from libpsht. Further refactoring
-could be carried out to make the code use libsharp conventions instead for SSE etc.;
-
-*/
-
-
-/*
-sse_utils.h
-*/
-/*
- *  This file is part of libc_utils.
- *
- *  libc_utils is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libc_utils is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libc_utils; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sse_utils.h
- *  SSE/SSE2/SSE3-related functionality
- *
- *  Copyright (C) 2010,2011 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-
-#if (defined(__SSE__))
-
-#include <xmmintrin.h>
-
-
-typedef __m128 v4sf; /* vector of 4 floats (SSE1) */
-
-typedef union {
-  float f[4];
-  v4sf v;
-} V4SF;
-
-static inline v4sf build_v4sf (float a, float b, float c, float d)
-  { return _mm_set_ps(d,c,b,a); }
-static inline void read_v4sf (v4sf v, float *a, float *b, float *c, float *d)
-  {
-  V4SF tmp;
-  tmp.v = v;
-  if (a) *a=tmp.f[0];
-  if (b) *b=tmp.f[1];
-  if (c) *c=tmp.f[2];
-  if (d) *d=tmp.f[3];
-  }
-
-
-#endif
-
-#if (defined(__SSE2__))
-
-#include <emmintrin.h>
-
-
-typedef __m128d v2df; /* vector of 2 doubles (SSE2) */
-
-typedef union {
-  double d[2];
-  v2df v;
-} V2DF;
-
-typedef struct {
-  v2df a,b;
-} v2df2;
-typedef struct {
-  V2DF a,b;
-} V2DF2;
-
-#define V2DF_SIGNMASK _mm_set1_pd(-0.0)
-
-static inline v2df build_v2df (double a, double b)
-  { return _mm_set_pd(b,a); }
-static inline void read_v2df (v2df v, double *a, double *b)
-  { _mm_store_sd(a,v); _mm_storeh_pd(b,v); }
-
-static inline int v2df_any_gt (v2df a, v2df b)
-  {
-  return (_mm_movemask_pd(_mm_cmpgt_pd(_mm_andnot_pd(V2DF_SIGNMASK,a),b))!=0);
-  }
-static inline int v2df_all_ge (v2df a, v2df b)
-  {
-  return (_mm_movemask_pd(_mm_cmplt_pd(_mm_andnot_pd(V2DF_SIGNMASK,a),b))==0);
-  }
-static inline V2DF to_V2DF (v2df x)
-  { V2DF X; X.v=x; return X; }
-static inline V2DF2 to_V2DF2 (v2df2 x)
-  { V2DF2 X; X.a.v=x.a; X.b.v=x.b; return X; }
-static inline v2df2 to_v2df2 (V2DF2 X)
-  { v2df2 x; x.a=X.a.v; x.b=X.b.v; return x; }
-static inline v2df2 zero_v2df2(void)
-  { v2df2 x; x.a=x.b=_mm_setzero_pd(); return x; }
-
-
-#endif
-
-#if (defined(__SSE3__))
-
-#include <pmmintrin.h>
-
-#endif
-
-
-
-/*
-ylmgen_c.h
-*/
-/*
- *  This file is part of libpsht.
- *
- *  libpsht is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libpsht is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libpsht; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libpsht is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file ylmgen_c.h
- *  Code for efficient calculation of Y_lm(phi=0,theta)
- *
- *  Copyright (C) 2005-2011 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-typedef double ylmgen_dbl2[2];
-typedef double ylmgen_dbl3[3];
-
-typedef struct
-  {
-  double cth_crit;
-  int mdist_crit;
-  /* members depending on m and m' */
-  int s, m, mlo, mhi, cosPow, sinPow;
-  long double prefactor;
-  ylmgen_dbl3 *fx;
-  int preMinus_p, preMinus_m;
-  } sylmgen_d;
-
-typedef struct
-  {
-  double fsmall, fbig, eps, cth_crit;
-  int lmax, mmax, smax, m_cur, ith, nth, m_crit, spinrec;
-  /*! The index of the first non-negligible Y_lm value. */
-  int *firstl;
-  double *cf, *mfac, *t1fac, *t2fac, *th, *cth, *sth, *logsth;
-  ylmgen_dbl2 *recfac;
-  double *lamfact;
-  /*! Points to an array of size [0..lmax] containing the Y_lm values. */
-  double *ylm;
-  /*! Points to an array of size [0..lmax] containing the lambda_w
-      and lambda_x values for spin>0 transforms. */
-  ylmgen_dbl2 **lambda_wx;
-  long double *logsum, *lc05, *ls05;
-  double *flm1, *flm2, *xl;
-
-  sylmgen_d **sylm;
-
-  int *lwx_uptodate;
-  int ylm_uptodate;
-
-#ifdef __SSE2__
-  int ith1, ith2;
-  /*! Points to an array of size [0..lmax] containing the Y_lm values. */
-  v2df *ylm_sse2;
-  /*! Points to an array of size [0..lmax] containing the lambda_w
-      and lambda_x values for spin>0 transforms. */
-  v2df2 **lambda_wx_sse2;
-  int *lwx_uptodate_sse2;
-  int ylm_uptodate_sse2;
-#endif
-
-  int recfac_uptodate, lamfact_uptodate;
-  } Ylmgen_C;
-
-/*! Creates a generator which will calculate Y_lm(theta,phi=0)
-    up to \a l=l_max and \a m=m_max. It may regard Y_lm whose absolute
-    magnitude is smaller than \a epsilon as zero. If \a spinrec is nonzero,
-    the spin-1 and spin-2 Y_lm will be calculated by recursion from the spin-0
-    ones, otherwise Wigner d matrix elements will be used. */
-static void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int s_max, int spinrec,
-   double epsilon);
-
-/*! Passes am array \a theta of \a nth colatitudes that will be used in
-    subsequent calls. The individual angles will be referenced by their
-    index in the array, starting with 0.
-    \note The array can be freed or reused after the call. */
-static void Ylmgen_set_theta (Ylmgen_C *gen, const double *theta, int nth);
-
-/*! Deallocates a generator previously initialised by Ylmgen_init(). */
-static void Ylmgen_destroy (Ylmgen_C *gen);
-
-/*! Prepares the object for the calculation at \a theta and \a m. */
-static void Ylmgen_prepare (Ylmgen_C *gen, int ith, int m);
-
-/*! Recalculates (if necessary) the Y_lm values. */
-static void Ylmgen_recalc_Ylm (Ylmgen_C *gen);
-/*! Recalculates (if necessary) the lambda_w and lambda_x values for spin >0
-    transforms. */
-/*static void Ylmgen_recalc_lambda_wx (Ylmgen_C *gen, int spin);*/
-
-#ifdef __SSE2__
-/*! Prepares the object for the calculation at \a theta, \a theta2 and \a m. */
-static void Ylmgen_prepare_sse2 (Ylmgen_C *gen, int ith1, int ith2, int m);
-
-/*! Recalculates (if necessary) the Y_lm values. */
-static void Ylmgen_recalc_Ylm_sse2 (Ylmgen_C *gen);
-/*! Recalculates (if necessary) the lambda_w and lambda_x values for spin >0
-    transforms. */
-/*static void Ylmgen_recalc_lambda_wx_sse2 (Ylmgen_C *gen, int spin);*/
-#endif
-
-/*! Returns a pointer to an array with lmax+1 entries containing normalisation
-    factors that must be applied to Y_lm values computed for \a spin with the
-    given \a spinrec flag. The array must be deallocated (using free()) by the
-    user. */
-/*static double *Ylmgen_get_norm (int lmax, int spin, int spinrec);*/
-
-
-/*
-ylmgen_c.c
-*/
-
-/*
- *  This file is part of libpsht.
- *
- *  libpsht is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libpsht is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libpsht; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libpsht is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*
- *  Code for efficient calculation of Y_lm(theta,phi=0)
- *
- *  Copyright (C) 2005-2011 Max-Planck-Society
- *  Author: Martin Reinecke
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include "c_utils.h"
-
-enum { large_exponent2=90, minscale=-4, maxscale=11 };
-
-/*static void sylmgen_init (sylmgen_d *gen, const Ylmgen_C *ygen, int spin)
-  {
-  int i;
-  UTIL_ASSERT(spin>=1,"incorrect spin");
-  gen->s=spin;
-  gen->m=gen->mlo=gen->mhi=-1234567890;
-  ALLOC(gen->fx,ylmgen_dbl3,ygen->lmax+2);
-
-  for (i=0; i<ygen->lmax+2; ++i)
-    gen->fx[i][0]=gen->fx[i][1]=gen->fx[i][2]=0.;
-
-  gen->cth_crit = 2.;
-  gen->mdist_crit = ygen->lmax+1;
-  }*/
-
-static void sylmgen_destroy (sylmgen_d *gen)
-  { DEALLOC(gen->fx); }
-
-/*static void sylmgen_prepare (sylmgen_d *gen, const Ylmgen_C *ygen, int m_)
-  {
-  int mlo_, mhi_, ms_similar, l;
-
-  if (m_==gen->m) return;
-  UTIL_ASSERT(m_>=0,"incorrect m");
-
-  mlo_=m_; mhi_=gen->s;
-  if (mhi_<mlo_) SWAP(mhi_,mlo_,int);
-  ms_similar = ((gen->mhi==mhi_) && (gen->mlo==mlo_));
-
-  gen->m=m_;
-  gen->mlo = mlo_; gen->mhi = mhi_;
-
-  if (!ms_similar)
-    {
-    for (l=gen->mhi; l<ygen->lmax; ++l)
-      {
-      double t = ygen->flm1[l+gen->m]*ygen->flm1[l-gen->m]
-                *ygen->flm1[l+gen->s]*ygen->flm1[l-gen->s];
-      double lt = 2*l+1;
-      double l1 = l+1;
-      gen->fx[l+1][0]=l1*lt*t;
-      gen->fx[l+1][1]=gen->m*gen->s*ygen->xl[l]*ygen->xl[l+1];
-      t = ygen->flm2[l+gen->m]*ygen->flm2[l-gen->m]
-         *ygen->flm2[l+gen->s]*ygen->flm2[l-gen->s];
-      gen->fx[l+1][2]=t*l1*ygen->xl[l];
-      }
-    gen->prefactor = 0.5L*(ygen->logsum[2*gen->mhi]
-      -ygen->logsum[gen->mhi+gen->mlo]-ygen->logsum[gen->mhi-gen->mlo]);
-    }
-
-  gen->preMinus_p = gen->preMinus_m = 0;
-  if (gen->mhi==gen->m)
-    {
-    gen->cosPow = gen->mhi+gen->s; gen->sinPow = gen->mhi-gen->s;
-    gen->preMinus_p = gen->preMinus_m = ((gen->mhi-gen->s)&1);
-    }
-  else
-    {
-    gen->cosPow = gen->mhi+gen->m; gen->sinPow = gen->mhi-gen->m;
-    gen->preMinus_m = ((gen->mhi+gen->m)&1);
-    }
-    }*/
-
-/*static void sylmgen_recalc (sylmgen_d *gen, const Ylmgen_C *ygen, int ith,
-  ylmgen_dbl2 *res, int *firstl)
-  {
-  const double ln2     = 0.6931471805599453094172321214581766;
-  const double inv_ln2 = 1.4426950408889634073599246810018921;
-  int l=gen->mhi;
-  int lmax = ygen->lmax;
-  ylmgen_dbl3 *fy = gen->fx;
-  const double fsmall = ygen->fsmall, fbig = ygen->fbig, eps = ygen->eps;
-  const double cth = ygen->cth[ith];
-  long double logvalp = inv_ln2*(gen->prefactor
-    + ygen->lc05[ith]*gen->cosPow + ygen->ls05[ith]*gen->sinPow);
-  long double logvalm = inv_ln2*(gen->prefactor
-    + ygen->lc05[ith]*gen->sinPow + ygen->ls05[ith]*gen->cosPow);
-  int scalep = (int)(logvalp/large_exponent2)-minscale;
-  int scalem = (int)(logvalm/large_exponent2)-minscale;
-  double rec1p=0., rec1m=0.;
-  double rec2p = exp(ln2*(double)(logvalp-(scalep+minscale)*large_exponent2));
-  double rec2m = exp(ln2*(double)(logvalm-(scalem+minscale)*large_exponent2));
-  double corfacp,corfacm;
-  double tp,tm;
-
-  if ((abs(gen->m-gen->s)>=gen->mdist_crit)&&(fabs(cth)>=gen->cth_crit))
-    { *firstl=ygen->lmax+1; return; }
-
-  if (gen->preMinus_p)
-    rec2p=-rec2p;
-  if (gen->preMinus_m)
-    rec2m=-rec2m;
-  if (gen->s&1)
-    rec2p=-rec2p;
-
-  / iterate until we reach the realm of IEEE numbers /
-  while((scalem<0)&&(scalep<0))
-    {
-    if (++l>lmax) break;
-    rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
-    rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
-    if (++l>lmax) break;
-    rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
-    rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;
-
-    while (fabs(rec2p)>fbig)
-      { rec1p *= fsmall; rec2p *= fsmall; ++scalep; }
-    while (fabs(rec2m)>fbig)
-      { rec1m *= fsmall; rec2m *= fsmall; ++scalem; }
-    }
-
-  corfacp = (scalep<0) ? 0. : ygen->cf[scalep];
-  corfacm = (scalem<0) ? 0. : ygen->cf[scalem];
-
-  if (l<=lmax)
-    {
-    while (1)
-      {
-      if ((fabs(rec2p*corfacp)>eps) || (fabs(rec2m*corfacm)>eps))
-        break;
-      if (++l>lmax) break;
-      rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
-      rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
-      if ((fabs(rec1p*corfacp)>eps) || (fabs(rec1m*corfacm)>eps))
-        { SWAP(rec1p,rec2p,double); SWAP(rec1m,rec2m,double); break; }
-      if (++l>lmax) break;
-      rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
-      rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;
-
-      if ((fabs(rec2p)>fbig) || (fabs(rec2m)>fbig))
-        {
-        while (fabs(rec2p)>fbig)
-          { rec1p *= fsmall; rec2p *= fsmall; ++scalep; }
-        while (fabs(rec2m)>fbig)
-          { rec1m *= fsmall; rec2m *= fsmall; ++scalem; }
-        corfacp = (scalep<0) ? 0. : ygen->cf[scalep];
-        corfacm = (scalem<0) ? 0. : ygen->cf[scalem];
-        }
-      }
-    }
-
-  *firstl=l;
-  if (l>lmax)
-    {
-    gen->mdist_crit=abs(gen->m-gen->s);
-    gen->cth_crit= fabs(cth);
-    return;
-    }
-
-  tp = rec2p*corfacp; tm = rec2m*corfacm;
-  res[l][0]=tp+tm;
-  res[l][1]=tm-tp;
-
-  while (1)
-    {
-    if ((fabs(tp)>eps) && (fabs(tm)>eps))
-      break;
-    if (++l>lmax) break;
-    rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
-    rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
-    tp=rec1p*corfacp; tm=rec1m*corfacm;
-    res[l][0]=tp+tm; res[l][1]=tm-tp;
-    if ((fabs(tp)>eps) && (fabs(tm)>eps))
-      { SWAP(rec1p,rec2p,double); SWAP(rec1m,rec2m,double); break; }
-    if (++l>lmax) break;
-    rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
-    rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;
-    tp=rec2p*corfacp; tm=rec2m*corfacm;
-    res[l][0]=tp+tm; res[l][1]=tm-tp;
-
-    if ((fabs(rec2p)>fbig) || (fabs(rec2m)>fbig))
-      {
-      while (fabs(rec2p)>fbig)
-        { rec1p *= fsmall; rec2p *= fsmall; ++scalep; }
-      while (fabs(rec2m)>fbig)
-        { rec1m *= fsmall; rec2m *= fsmall; ++scalem; }
-      corfacp = (scalep<0) ? 0. : ygen->cf[scalep];
-      corfacm = (scalem<0) ? 0. : ygen->cf[scalem];
-      }
-    }
-
-  rec1p *= corfacp; rec2p *= corfacp;
-  rec1m *= corfacm; rec2m *= corfacm;
-
-  for (;l<lmax-1;l+=2)
-    {
-    rec1p = (cth - fy[l+1][1])*fy[l+1][0]*rec2p - fy[l+1][2]*rec1p;
-    rec1m = (cth + fy[l+1][1])*fy[l+1][0]*rec2m - fy[l+1][2]*rec1m;
-    res[l+1][0] = rec1p+rec1m; res[l+1][1] = rec1m-rec1p;
-    rec2p = (cth - fy[l+2][1])*fy[l+2][0]*rec1p - fy[l+2][2]*rec2p;
-    rec2m = (cth + fy[l+2][1])*fy[l+2][0]*rec1m - fy[l+2][2]*rec2m;
-    res[l+2][0] = rec2p+rec2m; res[l+2][1] = rec2m-rec2p;
-    }
-  while (1)
-    {
-    if (++l>lmax) break;
-    rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
-    rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
-    res[l][0] = rec1p+rec1m; res[l][1] = rec1m-rec1p;
-    if (++l>lmax) break;
-    rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
-    rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;
-    res[l][0] = rec2p+rec2m; res[l][1] = rec2m-rec2p;
-    }
-    }*/
-
-#ifdef __SSE2__
-
-#define ADVANCE(L,ap,am,bp,bm) \
-  { \
-  v2df f0=_mm_set1_pd(fy[L][0]), \
-       f1=_mm_set1_pd(fy[L][1]), \
-       f2=_mm_set1_pd(fy[L][2]); \
-  ap = _mm_sub_pd( \
-       _mm_mul_pd(_mm_sub_pd(cth,f1),_mm_mul_pd(f0,bp)), \
-       _mm_mul_pd(f2,ap)); \
-  am = _mm_sub_pd( \
-       _mm_mul_pd(_mm_add_pd(cth,f1),_mm_mul_pd(f0,bm)), \
-       _mm_mul_pd(f2,am)); \
-  }
-
-#define RENORMX(r1,r2,corf,sca,scb) \
-  do \
-    { \
-    double rec1a, rec1b, rec2a, rec2b, corfaca, corfacb; \
-    read_v2df (r1, &rec1a, &rec1b); read_v2df (r2, &rec2a, &rec2b); \
-    read_v2df (corf, &corfaca, &corfacb); \
-    while (fabs(rec2a)>fbig) \
-      { \
-      rec1a*=fsmall; rec2a*=fsmall; ++sca; \
-      corfaca = (sca<0) ? 0. : ygen->cf[sca]; \
-      } \
-    while (fabs(rec2b)>fbig) \
-      { \
-      rec1b*=fsmall; rec2b*=fsmall; ++scb; \
-      corfacb = (scb<0) ? 0. : ygen->cf[scb]; \
-      } \
-    r1=build_v2df(rec1a,rec1b); r2=build_v2df(rec2a,rec2b); \
-    corf=build_v2df(corfaca,corfacb); \
-    } \
-  while(0)
-
-#define RENORM \
-  RENORMX(rec1p,rec2p,corfacp,scale1p,scale2p); \
-  RENORMX(rec1m,rec2m,corfacm,scale1m,scale2m)
-
-/*static void sylmgen_recalc_sse2 (sylmgen_d *gen, const Ylmgen_C *ygen,
-  int ith1, int ith2, v2df2 *res, int *firstl)
-  {
-  const double ln2     = 0.6931471805599453094172321214581766;
-  const double inv_ln2 = 1.4426950408889634073599246810018921;
-  int l=gen->mhi;
-  int lmax = ygen->lmax;
-  ylmgen_dbl3 *fy = gen->fx;
-  const double fbig=ygen->fbig, fsmall=ygen->fsmall;
-  v2df eps2    = build_v2df(ygen->eps,ygen->eps);
-  const double cth1=ygen->cth[ith1], cth2=ygen->cth[ith2];
-  v2df cth = build_v2df(cth1,cth2);
-  long double
-    logval1p = inv_ln2*(gen->prefactor
-             + ygen->lc05[ith1]*gen->cosPow + ygen->ls05[ith1]*gen->sinPow),
-    logval2p = inv_ln2*(gen->prefactor
-             + ygen->lc05[ith2]*gen->cosPow + ygen->ls05[ith2]*gen->sinPow),
-    logval1m = inv_ln2*(gen->prefactor
-             + ygen->lc05[ith1]*gen->sinPow + ygen->ls05[ith1]*gen->cosPow),
-    logval2m = inv_ln2*(gen->prefactor
-             + ygen->lc05[ith2]*gen->sinPow + ygen->ls05[ith2]*gen->cosPow);
-
-  int scale1p = (int)(logval1p/large_exponent2)-minscale,
-      scale2p = (int)(logval2p/large_exponent2)-minscale,
-      scale1m = (int)(logval1m/large_exponent2)-minscale,
-      scale2m = (int)(logval2m/large_exponent2)-minscale;
-
-  v2df rec1p =_mm_setzero_pd(), rec1m=_mm_setzero_pd();
-  v2df rec2p = build_v2df(
-    exp(ln2*(double)(logval1p-(scale1p+minscale)*large_exponent2)),
-    exp(ln2*(double)(logval2p-(scale2p+minscale)*large_exponent2)));
-  v2df rec2m = build_v2df(
-    exp(ln2*(double)(logval1m-(scale1m+minscale)*large_exponent2)),
-    exp(ln2*(double)(logval2m-(scale2m+minscale)*large_exponent2)));
-  v2df corfacp=build_v2df((scale1p<0) ? 0. : ygen->cf[scale1p],
-                          (scale2p<0) ? 0. : ygen->cf[scale2p]),
-       corfacm=build_v2df((scale1m<0) ? 0. : ygen->cf[scale1m],
-                          (scale2m<0) ? 0. : ygen->cf[scale2m]);
-  v2df tp,tm;
-
-  if ((abs(gen->m-gen->s)>=gen->mdist_crit)
-     &&(fabs(ygen->cth[ith1])>=gen->cth_crit)
-     &&(fabs(ygen->cth[ith2])>=gen->cth_crit))
-    { *firstl=ygen->lmax+1; return; }
-
-  if (gen->preMinus_p)
-    rec2p = _mm_xor_pd (rec2p,V2DF_SIGNMASK); / negate /
-  if (gen->preMinus_m)
-    rec2m = _mm_xor_pd (rec2m,V2DF_SIGNMASK); / negate /
-  if (gen->s&1)
-    rec2p = _mm_xor_pd (rec2p,V2DF_SIGNMASK); / negate /
-
-  / iterate until we reach the realm of IEEE numbers /
-  while((scale1m<0)&&(scale1p<0)&&(scale2m<0)&&(scale2p<0))
-    {
-    if (++l>lmax) break;
-    ADVANCE (l,rec1p,rec1m,rec2p,rec2m)
-    if (++l>lmax) break;
-    ADVANCE (l,rec2p,rec2m,rec1p,rec1m)
-
-    RENORM;
-    }
-
-  if (l<=lmax)
-    {
-    while (1)
-      {
-      if (v2df_any_gt(_mm_mul_pd(rec2p,corfacp),eps2) ||
-          v2df_any_gt(_mm_mul_pd(rec2m,corfacm),eps2))
-        break;
-      if (++l>lmax) break;
-      ADVANCE (l,rec1p,rec1m,rec2p,rec2m)
-      if (v2df_any_gt(_mm_mul_pd(rec1p,corfacp),eps2) ||
-          v2df_any_gt(_mm_mul_pd(rec1m,corfacm),eps2))
-        { SWAP(rec1p,rec2p,v2df); SWAP(rec1m,rec2m,v2df); break; }
-      if (++l>lmax) break;
-      ADVANCE (l,rec2p,rec2m,rec1p,rec1m)
-
-      RENORM;
-      }
-    }
-
-  *firstl=l;
-  if (l>lmax)
-    {
-    gen->mdist_crit=abs(gen->m-gen->s);
-    gen->cth_crit= (fabs(cth1)<fabs(cth2)) ? fabs(cth1) : fabs(cth2);
-    return;
-    }
-  tp = _mm_mul_pd(rec2p,corfacp); tm = _mm_mul_pd(rec2m,corfacm);
-  res[l].a=_mm_add_pd(tp,tm);
-  res[l].b=_mm_sub_pd(tm,tp);
-
-  while (1)
-    {
-    if (v2df_all_ge(tp,eps2) && v2df_all_ge(tm,eps2))
-      break;
-    if (++l>lmax) break;
-    ADVANCE(l,rec1p,rec1m,rec2p,rec2m)
-    tp=_mm_mul_pd(rec1p,corfacp); tm=_mm_mul_pd(rec1m,corfacm);
-    res[l].a=_mm_add_pd(tp,tm); res[l].b=_mm_sub_pd(tm,tp);
-    if (v2df_all_ge(tp,eps2) && v2df_all_ge(tm,eps2))
-      { SWAP(rec1p,rec2p,v2df); SWAP(rec1m,rec2m,v2df); break; }
-    if (++l>lmax) break;
-    ADVANCE (l,rec2p,rec2m,rec1p,rec1m)
-    tp=_mm_mul_pd(rec2p,corfacp); tm=_mm_mul_pd(rec2m,corfacm);
-    res[l].a=_mm_add_pd(tp,tm); res[l].b=_mm_sub_pd(tm,tp);
-
-    RENORM;
-    }
-
-  rec1p = _mm_mul_pd(rec1p,corfacp); rec2p = _mm_mul_pd(rec2p,corfacp);
-  rec1m = _mm_mul_pd(rec1m,corfacm); rec2m = _mm_mul_pd(rec2m,corfacm);
-
-  for (;l<lmax-1;l+=2)
-    {
-    ADVANCE(l+1,rec1p,rec1m,rec2p,rec2m)
-    res[l+1].a=_mm_add_pd(rec1p,rec1m); res[l+1].b=_mm_sub_pd(rec1m,rec1p);
-    ADVANCE(l+2,rec2p,rec2m,rec1p,rec1m)
-    res[l+2].a=_mm_add_pd(rec2p,rec2m); res[l+2].b=_mm_sub_pd(rec2m,rec2p);
-    }
-  while (1)
-    {
-    if (++l>lmax) break;
-    ADVANCE(l,rec1p,rec1m,rec2p,rec2m)
-    res[l].a=_mm_add_pd(rec1p,rec1m); res[l].b=_mm_sub_pd(rec1m,rec1p);
-    if (++l>lmax) break;
-    ADVANCE(l,rec2p,rec2m,rec1p,rec1m)
-    res[l].a=_mm_add_pd(rec2p,rec2m); res[l].b=_mm_sub_pd(rec2m,rec2p);
-    }
-  }
-*/
-#endif
-
-static void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int s_max, int spinrec,
-  double epsilon)
-  {
-  int m;
-  const double inv_sqrt4pi = 0.2820947917738781434740397257803862929220;
-  const double inv_ln2 = 1.4426950408889634073599246810018921;
-
-  gen->fsmall = ldexp(1.,-large_exponent2);
-  gen->fbig   = ldexp(1., large_exponent2);
-  gen->eps = epsilon;
-  gen->cth_crit = 2.;
-  gen->ith = -1;
-  gen->nth = 0;
-  gen->lmax = l_max;
-  gen->mmax = m_max;
-  gen->smax = s_max;
-  gen->spinrec = spinrec;
-  gen->m_cur = -1;
-  gen->m_crit = gen->mmax+1;
-  gen->firstl = RALLOC(int,gen->smax+1);
-  for (m=0; m<=gen->smax; ++m) gen->firstl[m]=-1;
-  gen->cf = RALLOC(double,maxscale-minscale+1);
-  for (m=0; m<(maxscale-minscale+1); ++m)
-    gen->cf[m] = ldexp(1.,(m+minscale)*large_exponent2);
-  gen->recfac = RALLOC(ylmgen_dbl2,gen->lmax+1);
-  gen->mfac = RALLOC(double,gen->mmax+1);
-  gen->mfac[0] = 1;
-  for (m=1; m<=gen->mmax; ++m)
-    gen->mfac[m] = gen->mfac[m-1]*sqrt((2*m+1.)/(2*m));
-  for (m=0; m<=gen->mmax; ++m)
-    gen->mfac[m] = inv_ln2*log(inv_sqrt4pi*gen->mfac[m]);
-
-  gen->t1fac = RALLOC(double,gen->lmax+1);
-  for (m=0; m<=gen->lmax; ++m)
-    gen->t1fac[m] = sqrt(4.*(m+1)*(m+1)-1.);
-  gen->t2fac = RALLOC(double,gen->lmax+gen->mmax+1);
-  for (m=0; m<=gen->lmax+gen->mmax; ++m)
-    gen->t2fac[m] = 1./sqrt(m+1.);
-
-  gen->lamfact = RALLOC(double,gen->lmax+1);
-  gen->ylm = RALLOC(double,gen->lmax+1);
-  ALLOC(gen->lambda_wx,ylmgen_dbl2 *,gen->smax+1);
-  for (m=0; m<=gen->smax; ++m)
-    gen->lambda_wx[m]=NULL;
-
-  gen->sylm = RALLOC(sylmgen_d *,gen->smax+1);
-  for (m=0; m<=gen->smax; ++m)
-    gen->sylm[m]=NULL;
-
-  gen->ylm_uptodate = 0;
-  gen->lwx_uptodate = RALLOC(int,gen->smax+1);
-  SET_ARRAY(gen->lwx_uptodate,0,gen->smax+1,0);
-  gen->recfac_uptodate = 0;
-  gen->lamfact_uptodate = 0;
-
-  gen->th = gen->cth = gen->sth = gen->logsth = NULL;
-
-#ifdef __SSE2__
-  gen->ith1 = gen->ith2 = -1;
-  gen->ylm_sse2 = RALLOC(v2df,gen->lmax+1);
-  ALLOC(gen->lambda_wx_sse2,v2df2 *,gen->smax+1);
-  for (m=0; m<=gen->smax; ++m)
-    gen->lambda_wx_sse2[m]=NULL;
-  gen->ylm_uptodate_sse2 = 0;
-  gen->lwx_uptodate_sse2 = RALLOC(int,gen->smax+1);
-  SET_ARRAY(gen->lwx_uptodate_sse2,0,gen->smax+1,0);
-#endif
-
-  ALLOC(gen->logsum,long double,2*gen->lmax+1);
-  gen->lc05 = gen->ls05 = NULL;
-  ALLOC(gen->flm1,double,2*gen->lmax+1);
-  ALLOC(gen->flm2,double,2*gen->lmax+1);
-  ALLOC(gen->xl,double,gen->lmax+1);
-
-  gen->logsum[0] = 0.;
-  for (m=1; m<2*gen->lmax+1; ++m)
-    gen->logsum[m] = gen->logsum[m-1]+logl((long double)m);
-  for (m=0; m<2*gen->lmax+1; ++m)
-    {
-    gen->flm1[m] = sqrt(1./(m+1.));
-    gen->flm2[m] = sqrt(m/(m+1.));
-    }
-
-  gen->xl[0]=0;
-  for (m=1; m<gen->lmax+1; ++m) gen->xl[m]=1./m;
-  }
-
-static void Ylmgen_destroy (Ylmgen_C *gen)
-  {
-  int m;
-
-  DEALLOC(gen->firstl);
-  DEALLOC(gen->cf);
-  DEALLOC(gen->recfac);
-  DEALLOC(gen->mfac);
-  DEALLOC(gen->t1fac);
-  DEALLOC(gen->t2fac);
-  DEALLOC(gen->lamfact);
-  DEALLOC(gen->ylm);
-  DEALLOC(gen->lwx_uptodate);
-  for (m=0; m<=gen->smax; ++m)
-    DEALLOC(gen->lambda_wx[m]);
-  DEALLOC(gen->lambda_wx);
-  for (m=0; m<=gen->smax; ++m)
-    if (gen->sylm[m])
-      {
-      sylmgen_destroy (gen->sylm[m]);
-      DEALLOC(gen->sylm[m]);
-      }
-  DEALLOC(gen->sylm);
-  DEALLOC(gen->th);
-  DEALLOC(gen->cth);
-  DEALLOC(gen->sth);
-  DEALLOC(gen->logsth);
-  DEALLOC(gen->logsum);
-  DEALLOC(gen->lc05);
-  DEALLOC(gen->ls05);
-  DEALLOC(gen->flm1);
-  DEALLOC(gen->flm2);
-  DEALLOC(gen->xl);
-#ifdef __SSE2__
-  DEALLOC(gen->ylm_sse2);
-  for (m=0; m<=gen->smax; ++m)
-    DEALLOC(gen->lambda_wx_sse2[m]);
-  DEALLOC(gen->lambda_wx_sse2);
-  DEALLOC(gen->lwx_uptodate_sse2);
-#endif
-  }
-
-static void Ylmgen_set_theta (Ylmgen_C *gen, const double *theta, int nth)
-  {
-  const double inv_ln2 = 1.4426950408889634073599246810018921;
-  int m;
-  DEALLOC(gen->th);
-  DEALLOC(gen->cth);
-  DEALLOC(gen->sth);
-  DEALLOC(gen->logsth);
-  DEALLOC(gen->lc05);
-  DEALLOC(gen->ls05);
-  gen->th = RALLOC(double,nth);
-  gen->cth = RALLOC(double,nth);
-  gen->sth = RALLOC(double,nth);
-  gen->logsth = RALLOC(double,nth);
-  gen->lc05 = RALLOC(long double,nth);
-  gen->ls05 = RALLOC(long double,nth);
-  for (m=0; m<nth; ++m)
-    {
-    const double pi = 3.141592653589793238462643383279502884197;
-    double th=theta[m];
-    UTIL_ASSERT ((th>=0.)&&(th<=pi),"bad theta angle specified");
-    /* tiny adjustments to make sure cos and sin (theta/2) are positive */
-    if (th==0.) th=1e-16;
-    if (ABSAPPROX(th,pi,1e-15)) th=pi-1e-15;
-    gen->th[m] = th;
-    gen->cth[m] = cos(th);
-    gen->sth[m] = sin(th);
-    gen->logsth[m] = inv_ln2*log(gen->sth[m]);
-    gen->lc05[m]=logl(cosl(0.5L*th));
-    gen->ls05[m]=logl(sinl(0.5L*th));
-    }
-
-  gen->nth = nth;
-  gen->ith = -1;
-#ifdef __SSE2__
-  gen->ith1 = gen->ith2 = -1;
-#endif
-  }
-
-static void Ylmgen_prepare (Ylmgen_C *gen, int ith, int m)
-  {
-  if ((ith==gen->ith) && (m==gen->m_cur)) return;
-
-  gen->ylm_uptodate = 0;
-  SET_ARRAY(gen->lwx_uptodate,0,gen->smax+1,0);
-
-  gen->ith = ith;
-
-  if (m!=gen->m_cur)
-    {
-    gen->recfac_uptodate = 0;
-    gen->lamfact_uptodate = 0;
-    gen->m_cur = m;
-    }
-  }
-
-static void Ylmgen_recalc_recfac (Ylmgen_C *gen)
-  {
-  double f_old=1;
-  int l, m;
-
-  if (gen->recfac_uptodate) return;
-  gen->recfac_uptodate = 1;
-
-  m = gen->m_cur;
-  for (l=m; l<=gen->lmax; ++l)
-    {
-    gen->recfac[l][0] = gen->t1fac[l]*gen->t2fac[l+m]*gen->t2fac[l-m];
-    gen->recfac[l][1] = gen->recfac[l][0]/f_old;
-    f_old = gen->recfac[l][0];
-    }
-  }
-
-/*static void Ylmgen_recalc_lamfact (Ylmgen_C *gen)
-  {
-  int l, m;
-
-  if (gen->lamfact_uptodate) return;
-  gen->lamfact_uptodate = 1;
-
-  m = gen->m_cur;
-  gen->lamfact[m] = 0;
-  for (l=m+1; l<=gen->lmax; ++l)
-    gen->lamfact[l] = sqrt((2*l+1.)/(2*l-1.) * (l*l-m*m));
-    }*/
-
-#define RENORMALIZE_SCALAR \
-  do \
-    { \
-    while (fabs(lam_2)>fbig) \
-      { lam_1*=fsmall; lam_2*=fsmall; ++scale; } \
-    corfac = (scale<0) ? 0. : gen->cf[scale]; \
-    } \
-  while(0)
-
-static void Ylmgen_recalc_Ylm (Ylmgen_C *gen)
-  {
-  const double ln2 = 0.6931471805599453094172321214581766;
-
-  double logval,lam_1,lam_2,corfac;
-  double eps=gen->eps, fbig=gen->fbig, fsmall=gen->fsmall;
-  ylmgen_dbl2 *recfac = gen->recfac;
-  int lmax=gen->lmax;
-  int scale,l;
-  int m = gen->m_cur;
-  double cth=gen->cth[gen->ith], sth=gen->sth[gen->ith];
-  double *result = gen->ylm;
-
-  if (gen->ylm_uptodate) return;
-  gen->ylm_uptodate=1;
-
-  if (((m>=gen->m_crit)&&(fabs(cth)>=gen->cth_crit)) || ((m>0)&&(sth==0)))
-    { gen->firstl[0]=gen->lmax+1; return; }
-
-  Ylmgen_recalc_recfac(gen);
-
-  logval = gen->mfac[m];
-  if (m>0) logval += m*gen->logsth[gen->ith];
-  scale = (int) (logval/large_exponent2)-minscale;
-  corfac = (scale<0) ? 0. : gen->cf[scale];
-
-  lam_1 = 0;
-  lam_2 = exp(ln2*(logval-(scale+minscale)*large_exponent2));
-  if (m&1) lam_2 = -lam_2;
-
-  l=m;
-  if (scale<0)
-    {
-    while (1)
-      {
-      if (++l>lmax) break;
-      lam_1 = cth*lam_2*recfac[l-1][0] - lam_1*recfac[l-1][1];
-      if (++l>lmax) break;
-      lam_2 = cth*lam_1*recfac[l-1][0] - lam_2*recfac[l-1][1];
-      if (fabs(lam_2)>fbig)
-        {
-        RENORMALIZE_SCALAR;
-        if (scale>=0) break;
-        }
-      }
-    }
-
-  lam_1*=corfac;
-  lam_2*=corfac;
-
-  if (l<=lmax)
-    {
-    while (1)
-      {
-      if (fabs(lam_2)>eps) break;
-      if (++l>lmax) break;
-      lam_1 = cth*lam_2*recfac[l-1][0] - lam_1*recfac[l-1][1];
-      if (fabs(lam_1)>eps)
-        { double x=lam_1; lam_1=lam_2; lam_2=x; break; }
-      if (++l>lmax) break;
-      lam_2 = cth*lam_1*recfac[l-1][0] - lam_2*recfac[l-1][1];
-      }
-    }
-
-  gen->firstl[0]=l;
-  if (l>lmax)
-    { gen->m_crit=m; gen->cth_crit=fabs(cth); return; }
-
-  for(;l<lmax-3;l+=4)
-    {
-    result[l]=lam_2;
-    lam_1 = cth*lam_2*recfac[l][0] - lam_1*recfac[l][1];
-    result[l+1] = lam_1;
-    lam_2 = cth*lam_1*recfac[l+1][0] - lam_2*recfac[l+1][1];
-    result[l+2]=lam_2;
-    lam_1 = cth*lam_2*recfac[l+2][0] - lam_1*recfac[l+2][1];
-    result[l+3] = lam_1;
-    lam_2 = cth*lam_1*recfac[l+3][0] - lam_2*recfac[l+3][1];
-    }
-
-  while (1)
-    {
-    result[l]=lam_2;
-    if (++l>lmax) break;
-    lam_1 = cth*lam_2*recfac[l-1][0] - lam_1*recfac[l-1][1];
-    result[l] = lam_1;
-    if (++l>lmax) break;
-    lam_2 = cth*lam_1*recfac[l-1][0] - lam_2*recfac[l-1][1];
-    }
-  }
-
-
-/*
-static void Ylmgen_recalc_lambda_wx1 (Ylmgen_C *gen)
-  {
-  if (gen->lwx_uptodate[1]) return;
-  Ylmgen_recalc_Ylm(gen);
-  gen->firstl[1] = gen->firstl[0];
-  if (gen->firstl[1]>gen->lmax) return;
-  Ylmgen_recalc_lamfact(gen);
-  gen->lwx_uptodate[1] = 1;
-
-  {
-  double cth=gen->cth[gen->ith];
-  double xsth=1./(gen->sth[gen->ith]);
-  double m=gen->m_cur;
-  double m_on_sth = m*xsth;
-  double lam_lm=0;
-  ylmgen_dbl2 *lambda_wx = gen->lambda_wx[1];
-  int l;
-  double ell;
-  for (ell=l=gen->firstl[1]; l<=gen->lmax; ++l, ell+=1.)
-    {
-    double lam_lm1m=lam_lm;
-    lam_lm=gen->ylm[l];
-    lambda_wx[l][0] = xsth*(gen->lamfact[l]*lam_lm1m - ell*cth*lam_lm);
-    lambda_wx[l][1] = m_on_sth*lam_lm;
-    }
-  }
-  }
-
-static void Ylmgen_recalc_lambda_wx2 (Ylmgen_C *gen)
-  {
-  if (gen->lwx_uptodate[2]) return;
-  Ylmgen_recalc_Ylm(gen);
-  gen->firstl[2] = gen->firstl[0];
-  if (gen->firstl[2]>gen->lmax) return;
-  Ylmgen_recalc_lamfact(gen);
-  gen->lwx_uptodate[2] = 1;
-
-  {
-  double cth=gen->cth[gen->ith];
-  double sth=gen->sth[gen->ith];
-  double m=gen->m_cur;
-  double one_on_s2 = 1./(sth*sth);
-  double two_on_s2 = 2*one_on_s2;
-  double two_c_on_s2 = cth * two_on_s2;
-  double m2 = m*m;
-  double two_m_on_s2 = m*two_on_s2;
-  double lam_lm=0;
-  ylmgen_dbl2 *lambda_wx = gen->lambda_wx[2];
-  int l;
-  double ell;
-  for (ell=l=gen->firstl[2]; l<=gen->lmax; ++l, ell+=1.)
-    {
-    double lam_lm1m=lam_lm;
-    lam_lm=gen->ylm[l];
-    {
-    const double t1  = lam_lm1m*gen->lamfact[l];
-    const double a_w = (m2-ell)*two_on_s2 - ell*(ell-1.);
-    const double a_x = cth*(ell-1.)*lam_lm;
-    lambda_wx[l][0] = a_w*lam_lm + t1*two_c_on_s2;
-    lambda_wx[l][1] = two_m_on_s2 * (t1-a_x);
-    }
-    }
-  }
-  }
-
-void Ylmgen_recalc_lambda_wx (Ylmgen_C *gen, int spin)
-  {
-  UTIL_ASSERT ((spin>0) && (spin<=gen->smax),
-    "invalid spin in Ylmgen_recalc_lambda_wx");
-
-  if (!gen->lambda_wx[spin])
-    gen->lambda_wx[spin]=RALLOC(ylmgen_dbl2,gen->lmax+1);
-
-  if (gen->spinrec && spin==1) { Ylmgen_recalc_lambda_wx1(gen); return; }
-  if (gen->spinrec && spin==2) { Ylmgen_recalc_lambda_wx2(gen); return; }
-
-  if (!gen->sylm[spin])
-    {
-    gen->sylm[spin]=RALLOC(sylmgen_d,1);
-    sylmgen_init(gen->sylm[spin],gen,spin);
-    }
-  if (gen->lwx_uptodate[spin]) return;
-  sylmgen_prepare(gen->sylm[spin],gen,gen->m_cur);
-  sylmgen_recalc(gen->sylm[spin],gen,gen->ith,gen->lambda_wx[spin],
-    &gen->firstl[spin]);
-  gen->lwx_uptodate[spin] = 1;
-  }
-*/
-#ifdef __SSE2__
-
-static void Ylmgen_prepare_sse2 (Ylmgen_C *gen, int ith1, int ith2, int m)
-  {
-  if ((ith1==gen->ith1) && (ith2==gen->ith2) && (m==gen->m_cur)) return;
-
-  gen->ylm_uptodate_sse2 = 0;
-  SET_ARRAY(gen->lwx_uptodate_sse2,0,gen->smax+1,0);
-
-  gen->ith1 = ith1; gen->ith2 = ith2;
-
-  if (m!=gen->m_cur)
-    {
-    gen->recfac_uptodate = gen->lamfact_uptodate = 0;
-    gen->m_cur = m;
-    }
-  }
-
-
-#define RENORMALIZE \
-  do \
-    { \
-    double lam1a, lam1b, lam2a, lam2b, corfaca, corfacb; \
-    read_v2df (lam_1, &lam1a, &lam1b); read_v2df (lam_2, &lam2a, &lam2b); \
-    read_v2df (corfac, &corfaca, &corfacb); \
-    while (fabs(lam2a)>fbig) \
-      { \
-      lam1a*=fsmall; lam2a*=fsmall; ++scale1; \
-      corfaca = (scale1<0) ? 0. : gen->cf[scale1]; \
-      } \
-    while (fabs(lam2b)>fbig) \
-      { \
-      lam1b*=fsmall; lam2b*=fsmall; ++scale2; \
-      corfacb = (scale2<0) ? 0. : gen->cf[scale2]; \
-      } \
-    lam_1=build_v2df(lam1a,lam1b); lam_2=build_v2df(lam2a,lam2b); \
-    corfac=build_v2df(corfaca,corfacb); \
-    } \
-  while(0)
-#define GETPRE(prea,preb,lv) \
-  { \
-  prea=_mm_mul_pd(_mm_set1_pd(recfac[lv][0]),cth); \
-  preb=_mm_set1_pd(recfac[lv][1]); \
-  }
-#define NEXTSTEP(prea,preb,prec,pred,reca,recb,lv) \
-  { \
-  preb = _mm_mul_pd(preb,reca); \
-  prea = _mm_mul_pd(prea,recb); \
-  prec = _mm_set1_pd(recfac[lv][0]); \
-  pred = _mm_set1_pd(recfac[lv][1]); \
-  reca = _mm_sub_pd(prea,preb); \
-  prec = _mm_mul_pd(cth,prec); \
-  }
-
-static void Ylmgen_recalc_Ylm_sse2 (Ylmgen_C *gen)
-  {
-  const double ln2 = 0.6931471805599453094172321214581766;
-
-  v2df lam_1,lam_2,corfac;
-  double logval1,logval2;
-  double eps=gen->eps, fbig=gen->fbig, fsmall=gen->fsmall;
-  v2df eps2=build_v2df(eps,eps);
-  v2df fbig2=build_v2df(fbig,fbig);
-  ylmgen_dbl2 *recfac = gen->recfac;
-  int lmax=gen->lmax;
-  int scale1,scale2,l;
-  int m = gen->m_cur;
-  double cth1=gen->cth[gen->ith1], cth2=gen->cth[gen->ith2];
-  v2df cth=build_v2df(cth1,cth2);
-  v2df *result = gen->ylm_sse2;
-  v2df pre0,pre1,pre2,pre3;
-
-  if (gen->ylm_uptodate_sse2) return;
-  gen->ylm_uptodate_sse2=1;
-
-  if ((m>=gen->m_crit)&&(fabs(cth1)>=gen->cth_crit)&&(fabs(cth2)>=gen->cth_crit))
-    { gen->firstl[0]=gen->lmax+1; return; }
-
-  Ylmgen_recalc_recfac(gen);
-
-  logval1 = logval2 = gen->mfac[m];
-  if (m>0) logval1 += m*gen->logsth[gen->ith1];
-  if (m>0) logval2 += m*gen->logsth[gen->ith2];
-  scale1 = (int) (logval1/large_exponent2)-minscale;
-  scale2 = (int) (logval2/large_exponent2)-minscale;
-  corfac = build_v2df((scale1<0) ? 0. : gen->cf[scale1],
-                      (scale2<0) ? 0. : gen->cf[scale2]);
-
-  lam_1 = _mm_setzero_pd();
-  lam_2 = build_v2df(exp(ln2*(logval1-(scale1+minscale)*large_exponent2)),
-                     exp(ln2*(logval2-(scale2+minscale)*large_exponent2)));
-  if (m&1) lam_2 = _mm_xor_pd (lam_2,V2DF_SIGNMASK); /* negate */
-
-  l=m;
-  if ((scale1<0) && (scale2<0))
-    {
-    GETPRE(pre0,pre1,l)
-    while (1)
-      {
-      if (++l>lmax) break;
-      NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)
-      if (++l>lmax) break;
-      NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)
-      if (v2df_any_gt(lam_2,fbig2))
-        {
-        RENORMALIZE;
-        if ((scale1>=0) || (scale2>=0)) break;
-        }
-      }
-    }
-
-  if (l<=lmax)
-    {
-    GETPRE(pre0,pre1,l)
-    while (1)
-      {
-      v2df t1;
-      result[l]=t1=_mm_mul_pd(lam_2,corfac);
-      if (v2df_any_gt(t1,eps2))
-        break;
-      if (++l>lmax) break;
-      NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)
-
-      result[l]=t1=_mm_mul_pd(lam_1,corfac);
-      if (v2df_any_gt(t1,eps2))
-        { v2df tmp=lam_1;lam_1=lam_2;lam_2=tmp; break; }
-      if (++l>lmax) break;
-      NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)
-
-      if (v2df_any_gt(lam_2,fbig2))
-        RENORMALIZE;
-      }
-    }
-
-  gen->firstl[0]=l;
-  if (l>lmax)
-    {
-    gen->m_crit=m;
-    gen->cth_crit= (fabs(cth1)<fabs(cth2)) ? fabs(cth1) : fabs(cth2);
-    return;
-    }
-
-  GETPRE(pre0,pre1,l)
-  while (1)
-    {
-    v2df t1;
-    result[l]=t1=_mm_mul_pd(lam_2,corfac);
-    if (v2df_all_ge(t1,eps2))
-      break;
-    if (++l>lmax) return;
-    NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)
-
-    result[l]=t1=_mm_mul_pd(lam_1,corfac);
-    if (v2df_all_ge(t1,eps2))
-      { v2df tmp=lam_1;lam_1=lam_2;lam_2=tmp; break; }
-    if (++l>lmax) return;
-    NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)
-
-    if (v2df_any_gt(lam_2,fbig2))
-      RENORMALIZE;
-    }
-
-  lam_1 = _mm_mul_pd (lam_1,corfac);
-  lam_2 = _mm_mul_pd (lam_2,corfac);
-
-  GETPRE(pre0,pre1,l)
-  for(;l<lmax-2;l+=2)
-    {
-    result[l]=lam_2;
-    NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l+1)
-    result[l+1]=lam_1;
-    NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l+2)
-    }
-
-  while (1)
-    {
-    result[l]=lam_2;
-    if (++l>lmax) break;
-    NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)
-    result[l] = lam_1;
-    if (++l>lmax) break;
-    NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)
-    }
-  }
-
-/*static void Ylmgen_recalc_lambda_wx1_sse2 (Ylmgen_C *gen)
-  {
-  if (gen->lwx_uptodate_sse2[1]) return;
-  Ylmgen_recalc_Ylm_sse2(gen);
-  gen->firstl[1] = gen->firstl[0];
-  if (gen->firstl[1]>gen->lmax) return;
-  Ylmgen_recalc_lamfact(gen);
-  gen->lwx_uptodate_sse2[1] = 1;
-
-  {
-  v2df cth=build_v2df(gen->cth[gen->ith1],gen->cth[gen->ith2]);
-  v2df xsth=build_v2df(1./gen->sth[gen->ith1],1./gen->sth[gen->ith2]);
-  v2df m=build_v2df(gen->m_cur,gen->m_cur);
-  v2df m_on_sth = _mm_mul_pd(m,xsth);
-  v2df lam_lm=_mm_setzero_pd();
-  v2df2 *lambda_wx = gen->lambda_wx_sse2[1];
-  int l;
-  v2df ell=build_v2df(gen->firstl[1],gen->firstl[1]);
-  v2df uno=_mm_set1_pd(1.);
-  for (l=gen->firstl[1]; l<=gen->lmax; ++l, ell=_mm_add_pd(ell,uno))
-    {
-    v2df lamfact=_mm_load1_pd(&gen->lamfact[l]);
-    v2df lam_lm1m=lam_lm;
-    lam_lm=gen->ylm_sse2[l];
-    lambda_wx[l].a = _mm_mul_pd(xsth,_mm_sub_pd(_mm_mul_pd(lamfact,lam_lm1m),
-                     _mm_mul_pd(_mm_mul_pd(ell,cth),lam_lm)));
-    lambda_wx[l].b = _mm_mul_pd(m_on_sth,lam_lm);
-    }
-  }
-  }
-
-static void Ylmgen_recalc_lambda_wx2_sse2 (Ylmgen_C *gen)
-  {
-  if (gen->lwx_uptodate_sse2[2]) return;
-  Ylmgen_recalc_Ylm_sse2(gen);
-  gen->firstl[2] = gen->firstl[0];
-  if (gen->firstl[2]>gen->lmax) return;
-  Ylmgen_recalc_lamfact(gen);
-  gen->lwx_uptodate_sse2[2] = 1;
-
-  {
-  v2df cth=build_v2df(gen->cth[gen->ith1],gen->cth[gen->ith2]);
-  v2df sth=build_v2df(gen->sth[gen->ith1],gen->sth[gen->ith2]);
-  v2df m=build_v2df(gen->m_cur,gen->m_cur);
-  v2df uno=_mm_set1_pd(1.);
-  v2df one_on_s2 = _mm_div_pd(uno,_mm_mul_pd(sth,sth));
-  v2df two_on_s2 = _mm_mul_pd(_mm_set1_pd(2.),one_on_s2);
-  v2df two_c_on_s2 = _mm_mul_pd(cth,two_on_s2);
-  v2df m2 = _mm_mul_pd(m,m);
-  v2df two_m_on_s2 = _mm_mul_pd(m,two_on_s2);
-  v2df lam_lm=_mm_setzero_pd();
-  v2df2 *lambda_wx = gen->lambda_wx_sse2[2];
-  int l;
-  v2df ell=build_v2df(gen->firstl[2],gen->firstl[2]);
-  for (l=gen->firstl[2]; l<=gen->lmax; ++l, ell=_mm_add_pd(ell,uno))
-    {
-    v2df lamfact=_mm_load1_pd(&gen->lamfact[l]);
-    v2df lam_lm1m=lam_lm;
-    lam_lm=gen->ylm_sse2[l];
-    {
-    const v2df t1  = _mm_mul_pd(lam_lm1m,lamfact);
-    const v2df ellm1 = _mm_sub_pd(ell,uno);
-    const v2df a_w = _mm_sub_pd
-      (_mm_mul_pd(_mm_sub_pd(m2,ell),two_on_s2),_mm_mul_pd(ell,ellm1));
-    const v2df a_x = _mm_mul_pd(_mm_mul_pd(cth,ellm1),lam_lm);
-    lambda_wx[l].a =
-      _mm_add_pd(_mm_mul_pd(a_w,lam_lm),_mm_mul_pd(t1,two_c_on_s2));
-    lambda_wx[l].b = _mm_mul_pd(two_m_on_s2,_mm_sub_pd(t1,a_x));
-    }
-    }
-  }
-  }*/
-
-/*static void Ylmgen_recalc_lambda_wx_sse2 (Ylmgen_C *gen, int spin)
-  {
-  UTIL_ASSERT ((spin>0) && (spin<=gen->smax),
-    "invalid spin in Ylmgen_recalc_lambda_wx_sse2");
-
-  if (!gen->lambda_wx_sse2[spin])
-    gen->lambda_wx_sse2[spin]=RALLOC(v2df2,gen->lmax+1);
-
-  if (gen->spinrec && spin==1) { Ylmgen_recalc_lambda_wx1_sse2(gen); return; }
-  if (gen->spinrec && spin==2) { Ylmgen_recalc_lambda_wx2_sse2(gen); return; }
-
-  if (!gen->sylm[spin])
-    {
-    gen->sylm[spin]=RALLOC(sylmgen_d,1);
-    sylmgen_init(gen->sylm[spin],gen,spin);
-    }
-  if (gen->lwx_uptodate_sse2[spin]) return;
-  sylmgen_prepare(gen->sylm[spin],gen,gen->m_cur);
-  sylmgen_recalc_sse2(gen->sylm[spin],gen,gen->ith1,gen->ith2,
-    gen->lambda_wx_sse2[spin],&gen->firstl[spin]);
-  gen->lwx_uptodate_sse2[spin] = 1;
-  }*/
-
-#endif /* __SSE2__ */
-
-/*
-double *Ylmgen_get_norm (int lmax, int spin, int spinrec)
-  {
-  const double pi = 3.141592653589793238462643383279502884197;
-  double *res=RALLOC(double,lmax+1);
-  int l;
-  double spinsign;
-  / sign convention for H=1 (LensPix paper) /
-#if 1
-  spinsign = (spin>0) ? -1.0 : 1.0;
-#else
-  spinsign = 1.0;
-#endif
-
-  if (spin==0)
-    {
-    for (l=0; l<=lmax; ++l)
-      res[l]=1.;
-    return res;
-    }
-
-  if ((!spinrec) || (spin>=3))
-    {
-    spinsign = (spin&1) ? -spinsign : spinsign;
-    for (l=0; l<=lmax; ++l)
-      res[l] = (l<spin) ? 0. : spinsign*0.5*sqrt((2*l+1)/(4*pi));
-    return res;
-    }
-
-  if (spin==1)
-    {
-    for (l=0; l<=lmax; ++l)
-      res[l] = (l<spin) ? 0. : spinsign*sqrt(1./((l+1.)*l));
-    return res;
-    }
-
-  if (spin==2)
-    {
-    for (l=0; l<=lmax; ++l)
-      res[l] = (l<spin) ? 0. : spinsign*sqrt(1./((l+2.)*(l+1.)*l*(l-1.)));
-    return res;
-    }
-
-  UTIL_FAIL ("error in Ylmgen_get_norm");
-  return NULL;
-  }
-*/
-
-
-
-/*
-New high-level wrapper
-*/
-#include "sharp_legendre_table.h"
-#include <stdio.h>
-
-void sharp_normalized_associated_legendre_table(
-  ptrdiff_t m,
-  int spin,
-  ptrdiff_t lmax,
-  ptrdiff_t ntheta,
-  double *theta,
-  ptrdiff_t theta_stride,
-  ptrdiff_t l_stride,
-  ptrdiff_t spin_stride,
-  double *out
-) {
-    if (spin != 0) UTIL_FAIL ("sharp_normalized_associated_legendre_table: only spin=0 has been implemented so far");
-
-    Ylmgen_C ctx;
-    ptrdiff_t itheta, l, lmin;
-
-    Ylmgen_init(&ctx, lmax, lmax, 0, 0, 1e-300);
-    Ylmgen_set_theta(&ctx, theta, ntheta);
-
-    itheta = 0;
-    #ifdef __SSE2__
-    for (; itheta < ntheta - 1; itheta += 2) {
-        Ylmgen_prepare_sse2(&ctx, itheta, itheta + 1, m);
-        Ylmgen_recalc_Ylm_sse2(&ctx);
-        lmin = IMIN(*ctx.firstl, lmax + 1);
-        for (l = m; l < lmin; ++l) {
-            out[itheta * theta_stride + (l - m) * l_stride + spin * spin_stride] = 0;
-            out[(itheta + 1) * theta_stride + (l - m) * l_stride + spin * spin_stride] = 0;
-        }
-        for (l = IMAX(lmin, m); l <= lmax; ++l) {
-            double v1, v2;
-            read_v2df(ctx.ylm_sse2[l], &v1, &v2);
-            out[itheta * theta_stride + (l - m) * l_stride + spin * spin_stride] = v1;
-            out[(itheta + 1) * theta_stride + (l - m) * l_stride + spin * spin_stride] = v2;
-        }
-    }
-    #endif
-    for (; itheta < ntheta; itheta += 1) {
-        Ylmgen_prepare(&ctx, itheta, m);
-        Ylmgen_recalc_Ylm(&ctx);
-        lmin = IMIN(*ctx.firstl, lmax + 1);
-        for (l = m; l < lmin; ++l) {
-            out[itheta * theta_stride + (l - m) * l_stride + spin * spin_stride] = 0;
-        }
-        for (l = IMAX(lmin, m); l <= lmax; ++l) {
-            out[itheta * theta_stride + (l - m) * l_stride + spin * spin_stride] = ctx.ylm[l];
-        }
-    }
-    Ylmgen_destroy(&ctx);
-}
diff --git a/libsharp/sharp_legendre_table.h b/libsharp/sharp_legendre_table.h
deleted file mode 100644
index de4cdd6..0000000
--- a/libsharp/sharp_legendre_table.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- * Redistribution and use in source and binary forms, with or without
- * met:
- * 
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file sharp_legendre_table.h
- *  Interface for computing tables of the normalized associated Legendre transform
- *
- *  Copyright (C) 2017 Dag Sverre Seljebotn
- *  \author Dag Sverre Seljebotn
- *
- *  Note: This code was mainly copied from libpsht; only a small high-level wrapper added
- */
-
-#ifndef SHARP_LEGENDRE_TABLE_H
-#define SHARP_LEGENDRE_TABLE_H
-
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef NO_LEGENDRE_TABLE
-
-
-/*! Returns a table of the normalized associated Legendre polynomials. m is a single
-    fixed argument and a table for multiple l and cos(theta) is provided.
-    (Internally, sin(theta) is also used for part of the computation, making theta
-    the most convenient argument.)
-
-    NOTE: Support for spin-weighted Legendre functions is on the TODO-list. Only spin=0
-    is supported now.
-
-    \param m The m-value to compute a table for; must be >= 0
-    \param spin The spin parameter; pass 0 for the regular associated Legendre functions.
-                NOTE: This is present for future compatability, currently only 0 is supported.
-    \param lmax A table will be provided for l = m .. lmax
-    \param ntheta How many theta values to evaluate for
-    \param theta Contiguous 1D array of theta values
-    \param theta_stride See below
-    \param l_stride See below
-    \param spin_stride See below. "ispin" will always be 0 if spin==0, or 0 for positive spin
-                       and 1 for the corresponding negative spin otherwise.
-    \param out Contiguous 3D array that will receive the output. Each output entry
-               is assigned to out[itheta * theta_stride + (l - m) * l_stride + ispin * spin_stride].
- */
-void sharp_normalized_associated_legendre_table(
-  ptrdiff_t m,
-  int spin,
-  ptrdiff_t lmax,
-  ptrdiff_t ntheta,
-  /* contiguous 1D array of theta values to compute for,
-     contains ntheta values */
-  double *theta,
-  /* contiguous 2D array, in "theta-major ordering". Has `ntheta`
-     rows and `ncols` columns. Indexed as out[itheta * ncols + (l - m)].
-     If `ncols > lmax - m` then those entries are not accessed.
-  */
-  ptrdiff_t theta_stride,
-  ptrdiff_t l_stride,
-  ptrdiff_t spin_stride,
-  double *out
-);
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/m4/m4_ax_create_pkgconfig_info.m4 b/m4/m4_ax_create_pkgconfig_info.m4
new file mode 100644
index 0000000..308e64f
--- /dev/null
+++ b/m4/m4_ax_create_pkgconfig_info.m4
@@ -0,0 +1,351 @@
+# ============================================================================
+#  http://www.gnu.org/software/autoconf-archive/ax_create_pkgconfig_info.html
+# ============================================================================
+#
+# SYNOPSIS
+#
+#   AX_CREATE_PKGCONFIG_INFO [(outputfile, [requires [,libs [,summary [,cflags [, ldflags]]]]])]
+#
+# DESCRIPTION
+#
+#   Defaults:
+#
+#     $1 = $PACKAGE_NAME.pc
+#     $2 = (empty)
+#     $3 = $PACKAGE_LIBS $LIBS (as set at that point in configure.ac)
+#     $4 = $PACKAGE_SUMMARY (or $1 Library)
+#     $5 = $PACKAGE_CFLAGS (as set at the point in configure.ac)
+#     $6 = $PACKAGE_LDFLAGS (as set at the point in configure.ac)
+#
+#     PACKAGE_NAME defaults to $PACKAGE if not set.
+#     PACKAGE_LIBS defaults to -l$PACKAGE_NAME if not set.
+#
+#   The resulting file is called $PACKAGE.pc.in / $PACKAGE.pc
+#
+#   You will find this macro most useful in conjunction with
+#   ax_spec_defaults that can read good initializers from the .spec file. In
+#   consequencd, most of the generatable installable stuff can be made from
+#   information being updated in a single place for the whole project.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de>
+#   Copyright (c) 2008 Sven Verdoolaege <skimo@kotnet.org>
+#
+#   This program is free software; you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation; either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 12
+
+AC_DEFUN([AX_CREATE_PKGCONFIG_INFO],[dnl
+AS_VAR_PUSHDEF([PKGCONFIG_suffix],[ax_create_pkgconfig_suffix])dnl
+AS_VAR_PUSHDEF([PKGCONFIG_libdir],[ax_create_pkgconfig_libdir])dnl
+AS_VAR_PUSHDEF([PKGCONFIG_libfile],[ax_create_pkgconfig_libfile])dnl
+AS_VAR_PUSHDEF([PKGCONFIG_libname],[ax_create_pkgconfig_libname])dnl
+AS_VAR_PUSHDEF([PKGCONFIG_version],[ax_create_pkgconfig_version])dnl
+AS_VAR_PUSHDEF([PKGCONFIG_description],[ax_create_pkgconfig_description])dnl
+AS_VAR_PUSHDEF([PKGCONFIG_requires],[ax_create_pkgconfig_requires])dnl
+AS_VAR_PUSHDEF([PKGCONFIG_pkglibs],[ax_create_pkgconfig_pkglibs])dnl
+AS_VAR_PUSHDEF([PKGCONFIG_libs],[ax_create_pkgconfig_libs])dnl
+AS_VAR_PUSHDEF([PKGCONFIG_ldflags],[ax_create_pkgconfig_ldflags])dnl
+AS_VAR_PUSHDEF([PKGCONFIG_cppflags],[ax_create_pkgconfig_cppflags])dnl
+AS_VAR_PUSHDEF([PKGCONFIG_generate],[ax_create_pkgconfig_generate])dnl
+AS_VAR_PUSHDEF([PKGCONFIG_src_libdir],[ax_create_pkgconfig_src_libdir])dnl
+AS_VAR_PUSHDEF([PKGCONFIG_src_headers],[ax_create_pkgconfig_src_headers])dnl
+
+# we need the expanded forms...
+test "x$prefix" = xNONE && prefix=$ac_default_prefix
+test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
+
+AC_MSG_CHECKING(our pkgconfig libname)
+test ".$PKGCONFIG_libname" != "." || \
+PKGCONFIG_libname="ifelse($1,,${PACKAGE_NAME},`basename $1 .pc`)"
+test ".$PKGCONFIG_libname" != "." || \
+PKGCONFIG_libname="$PACKAGE"
+PKGCONFIG_libname=`eval echo "$PKGCONFIG_libname"`
+PKGCONFIG_libname=`eval echo "$PKGCONFIG_libname"`
+AC_MSG_RESULT($PKGCONFIG_libname)
+
+AC_MSG_CHECKING(our pkgconfig version)
+test ".$PKGCONFIG_version" != "." || \
+PKGCONFIG_version="${PACKAGE_VERSION}"
+test ".$PKGCONFIG_version" != "." || \
+PKGCONFIG_version="$VERSION"
+PKGCONFIG_version=`eval echo "$PKGCONFIG_version"`
+PKGCONFIG_version=`eval echo "$PKGCONFIG_version"`
+AC_MSG_RESULT($PKGCONFIG_version)
+
+AC_MSG_CHECKING(our pkgconfig_libdir)
+test ".$pkgconfig_libdir" = "." && \
+pkgconfig_libdir='${libdir}/pkgconfig'
+PKGCONFIG_libdir=`eval echo "$pkgconfig_libdir"`
+PKGCONFIG_libdir=`eval echo "$PKGCONFIG_libdir"`
+PKGCONFIG_libdir=`eval echo "$PKGCONFIG_libdir"`
+AC_MSG_RESULT($pkgconfig_libdir)
+test "$pkgconfig_libdir" != "$PKGCONFIG_libdir" && (
+AC_MSG_RESULT(expanded our pkgconfig_libdir... $PKGCONFIG_libdir))
+AC_SUBST([pkgconfig_libdir])
+
+AC_MSG_CHECKING(our pkgconfig_libfile)
+test ".$pkgconfig_libfile" != "." || \
+pkgconfig_libfile="ifelse($1,,$PKGCONFIG_libname.pc,`basename $1`)"
+PKGCONFIG_libfile=`eval echo "$pkgconfig_libfile"`
+PKGCONFIG_libfile=`eval echo "$PKGCONFIG_libfile"`
+AC_MSG_RESULT($pkgconfig_libfile)
+test "$pkgconfig_libfile" != "$PKGCONFIG_libfile" && (
+AC_MSG_RESULT(expanded our pkgconfig_libfile... $PKGCONFIG_libfile))
+AC_SUBST([pkgconfig_libfile])
+
+AC_MSG_CHECKING(our package / suffix)
+PKGCONFIG_suffix="$program_suffix"
+test ".$PKGCONFIG_suffix" != .NONE || PKGCONFIG_suffix=""
+AC_MSG_RESULT(${PACKAGE_NAME} / ${PKGCONFIG_suffix})
+
+AC_MSG_CHECKING(our pkgconfig description)
+PKGCONFIG_description="ifelse($4,,$PACKAGE_SUMMARY,$4)"
+test ".$PKGCONFIG_description" != "." || \
+PKGCONFIG_description="$PKGCONFIG_libname Library"
+PKGCONFIG_description=`eval echo "$PKGCONFIG_description"`
+PKGCONFIG_description=`eval echo "$PKGCONFIG_description"`
+AC_MSG_RESULT($PKGCONFIG_description)
+
+AC_MSG_CHECKING(our pkgconfig requires)
+PKGCONFIG_requires="ifelse($2,,$PACKAGE_REQUIRES,$2)"
+PKGCONFIG_requires=`eval echo "$PKGCONFIG_requires"`
+PKGCONFIG_requires=`eval echo "$PKGCONFIG_requires"`
+AC_MSG_RESULT($PKGCONFIG_requires)
+
+AC_MSG_CHECKING(our pkgconfig ext libs)
+PKGCONFIG_pkglibs="$PACKAGE_LIBS"
+test ".$PKGCONFIG_pkglibs" != "." || PKGCONFIG_pkglibs="-l$PKGCONFIG_libname"
+PKGCONFIG_libs="ifelse($3,,$PKGCONFIG_pkglibs $LIBS,$3)"
+PKGCONFIG_libs=`eval echo "$PKGCONFIG_libs"`
+PKGCONFIG_libs=`eval echo "$PKGCONFIG_libs"`
+AC_MSG_RESULT($PKGCONFIG_libs)
+
+AC_MSG_CHECKING(our pkgconfig cppflags)
+PKGCONFIG_cppflags="ifelse($5,,$PACKAGE_CFLAGS,$5)"
+PKGCONFIG_cppflags=`eval echo "$PKGCONFIG_cppflags"`
+PKGCONFIG_cppflags=`eval echo "$PKGCONFIG_cppflags"`
+AC_MSG_RESULT($PKGCONFIG_cppflags)
+
+AC_MSG_CHECKING(our pkgconfig ldflags)
+PKGCONFIG_ldflags="ifelse($6,,$PACKAGE_LDFLAGS,$5)"
+PKGCONFIG_ldflags=`eval echo "$PKGCONFIG_ldflags"`
+PKGCONFIG_ldflags=`eval echo "$PKGCONFIG_ldflags"`
+AC_MSG_RESULT($PKGCONFIG_ldflags)
+
+test ".$PKGCONFIG_generate" != "." || \
+PKGCONFIG_generate="ifelse($1,,$PKGCONFIG_libname.pc,$1)"
+PKGCONFIG_generate=`eval echo "$PKGCONFIG_generate"`
+PKGCONFIG_generate=`eval echo "$PKGCONFIG_generate"`
+test "$pkgconfig_libfile" != "$PKGCONFIG_generate" && (
+AC_MSG_RESULT(generate the pkgconfig later... $PKGCONFIG_generate))
+
+if test ".$PKGCONFIG_src_libdir" = "." ; then
+PKGCONFIG_src_libdir=`pwd`
+PKGCONFIG_src_libdir=`AS_DIRNAME("$PKGCONFIG_src_libdir/$PKGCONFIG_generate")`
+test ! -d $PKGCONFIG_src_libdir/src || \
+PKGCONFIG_src_libdir="$PKGCONFIG_src_libdir/src"
+case ".$objdir" in
+*libs) PKGCONFIG_src_libdir="$PKGCONFIG_src_libdir/$objdir" ;; esac
+AC_MSG_RESULT(noninstalled pkgconfig -L $PKGCONFIG_src_libdir)
+fi
+
+if test ".$PKGCONFIG_src_headers" = "." ; then
+PKGCONFIG_src_headers=`pwd`
+v="$ac_top_srcdir" ;
+test ".$v" != "." || v="$ax_spec_dir"
+test ".$v" != "." || v="$srcdir"
+case "$v" in /*) PKGCONFIG_src_headers="" ;; esac
+PKGCONFIG_src_headers=`AS_DIRNAME("$PKGCONFIG_src_headers/$v/x")`
+test ! -d $PKGCONFIG_src_headers/incl[]ude || \
+PKGCONFIG_src_headers="$PKGCONFIG_src_headers/incl[]ude"
+AC_MSG_RESULT(noninstalled pkgconfig -I $PKGCONFIG_src_headers)
+fi
+
+
+dnl AC_CONFIG_COMMANDS crap disallows to use $PKGCONFIG_libfile here...
+AC_CONFIG_COMMANDS([$ax_create_pkgconfig_generate],[
+pkgconfig_generate="$ax_create_pkgconfig_generate"
+if test ! -f "$pkgconfig_generate.in"
+then generate="true"
+elif grep ' generated by configure ' $pkgconfig_generate.in >/dev/null
+then generate="true"
+else generate="false";
+fi
+if $generate ; then
+AC_MSG_NOTICE(creating $pkgconfig_generate.in)
+cat > $pkgconfig_generate.in <<AXEOF
+# generated by configure / remove this line to disable regeneration
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+bindir=@bindir@
+libdir=@libdir@
+datarootdir=@datarootdir@
+datadir=@datadir@
+sysconfdir=@sysconfdir@
+includedir=@includedir@
+package=@PACKAGE@
+suffix=@suffix@
+
+Name: @PACKAGE_NAME@
+Description: @PACKAGE_DESCRIPTION@
+Version: @PACKAGE_VERSION@
+Requires: @PACKAGE_REQUIRES@
+Libs: -L\${libdir} @LDFLAGS@ @LIBS@
+Cflags: -I\${includedir} @CPPFLAGS@
+AXEOF
+fi # DONE generate $pkgconfig_generate.in
+AC_MSG_NOTICE(creating $pkgconfig_generate)
+cat >conftest.sed <<AXEOF
+s|@prefix@|${pkgconfig_prefix}|
+s|@exec_prefix@|${pkgconfig_execprefix}|
+s|@bindir@|${pkgconfig_bindir}|
+s|@libdir@|${pkgconfig_libdir}|
+s|@datarootdir@|${pkgconfig_datarootdir}|
+s|@datadir@|${pkgconfig_datadir}|
+s|@sysconfdir@|${pkgconfig_sysconfdir}|
+s|@includedir@|${pkgconfig_includedir}|
+s|@suffix@|${pkgconfig_suffix}|
+s|@PACKAGE@|${pkgconfig_package}|
+s|@PACKAGE_NAME@|${pkgconfig_libname}|
+s|@PACKAGE_DESCRIPTION@|${pkgconfig_description}|
+s|@PACKAGE_VERSION@|${pkgconfig_version}|
+s|@PACKAGE_REQUIRES@|${pkgconfig_requires}|
+s|@LIBS@|${pkgconfig_libs}|
+s|@LDFLAGS@|${pkgconfig_ldflags}|
+s|@CPPFLAGS@|${pkgconfig_cppflags}|
+AXEOF
+sed -f conftest.sed  $pkgconfig_generate.in > $pkgconfig_generate
+if test ! -s $pkgconfig_generate ; then
+    AC_MSG_ERROR([$pkgconfig_generate is empty])
+fi ; rm conftest.sed # DONE generate $pkgconfig_generate
+pkgconfig_uninstalled=`echo $pkgconfig_generate |sed 's/.pc$/-uninstalled.pc/'`
+AC_MSG_NOTICE(creating $pkgconfig_uninstalled)
+cat >conftest.sed <<AXEOF
+s|@prefix@|${pkgconfig_prefix}|
+s|@exec_prefix@|${pkgconfig_execprefix}|
+s|@bindir@|${pkgconfig_bindir}|
+s|@libdir@|${pkgconfig_src_libdir}|
+s|@datarootdir@|${pkgconfig_datarootdir}|
+s|@datadir@|${pkgconfig_datadir}|
+s|@sysconfdir@|${pkgconfig_sysconfdir}|
+s|@includedir@|${pkgconfig_src_headers}|
+s|@suffix@|${pkgconfig_suffix}|
+s|@PACKAGE@|${pkgconfig_package}|
+s|@PACKAGE_NAME@|${pkgconfig_libname}|
+s|@PACKAGE_DESCRIPTION@|${pkgconfig_description}|
+s|@PACKAGE_VERSION@|${pkgconfig_version}|
+s|@PACKAGE_REQUIRES@|${pkgconfig_requires}|
+s|@LIBS@|${pkgconfig_libs}|
+s|@LDFLAGS@|${pkgconfig_ldflags}|
+s|@CPPFLAGS@|${pkgconfig_cppflags}|
+AXEOF
+sed -f conftest.sed $pkgconfig_generate.in > $pkgconfig_uninstalled
+if test ! -s $pkgconfig_uninstalled ; then
+    AC_MSG_ERROR([$pkgconfig_uninstalled is empty])
+fi ; rm conftest.sed # DONE generate $pkgconfig_uninstalled
+           pkgconfig_requires_add=`echo ${pkgconfig_requires}`
+if test ".$pkgconfig_requires_add" != "." ; then
+           pkgconfig_requires_add="pkg-config $pkgconfig_requires_add"
+    else   pkgconfig_requires_add=":" ; fi
+pkgconfig_uninstalled=`echo $pkgconfig_generate |sed 's/.pc$/-uninstalled.sh/'`
+AC_MSG_NOTICE(creating $pkgconfig_uninstalled)
+cat >conftest.sed <<AXEOF
+s|@prefix@|\"${pkgconfig_prefix}\"|
+s|@exec_prefix@|\"${pkgconfig_execprefix}\"|
+s|@bindir@|\"${pkgconfig_bindir}\"|
+s|@libdir@|\"${pkgconfig_src_libdir}\"|
+s|@datarootdir@|\"${pkgconfig_datarootdir}\"|
+s|@datadir@|\"${pkgconfig_datadir}\"|
+s|@sysconfdir@|\"${pkgconfig_sysconfdir}\"|
+s|@includedir@|\"${pkgconfig_src_headers}\"|
+s|@suffix@|\"${pkgconfig_suffix}\"|
+s|@PACKAGE@|\"${pkgconfig_package}\"|
+s|@PACKAGE_NAME@|\"${pkgconfig_libname}\"|
+s|@PACKAGE_DESCRIPTION@|\"${pkgconfig_description}\"|
+s|@PACKAGE_VERSION@|\"${pkgconfig_version}\"|
+s|@PACKAGE_REQUIRES@|\"${pkgconfig_requires}\"|
+s|@LIBS@|\"${pkgconfig_libs}\"|
+s|@LDFLAGS@|\"${pkgconfig_ldflags}\"|
+s|@CPPFLAGS@|\"${pkgconfig_cppflags}\"|
+s>Name:>for option\\; do case \"\$option\" in --list-all|--name) echo >
+s>Description: *>\\;\\; --help) pkg-config --help \\; echo Buildscript Of >
+s>Version: *>\\;\\; --modversion|--version) echo >
+s>Requires:>\\;\\; --requires) echo $pkgconfig_requires_add>
+s>Libs: *>\\;\\; --libs) echo >
+s>Cflags: *>\\;\\; --cflags) echo >
+/--libs)/a\\
+       $pkgconfig_requires_add
+/--cflags)/a\\
+       $pkgconfig_requires_add\\
+;; --variable=*) eval echo '\$'\`echo \$option | sed -e 's/.*=//'\`\\
+;; --uninstalled) exit 0 \\
+;; *) ;; esac done
+AXEOF
+sed -f conftest.sed  $pkgconfig_generate.in > $pkgconfig_uninstalled
+if test ! -s $pkgconfig_uninstalled ; then
+    AC_MSG_ERROR([$pkgconfig_uninstalled is empty])
+fi ; rm conftest.sed # DONE generate $pkgconfig_uninstalled
+],[
+dnl AC_CONFIG_COMMANDS crap, the AS_PUSHVAR defines are invalid here...
+ax_create_pkgconfig_generate="$ax_create_pkgconfig_generate"
+pkgconfig_prefix='$prefix'
+pkgconfig_execprefix='$exec_prefix'
+pkgconfig_bindir='$bindir'
+pkgconfig_libdir='$libdir'
+pkgconfig_includedir='$includedir'
+pkgconfig_datarootdir='$datarootdir'
+pkgconfig_datadir='$datadir'
+pkgconfig_sysconfdir='$sysconfdir'
+pkgconfig_suffix='$ax_create_pkgconfig_suffix'
+pkgconfig_package='$PACKAGE_NAME'
+pkgconfig_libname='$ax_create_pkgconfig_libname'
+pkgconfig_description='$ax_create_pkgconfig_description'
+pkgconfig_version='$ax_create_pkgconfig_version'
+pkgconfig_requires='$ax_create_pkgconfig_requires'
+pkgconfig_libs='$ax_create_pkgconfig_libs'
+pkgconfig_ldflags='$ax_create_pkgconfig_ldflags'
+pkgconfig_cppflags='$ax_create_pkgconfig_cppflags'
+pkgconfig_src_libdir='$ax_create_pkgconfig_src_libdir'
+pkgconfig_src_headers='$ax_create_pkgconfig_src_headers'
+])dnl
+AS_VAR_POPDEF([PKGCONFIG_suffix])dnl
+AS_VAR_POPDEF([PKGCONFIG_libdir])dnl
+AS_VAR_POPDEF([PKGCONFIG_libfile])dnl
+AS_VAR_POPDEF([PKGCONFIG_libname])dnl
+AS_VAR_POPDEF([PKGCONFIG_version])dnl
+AS_VAR_POPDEF([PKGCONFIG_description])dnl
+AS_VAR_POPDEF([PKGCONFIG_requires])dnl
+AS_VAR_POPDEF([PKGCONFIG_pkglibs])dnl
+AS_VAR_POPDEF([PKGCONFIG_libs])dnl
+AS_VAR_POPDEF([PKGCONFIG_ldflags])dnl
+AS_VAR_POPDEF([PKGCONFIG_cppflags])dnl
+AS_VAR_POPDEF([PKGCONFIG_generate])dnl
+AS_VAR_POPDEF([PKGCONFIG_src_libdir])dnl
+AS_VAR_POPDEF([PKGCONFIG_src_headers])dnl
+])
diff --git a/python/fake_pyrex/Pyrex/Distutils/__init__.py b/python/fake_pyrex/Pyrex/Distutils/__init__.py
deleted file mode 100644
index 51c8e16..0000000
--- a/python/fake_pyrex/Pyrex/Distutils/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# work around broken setuptools monkey patching
diff --git a/python/fake_pyrex/Pyrex/Distutils/build_ext.py b/python/fake_pyrex/Pyrex/Distutils/build_ext.py
deleted file mode 100644
index 4f846f6..0000000
--- a/python/fake_pyrex/Pyrex/Distutils/build_ext.py
+++ /dev/null
@@ -1 +0,0 @@
-build_ext = "yes, it's there!"
diff --git a/python/fake_pyrex/Pyrex/__init__.py b/python/fake_pyrex/Pyrex/__init__.py
deleted file mode 100644
index 51c8e16..0000000
--- a/python/fake_pyrex/Pyrex/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# work around broken setuptools monkey patching
diff --git a/python/fake_pyrex/README b/python/fake_pyrex/README
deleted file mode 100644
index cf3f3ff..0000000
--- a/python/fake_pyrex/README
+++ /dev/null
@@ -1,2 +0,0 @@
-This directory is here to fool setuptools into building .pyx files
-even if Pyrex is not installed. See ../setup.py.
\ No newline at end of file
diff --git a/python/libsharp/__init__.py b/python/libsharp/__init__.py
deleted file mode 100644
index dd0fa41..0000000
--- a/python/libsharp/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .libsharp import *
diff --git a/python/libsharp/libsharp.pxd b/python/libsharp/libsharp.pxd
deleted file mode 100644
index 27a4608..0000000
--- a/python/libsharp/libsharp.pxd
+++ /dev/null
@@ -1,92 +0,0 @@
-cdef extern from "sharp.h":
-
-    void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x,
-                                    float *out, ptrdiff_t nx)
-    void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x,
-                                  double *out, ptrdiff_t nx)
-    void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax)
-    void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax)
-    void sharp_legendre_roots(int n, double *x, double *w)
-
-    # sharp_lowlevel.h
-    ctypedef struct sharp_alm_info:
-      # Maximum \a l index of the array
-      int lmax
-      # Number of different \a m values in this object
-      int nm
-      # Array with \a nm entries containing the individual m values
-      int *mval
-      # Combination of flags from sharp_almflags
-      int flags
-      # Array with \a nm entries containing the (hypothetical) indices of
-      #   the coefficients with quantum numbers 0,\a mval[i]
-      long *mvstart
-      # Stride between a_lm and a_(l+1),m
-      long stride
-
-    ctypedef struct sharp_geom_info:
-        pass
-
-    void sharp_make_alm_info (int lmax, int mmax, int stride,
-                             ptrdiff_t *mvstart, sharp_alm_info **alm_info)
-
-    void sharp_make_geom_info (int nrings, int *nph, ptrdiff_t *ofs,
-                               int *stride, double *phi0, double *theta,
-                               double *wgt, sharp_geom_info **geom_info)
-
-    void sharp_destroy_alm_info(sharp_alm_info *info)
-    void sharp_destroy_geom_info(sharp_geom_info *info)
-
-    ptrdiff_t sharp_map_size(sharp_geom_info *info)
-    ptrdiff_t sharp_alm_count(sharp_alm_info *self)
-
-
-    ctypedef enum sharp_jobtype:
-        SHARP_YtW
-        SHARP_Yt
-        SHARP_WY
-        SHARP_Y
-
-    ctypedef enum:
-        SHARP_DP
-        SHARP_ADD
-
-    void sharp_execute(sharp_jobtype type_,
-                       int spin,
-                       void *alm,
-                       void *map,
-                       sharp_geom_info *geom_info,
-                       sharp_alm_info *alm_info,
-                       int ntrans,
-                       int flags,
-                       double *time,
-                       unsigned long long *opcnt) nogil
-
-    ctypedef enum:
-        SHARP_ERROR_NO_MPI
-
-    int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
-        void *alm, void *map, sharp_geom_info *geom_info,
-        sharp_alm_info *alm_info, int ntrans, int flags, double *time,
-        unsigned long long *opcnt) nogil
-
-    void sharp_normalized_associated_legendre_table(int m, int spin, int lmax, int ntheta,
-        double *theta, int theta_stride, int l_stride, int spin_stride, double *out) nogil
-
-
-cdef extern from "sharp_geomhelpers.h":
-    void sharp_make_subset_healpix_geom_info(
-        int nside, int stride, int nrings,
-        int *rings, double *weight, sharp_geom_info **geom_info)
-    void sharp_make_gauss_geom_info(
-        int nrings, int nphi, double phi0,
-        int stride_lon, int stride_lat, sharp_geom_info **geom_info)
-
-cdef extern from "sharp_almhelpers.h":
-    void sharp_make_triangular_alm_info (int lmax, int mmax, int stride,
-        sharp_alm_info **alm_info)
-    void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride,
-        sharp_alm_info **alm_info)
-    void sharp_make_mmajor_real_packed_alm_info (int lmax, int stride,
-        int nm, const int *ms, sharp_alm_info **alm_info)
-
diff --git a/python/libsharp/libsharp.pyx b/python/libsharp/libsharp.pyx
deleted file mode 100644
index dfefc93..0000000
--- a/python/libsharp/libsharp.pyx
+++ /dev/null
@@ -1,324 +0,0 @@
-import numpy as np
-cimport numpy as np
-cimport cython
-
-__all__ = ['legendre_transform', 'legendre_roots', 'sht', 'synthesis', 'adjoint_synthesis',
-           'analysis', 'adjoint_analysis', 'healpix_grid', 'triangular_order', 'rectangular_order',
-           'packed_real_order', 'normalized_associated_legendre_table']
-
-
-def legendre_transform(x, bl, out=None):
-    if out is None:
-        out = np.empty_like(x)
-    if out.shape[0] == 0:
-        return out
-    elif x.dtype == np.float64:
-        if bl.dtype != np.float64:
-            bl = bl.astype(np.float64)
-        return _legendre_transform(x, bl, out=out)
-    elif x.dtype == np.float32:
-        if bl.dtype != np.float32:
-            bl = bl.astype(np.float32)
-        return _legendre_transform_s(x, bl, out=out)
-    else:
-        raise ValueError("unsupported dtype")
-
-
-def _legendre_transform(double[::1] x, double[::1] bl, double[::1] out):
-    if out.shape[0] != x.shape[0]:
-        raise ValueError('x and out must have same shape')
-    sharp_legendre_transform(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0])
-    return np.asarray(out)
-
-
-def _legendre_transform_s(float[::1] x, float[::1] bl, float[::1] out):
-    if out.shape[0] != x.shape[0]:
-        raise ValueError('x and out must have same shape')
-    sharp_legendre_transform_s(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0])
-    return np.asarray(out)
-
-
-def legendre_roots(n):
-    x = np.empty(n, np.double)
-    w = np.empty(n, np.double)
-    cdef double[::1] x_buf = x, w_buf = w
-    if not (x_buf.shape[0] == w_buf.shape[0] == n):
-        raise AssertionError()
-    if n > 0:
-        sharp_legendre_roots(n, &x_buf[0], &w_buf[0])
-    return x, w
-
-
-JOBTYPE_TO_CONST = {
-    'Y': SHARP_Y,
-    'Yt': SHARP_Yt,
-    'WY': SHARP_WY,
-    'YtW': SHARP_YtW
-}
-
-def sht(jobtype, geom_info ginfo, alm_info ainfo, double[:, :, ::1] input,
-        int spin=0, comm=None, add=False):
-    cdef void *comm_ptr
-    cdef int flags = SHARP_DP | (SHARP_ADD if add else 0)
-    cdef int r
-    cdef sharp_jobtype jobtype_i
-    cdef double[:, :, ::1] output_buf
-    cdef int ntrans = input.shape[0]
-    cdef int ntotcomp = ntrans * input.shape[1]
-    cdef int i, j
-
-    if spin == 0 and input.shape[1] != 1:
-        raise ValueError('For spin == 0, we need input.shape[1] == 1')
-    elif spin != 0 and input.shape[1] != 2:
-        raise ValueError('For spin != 0, we need input.shape[1] == 2')
-
-
-    cdef size_t[::1] ptrbuf = np.empty(2 * ntotcomp, dtype=np.uintp)
-    cdef double **alm_ptrs = <double**>&ptrbuf[0]
-    cdef double **map_ptrs = <double**>&ptrbuf[ntotcomp]
-
-    try:
-        jobtype_i = JOBTYPE_TO_CONST[jobtype]
-    except KeyError:
-        raise ValueError('jobtype must be one of: %s' % ', '.join(sorted(JOBTYPE_TO_CONST.keys())))
-
-    if jobtype_i == SHARP_Y or jobtype_i == SHARP_WY:
-        output = np.empty((input.shape[0], input.shape[1], ginfo.local_size()), dtype=np.float64)
-        output_buf = output
-        for i in range(input.shape[0]):
-            for j in range(input.shape[1]):
-                alm_ptrs[i * input.shape[1] + j] = &input[i, j, 0]
-                map_ptrs[i * input.shape[1] + j] = &output_buf[i, j, 0]
-    else:
-        output = np.empty((input.shape[0], input.shape[1], ainfo.local_size()), dtype=np.float64)
-        output_buf = output
-        for i in range(input.shape[0]):
-            for j in range(input.shape[1]):
-                alm_ptrs[i * input.shape[1] + j] = &output_buf[i, j, 0]
-                map_ptrs[i * input.shape[1] + j] = &input[i, j, 0]
-
-    if comm is None:
-        with nogil:
-            sharp_execute (
-                jobtype_i,
-                geom_info=ginfo.ginfo, alm_info=ainfo.ainfo,
-                spin=spin, alm=alm_ptrs, map=map_ptrs,
-                ntrans=ntrans, flags=flags, time=NULL, opcnt=NULL)
-    else:
-        from mpi4py import MPI
-        if not isinstance(comm, MPI.Comm):
-            raise TypeError('comm must be an mpi4py communicator')
-        from .libsharp_mpi import _addressof
-        comm_ptr = <void*><size_t>_addressof(comm)
-        with nogil:
-            r = sharp_execute_mpi_maybe (
-                comm_ptr, jobtype_i,
-                geom_info=ginfo.ginfo, alm_info=ainfo.ainfo,
-                spin=spin, alm=alm_ptrs, map=map_ptrs,
-                ntrans=ntrans, flags=flags, time=NULL, opcnt=NULL)
-        if r == SHARP_ERROR_NO_MPI:
-            raise Exception('MPI requested, but not available')
-
-    return output
-
-
-def synthesis(*args, **kw):
-    return sht('Y', *args, **kw)
-
-def adjoint_synthesis(*args, **kw):
-    return sht('Yt', *args, **kw)
-
-def analysis(*args, **kw):
-    return sht('YtW', *args, **kw)
-
-def adjoint_analysis(*args, **kw):
-    return sht('WY', *args, **kw)
-
-
-#
-# geom_info
-#
-class NotInitializedError(Exception):
-    pass
-
-
-cdef class geom_info:
-    cdef sharp_geom_info *ginfo
-
-    def __cinit__(self, *args, **kw):
-        self.ginfo = NULL
-
-    def local_size(self):
-        if self.ginfo == NULL:
-            raise NotInitializedError()
-        return sharp_map_size(self.ginfo)
-
-    def __dealloc__(self):
-        if self.ginfo != NULL:
-            sharp_destroy_geom_info(self.ginfo)
-        self.ginfo = NULL
-
-
-cdef class healpix_grid(geom_info):
-
-    _weight_cache = {}  # { (nside, 'T'/'Q'/'U') -> numpy array of ring weights cached from file }
-
-    def __init__(self, int nside, stride=1, int[::1] rings=None, double[::1] weights=None):
-        if weights is not None and weights.shape[0] != 2 * nside:
-            raise ValueError('weights must have length 2 * nside')
-        sharp_make_subset_healpix_geom_info(nside, stride,
-                                            nrings=4 * nside - 1 if rings is None else rings.shape[0],
-                                            rings=NULL if rings is None else &rings[0],
-                                            weight=NULL if weights is None else &weights[0],
-                                            geom_info=&self.ginfo)
-
-    @classmethod
-    def load_ring_weights(cls, nside, fields):
-        """
-        Loads HEALPix ring weights from file. The environment variable
-        HEALPIX should be set, and this routine will look in the `data`
-        subdirectory.
-
-        Parameters
-        ----------
-
-        nside: int
-            HEALPix nside parameter
-
-        fields: tuple of str
-            Which weights to extract; pass ('T',) to only get scalar
-            weights back, or ('T', 'Q', 'U') to get all the weights
-
-        Returns
-        -------
-
-        List of NumPy arrays, according to fields parameter.
-
-        """
-        import os
-        from astropy.io import fits
-        data_path = os.path.join(os.environ['HEALPIX'], 'data')
-        fits_field_names = {
-            'T': 'TEMPERATURE WEIGHTS',
-            'Q': 'Q-POLARISATION WEIGHTS',
-            'U': 'U-POLARISATION WEIGHTS'}
-
-        must_load = [field for field in fields if (nside, field) not in cls._weight_cache]
-
-        if must_load:
-            hdulist = fits.open(os.path.join(data_path, 'weight_ring_n%05d.fits' % nside))
-            try:
-                for field in must_load:
-                    w = hdulist[1].data.field(fits_field_names[field]).ravel().astype(np.double)
-                    w += 1
-                    cls._weight_cache[nside, field] = w
-            finally:
-                hdulist.close()
-        return [cls._weight_cache[(nside, field)].copy() for field in fields]
-
-#
-# alm_info
-#
-
-
-cdef class alm_info:
-    cdef sharp_alm_info *ainfo
-
-    def __cinit__(self, *args, **kw):
-        self.ainfo = NULL
-
-    def local_size(self):
-        if self.ainfo == NULL:
-            raise NotInitializedError()
-        return sharp_alm_count(self.ainfo)
-
-    def mval(self):
-        if self.ainfo == NULL:
-            raise NotInitializedError()
-        return np.asarray(<int[:self.ainfo.nm]> self.ainfo.mval)
-
-    def mvstart(self):
-        if self.ainfo == NULL:
-            raise NotInitializedError()
-        return np.asarray(<long[:self.ainfo.nm]> self.ainfo.mvstart)
-
-    def __dealloc__(self):
-        if self.ainfo != NULL:
-            sharp_destroy_alm_info(self.ainfo)
-        self.ainfo = NULL
-
-    @cython.boundscheck(False)
-    def almxfl(self, np.ndarray[double, ndim=3, mode='c'] alm, np.ndarray[double, ndim=2, mode='c'] fl):
-        """Multiply Alm by a Ell based array
-
-
-        Parameters
-        ----------
-        alm : np.ndarray
-            input alm, 3 dimensions = (different signal x polarizations x lm-ordering)
-        fl : np.ndarray
-            either 1 dimension, e.g. gaussian beam, or 2 dimensions e.g. a polarized beam
-
-        Returns
-        -------
-        None, it modifies alms in-place
-
-        """
-        cdef int mvstart = 0
-        cdef bint has_multiple_beams = alm.shape[2] > 1 and fl.shape[1] > 1
-        cdef int f, i_m, m, num_ells, i_l, i_signal, i_pol, i_mv
-
-        for i_m in range(self.ainfo.nm):
-            m = self.ainfo.mval[i_m]
-            f = 1 if (m==0) else 2
-            num_ells = self.ainfo.lmax + 1 - m
-
-            if not has_multiple_beams:
-                for i_signal in range(alm.shape[0]):
-                    for i_pol in range(alm.shape[1]):
-                        for i_l in range(num_ells):
-                            l = m + i_l
-                            for i_mv in range(mvstart + f*i_l, mvstart + f*i_l +f):
-                                alm[i_signal, i_pol, i_mv] *= fl[l, 0]
-            else:
-                for i_signal in range(alm.shape[0]):
-                    for i_pol in range(alm.shape[1]):
-                        for i_l in range(num_ells):
-                            l = m + i_l
-                            for i_mv in range(mvstart + f*i_l, mvstart + f*i_l +f):
-                                alm[i_signal, i_pol, i_mv] *= fl[l, i_pol]
-            mvstart += f * num_ells
-
-cdef class triangular_order(alm_info):
-    def __init__(self, int lmax, mmax=None, stride=1):
-        mmax = mmax if mmax is not None else lmax
-        sharp_make_triangular_alm_info(lmax, mmax, stride, &self.ainfo)
-
-
-cdef class rectangular_order(alm_info):
-    def __init__(self, int lmax, mmax=None, stride=1):
-        mmax = mmax if mmax is not None else lmax
-        sharp_make_rectangular_alm_info(lmax, mmax, stride, &self.ainfo)
-
-
-cdef class packed_real_order(alm_info):
-    def __init__(self, int lmax, stride=1, int[::1] ms=None):
-        sharp_make_mmajor_real_packed_alm_info(lmax=lmax, stride=stride,
-                                               nm=lmax + 1 if ms is None else ms.shape[0],
-                                               ms=NULL if ms is None else &ms[0],
-                                               alm_info=&self.ainfo)
-
-#
-# 
-#
-
-@cython.boundscheck(False)
-def normalized_associated_legendre_table(int lmax, int m, theta):
-    cdef double[::1] theta_ = np.ascontiguousarray(theta, dtype=np.double)
-    out = np.zeros((theta_.shape[0], lmax - m + 1), np.double)
-    cdef double[:, ::1] out_ = out
-    if lmax < m:
-        raise ValueError("lmax < m")
-    with nogil:
-        sharp_normalized_associated_legendre_table(m, 0, lmax, theta_.shape[0], &theta_[0], lmax - m + 1, 1, 1, &out_[0,0])
-    return out
diff --git a/python/libsharp/libsharp_mpi.pyx b/python/libsharp/libsharp_mpi.pyx
deleted file mode 100644
index e819a77..0000000
--- a/python/libsharp/libsharp_mpi.pyx
+++ /dev/null
@@ -1,17 +0,0 @@
-cdef extern from "mpi.h":
-    ctypedef void *MPI_Comm
-
-cdef extern from "Python.h":
-    object PyLong_FromVoidPtr(void*)
-
-cdef extern:
-    ctypedef class mpi4py.MPI.Comm [object PyMPICommObject]:
-        cdef MPI_Comm ob_mpi
-        cdef unsigned flags
-
-# For compatibility with mpi4py <= 1.3.1
-# Newer versions could use the MPI._addressof function
-def _addressof(Comm comm):
-    cdef void *ptr = NULL
-    ptr = <void*>&comm.ob_mpi
-    return PyLong_FromVoidPtr(ptr)
diff --git a/python/libsharp/tests/__init__.py b/python/libsharp/tests/__init__.py
deleted file mode 100644
index 1bb8bf6..0000000
--- a/python/libsharp/tests/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# empty
diff --git a/python/libsharp/tests/test_legendre.py b/python/libsharp/tests/test_legendre.py
deleted file mode 100644
index 0129b29..0000000
--- a/python/libsharp/tests/test_legendre.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import numpy as np
-from scipy.special import legendre
-from scipy.special import p_roots
-import libsharp
-
-from numpy.testing import assert_allclose
-
-
-def check_legendre_transform(lmax, ntheta):
-    l = np.arange(lmax + 1)
-    if lmax >= 1:
-        sigma = -np.log(1e-3) / lmax / (lmax + 1)
-        bl = np.exp(-sigma*l*(l+1))
-        bl *= (2 * l + 1)
-    else:
-        bl = np.asarray([1], dtype=np.double)
-
-    theta = np.linspace(0, np.pi, ntheta, endpoint=True)
-    x = np.cos(theta)
-
-    # Compute truth using scipy.special.legendre
-    P = np.zeros((ntheta, lmax + 1))
-    for l in range(lmax + 1):
-        P[:, l] = legendre(l)(x)
-    y0 = np.dot(P, bl)
-
-
-    # double-precision
-    y = libsharp.legendre_transform(x, bl)
-
-    assert_allclose(y, y0, rtol=1e-12, atol=1e-12)
-
-    # single-precision
-    y32 = libsharp.legendre_transform(x.astype(np.float32), bl)
-    assert_allclose(y, y0, rtol=1e-5, atol=1e-5)
-
-
-def test_legendre_transform():
-    nthetas_to_try = [0, 9, 17, 19] + list(np.random.randint(500, size=20))
-    for ntheta in nthetas_to_try:
-        for lmax in [0, 1, 2, 3, 20] + list(np.random.randint(50, size=4)):
-            yield check_legendre_transform, lmax, ntheta
-
-def check_legendre_roots(n):
-    xs, ws = ([], []) if n == 0 else p_roots(n) # from SciPy
-    xl, wl = libsharp.legendre_roots(n)
-    assert_allclose(xs, xl, rtol=1e-14, atol=1e-14)
-    assert_allclose(ws, wl, rtol=1e-14, atol=1e-14)
-
-def test_legendre_roots():
-    """
-    Test the Legendre root-finding algorithm from libsharp by comparing it with
-    the SciPy version.
-    """
-    yield check_legendre_roots, 0
-    yield check_legendre_roots, 1
-    yield check_legendre_roots, 32
-    yield check_legendre_roots, 33
diff --git a/python/libsharp/tests/test_legendre_table.py b/python/libsharp/tests/test_legendre_table.py
deleted file mode 100644
index eb02df2..0000000
--- a/python/libsharp/tests/test_legendre_table.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from __future__ import print_function
-import numpy as np
-
-from numpy.testing import assert_almost_equal
-from nose.tools import eq_, ok_
-
-from libsharp import normalized_associated_legendre_table
-from scipy.special import sph_harm, p_roots
-
-def test_compare_legendre_table_with_scipy():
-    def test(theta, m, lmax):
-        Plm = normalized_associated_legendre_table(lmax, m, theta)
-
-        Plm_p = sph_harm(m, np.arange(m, lmax + 1), 0, theta)[None, :]
-        if not np.allclose(Plm_p, Plm):
-            print(Plm_p)
-            print(Plm)
-        return ok_, np.allclose(Plm_p, Plm)
-
-    yield test(np.pi/2, 0, 10)
-    yield test(np.pi/4, 0, 10)
-    yield test(3 * np.pi/4, 0, 10)
-    yield test(np.pi/4, 1, 4)
-    yield test(np.pi/4, 2, 4)
-    yield test(np.pi/4, 50, 50)
-    yield test(np.pi/2, 49, 50)
-
-
-def test_legendre_table_wrapper_logic():
-    # tests the SSE 2 logic in the high-level wrapper by using an odd number of thetas
-    theta = np.asarray([np.pi/2, np.pi/4, 3 * np.pi / 4])
-    m = 3
-    lmax = 10
-    Plm = normalized_associated_legendre_table(lmax, m, theta)
-    assert np.allclose(Plm[1, :], normalized_associated_legendre_table(lmax, m, np.pi/4)[0, :])
-    assert np.allclose(Plm[2, :], normalized_associated_legendre_table(lmax, m, 3 * np.pi/4)[0, :])
diff --git a/python/libsharp/tests/test_sht.py b/python/libsharp/tests/test_sht.py
deleted file mode 100644
index 63ccf20..0000000
--- a/python/libsharp/tests/test_sht.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import numpy as np
-from numpy.testing import assert_allclose
-import libsharp
-
-from mpi4py import MPI
-
-
-def test_basic():
-    lmax = 10
-    nside = 8
-    rank = MPI.COMM_WORLD.Get_rank()
-    ms = np.arange(rank, lmax + 1, MPI.COMM_WORLD.Get_size(), dtype=np.int32)
-    
-    order = libsharp.packed_real_order(lmax, ms=ms)
-    grid = libsharp.healpix_grid(nside)
-
-    
-    alm = np.zeros(order.local_size())
-    if rank == 0:
-        alm[0] = 1
-    elif rank == 1:
-        alm[0] = 1
-
-
-    map = libsharp.synthesis(grid, order, np.repeat(alm[None, None, :], 3, 0), comm=MPI.COMM_WORLD)
-    assert np.all(map[2, :] == map[1, :]) and np.all(map[1, :] == map[0, :])
-    map = map[0, 0, :]
-    print(rank, "shape", map.shape)
-    print(rank, "mean", map.mean())
-
-if __name__=="__main__":
-    test_basic()
diff --git a/python/libsharp/tests/test_smoothing_noise_pol_mpi.py b/python/libsharp/tests/test_smoothing_noise_pol_mpi.py
deleted file mode 100644
index 2cdff95..0000000
--- a/python/libsharp/tests/test_smoothing_noise_pol_mpi.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# This test needs to be run with:
-
-# mpirun -np X python test_smoothing_noise_pol_mpi.py
-
-from mpi4py import MPI
-
-import numpy as np
-
-import healpy as hp
-
-import libsharp
-
-mpi = True
-rank = MPI.COMM_WORLD.Get_rank()
-
-nside = 256
-npix = hp.nside2npix(nside)
-
-np.random.seed(100)
-input_map = np.random.normal(size=(3, npix))
-fwhm_deg = 10
-lmax = 512
-
-nrings = 4 * nside - 1  # four missing pixels
-
-if rank == 0:
-    print("total rings", nrings)
-
-n_mpi_processes = MPI.COMM_WORLD.Get_size()
-rings_per_process = nrings // n_mpi_processes + 1
-# ring indices are 1-based
-
-ring_indices_emisphere = np.arange(2*nside, dtype=np.int32) + 1
-local_ring_indices = ring_indices_emisphere[rank::n_mpi_processes]
-
-# to improve performance, simmetric rings north/south need to be in the same rank
-# therefore we use symmetry to create the full ring indexing
-
-if local_ring_indices[-1] == 2 * nside:
-    # has equator ring
-    local_ring_indices = np.concatenate(
-      [local_ring_indices[:-1],
-       nrings - local_ring_indices[::-1] + 1]
-    )
-else:
-    # does not have equator ring
-    local_ring_indices = np.concatenate(
-      [local_ring_indices,
-       nrings - local_ring_indices[::-1] + 1]
-    )
-
-print("rank", rank, "n_rings", len(local_ring_indices))
-
-if not mpi:
-    local_ring_indices = None
-grid = libsharp.healpix_grid(nside, rings=local_ring_indices)
-
-# returns start index of the ring and number of pixels
-startpix, ringpix, _, _, _ = hp.ringinfo(nside, local_ring_indices.astype(np.int64))
-
-local_npix = grid.local_size()
-
-def expand_pix(startpix, ringpix, local_npix):
-    """Turn first pixel index and number of pixel in full array of pixels
-
-    to be optimized with cython or numba
-    """
-    local_pix = np.empty(local_npix, dtype=np.int64)
-    i = 0
-    for start, num in zip(startpix, ringpix):
-        local_pix[i:i+num] = np.arange(start, start+num)
-        i += num
-    return local_pix
-
-local_pix = expand_pix(startpix, ringpix, local_npix)
-
-local_map = input_map[:, local_pix]
-
-local_hitmap = np.zeros(npix)
-local_hitmap[local_pix] = 1
-hp.write_map("hitmap_{}.fits".format(rank), local_hitmap, overwrite=True)
-
-print("rank", rank, "npix", npix, "local_npix", local_npix, "local_map len", len(local_map), "unique pix", len(np.unique(local_pix)))
-
-local_m_indices = np.arange(rank, lmax + 1, MPI.COMM_WORLD.Get_size(), dtype=np.int32)
-if not mpi:
-    local_m_indices = None
-
-order = libsharp.packed_real_order(lmax, ms=local_m_indices) 
-local_nl = order.local_size()
-print("rank", rank, "local_nl", local_nl, "mval", order.mval())
-
-mpi_comm = MPI.COMM_WORLD if mpi else None
-
-# map2alm
-# maps in libsharp are 3D, 2nd dimension is IQU, 3rd is pixel
-
-alm_sharp_I = libsharp.analysis(grid, order,
-                                np.ascontiguousarray(local_map[0].reshape((1, 1, -1))),
-                                spin=0, comm=mpi_comm)
-alm_sharp_P = libsharp.analysis(grid, order,
-                                np.ascontiguousarray(local_map[1:].reshape((1, 2, -1))),
-                                spin=2, comm=mpi_comm)
-
-beam = hp.gauss_beam(fwhm=np.radians(fwhm_deg), lmax=lmax, pol=True)
-
-print("Smooth")
-# smooth in place (zonca implemented this function)
-order.almxfl(alm_sharp_I, np.ascontiguousarray(beam[:, 0:1]))
-order.almxfl(alm_sharp_P, np.ascontiguousarray(beam[:, (1, 2)]))
-
-# alm2map
-
-new_local_map_I = libsharp.synthesis(grid, order, alm_sharp_I, spin=0, comm=mpi_comm)
-new_local_map_P = libsharp.synthesis(grid, order, alm_sharp_P, spin=2, comm=mpi_comm)
-
-# Transfer map to first process for writing
-
-local_full_map = np.zeros(input_map.shape, dtype=np.float64)
-local_full_map[0, local_pix] = new_local_map_I
-local_full_map[1:, local_pix] = new_local_map_P
-
-output_map = np.zeros(input_map.shape, dtype=np.float64) if rank == 0 else None
-mpi_comm.Reduce(local_full_map, output_map, root=0, op=MPI.SUM)
-
-if rank == 0:
-    # hp.write_map("sharp_smoothed_map.fits", output_map, overwrite=True)
-    # hp_smoothed = hp.alm2map(hp.map2alm(input_map, lmax=lmax), nside=nside) # transform only
-    hp_smoothed = hp.smoothing(input_map, fwhm=np.radians(fwhm_deg), lmax=lmax)
-    std_diff = (hp_smoothed-output_map).std()
-    print("Std of difference between libsharp and healpy", std_diff)
-    # hp.write_map(
-    #     "healpy_smoothed_map.fits",
-    #     hp_smoothed,
-    #     overwrite=True
-    # )
-    assert std_diff < 1e-5
diff --git a/python/setup.py b/python/setup.py
deleted file mode 100644
index 788d7a6..0000000
--- a/python/setup.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#! /usr/bin/env python
-
-descr   = """Spherical Harmionic transforms package
-
-Python API for the libsharp spherical harmonic transforms library
-"""
-
-import os
-import sys
-
-DISTNAME            = 'libsharp'
-DESCRIPTION         = 'libsharp library for fast Spherical Harmonic Transforms'
-LONG_DESCRIPTION    = descr
-MAINTAINER          = 'Dag Sverre Seljebotn',
-MAINTAINER_EMAIL    = 'd.s.seljebotn@astro.uio.no',
-URL                 = 'http://sourceforge.net/projects/libsharp/'
-LICENSE             = 'GPL'
-DOWNLOAD_URL        = "http://sourceforge.net/projects/libsharp/"
-VERSION             = '0.1'
-
-# Add our fake Pyrex at the end of the Python search path
-# in order to fool setuptools into allowing compilation of
-# pyx files to C files. Importing Cython.Distutils then
-# makes Cython the tool of choice for this rather than
-# (the possibly nonexisting) Pyrex.
-project_path = os.path.split(__file__)[0]
-sys.path.append(os.path.join(project_path, 'fake_pyrex'))
-
-from setuptools import setup, find_packages, Extension
-from Cython.Build import cythonize
-import numpy as np
-
-libsharp = os.environ.get('LIBSHARP', None)
-libsharp_include = os.environ.get('LIBSHARP_INCLUDE', libsharp and os.path.join(libsharp, 'include'))
-libsharp_lib = os.environ.get('LIBSHARP_LIB', libsharp and os.path.join(libsharp, 'lib'))
-
-if libsharp_include is None or libsharp_lib is None:
-    sys.stderr.write('Please set LIBSHARP environment variable to the install directly of libsharp, '
-                     'this script will refer to the lib and include sub-directories. Alternatively '
-                     'set LIBSHARP_INCLUDE and LIBSHARP_LIB\n')
-    sys.exit(1)
-
-if __name__ == "__main__":
-    setup(install_requires = ['numpy'],
-          packages = find_packages(),
-          test_suite="nose.collector",
-          # Well, technically zipping the package will work, but since it's
-          # all compiled code it'll just get unzipped again at runtime, which
-          # is pointless:
-          zip_safe = False,
-          name = DISTNAME,
-          version = VERSION,
-          maintainer = MAINTAINER,
-          maintainer_email = MAINTAINER_EMAIL,
-          description = DESCRIPTION,
-          license = LICENSE,
-          url = URL,
-          download_url = DOWNLOAD_URL,
-          long_description = LONG_DESCRIPTION,
-          classifiers =
-            [ 'Development Status :: 3 - Alpha',
-              'Environment :: Console',
-              'Intended Audience :: Developers',
-              'Intended Audience :: Science/Research',
-              'License :: OSI Approved :: GNU General Public License (GPL)',
-              'Topic :: Scientific/Engineering'],
-          ext_modules = cythonize([
-              Extension("libsharp.libsharp",
-                        ["libsharp/libsharp.pyx"],
-                        libraries=["sharp", "fftpack", "c_utils"],
-                        include_dirs=[libsharp_include, np.get_include()],
-                        library_dirs=[libsharp_lib],
-                        extra_link_args=["-fopenmp"],
-              ),
-              Extension("libsharp.libsharp_mpi",
-                        ["libsharp/libsharp_mpi.pyx"],
-                        libraries=["sharp", "fftpack", "c_utils"],
-                        include_dirs=[libsharp_include, np.get_include()],
-                        library_dirs=[libsharp_lib],
-                        extra_link_args=["-fopenmp"],
-              ),
-              ]),
-          )
diff --git a/runjinja.py b/runjinja.py
deleted file mode 100755
index fb06737..0000000
--- a/runjinja.py
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Preprocesses foo.c.in to foo.c. Reads STDIN and writes STDOUT.
-"""
-
-import sys
-import hashlib
-from jinja2 import Template, Environment
-
-env = Environment(block_start_string='/*{',
-                  block_end_string='}*/',
-                  variable_start_string='{{',
-                  variable_end_string='}}')
-
-extra_vars = dict(len=len)
-input = sys.stdin.read()
-sys.stdout.write('/* DO NOT EDIT. md5sum of source: %s */' % hashlib.md5(input.encode()).hexdigest())
-sys.stdout.write(env.from_string(input).render(**extra_vars))

From 2affc388adda87918f90749c7741c10a87cf3530 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Mon, 15 Oct 2018 18:43:55 +0200
Subject: [PATCH 02/85] support external pocketfft

---
 Makefile.am  | 10 ++++++----
 configure.ac | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index 0d40b92..3050371 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -3,6 +3,8 @@ ACLOCAL_AMFLAGS = -I m4
 lib_LTLIBRARIES = libsharp.la
 
 src_sharp = \
+  c_utils/c_utils.c \
+  c_utils/c_utils.h \
   libsharp/sharp.c \
   libsharp/sharp_almhelpers.c \
   libsharp/sharp_announce.c \
@@ -34,13 +36,13 @@ EXTRA_DIST = \
 
 libsharp_la_SOURCES = $(src_sharp)
 
-#check_PROGRAMS = ffttest
-#ffttest_SOURCES = ffttest.c
-#ffttest_LDADD = libpocketfft.la -lm
+check_PROGRAMS = sharp_testsuite
+sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h c_utils/walltime_c.c c_utils/walltime_c.h
+sharp_testsuite_LDADD = libsharp.la
 
 #TESTS = ffttest
 
-AM_CFLAGS = -I$(top_srcdir)
+AM_CFLAGS = -I$(top_srcdir)/c_utils -I$(top_srcdir)/libsharp @AM_CFLAGS@
 
 pkgconfigdir = $(libdir)/pkgconfig
 nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc
diff --git a/configure.ac b/configure.ac
index 9d8e203..acad8ef 100644
--- a/configure.ac
+++ b/configure.ac
@@ -69,12 +69,49 @@ AX_CHECK_COMPILE_FLAG([-fno-rounding-math],[CFLAGS="$CFLAGS -fno-rounding-math"]
 AX_CHECK_COMPILE_FLAG([-fno-signaling-nans],[CFLAGS="$CFLAGS -fno-signaling-nans"])
 AX_CHECK_COMPILE_FLAG([-fcx-limited-range],[CFLAGS="$CFLAGS -fcx-limited-range"])
 
+# adding the lib to the files to link
+LIBS="-lm"
+LIBS="-lpocketfft $LIBS"
+# introduce the optional configure parameter for a non-standard install prefix of XXX
+AC_ARG_WITH([pocketfft],
+    [AS_HELP_STRING([--with-pocketfft=prefix],
+        [try this for a non-standard install prefix of the pocketfft library])],
+    [POCKETFFTPATHSET=1],
+    [POCKETFFTPATHSET=0])
+
+# if optional parameter used, extend path flags for compliler and linker
+if test $POCKETFFTPATHSET = 1 ; then
+    # extend the compiler and linker flags according to the path set
+    AM_CFLAGS="$AM_CFLAGS -I$with_pocketfft/include"
+    AM_LDFLAGS="$AM_LDFLAGS -L$with_pocketfft/lib"
+fi
+
+##########################################################################
+# check for pocketfft
+##########################################################################
+OLD_CFLAGS=$CFLAGS;
+OLD_LDFLAGS=$LDFLAGS;
+CFLAGS="$AM_CFLAGS $CFLAGS"
+LDFLAGS="$AM_LDFLAGS $LDFLAGS"
+AC_CHECK_HEADERS([pocketfft/pocketfft.h],
+        [pocketfft_header_found=yes; break;])
+
+AS_IF([test "x$pocketfft_header_found" != "xyes"],
+        [AC_MSG_ERROR([Unable to find pocketfft header])])
+
+AC_SEARCH_LIBS([make_rfft_plan],[pocketfft],,AC_MSG_ERROR([pocketfft not found]))
+CFLAGS=$OLD_CFLAGS
+LDFLAGS=$OLD_LDFLAGS
+
 AC_PROG_LIBTOOL
 
 dnl
 dnl Create pkgconfig .pc file.
 dnl
 AX_CREATE_PKGCONFIG_INFO(,,,,[])
+AC_SUBST([LIBS])
+AC_SUBST([AM_CFLAGS])
+AC_SUBST([AM_LDFLAGS])
 
 AC_CONFIG_FILES([Makefile])
 AC_OUTPUT

From f30d99cb2fe46ee2df55843b308386878681b13b Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 26 Oct 2018 10:34:02 +0200
Subject: [PATCH 03/85] heavy tweaking

---
 Makefile.am                |    4 +
 configure.ac               |   34 +-
 libsharp/sharp_core.c      |  213 +---
 libsharp/sharp_core_avx.c  |   14 +
 libsharp/sharp_core_inc0.c |  242 +++++
 libsharp/sharp_vecutil.h   |    6 -
 pocketfft/pocketfft.c      | 2060 ++++++++++++++++++++++++++++++++++++
 pocketfft/pocketfft.h      |   34 +
 8 files changed, 2370 insertions(+), 237 deletions(-)
 create mode 100644 libsharp/sharp_core_avx.c
 create mode 100644 libsharp/sharp_core_inc0.c
 create mode 100644 pocketfft/pocketfft.c
 create mode 100644 pocketfft/pocketfft.h

diff --git a/Makefile.am b/Makefile.am
index 3050371..5cd60d4 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -5,10 +5,13 @@ lib_LTLIBRARIES = libsharp.la
 src_sharp = \
   c_utils/c_utils.c \
   c_utils/c_utils.h \
+  pocketfft/pocketfft.c \
+  pocketfft/pocketfft.h \
   libsharp/sharp.c \
   libsharp/sharp_almhelpers.c \
   libsharp/sharp_announce.c \
   libsharp/sharp_core.c \
+  libsharp/sharp_core_avx.c \
   libsharp/sharp_geomhelpers.c \
   libsharp/sharp_legendre_roots.c \
   libsharp/sharp_ylmgen_c.c \
@@ -30,6 +33,7 @@ include_HEADERS = \
   libsharp/sharp_cxx.h
 
 EXTRA_DIST = \
+  libsharp/sharp_core_inc0.c \
   libsharp/sharp_core_inc.c \
   libsharp/sharp_core_inc2.c \
   libsharp/sharp_core_inchelper.c
diff --git a/configure.ac b/configure.ac
index acad8ef..34626bc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,6 +2,8 @@ AC_INIT([libsharp], [1.0.0])
 AM_INIT_AUTOMAKE([foreign subdir-objects -Wall -Werror])
 AM_MAINTAINER_MODE([enable])
 
+AC_OPENMP
+
 dnl
 dnl Needed for linking on Windows.
 dnl Protect with m4_ifdef because AM_PROG_AR is required in
@@ -68,40 +70,10 @@ AX_CHECK_COMPILE_FLAG([-fno-trapping-math],[CFLAGS="$CFLAGS -fno-trapping-math"]
 AX_CHECK_COMPILE_FLAG([-fno-rounding-math],[CFLAGS="$CFLAGS -fno-rounding-math"])
 AX_CHECK_COMPILE_FLAG([-fno-signaling-nans],[CFLAGS="$CFLAGS -fno-signaling-nans"])
 AX_CHECK_COMPILE_FLAG([-fcx-limited-range],[CFLAGS="$CFLAGS -fcx-limited-range"])
+CFLAGS="$CFLAGS $OPENMP_CFLAGS"
 
 # adding the lib to the files to link
 LIBS="-lm"
-LIBS="-lpocketfft $LIBS"
-# introduce the optional configure parameter for a non-standard install prefix of XXX
-AC_ARG_WITH([pocketfft],
-    [AS_HELP_STRING([--with-pocketfft=prefix],
-        [try this for a non-standard install prefix of the pocketfft library])],
-    [POCKETFFTPATHSET=1],
-    [POCKETFFTPATHSET=0])
-
-# if optional parameter used, extend path flags for compliler and linker
-if test $POCKETFFTPATHSET = 1 ; then
-    # extend the compiler and linker flags according to the path set
-    AM_CFLAGS="$AM_CFLAGS -I$with_pocketfft/include"
-    AM_LDFLAGS="$AM_LDFLAGS -L$with_pocketfft/lib"
-fi
-
-##########################################################################
-# check for pocketfft
-##########################################################################
-OLD_CFLAGS=$CFLAGS;
-OLD_LDFLAGS=$LDFLAGS;
-CFLAGS="$AM_CFLAGS $CFLAGS"
-LDFLAGS="$AM_LDFLAGS $LDFLAGS"
-AC_CHECK_HEADERS([pocketfft/pocketfft.h],
-        [pocketfft_header_found=yes; break;])
-
-AS_IF([test "x$pocketfft_header_found" != "xyes"],
-        [AC_MSG_ERROR([Unable to find pocketfft header])])
-
-AC_SEARCH_LIBS([make_rfft_plan],[pocketfft],,AC_MSG_ERROR([pocketfft not found]))
-CFLAGS=$OLD_CFLAGS
-LDFLAGS=$OLD_LDFLAGS
 
 AC_PROG_LIBTOOL
 
diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 7cd2f17..f052555 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -29,212 +29,25 @@
  *  \author Martin Reinecke
  */
 
-#include <complex.h>
-#include <math.h>
-#include <string.h>
-#include "sharp_vecsupport.h"
-#include "sharp_complex_hacks.h"
-#include "sharp_ylmgen_c.h"
-#include "sharp.h"
-#include "sharp_core.h"
-#include "c_utils.h"
+#define ARCH _default
+#include "sharp_core_inc0.c"
+#undef ARCH
 
-typedef complex double dcmplx;
-
-// must be in the range [0;6]
-#define MAXJOB_SPECIAL 2
-
-#define XCONCAT2(a,b) a##_##b
-#define CONCAT2(a,b) XCONCAT2(a,b)
-#define XCONCAT3(a,b,c) a##_##b##_##c
-#define CONCAT3(a,b,c) XCONCAT3(a,b,c)
-
-#define nvec 1
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 2
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 3
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 4
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 5
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 6
-#include "sharp_core_inchelper.c"
-#undef nvec
+#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+void inner_loop_avx (sharp_job *job, const int *ispair,const double *cth,
+  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
+  const int *mlim);
+#endif
 
 void inner_loop (sharp_job *job, const int *ispair,const double *cth,
   const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
   const int *mlim)
   {
-  int njobs=job->ntrans, nv=job->flags&SHARP_NVMAX;
-  if (njobs<=MAXJOB_SPECIAL)
-    {
-    switch (njobs*16+nv)
-      {
-#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1))
-      case 0x11:
-        CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x12:
-        CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x13:
-        CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x14:
-        CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x15:
-        CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x16:
-        CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-#if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2))
-      case 0x21:
-        CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x22:
-        CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x23:
-        CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x24:
-        CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x25:
-        CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x26:
-        CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-#if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3))
-      case 0x31:
-        CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x32:
-        CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x33:
-        CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x34:
-        CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x35:
-        CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x36:
-        CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-#if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4))
-      case 0x41:
-        CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x42:
-        CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x43:
-        CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x44:
-        CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x45:
-        CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x46:
-        CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-#if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5))
-      case 0x51:
-        CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x52:
-        CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x53:
-        CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x54:
-        CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x55:
-        CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x56:
-        CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-#if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6))
-      case 0x61:
-        CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x62:
-        CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x63:
-        CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x64:
-        CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x65:
-        CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x66:
-        CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-      }
-    }
-#if (SHARP_MAXTRANS>MAXJOB_SPECIAL)
+#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+  __builtin_cpu_init();
+  if (__builtin_cpu_supports("avx"))
+    inner_loop_avx (job, ispair, cth, sth, llim, ulim, gen, mi, mlim);
   else
-    {
-    switch (nv)
-      {
-      case 1:
-        CONCAT2(inner_loop,1)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      case 2:
-        CONCAT2(inner_loop,2)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      case 3:
-        CONCAT2(inner_loop,3)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      case 4:
-        CONCAT2(inner_loop,4)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      case 5:
-        CONCAT2(inner_loop,5)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      case 6:
-        CONCAT2(inner_loop,6)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      }
-    }
 #endif
-  UTIL_FAIL("Incorrect vector parameters");
+    inner_loop_default (job, ispair, cth, sth, llim, ulim, gen, mi, mlim);
   }
diff --git a/libsharp/sharp_core_avx.c b/libsharp/sharp_core_avx.c
new file mode 100644
index 0000000..dc6ee48
--- /dev/null
+++ b/libsharp/sharp_core_avx.c
@@ -0,0 +1,14 @@
+#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+// if we arrive here, we can benefit from an additional AVX version
+#warning entering gcc and x86_64 specific code branch
+
+#define ARCH _avx
+#define __AVX__
+#pragma GCC push_options
+#pragma GCC target("avx")
+#include "sharp_core_inc0.c"
+#pragma GCC pop_options
+#undef __AVX__
+#undef ARCH
+
+#endif
diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c
new file mode 100644
index 0000000..8590d2c
--- /dev/null
+++ b/libsharp/sharp_core_inc0.c
@@ -0,0 +1,242 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_core_inc0.c
+ *  Computational core
+ *
+ *  Copyright (C) 2012-2013 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#include <complex.h>
+#include <math.h>
+#include <string.h>
+#include "sharp_vecsupport.h"
+#include "sharp_complex_hacks.h"
+#include "sharp_ylmgen_c.h"
+#include "sharp.h"
+#include "sharp_core.h"
+#include "c_utils.h"
+
+typedef complex double dcmplx;
+
+// must be in the range [0;6]
+#define MAXJOB_SPECIAL 2
+
+#define XCONCATX(a,b) a##b
+#define CONCATX(a,b) XCONCATX(a,b)
+#define XCONCAT2(a,b) a##_##b
+#define CONCAT2(a,b) XCONCAT2(a,b)
+#define XCONCAT3(a,b,c) a##_##b##_##c
+#define CONCAT3(a,b,c) XCONCAT3(a,b,c)
+
+#define nvec 1
+#include "sharp_core_inchelper.c"
+#undef nvec
+
+#define nvec 2
+#include "sharp_core_inchelper.c"
+#undef nvec
+
+#define nvec 3
+#include "sharp_core_inchelper.c"
+#undef nvec
+
+#define nvec 4
+#include "sharp_core_inchelper.c"
+#undef nvec
+
+#define nvec 5
+#include "sharp_core_inchelper.c"
+#undef nvec
+
+#define nvec 6
+#include "sharp_core_inchelper.c"
+#undef nvec
+
+void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *cth,
+  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
+  const int *mlim)
+  {
+  int njobs=job->ntrans, nv=job->flags&SHARP_NVMAX;
+  if (njobs<=MAXJOB_SPECIAL)
+    {
+    switch (njobs*16+nv)
+      {
+#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1))
+      case 0x11:
+        CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x12:
+        CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x13:
+        CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x14:
+        CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x15:
+        CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x16:
+        CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+#endif
+#if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2))
+      case 0x21:
+        CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x22:
+        CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x23:
+        CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x24:
+        CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x25:
+        CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x26:
+        CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+#endif
+#if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3))
+      case 0x31:
+        CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x32:
+        CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x33:
+        CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x34:
+        CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x35:
+        CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x36:
+        CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+#endif
+#if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4))
+      case 0x41:
+        CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x42:
+        CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x43:
+        CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x44:
+        CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x45:
+        CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x46:
+        CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+#endif
+#if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5))
+      case 0x51:
+        CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x52:
+        CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x53:
+        CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x54:
+        CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x55:
+        CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x56:
+        CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+#endif
+#if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6))
+      case 0x61:
+        CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x62:
+        CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x63:
+        CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x64:
+        CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x65:
+        CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+      case 0x66:
+        CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+        return;
+#endif
+      }
+    }
+#if (SHARP_MAXTRANS>MAXJOB_SPECIAL)
+  else
+    {
+    switch (nv)
+      {
+      case 1:
+        CONCAT2(inner_loop,1)
+          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
+        return;
+      case 2:
+        CONCAT2(inner_loop,2)
+          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
+        return;
+      case 3:
+        CONCAT2(inner_loop,3)
+          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
+        return;
+      case 4:
+        CONCAT2(inner_loop,4)
+          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
+        return;
+      case 5:
+        CONCAT2(inner_loop,5)
+          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
+        return;
+      case 6:
+        CONCAT2(inner_loop,6)
+          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
+        return;
+      }
+    }
+#endif
+  UTIL_FAIL("Incorrect vector parameters");
+  }
diff --git a/libsharp/sharp_vecutil.h b/libsharp/sharp_vecutil.h
index f6161ca..24a2e94 100644
--- a/libsharp/sharp_vecutil.h
+++ b/libsharp/sharp_vecutil.h
@@ -46,12 +46,6 @@
 
 #endif
 
-#if (VLEN==1)
-#define VLEN_s 1
-#else
-#define VLEN_s (2*VLEN)
-#endif
-
 #ifndef USE_FMA4
 #ifdef __FMA4__
 #define USE_FMA4 1
diff --git a/pocketfft/pocketfft.c b/pocketfft/pocketfft.c
new file mode 100644
index 0000000..562ebc9
--- /dev/null
+++ b/pocketfft/pocketfft.c
@@ -0,0 +1,2060 @@
+/*
+ * This file is part of pocketfft.
+ * Licensed under a 3-clause BSD style license - see LICENSE.md
+ */
+
+/*
+ *  Main implementation file.
+ *
+ *  Copyright (C) 2004-2018 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "pocketfft.h"
+
+#define RALLOC(type,num) \
+  ((type *)malloc((num)*sizeof(type)))
+#define DEALLOC(ptr) \
+  do { free(ptr); (ptr)=NULL; } while(0)
+
+#define SWAP(a,b,type) \
+  do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0)
+
+#ifdef __GNUC__
+#define NOINLINE __attribute__((noinline))
+#define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result))
+#else
+#define NOINLINE
+#define WARN_UNUSED_RESULT
+#endif
+
+#if 0
+static void fracsincos(size_t m, size_t n, double *restrict res)
+  {
+  static const long double twopi=6.283185307179586476925286766559006L;
+  long double arg = twopi*(long double)m/((long double)n);
+  res[0] = (double)cosl(arg); res[1] = (double)sinl(arg);
+  }
+#endif
+
+// adapted from https://stackoverflow.com/questions/42792939/
+// CAUTION: this function only works for arguments in the range [-0.25; 0.25]!
+static void my_sincosm1pi (double a, double *restrict res)
+  {
+  double s = a * a;
+  /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
+  double r =     -1.0369917389758117e-4;
+  r = fma (r, s,  1.9294935641298806e-3);
+  r = fma (r, s, -2.5806887942825395e-2);
+  r = fma (r, s,  2.3533063028328211e-1);
+  r = fma (r, s, -1.3352627688538006e+0);
+  r = fma (r, s,  4.0587121264167623e+0);
+  r = fma (r, s, -4.9348022005446790e+0);
+  double c = r*s;
+  /* Approximate sin(pi*x) for x in [-0.25,0.25] */
+  r =             4.6151442520157035e-4;
+  r = fma (r, s, -7.3700183130883555e-3);
+  r = fma (r, s,  8.2145868949323936e-2);
+  r = fma (r, s, -5.9926452893214921e-1);
+  r = fma (r, s,  2.5501640398732688e+0);
+  r = fma (r, s, -5.1677127800499516e+0);
+  s = s * a;
+  r = r * s;
+  s = fma (a, 3.1415926535897931e+0, r);
+  res[0] = c;
+  res[1] = s;
+  }
+
+NOINLINE static void calc_first_octant(size_t den, double * restrict res)
+  {
+  size_t n = (den+4)>>3;
+  if (n==0) return;
+  res[0]=1.; res[1]=0.;
+  if (n==1) return;
+  size_t l1=(size_t)sqrt(n);
+  for (size_t i=1; i<l1; ++i)
+    my_sincosm1pi((2.*i)/den,&res[2*i]);
+  size_t start=l1;
+  while(start<n)
+    {
+    double cs[2];
+    my_sincosm1pi((2.*start)/den,cs);
+    res[2*start] = cs[0]+1.;
+    res[2*start+1] = cs[1];
+    size_t end = l1;
+    if (start+end>n) end = n-start;
+    for (size_t i=1; i<end; ++i)
+      {
+      double csx[2]={res[2*i], res[2*i+1]};
+      res[2*(start+i)] = ((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1.;
+      res[2*(start+i)+1] = (cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1];
+      }
+    start += l1;
+    }
+  for (size_t i=1; i<l1; ++i)
+    res[2*i] += 1.;
+  }
+
+NOINLINE static void calc_first_quadrant(size_t n, double * restrict res)
+  {
+  double * restrict p = res+n;
+  calc_first_octant(n<<1, p);
+  size_t ndone=(n+2)>>2;
+  size_t i=0, idx1=0, idx2=2*ndone-2;
+  for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
+    {
+    res[idx1]   = p[2*i];
+    res[idx1+1] = p[2*i+1];
+    res[idx2]   = p[2*i+3];
+    res[idx2+1] = p[2*i+2];
+    }
+  if (i!=ndone)
+    {
+    res[idx1  ] = p[2*i];
+    res[idx1+1] = p[2*i+1];
+    }
+  }
+
+NOINLINE static void calc_first_half(size_t n, double * restrict res)
+  {
+  int ndone=(n+1)>>1;
+  double * p = res+n-1;
+  calc_first_octant(n<<2, p);
+  int i4=0, in=n, i=0;
+  for (; i4<=in-i4; ++i, i4+=4) // octant 0
+    {
+    res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1];
+    }
+  for (; i4-in <= 0; ++i, i4+=4) // octant 1
+    {
+    int xm = in-i4;
+    res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm];
+    }
+  for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
+    {
+    int xm = i4-in;
+    res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm];
+    }
+  for (; i<ndone; ++i, i4+=4) // octant 3
+    {
+    int xm = 2*in-i4;
+    res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1];
+    }
+  }
+
+NOINLINE static void fill_first_quadrant(size_t n, double * restrict res)
+  {
+  const double hsqt2 = 0.707106781186547524400844362104849;
+  size_t quart = n>>2;
+  if ((n&7)==0)
+    res[quart] = res[quart+1] = hsqt2;
+  for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
+    {
+    res[j  ] = res[i+1];
+    res[j+1] = res[i  ];
+    }
+  }
+
+NOINLINE static void fill_first_half(size_t n, double * restrict res)
+  {
+  size_t half = n>>1;
+  if ((n&3)==0)
+    { res[half] = 0.; res[half+1] = 1.; }
+  for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
+    {
+    res[j  ] = -res[i  ];
+    res[j+1] =  res[i+1];
+    }
+  }
+
+NOINLINE static void fill_second_half(size_t n, double * restrict res)
+  {
+  if ((n&1)==0)
+    { res[n] = -1.; res[n+1] = 0.; }
+  for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
+    {
+    res[j  ] =  res[i  ];
+    res[j+1] = -res[i+1];
+    }
+  }
+
+NOINLINE static void sincos_2pibyn(size_t n, double * restrict res)
+  {
+  if ((n&3)==0)
+    {
+    calc_first_octant(n, res);
+    fill_first_quadrant(n, res);
+    fill_first_half(n, res);
+    }
+  else if ((n&1)==0)
+    {
+    calc_first_quadrant(n, res);
+    fill_first_half(n, res);
+    }
+  else
+    calc_first_half(n, res);
+  fill_second_half(n, res);
+  }
+
+
+static size_t largest_prime_factor (size_t n)
+  {
+  size_t res=1;
+  size_t tmp;
+  while (((tmp=(n>>1))<<1)==n)
+    { res=2; n=tmp; }
+
+  size_t limit=(size_t)sqrt(n+0.01);
+  for (size_t x=3; x<=limit; x+=2)
+  while (((tmp=(n/x))*x)==n)
+    {
+    res=x;
+    n=tmp;
+    limit=(size_t)sqrt(n+0.01);
+    }
+  if (n>1) res=n;
+
+  return res;
+  }
+
+static double cost_guess (size_t n)
+  {
+  const double lfp=1.1; // penalty for non-hardcoded larger factors
+  size_t ni=n;
+  double result=0.;
+  size_t tmp;
+  while (((tmp=(n>>1))<<1)==n)
+    { result+=2; n=tmp; }
+
+  size_t limit=(size_t)sqrt(n+0.01);
+  for (size_t x=3; x<=limit; x+=2)
+  while ((tmp=(n/x))*x==n)
+    {
+    result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
+    n=tmp;
+    limit=(size_t)sqrt(n+0.01);
+    }
+  if (n>1) result+=(n<=5) ? n : lfp*n;
+
+  return result*ni;
+  }
+
+/* returns the smallest composite of 2, 3 and 5 which is >= n */
+static size_t good_size(size_t n)
+  {
+  if (n<=6) return n;
+
+  size_t bestfac=2*n;
+  for (size_t f2=1; f2<bestfac; f2*=2)
+    for (size_t f23=f2; f23<bestfac; f23*=3)
+      for (size_t f235=f23; f235<bestfac; f235*=5)
+        for (size_t f2357=f235; f2357<bestfac; f2357*=7)
+          for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
+            if (f235711>=n) bestfac=f235711;
+  return bestfac;
+  }
+
+typedef struct cmplx {
+  double r,i;
+} cmplx;
+
+#define NFCT 25
+typedef struct cfftp_fctdata
+  {
+  size_t fct;
+  cmplx *tw, *tws;
+  } cfftp_fctdata;
+
+typedef struct cfftp_plan_i
+  {
+  size_t length, nfct;
+  cmplx *mem;
+  cfftp_fctdata fct[NFCT];
+  } cfftp_plan_i;
+typedef struct cfftp_plan_i * cfftp_plan;
+
+#define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
+#define MPC(a,b,c,d) { a.r=c.r-d.r; a.i=c.i-d.i; b.r=c.r+d.r; b.i=c.i+d.i; }
+#define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; }
+#define SCALEC(a,b) { a.r*=b; a.i*=b; }
+#define CONJFLIPC(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+#define WA(x,i) wa[(i)-1+(x)*(ido-1)]
+/* a = b*c */
+#define MULPMC(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
+#define MULMPC(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
+
+#define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; }
+/* a = b*c */
+#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-sign*b.i*c.i; a.i=b.r*c.i+sign*b.i*c.r; }
+/* a *= b */
+#define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; }
+
+NOINLINE static void pass2 (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+  {
+  const size_t cdim=2;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx t;
+        PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
+        MULPMSIGNC (CH(i,k,1),WA(0,i),t)
+        }
+      }
+  }
+
+#define PREP3(idx) \
+        cmplx t0 = CC(idx,0,k), t1, t2; \
+        PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)) \
+        CH(idx,k,0).r=t0.r+t1.r; \
+        CH(idx,k,0).i=t0.i+t1.i;
+#define PARTSTEP3a(u1,u2,twr,twi) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t0.r+twr*t1.r; \
+        ca.i=t0.i+twr*t1.i; \
+        cb.i=twi*t2.r; \
+        cb.r=-(twi*t2.i); \
+        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
+        }
+#define PARTSTEP3(u1,u2,twr,twi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twr*t1.r; \
+        ca.i=t0.i+twr*t1.i; \
+        cb.i=twi*t2.r; \
+        cb.r=-(twi*t2.i); \
+        PMC(da,db,ca,cb) \
+        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
+        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+
+NOINLINE static void pass3 (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+  {
+  const size_t cdim=3;
+  const double tw1r=-0.5, tw1i= sign * 0.86602540378443864676;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP3(i)
+        PARTSTEP3(1,2,tw1r,tw1i)
+        }
+      }
+  }
+
+NOINLINE static void pass4 (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+  {
+  const size_t cdim=4;
+
+  if (ido==1)
+    if (sign>0)
+      for (size_t k=0; k<l1; ++k)
+        {
+        cmplx t1, t2, t3, t4;
+        PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+        PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+        CONJFLIPC(t4)
+        PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+        PMC (CH(0,k,1),CH(0,k,3),t1,t4)
+        }
+    else
+      for (size_t k=0; k<l1; ++k)
+        {
+        cmplx t1, t2, t3, t4;
+        PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+        PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+        CONJFLIPC(t4)
+        PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+        MPC (CH(0,k,1),CH(0,k,3),t1,t4)
+        }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      CONJFLIPC(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMSIGNC (CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+      if (sign>0)
+        for (size_t i=1; i<ido; ++i)
+          {
+          cmplx c2, c3, c4, t1, t2, t3, t4;
+          cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
+          PMC(t2,t1,cc0,cc2)
+          PMC(t3,t4,cc1,cc3)
+          CONJFLIPC(t4)
+          cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
+          PMC(CH(i,k,0),c3,t2,t3)
+          PMC (c2,c4,t1,t4)
+          MULPMC (CH(i,k,1),wa0,c2)
+          MULPMC (CH(i,k,2),wa1,c3)
+          MULPMC (CH(i,k,3),wa2,c4)
+          }
+      else
+        for (size_t i=1; i<ido; ++i)
+          {
+          cmplx c2, c3, c4, t1, t2, t3, t4;
+          cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
+          PMC(t2,t1,cc0,cc2)
+          PMC(t3,t4,cc1,cc3)
+          CONJFLIPC(t4)
+          cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
+          PMC(CH(i,k,0),c3,t2,t3)
+          MPC (c2,c4,t1,t4)
+          MULMPC (CH(i,k,1),wa0,c2)
+          MULMPC (CH(i,k,2),wa1,c3)
+          MULMPC (CH(i,k,3),wa2,c4)
+          }
+      }
+  }
+
+#define PREP5(idx) \
+        cmplx t0 = CC(idx,0,k), t1, t2, t3, t4; \
+        PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)) \
+        PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)) \
+        CH(idx,k,0).r=t0.r+t1.r+t2.r; \
+        CH(idx,k,0).i=t0.i+t1.i+t2.i;
+
+#define PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
+        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
+        cb.i=twai*t4.r twbi*t3.r; \
+        cb.r=-(twai*t4.i twbi*t3.i); \
+        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
+        }
+#define PARTSTEP5(u1,u2,twar,twbr,twai,twbi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
+        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
+        cb.i=twai*t4.r twbi*t3.r; \
+        cb.r=-(twai*t4.i twbi*t3.i); \
+        PMC(da,db,ca,cb) \
+        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
+        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+
+NOINLINE static void pass5 (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+  {
+  const size_t cdim=5;
+  const double tw1r= 0.3090169943749474241,
+               tw1i= sign * 0.95105651629515357212,
+               tw2r= -0.8090169943749474241,
+               tw2i= sign * 0.58778525229247312917;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP5(i)
+        PARTSTEP5(1,4,tw1r,tw2r,+tw1i,+tw2i)
+        PARTSTEP5(2,3,tw2r,tw1r,+tw2i,-tw1i)
+        }
+      }
+  }
+
+#define PREP7(idx) \
+        cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
+        PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)) \
+        PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)) \
+        PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)) \
+        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
+        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
+
+#define PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
+        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
+        cb.i=y1*t7.r y2*t6.r y3*t5.r; \
+        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
+        PMC(out1,out2,ca,cb) \
+        }
+#define PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
+        PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
+#define PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
+        { \
+        cmplx da,db; \
+        PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
+        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
+        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+
+NOINLINE static void pass7(size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+  {
+  const size_t cdim=7;
+  const double tw1r= 0.623489801858733530525,
+               tw1i= sign * 0.7818314824680298087084,
+               tw2r= -0.222520933956314404289,
+               tw2i= sign * 0.9749279121818236070181,
+               tw3r= -0.9009688679024191262361,
+               tw3i= sign * 0.4338837391175581204758;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP7(0)
+      PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
+      PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
+      PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP7(0)
+      PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
+      PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
+      PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP7(i)
+        PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
+        PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
+        PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
+        }
+      }
+  }
+
+#define PREP11(idx) \
+        cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
+        PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)) \
+        PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)) \
+        PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)) \
+        PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)) \
+        PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)) \
+        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
+        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
+
+#define PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r+x4*t5.r+x5*t6.r; \
+        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i+x5*t6.i; \
+        cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
+        cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
+        PMC(out1,out2,ca,cb) \
+        }
+#define PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
+        PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
+#define PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
+        { \
+        cmplx da,db; \
+        PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
+        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
+        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+
+NOINLINE static void pass11 (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+  {
+  const size_t cdim=11;
+  const double tw1r =        0.8412535328311811688618,
+               tw1i = sign * 0.5406408174555975821076,
+               tw2r =        0.4154150130018864255293,
+               tw2i = sign * 0.9096319953545183714117,
+               tw3r =       -0.1423148382732851404438,
+               tw3i = sign * 0.9898214418809327323761,
+               tw4r =       -0.6548607339452850640569,
+               tw4i = sign * 0.755749574354258283774,
+               tw5r =       -0.9594929736144973898904,
+               tw5i = sign * 0.2817325568414296977114;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP11(0)
+      PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
+      PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
+      PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
+      PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
+      PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP11(0)
+      PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
+      PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
+      PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
+      PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
+      PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP11(i)
+        PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
+        PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
+        PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
+        PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
+        PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
+        }
+      }
+  }
+
+#define CX(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define CX2(a,b) cc[(a)+idl1*(b)]
+#define CH2(a,b) ch[(a)+idl1*(b)]
+
+NOINLINE static int passg (size_t ido, size_t ip, size_t l1,
+  cmplx * restrict cc, cmplx * restrict ch, const cmplx * restrict wa,
+  const cmplx * restrict csarr, const int sign)
+  {
+  const size_t cdim=ip;
+  size_t ipph = (ip+1)/2;
+  size_t idl1 = ido*l1;
+
+  cmplx * restrict wal=RALLOC(cmplx,ip);
+  if (!wal) return -1;
+  wal[0]=(cmplx){1.,0.};
+  for (size_t i=1; i<ip; ++i)
+    wal[i]=(cmplx){csarr[i].r,sign*csarr[i].i};
+
+  for (size_t k=0; k<l1; ++k)
+    for (size_t i=0; i<ido; ++i)
+      CH(i,k,0) = CC(i,0,k);
+  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
+    for (size_t k=0; k<l1; ++k)
+      for (size_t i=0; i<ido; ++i)
+        PMC(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k))
+  for (size_t k=0; k<l1; ++k)
+    for (size_t i=0; i<ido; ++i)
+      {
+      cmplx tmp = CH(i,k,0);
+      for (size_t j=1; j<ipph; ++j)
+        ADDC(tmp,tmp,CH(i,k,j))
+      CX(i,k,0) = tmp;
+      }
+  for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
+    {
+    // j=0
+    for (size_t ik=0; ik<idl1; ++ik)
+      {
+      CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
+      CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
+      CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
+      CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
+      }
+
+    size_t iwal=2*l;
+    size_t j=3, jc=ip-3;
+    for (; j<ipph-1; j+=2, jc-=2)
+      {
+      iwal+=l; if (iwal>ip) iwal-=ip;
+      cmplx xwal=wal[iwal];
+      iwal+=l; if (iwal>ip) iwal-=ip;
+      cmplx xwal2=wal[iwal];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
+        CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
+        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
+        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
+        }
+      }
+    for (; j<ipph; ++j, --jc)
+      {
+      iwal+=l; if (iwal>ip) iwal-=ip;
+      cmplx xwal=wal[iwal];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        CX2(ik,l).r += CH2(ik,j).r*xwal.r;
+        CX2(ik,l).i += CH2(ik,j).i*xwal.r;
+        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
+        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
+        }
+      }
+    }
+  DEALLOC(wal);
+
+  // shuffling and twiddling
+  if (ido==1)
+    for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        cmplx t1=CX2(ik,j), t2=CX2(ik,jc);
+        PMC(CX2(ik,j),CX2(ik,jc),t1,t2)
+        }
+  else
+    {
+    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
+      for (size_t k=0; k<l1; ++k)
+        {
+        cmplx t1=CX(0,k,j), t2=CX(0,k,jc);
+        PMC(CX(0,k,j),CX(0,k,jc),t1,t2)
+        for (size_t i=1; i<ido; ++i)
+          {
+          cmplx x1, x2;
+          PMC(x1,x2,CX(i,k,j),CX(i,k,jc))
+          size_t idij=(j-1)*(ido-1)+i-1;
+          MULPMSIGNC (CX(i,k,j),wa[idij],x1)
+          idij=(jc-1)*(ido-1)+i-1;
+          MULPMSIGNC (CX(i,k,jc),wa[idij],x2)
+          }
+        }
+    }
+  return 0;
+  }
+
+#undef CH2
+#undef CX2
+#undef CX
+
+WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct,
+  const int sign)
+  {
+  if (plan->length==1) return 0;
+  size_t len=plan->length;
+  size_t l1=1, nf=plan->nfct;
+  cmplx *ch = RALLOC(cmplx, len);
+  if (!ch) return -1;
+  cmplx *p1=c, *p2=ch;
+
+  for(size_t k1=0; k1<nf; k1++)
+    {
+    size_t ip=plan->fct[k1].fct;
+    size_t l2=ip*l1;
+    size_t ido = len/l2;
+    if     (ip==4)  pass4 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
+    else if(ip==2)  pass2 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
+    else if(ip==3)  pass3 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
+    else if(ip==5)  pass5 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
+    else if(ip==7)  pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
+    else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign);
+    else
+      {
+      if (passg(ido, ip, l1, p1, p2, plan->fct[k1].tw, plan->fct[k1].tws, sign))
+        { DEALLOC(ch); return -1; }
+      SWAP(p1,p2,cmplx *);
+      }
+    SWAP(p1,p2,cmplx *);
+    l1=l2;
+    }
+  if (p1!=c)
+    {
+    if (fct!=1.)
+      for (size_t i=0; i<len; ++i)
+        {
+        c[i].r = ch[i].r*fct;
+        c[i].i = ch[i].i*fct;
+        }
+    else
+      memcpy (c,p1,len*sizeof(cmplx));
+    }
+  else
+    if (fct!=1.)
+      for (size_t i=0; i<len; ++i)
+        {
+        c[i].r *= fct;
+        c[i].i *= fct;
+        }
+  DEALLOC(ch);
+  return 0;
+  }
+
+#undef PMSIGNC
+#undef MULPMC
+#undef MULMPC
+#undef MULPMSIGNC
+#undef MULPMSIGNCEQ
+
+#undef WA
+#undef CC
+#undef CH
+#undef CONJFLIPC
+#undef SCALEC
+#undef ADDC
+#undef MPC
+#undef PMC
+
+WARN_UNUSED_RESULT
+static int cfftp_forward(cfftp_plan plan, double c[], double fct)
+  { return pass_all(plan,(cmplx *)c, fct, -1); }
+
+WARN_UNUSED_RESULT
+static int cfftp_backward(cfftp_plan plan, double c[], double fct)
+  { return pass_all(plan,(cmplx *)c, fct, 1); }
+
+WARN_UNUSED_RESULT
+static int cfftp_factorize (cfftp_plan plan)
+  {
+  size_t length=plan->length;
+  size_t nfct=0;
+  while ((length%4)==0)
+    { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
+  if ((length%2)==0)
+    {
+    length>>=1;
+    // factor 2 should be at the front of the factor list
+    if (nfct>=NFCT) return -1;
+    plan->fct[nfct++].fct=2;
+    SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
+    }
+  size_t maxl=(size_t)(sqrt((double)length))+1;
+  for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
+    if ((length%divisor)==0)
+      {
+      while ((length%divisor)==0)
+        {
+        if (nfct>=NFCT) return -1;
+        plan->fct[nfct++].fct=divisor;
+        length/=divisor;
+        }
+      maxl=(size_t)(sqrt((double)length))+1;
+      }
+  if (length>1) plan->fct[nfct++].fct=length;
+  plan->nfct=nfct;
+  return 0;
+  }
+
+static size_t cfftp_twsize (cfftp_plan plan)
+  {
+  size_t twsize=0, l1=1;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
+    twsize+=(ip-1)*(ido-1);
+    if (ip>11)
+      twsize+=ip;
+    l1*=ip;
+    }
+  return twsize;
+  }
+
+WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan)
+  {
+  size_t length=plan->length;
+  double *twid = RALLOC(double, 2*length);
+  if (!twid) return -1;
+  sincos_2pibyn(length, twid);
+  size_t l1=1;
+  size_t memofs=0;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido= length/(l1*ip);
+    plan->fct[k].tw=plan->mem+memofs;
+    memofs+=(ip-1)*(ido-1);
+    for (size_t j=1; j<ip; ++j)
+      for (size_t i=1; i<ido; ++i)
+        {
+        plan->fct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i];
+        plan->fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1];
+        }
+    if (ip>11)
+      {
+      plan->fct[k].tws=plan->mem+memofs;
+      memofs+=ip;
+      for (size_t j=0; j<ip; ++j)
+        {
+        plan->fct[k].tws[j].r = twid[2*j*l1*ido];
+        plan->fct[k].tws[j].i = twid[2*j*l1*ido+1];
+        }
+      }
+    l1*=ip;
+    }
+  DEALLOC(twid);
+  return 0;
+  }
+
+static cfftp_plan make_cfftp_plan (size_t length)
+  {
+  if (length==0) return NULL;
+  cfftp_plan plan = RALLOC(cfftp_plan_i,1);
+  if (!plan) return NULL;
+  plan->length=length;
+  plan->nfct=0;
+  for (size_t i=0; i<NFCT; ++i)
+    plan->fct[i]=(cfftp_fctdata){0,0,0};
+  plan->mem=0;
+  if (length==1) return plan;
+  if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
+  size_t tws=cfftp_twsize(plan);
+  plan->mem=RALLOC(cmplx,tws);
+  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  if (cfftp_comp_twiddle(plan)!=0)
+    { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  return plan;
+  }
+
+static void destroy_cfftp_plan (cfftp_plan plan)
+  {
+  DEALLOC(plan->mem);
+  DEALLOC(plan);
+  }
+
+typedef struct rfftp_fctdata
+  {
+  size_t fct;
+  double *tw, *tws;
+  } rfftp_fctdata;
+
+typedef struct rfftp_plan_i
+  {
+  size_t length, nfct;
+  double *mem;
+  rfftp_fctdata fct[NFCT];
+  } rfftp_plan_i;
+typedef struct rfftp_plan_i * rfftp_plan;
+
+#define WA(x,i) wa[(i)+(x)*(ido-1)]
+#define PM(a,b,c,d) { a=c+d; b=c-d; }
+/* (a+ib) = conj(c+id) * (e+if) */
+#define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; }
+
+#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
+
+NOINLINE static void radf2 (size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=2;
+
+  for (size_t k=0; k<l1; k++)
+    PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1))
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      CH(    0,1,k) = -CC(ido-1,k,1);
+      CH(ido-1,0,k) =  CC(ido-1,k,0);
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double tr2, ti2;
+      MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2)
+      PM (CH(i  ,0,k),CH(ic  ,1,k),ti2,CC(i  ,k,0))
+      }
+  }
+
+NOINLINE static void radf3(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=3;
+  static const double taur=-0.5, taui=0.86602540378443864676;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double cr2=CC(0,k,1)+CC(0,k,2);
+    CH(0,0,k) = CC(0,k,0)+cr2;
+    CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
+    CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double di2, di3, dr2, dr3;
+      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)) // d2=conj(WA0)*CC1
+      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)) // d3=conj(WA1)*CC2
+      double cr2=dr2+dr3; // c add
+      double ci2=di2+di3;
+      CH(i-1,0,k) = CC(i-1,k,0)+cr2; // c add
+      CH(i  ,0,k) = CC(i  ,k,0)+ci2;
+      double tr2 = CC(i-1,k,0)+taur*cr2; // c add
+      double ti2 = CC(i  ,k,0)+taur*ci2;
+      double tr3 = taui*(di2-di3);  // t3 = taui*i*(d3-d2)?
+      double ti3 = taui*(dr3-dr2);
+      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3) // PM(i) = t2+t3
+      PM(CH(i  ,2,k),CH(ic  ,1,k),ti3,ti2) // PM(ic) = conj(t2-t3)
+      }
+  }
+
+NOINLINE static void radf4(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=4;
+  static const double hsqt2=0.70710678118654752440;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double tr1,tr2;
+    PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1))
+    PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2))
+    PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1)
+    }
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      double ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
+      double tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
+      PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1)
+      PM (CH(    0,3,k),CH(    0,1,k),ti1,CC(ido-1,k,2))
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+      MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
+      MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
+      PM(tr1,tr4,cr4,cr2)
+      PM(ti1,ti4,ci2,ci4)
+      PM(tr2,tr3,CC(i-1,k,0),cr3)
+      PM(ti2,ti3,CC(i  ,k,0),ci3)
+      PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1)
+      PM(CH(i  ,0,k),CH(ic  ,3,k),ti1,ti2)
+      PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4)
+      PM(CH(i  ,2,k),CH(ic  ,1,k),tr4,ti3)
+      }
+  }
+
+NOINLINE static void radf5(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=5;
+  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
+                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double cr2, cr3, ci4, ci5;
+    PM (cr2,ci5,CC(0,k,4),CC(0,k,1))
+    PM (cr3,ci4,CC(0,k,3),CC(0,k,2))
+    CH(0,0,k)=CC(0,k,0)+cr2+cr3;
+    CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
+    CH(0,2,k)=ti11*ci5+ti12*ci4;
+    CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
+    CH(0,4,k)=ti12*ci5-ti11*ci4;
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      double ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3,
+         dr4, dr5, cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
+      size_t ic=ido-i;
+      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
+      MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
+      MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4))
+      PM(cr2,ci5,dr5,dr2)
+      PM(ci2,cr5,di2,di5)
+      PM(cr3,ci4,dr4,dr3)
+      PM(ci3,cr4,di3,di4)
+      CH(i-1,0,k)=CC(i-1,k,0)+cr2+cr3;
+      CH(i  ,0,k)=CC(i  ,k,0)+ci2+ci3;
+      tr2=CC(i-1,k,0)+tr11*cr2+tr12*cr3;
+      ti2=CC(i  ,k,0)+tr11*ci2+tr12*ci3;
+      tr3=CC(i-1,k,0)+tr12*cr2+tr11*cr3;
+      ti3=CC(i  ,k,0)+tr12*ci2+tr11*ci3;
+      MULPM(tr5,tr4,cr5,cr4,ti11,ti12)
+      MULPM(ti5,ti4,ci5,ci4,ti11,ti12)
+      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5)
+      PM(CH(i  ,2,k),CH(ic  ,1,k),ti5,ti2)
+      PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4)
+      PM(CH(i  ,4,k),CH(ic  ,3,k),ti4,ti3)
+      }
+  }
+
+#undef CC
+#undef CH
+#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define C2(a,b) cc[(a)+idl1*(b)]
+#define CH2(a,b) ch[(a)+idl1*(b)]
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+NOINLINE static void radfg(size_t ido, size_t ip, size_t l1,
+  double * restrict cc, double * restrict ch, const double * restrict wa,
+  const double * restrict csarr)
+  {
+  const size_t cdim=ip;
+  size_t ipph=(ip+1)/2;
+  size_t idl1 = ido*l1;
+
+  if (ido>1)
+    {
+    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)              // 114
+      {
+      size_t is=(j-1)*(ido-1),
+             is2=(jc-1)*(ido-1);
+      for (size_t k=0; k<l1; ++k)                            // 113
+        {
+        size_t idij=is;
+        size_t idij2=is2;
+        for (size_t i=1; i<=ido-2; i+=2)                      // 112
+          {
+          double t1=C1(i,k,j ), t2=C1(i+1,k,j ),
+                 t3=C1(i,k,jc), t4=C1(i+1,k,jc);
+          double x1=wa[idij]*t1 + wa[idij+1]*t2,
+                 x2=wa[idij]*t2 - wa[idij+1]*t1,
+                 x3=wa[idij2]*t3 + wa[idij2+1]*t4,
+                 x4=wa[idij2]*t4 - wa[idij2+1]*t3;
+          C1(i  ,k,j ) = x1+x3;
+          C1(i  ,k,jc) = x2-x4;
+          C1(i+1,k,j ) = x2+x4;
+          C1(i+1,k,jc) = x3-x1;
+          idij+=2;
+          idij2+=2;
+          }
+        }
+      }
+    }
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 123
+    for (size_t k=0; k<l1; ++k)                              // 122
+      {
+      double t1=C1(0,k,j), t2=C1(0,k,jc);
+      C1(0,k,j ) = t1+t2;
+      C1(0,k,jc) = t2-t1;
+      }
+
+//everything in C
+//memset(ch,0,ip*l1*ido*sizeof(double));
+
+  for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)                 // 127
+    {
+    for (size_t ik=0; ik<idl1; ++ik)                         // 124
+      {
+      CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
+      CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
+      }
+    size_t iang = 2*l;
+    size_t j=3, jc=ip-3;
+    for (; j<ipph-3; j+=4,jc-=4)              // 126
+      {
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)                       // 125
+        {
+        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
+                     +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
+        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
+                     +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
+        }
+      }
+    for (; j<ipph-1; j+=2,jc-=2)              // 126
+      {
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)                       // 125
+        {
+        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
+        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
+        }
+      }
+    for (; j<ipph; ++j,--jc)              // 126
+      {
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar=csarr[2*iang], ai=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)                       // 125
+        {
+        CH2(ik,l ) += ar*C2(ik,j );
+        CH2(ik,lc) += ai*C2(ik,jc);
+        }
+      }
+    }
+  for (size_t ik=0; ik<idl1; ++ik)                         // 101
+    CH2(ik,0) = C2(ik,0);
+  for (size_t j=1; j<ipph; ++j)                              // 129
+    for (size_t ik=0; ik<idl1; ++ik)                         // 128
+      CH2(ik,0) += C2(ik,j);
+
+// everything in CH at this point!
+//memset(cc,0,ip*l1*ido*sizeof(double));
+
+  for (size_t k=0; k<l1; ++k)                                // 131
+    for (size_t i=0; i<ido; ++i)                             // 130
+      CC(i,0,k) = CH(i,k,0);
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 137
+    {
+    size_t j2=2*j-1;
+    for (size_t k=0; k<l1; ++k)                              // 136
+      {
+      CC(ido-1,j2,k) = CH(0,k,j);
+      CC(0,j2+1,k) = CH(0,k,jc);
+      }
+    }
+
+  if (ido==1) return;
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 140
+    {
+    size_t j2=2*j-1;
+    for(size_t k=0; k<l1; ++k)                               // 139
+      for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 138
+        {
+        CC(i   ,j2+1,k) = CH(i  ,k,j )+CH(i  ,k,jc);
+        CC(ic  ,j2  ,k) = CH(i  ,k,j )-CH(i  ,k,jc);
+        CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
+        CC(ic+1,j2  ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
+        }
+    }
+  }
+#undef C1
+#undef C2
+#undef CH2
+
+#undef CH
+#undef CC
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+
+NOINLINE static void radb2(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=2;
+
+  for (size_t k=0; k<l1; k++)
+    PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k))
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      CH(ido-1,k,0) = 2.*CC(ido-1,0,k);
+      CH(ido-1,k,1) =-2.*CC(0    ,1,k);
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double ti2, tr2;
+      PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k))
+      PM (ti2,CH(i  ,k,0),CC(i  ,0,k),CC(ic  ,1,k))
+      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2)
+      }
+  }
+
+NOINLINE static void radb3(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=3;
+  static const double taur=-0.5, taui=0.86602540378443864676;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double tr2=2.*CC(ido-1,1,k);
+    double cr2=CC(0,0,k)+taur*tr2;
+    CH(0,k,0)=CC(0,0,k)+tr2;
+    double ci3=2.*taui*CC(0,2,k);
+    PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
+      double ti2=CC(i  ,2,k)-CC(ic  ,1,k);
+      double cr2=CC(i-1,0,k)+taur*tr2;     // c2=CC +taur*t2
+      double ci2=CC(i  ,0,k)+taur*ti2;
+      CH(i-1,k,0)=CC(i-1,0,k)+tr2;         // CH=CC+t2
+      CH(i  ,k,0)=CC(i  ,0,k)+ti2;
+      double cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
+      double ci3=taui*(CC(i  ,2,k)+CC(ic  ,1,k));
+      double di2, di3, dr2, dr3;
+      PM(dr3,dr2,cr2,ci3) // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
+      PM(di2,di3,ci2,cr3) // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
+      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2) // ch = WA*d2
+      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
+      }
+  }
+
+NOINLINE static void radb4(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=4;
+  static const double sqrt2=1.41421356237309504880;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double tr1, tr2;
+    PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k))
+    double tr3=2.*CC(ido-1,1,k);
+    double tr4=2.*CC(0,2,k);
+    PM (CH(0,k,0),CH(0,k,2),tr2,tr3)
+    PM (CH(0,k,3),CH(0,k,1),tr1,tr4)
+    }
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      double tr1,tr2,ti1,ti2;
+      PM (ti1,ti2,CC(0    ,3,k),CC(0    ,1,k))
+      PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k))
+      CH(ido-1,k,0)=tr2+tr2;
+      CH(ido-1,k,1)=sqrt2*(tr1-ti1);
+      CH(ido-1,k,2)=ti2+ti2;
+      CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+      size_t ic=ido-i;
+      PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k))
+      PM (ti1,ti2,CC(i  ,0,k),CC(ic  ,3,k))
+      PM (tr4,ti3,CC(i  ,2,k),CC(ic  ,1,k))
+      PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k))
+      PM (CH(i-1,k,0),cr3,tr2,tr3)
+      PM (CH(i  ,k,0),ci3,ti2,ti3)
+      PM (cr4,cr2,tr1,tr4)
+      PM (ci2,ci4,ti1,ti4)
+      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2)
+      MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3)
+      MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4)
+      }
+  }
+
+NOINLINE static void radb5(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=5;
+  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
+                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double ti5=CC(0,2,k)+CC(0,2,k);
+    double ti4=CC(0,4,k)+CC(0,4,k);
+    double tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
+    double tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
+    CH(0,k,0)=CC(0,0,k)+tr2+tr3;
+    double cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
+    double cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
+    double ci4, ci5;
+    MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
+    PM(CH(0,k,4),CH(0,k,1),cr2,ci5)
+    PM(CH(0,k,3),CH(0,k,2),cr3,ci4)
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
+      PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k))
+      PM(ti5,ti2,CC(i  ,2,k),CC(ic  ,1,k))
+      PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k))
+      PM(ti4,ti3,CC(i  ,4,k),CC(ic  ,3,k))
+      CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
+      CH(i  ,k,0)=CC(i  ,0,k)+ti2+ti3;
+      double cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
+      double ci2=CC(i  ,0,k)+tr11*ti2+tr12*ti3;
+      double cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
+      double ci3=CC(i  ,0,k)+tr12*ti2+tr11*ti3;
+      double ci4, ci5, cr5, cr4;
+      MULPM(cr5,cr4,tr5,tr4,ti11,ti12)
+      MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
+      double dr2, dr3, dr4, dr5, di2, di3, di4, di5;
+      PM(dr4,dr3,cr3,ci4)
+      PM(di3,di4,ci3,cr4)
+      PM(dr5,dr2,cr2,ci5)
+      PM(di2,di5,ci2,cr5)
+      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
+      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
+      MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4)
+      MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5)
+      }
+  }
+
+#undef CC
+#undef CH
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define C2(a,b) cc[(a)+idl1*(b)]
+#define CH2(a,b) ch[(a)+idl1*(b)]
+
+NOINLINE static void radbg(size_t ido, size_t ip, size_t l1,
+  double * restrict cc, double * restrict ch, const double * restrict wa,
+  const double * restrict csarr)
+  {
+  const size_t cdim=ip;
+  size_t ipph=(ip+1)/ 2;
+  size_t idl1 = ido*l1;
+
+  for (size_t k=0; k<l1; ++k)        // 102
+    for (size_t i=0; i<ido; ++i)     // 101
+      CH(i,k,0) = CC(i,0,k);
+  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)   // 108
+    {
+    size_t j2=2*j-1;
+    for (size_t k=0; k<l1; ++k)
+      {
+      CH(0,k,j ) = 2*CC(ido-1,j2,k);
+      CH(0,k,jc) = 2*CC(0,j2+1,k);
+      }
+    }
+
+  if (ido!=1)
+    {
+    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 111
+      {
+      size_t j2=2*j-1;
+      for (size_t k=0; k<l1; ++k)
+        for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 109
+          {
+          CH(i  ,k,j ) = CC(i  ,j2+1,k)+CC(ic  ,j2,k);
+          CH(i  ,k,jc) = CC(i  ,j2+1,k)-CC(ic  ,j2,k);
+          CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
+          CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
+          }
+      }
+    }
+  for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
+    {
+    for (size_t ik=0; ik<idl1; ++ik)
+      {
+      C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
+      C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
+      }
+    size_t iang=2*l;
+    size_t j=3,jc=ip-3;
+    for(; j<ipph-3; j+=4,jc-=4)
+      {
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
+                    +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
+        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
+                    +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
+        }
+      }
+    for(; j<ipph-1; j+=2,jc-=2)
+      {
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
+        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
+        }
+      }
+    for(; j<ipph; ++j,--jc)
+      {
+      iang+=l; if(iang>ip) iang-=ip;
+      double war=csarr[2*iang], wai=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        C2(ik,l ) += war*CH2(ik,j );
+        C2(ik,lc) += wai*CH2(ik,jc);
+        }
+      }
+    }
+  for (size_t j=1; j<ipph; ++j)
+    for (size_t ik=0; ik<idl1; ++ik)
+      CH2(ik,0) += CH2(ik,j);
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 124
+    for (size_t k=0; k<l1; ++k)
+      {
+      CH(0,k,j ) = C1(0,k,j)-C1(0,k,jc);
+      CH(0,k,jc) = C1(0,k,j)+C1(0,k,jc);
+      }
+
+  if (ido==1) return;
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)  // 127
+    for (size_t k=0; k<l1; ++k)
+      for (size_t i=1; i<=ido-2; i+=2)
+        {
+        CH(i  ,k,j ) = C1(i  ,k,j)-C1(i+1,k,jc);
+        CH(i  ,k,jc) = C1(i  ,k,j)+C1(i+1,k,jc);
+        CH(i+1,k,j ) = C1(i+1,k,j)+C1(i  ,k,jc);
+        CH(i+1,k,jc) = C1(i+1,k,j)-C1(i  ,k,jc);
+        }
+
+// All in CH
+
+  for (size_t j=1; j<ip; ++j)
+    {
+    size_t is = (j-1)*(ido-1);
+    for (size_t k=0; k<l1; ++k)
+      {
+      size_t idij = is;
+      for (size_t i=1; i<=ido-2; i+=2)
+        {
+        double t1=CH(i,k,j), t2=CH(i+1,k,j);
+        CH(i  ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
+        CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
+        idij+=2;
+        }
+      }
+    }
+  }
+#undef C1
+#undef C2
+#undef CH2
+
+#undef CC
+#undef CH
+#undef PM
+#undef MULPM
+#undef WA
+
+static void copy_and_norm(double *c, double *p1, size_t n, double fct)
+  {
+  if (p1!=c)
+    {
+    if (fct!=1.)
+      for (size_t i=0; i<n; ++i)
+        c[i] = fct*p1[i];
+    else
+      memcpy (c,p1,n*sizeof(double));
+    }
+  else
+    if (fct!=1.)
+      for (size_t i=0; i<n; ++i)
+        c[i] *= fct;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftp_forward(rfftp_plan plan, double c[], double fct)
+  {
+  if (plan->length==1) return 0;
+  size_t n=plan->length;
+  size_t l1=n, nf=plan->nfct;
+  double *ch = RALLOC(double, n);
+  if (!ch) return -1;
+  double *p1=c, *p2=ch;
+
+  for(size_t k1=0; k1<nf;++k1)
+    {
+    size_t k=nf-k1-1;
+    size_t ip=plan->fct[k].fct;
+    size_t ido=n / l1;
+    l1 /= ip;
+    if(ip==4)
+      radf4(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==2)
+      radf2(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==3)
+      radf3(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==5)
+      radf5(ido, l1, p1, p2, plan->fct[k].tw);
+    else
+      {
+      radfg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
+      SWAP (p1,p2,double *);
+      }
+    SWAP (p1,p2,double *);
+    }
+  copy_and_norm(c,p1,n,fct);
+  DEALLOC(ch);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftp_backward(rfftp_plan plan, double c[], double fct)
+  {
+  if (plan->length==1) return 0;
+  size_t n=plan->length;
+  size_t l1=1, nf=plan->nfct;
+  double *ch = RALLOC(double, n);
+  if (!ch) return -1;
+  double *p1=c, *p2=ch;
+
+  for(size_t k=0; k<nf; k++)
+    {
+    size_t ip = plan->fct[k].fct,
+           ido= n/(ip*l1);
+    if(ip==4)
+      radb4(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==2)
+      radb2(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==3)
+      radb3(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==5)
+      radb5(ido, l1, p1, p2, plan->fct[k].tw);
+    else
+      radbg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
+    SWAP (p1,p2,double *);
+    l1*=ip;
+    }
+  copy_and_norm(c,p1,n,fct);
+  DEALLOC(ch);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftp_factorize (rfftp_plan plan)
+  {
+  size_t length=plan->length;
+  size_t nfct=0;
+  while ((length%4)==0)
+    { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
+  if ((length%2)==0)
+    {
+    length>>=1;
+    // factor 2 should be at the front of the factor list
+    if (nfct>=NFCT) return -1;
+    plan->fct[nfct++].fct=2;
+    SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
+    }
+  size_t maxl=(size_t)(sqrt((double)length))+1;
+  for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
+    if ((length%divisor)==0)
+      {
+      while ((length%divisor)==0)
+        {
+        if (nfct>=NFCT) return -1;
+        plan->fct[nfct++].fct=divisor;
+        length/=divisor;
+        }
+      maxl=(size_t)(sqrt((double)length))+1;
+      }
+  if (length>1) plan->fct[nfct++].fct=length;
+  plan->nfct=nfct;
+  return 0;
+  }
+
+static size_t rfftp_twsize(rfftp_plan plan)
+  {
+  size_t twsize=0, l1=1;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
+    twsize+=(ip-1)*(ido-1);
+    if (ip>5) twsize+=2*ip;
+    l1*=ip;
+    }
+  return twsize;
+  return 0;
+  }
+
+WARN_UNUSED_RESULT static int rfftp_comp_twiddle (rfftp_plan plan)
+  {
+  size_t length=plan->length;
+  double *twid = RALLOC(double, 2*length);
+  if (!twid) return -1;
+  sincos_2pibyn(length, twid);
+  size_t l1=1;
+  double *ptr=plan->mem;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido=length/(l1*ip);
+    if (k<plan->nfct-1) // last factor doesn't need twiddles
+      {
+      plan->fct[k].tw=ptr; ptr+=(ip-1)*(ido-1);
+      for (size_t j=1; j<ip; ++j)
+        for (size_t i=1; i<=(ido-1)/2; ++i)
+          {
+          plan->fct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
+          plan->fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
+          }
+      }
+    if (ip>5) // special factors required by *g functions
+      {
+      plan->fct[k].tws=ptr; ptr+=2*ip;
+      for (size_t i=0; i<ip; ++i)
+        {
+        plan->fct[k].tws[2*i  ] = twid[2*i*(length/ip)];
+        plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
+        }
+      }
+    l1*=ip;
+    }
+  DEALLOC(twid);
+  return 0;
+  }
+
+static rfftp_plan make_rfftp_plan (size_t length)
+  {
+  if (length==0) return NULL;
+  rfftp_plan plan = RALLOC(rfftp_plan_i,1);
+  if (!plan) return NULL;
+  plan->length=length;
+  plan->nfct=0;
+  plan->mem=NULL;
+  for (size_t i=0; i<NFCT; ++i)
+    plan->fct[i]=(rfftp_fctdata){0,0,0};
+  if (length==1) return plan;
+  if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
+  size_t tws=rfftp_twsize(plan);
+  plan->mem=RALLOC(double,tws);
+  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  if (rfftp_comp_twiddle(plan)!=0)
+    { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  return plan;
+  }
+
+static void destroy_rfftp_plan (rfftp_plan plan)
+  {
+  DEALLOC(plan->mem);
+  DEALLOC(plan);
+  }
+
+typedef struct fftblue_plan_i
+  {
+  size_t n, n2;
+  cfftp_plan plan;
+  double *mem;
+  double *bk, *bkf;
+  } fftblue_plan_i;
+typedef struct fftblue_plan_i * fftblue_plan;
+
+static fftblue_plan make_fftblue_plan (size_t length)
+  {
+  fftblue_plan plan = RALLOC(fftblue_plan_i,1);
+  if (!plan) return NULL;
+  plan->n = length;
+  plan->n2 = good_size(plan->n*2-1);
+  plan->mem = RALLOC(double, 2*plan->n+2*plan->n2);
+  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  plan->bk  = plan->mem;
+  plan->bkf = plan->bk+2*plan->n;
+
+/* initialize b_k */
+  double *tmp = RALLOC(double,4*plan->n);
+  if (!tmp) { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  sincos_2pibyn(2*plan->n,tmp);
+  plan->bk[0] = 1;
+  plan->bk[1] = 0;
+
+  size_t coeff=0;
+  for (size_t m=1; m<plan->n; ++m)
+    {
+    coeff+=2*m-1;
+    if (coeff>=2*plan->n) coeff-=2*plan->n;
+    plan->bk[2*m  ] = tmp[2*coeff  ];
+    plan->bk[2*m+1] = tmp[2*coeff+1];
+    }
+
+  /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
+  double xn2 = 1./plan->n2;
+  plan->bkf[0] = plan->bk[0]*xn2;
+  plan->bkf[1] = plan->bk[1]*xn2;
+  for (size_t m=2; m<2*plan->n; m+=2)
+    {
+    plan->bkf[m]   = plan->bkf[2*plan->n2-m]   = plan->bk[m]   *xn2;
+    plan->bkf[m+1] = plan->bkf[2*plan->n2-m+1] = plan->bk[m+1] *xn2;
+    }
+  for (size_t m=2*plan->n;m<=(2*plan->n2-2*plan->n+1);++m)
+    plan->bkf[m]=0.;
+  plan->plan=make_cfftp_plan(plan->n2);
+  if (!plan->plan)
+    { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  if (cfftp_forward(plan->plan,plan->bkf,1.)!=0)
+    { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  DEALLOC(tmp);
+
+  return plan;
+  }
+
+static void destroy_fftblue_plan (fftblue_plan plan)
+  {
+  DEALLOC(plan->mem);
+  destroy_cfftp_plan(plan->plan);
+  DEALLOC(plan);
+  }
+
+WARN_UNUSED_RESULT
+static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct)
+  {
+  size_t n=plan->n;
+  size_t n2=plan->n2;
+  double *bk  = plan->bk;
+  double *bkf = plan->bkf;
+  double *akf = RALLOC(double, 2*n2);
+  if (!akf) return -1;
+
+/* initialize a_k and FFT it */
+  if (isign>0)
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      akf[m]   = c[m]*bk[m]   - c[m+1]*bk[m+1];
+      akf[m+1] = c[m]*bk[m+1] + c[m+1]*bk[m];
+      }
+  else
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      akf[m]   = c[m]*bk[m]   + c[m+1]*bk[m+1];
+      akf[m+1] =-c[m]*bk[m+1] + c[m+1]*bk[m];
+      }
+  for (size_t m=2*n; m<2*n2; ++m)
+    akf[m]=0;
+
+  if (cfftp_forward (plan->plan,akf,fct)!=0)
+    { DEALLOC(akf); return -1; }
+
+/* do the convolution */
+  if (isign>0)
+    for (size_t m=0; m<2*n2; m+=2)
+      {
+      double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
+      akf[m  ]  =  akf[m]*bkf[m]   + akf[m+1]*bkf[m+1];
+      akf[m+1]  = im;
+      }
+  else
+    for (size_t m=0; m<2*n2; m+=2)
+      {
+      double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
+      akf[m  ]  = akf[m]*bkf[m]   - akf[m+1]*bkf[m+1];
+      akf[m+1]  = im;
+      }
+
+/* inverse FFT */
+  if (cfftp_backward (plan->plan,akf,1.)!=0)
+    { DEALLOC(akf); return -1; }
+
+/* multiply by b_k */
+  if (isign>0)
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      c[m]   = bk[m]  *akf[m] - bk[m+1]*akf[m+1];
+      c[m+1] = bk[m+1]*akf[m] + bk[m]  *akf[m+1];
+      }
+  else
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      c[m]   = bk[m]  *akf[m] + bk[m+1]*akf[m+1];
+      c[m+1] =-bk[m+1]*akf[m] + bk[m]  *akf[m+1];
+      }
+  DEALLOC(akf);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int cfftblue_backward(fftblue_plan plan, double c[], double fct)
+  { return fftblue_fft(plan,c,1,fct); }
+
+WARN_UNUSED_RESULT
+static int cfftblue_forward(fftblue_plan plan, double c[], double fct)
+  { return fftblue_fft(plan,c,-1,fct); }
+
+WARN_UNUSED_RESULT
+static int rfftblue_backward(fftblue_plan plan, double c[], double fct)
+  {
+  size_t n=plan->n;
+  double *tmp = RALLOC(double,2*n);
+  if (!tmp) return -1;
+  tmp[0]=c[0];
+  tmp[1]=0.;
+  memcpy (tmp+2,c+1, (n-1)*sizeof(double));
+  if ((n&1)==0) tmp[n+1]=0.;
+  for (size_t m=2; m<n; m+=2)
+    {
+    tmp[2*n-m]=tmp[m];
+    tmp[2*n-m+1]=-tmp[m+1];
+    }
+  if (fftblue_fft(plan,tmp,1,fct)!=0)
+    { DEALLOC(tmp); return -1; }
+  for (size_t m=0; m<n; ++m)
+    c[m] = tmp[2*m];
+  DEALLOC(tmp);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftblue_forward(fftblue_plan plan, double c[], double fct)
+  {
+  size_t n=plan->n;
+  double *tmp = RALLOC(double,2*n);
+  if (!tmp) return -1;
+  for (size_t m=0; m<n; ++m)
+    {
+    tmp[2*m] = c[m];
+    tmp[2*m+1] = 0.;
+    }
+  if (fftblue_fft(plan,tmp,-1,fct)!=0)
+    { DEALLOC(tmp); return -1; }
+  c[0] = tmp[0];
+  memcpy (c+1, tmp+2, (n-1)*sizeof(double));
+  DEALLOC(tmp);
+  return 0;
+  }
+
+typedef struct cfft_plan_i
+  {
+  cfftp_plan packplan;
+  fftblue_plan blueplan;
+  } cfft_plan_i;
+
+cfft_plan make_cfft_plan (size_t length)
+  {
+  if (length==0) return NULL;
+  cfft_plan plan = RALLOC(cfft_plan_i,1);
+  if (!plan) return NULL;
+  plan->blueplan=0;
+  plan->packplan=0;
+  if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
+    {
+    plan->packplan=make_cfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    return plan;
+    }
+  double comp1 = cost_guess(length);
+  double comp2 = 2*cost_guess(good_size(2*length-1));
+  comp2*=1.5; /* fudge factor that appears to give good overall performance */
+  if (comp2<comp1) // use Bluestein
+    {
+    plan->blueplan=make_fftblue_plan(length);
+    if (!plan->blueplan) { DEALLOC(plan); return NULL; }
+    }
+  else
+    {
+    plan->packplan=make_cfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    }
+  return plan;
+  }
+
+void destroy_cfft_plan (cfft_plan plan)
+  {
+  if (plan->blueplan)
+    destroy_fftblue_plan(plan->blueplan);
+  if (plan->packplan)
+    destroy_cfftp_plan(plan->packplan);
+  DEALLOC(plan);
+  }
+
+WARN_UNUSED_RESULT int cfft_backward(cfft_plan plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return cfftp_backward(plan->packplan,c,fct);
+  // if (plan->blueplan)
+  return cfftblue_backward(plan->blueplan,c,fct);
+  }
+
+WARN_UNUSED_RESULT int cfft_forward(cfft_plan plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return cfftp_forward(plan->packplan,c,fct);
+  // if (plan->blueplan)
+  return cfftblue_forward(plan->blueplan,c,fct);
+  }
+
+typedef struct rfft_plan_i
+  {
+  rfftp_plan packplan;
+  fftblue_plan blueplan;
+  } rfft_plan_i;
+
+rfft_plan make_rfft_plan (size_t length)
+  {
+  if (length==0) return NULL;
+  rfft_plan plan = RALLOC(rfft_plan_i,1);
+  if (!plan) return NULL;
+  plan->blueplan=0;
+  plan->packplan=0;
+  if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
+    {
+    plan->packplan=make_rfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    return plan;
+    }
+  double comp1 = 0.5*cost_guess(length);
+  double comp2 = 2*cost_guess(good_size(2*length-1));
+  comp2*=1.5; /* fudge factor that appears to give good overall performance */
+  if (comp2<comp1) // use Bluestein
+    {
+    plan->blueplan=make_fftblue_plan(length);
+    if (!plan->blueplan) { DEALLOC(plan); return NULL; }
+    }
+  else
+    {
+    plan->packplan=make_rfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    }
+  return plan;
+  }
+
+void destroy_rfft_plan (rfft_plan plan)
+  {
+  if (plan->blueplan)
+    destroy_fftblue_plan(plan->blueplan);
+  if (plan->packplan)
+    destroy_rfftp_plan(plan->packplan);
+  DEALLOC(plan);
+  }
+
+size_t rfft_length(rfft_plan plan)
+  {
+  if (plan->packplan) return plan->packplan->length;
+  return plan->blueplan->n;
+  }
+
+size_t cfft_length(cfft_plan plan)
+  {
+  if (plan->packplan) return plan->packplan->length;
+  return plan->blueplan->n;
+  }
+
+WARN_UNUSED_RESULT int rfft_backward(rfft_plan plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return rfftp_backward(plan->packplan,c,fct);
+  else // if (plan->blueplan)
+    return rfftblue_backward(plan->blueplan,c,fct);
+  }
+
+WARN_UNUSED_RESULT int rfft_forward(rfft_plan plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return rfftp_forward(plan->packplan,c,fct);
+  else // if (plan->blueplan)
+    return rfftblue_forward(plan->blueplan,c,fct);
+  }
diff --git a/pocketfft/pocketfft.h b/pocketfft/pocketfft.h
new file mode 100644
index 0000000..9eb3985
--- /dev/null
+++ b/pocketfft/pocketfft.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of pocketfft.
+ * Licensed under a 3-clause BSD style license - see LICENSE.md
+ */
+
+/*! \file pocketfft.h
+ *  Public interface of the pocketfft library
+ *
+ *  Copyright (C) 2008-2018 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef POCKETFFT_H
+#define POCKETFFT_H
+
+#include <stdlib.h>
+
+struct cfft_plan_i;
+typedef struct cfft_plan_i * cfft_plan;
+cfft_plan make_cfft_plan (size_t length);
+void destroy_cfft_plan (cfft_plan plan);
+int cfft_backward(cfft_plan plan, double c[], double fct);
+int cfft_forward(cfft_plan plan, double c[], double fct);
+size_t cfft_length(cfft_plan plan);
+
+struct rfft_plan_i;
+typedef struct rfft_plan_i * rfft_plan;
+rfft_plan make_rfft_plan (size_t length);
+void destroy_rfft_plan (rfft_plan plan);
+int rfft_backward(rfft_plan plan, double c[], double fct);
+int rfft_forward(rfft_plan plan, double c[], double fct);
+size_t rfft_length(rfft_plan plan);
+
+#endif

From dce3c2b430a4e48adeea9fafcf3f24ab9efc63bd Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 26 Oct 2018 11:58:32 +0200
Subject: [PATCH 04/85] fixes

---
 libsharp/sharp_announce.c |  4 ++--
 libsharp/sharp_core.c     | 15 +++++++++++++--
 libsharp/sharp_core.h     |  2 ++
 libsharp/sharp_core_avx.c |  8 ++++----
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/libsharp/sharp_announce.c b/libsharp/sharp_announce.c
index bc6ee50..7027167 100644
--- a/libsharp/sharp_announce.c
+++ b/libsharp/sharp_announce.c
@@ -40,7 +40,7 @@
 #endif
 
 #include "sharp_announce.h"
-#include "sharp_vecutil.h"
+#include "sharp_core.h"
 
 static void OpenMP_status(void)
   {
@@ -70,7 +70,7 @@ static void MPI_status(void)
   }
 
 static void vecmath_status(void)
-  { printf("Supported vector length: %d\n",VLEN); }
+  { printf("Supported vector length: %d\n",sharp_veclen()); }
 
 void sharp_announce (const char *name)
   {
diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index f052555..8d75893 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -33,7 +33,7 @@
 #include "sharp_core_inc0.c"
 #undef ARCH
 
-#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=5)
 void inner_loop_avx (sharp_job *job, const int *ispair,const double *cth,
   const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
   const int *mlim);
@@ -43,7 +43,7 @@ void inner_loop (sharp_job *job, const int *ispair,const double *cth,
   const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
   const int *mlim)
   {
-#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=5)
   __builtin_cpu_init();
   if (__builtin_cpu_supports("avx"))
     inner_loop_avx (job, ispair, cth, sth, llim, ulim, gen, mi, mlim);
@@ -51,3 +51,14 @@ void inner_loop (sharp_job *job, const int *ispair,const double *cth,
 #endif
     inner_loop_default (job, ispair, cth, sth, llim, ulim, gen, mi, mlim);
   }
+
+int sharp_veclen(void)
+  {
+#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=5)
+  __builtin_cpu_init();
+  if (__builtin_cpu_supports("avx"))
+    return 4;
+  else
+#endif
+    return VLEN;
+  }
diff --git a/libsharp/sharp_core.h b/libsharp/sharp_core.h
index 1e86488..f641125 100644
--- a/libsharp/sharp_core.h
+++ b/libsharp/sharp_core.h
@@ -43,6 +43,8 @@ void inner_loop (sharp_job *job, const int *ispair,const double *cth,
   const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
   const int *mlim);
 
+int sharp_veclen(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/libsharp/sharp_core_avx.c b/libsharp/sharp_core_avx.c
index dc6ee48..a250b49 100644
--- a/libsharp/sharp_core_avx.c
+++ b/libsharp/sharp_core_avx.c
@@ -1,14 +1,14 @@
-#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=5)
 // if we arrive here, we can benefit from an additional AVX version
-#warning entering gcc and x86_64 specific code branch
+// #warning entering gcc and x86_64 specific code branch
 
 #define ARCH _avx
-#define __AVX__
+//#define __AVX__
 #pragma GCC push_options
 #pragma GCC target("avx")
 #include "sharp_core_inc0.c"
 #pragma GCC pop_options
-#undef __AVX__
+//#undef __AVX__
 #undef ARCH
 
 #endif

From 18c82762c3a4262d363c4feadeb79fbb693d00b9 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 26 Oct 2018 14:36:25 +0200
Subject: [PATCH 05/85] updates

---
 c_utils/c_utils.c              |   6 +-
 c_utils/c_utils.h              |   8 +-
 libsharp/sharp.c               | 195 +++++++++++++++++++++------------
 libsharp/sharp_almhelpers.c    |   2 +-
 libsharp/sharp_almhelpers.h    |   2 +-
 libsharp/sharp_complex_hacks.h |  31 +++++-
 libsharp/sharp_core_inc.c      |  93 ++++++++++------
 libsharp/sharp_core_inc0.c     |   3 +-
 libsharp/sharp_core_inc2.c     | 193 +++++++++++++++++---------------
 libsharp/sharp_cxx.h           |  70 ++++++++++--
 libsharp/sharp_geomhelpers.c   |   6 +-
 libsharp/sharp_testsuite.c     |   8 +-
 libsharp/sharp_vecsupport.h    |  14 ++-
 libsharp/sharp_ylmgen_c.c      |   9 +-
 libsharp/sharp_ylmgen_c.h      |   3 +-
 15 files changed, 424 insertions(+), 219 deletions(-)

diff --git a/c_utils/c_utils.c b/c_utils/c_utils.c
index 96bd765..9344a6d 100644
--- a/c_utils/c_utils.c
+++ b/c_utils/c_utils.c
@@ -25,7 +25,7 @@
 /*
  *  Convenience functions
  *
- *  Copyright (C) 2008, 2009, 2010, 2011, 2012 Max-Planck-Society
+ *  Copyright (C) 2008-2017 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
@@ -44,7 +44,7 @@ void util_warn_ (const char *file, int line, const char *func, const char *msg)
 
 /* This function tries to avoid allocations with a total size close to a high
    power of two (called the "critical stride" here), by adding a few more bytes
-   if necssary. This lowers the probability that two arrays differ by a multiple
+   if necessary. This lowers the probability that two arrays differ by a multiple
    of the critical stride in their starting address, which in turn lowers the
    risk of cache line contention. */
 static size_t manipsize(size_t sz)
@@ -61,7 +61,7 @@ void *util_malloc_ (size_t sz)
   {
   void *res;
   if (sz==0) return NULL;
-  res = _mm_malloc(manipsize(sz),16);
+  res = _mm_malloc(manipsize(sz),32);
   UTIL_ASSERT(res,"_mm_malloc() failed");
   return res;
   }
diff --git a/c_utils/c_utils.h b/c_utils/c_utils.h
index 0503449..01c64ad 100644
--- a/c_utils/c_utils.h
+++ b/c_utils/c_utils.h
@@ -25,7 +25,7 @@
 /*! \file c_utils.h
  *  Convenience functions
  *
- *  Copyright (C) 2008, 2009, 2010, 2011 Max-Planck-Society
+ *  Copyright (C) 2008-2017 Max-Planck-Society
  *  \author Martin Reinecke
  *  \note This file should only be included from .c files, NOT from .h files.
  */
@@ -144,4 +144,10 @@ void util_free_ (void *ptr);
 }
 #endif
 
+#ifdef __GNUC__
+#define NOINLINE __attribute__((noinline))
+#else
+#define NOINLINE
+#endif
+
 #endif
diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index b1b9277..884a644 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -25,11 +25,12 @@
 /*! \file sharp.c
  *  Spherical transform library
  *
- *  Copyright (C) 2006-2013 Max-Planck-Society
+ *  Copyright (C) 2006-2016 Max-Planck-Society
  *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
 #include <math.h>
+#include <string.h>
 #include "pocketfft/pocketfft.h"
 #include "sharp_ylmgen_c.h"
 #include "sharp_internal.h"
@@ -63,7 +64,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
   *nchunks = (ndata+(*chunksize)-1)/(*chunksize);
   }
 
-int sharp_get_mlim (int lmax, int spin, double sth, double cth)
+NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
   {
   double ofs=lmax*0.01;
   if (ofs<100.) ofs=100.;
@@ -83,12 +84,13 @@ typedef struct
   dcmplx *shiftarr;
   int s_shift;
   rfft_plan plan;
+  int length;
   int norot;
   } ringhelper;
 
 static void ringhelper_init (ringhelper *self)
   {
-  static ringhelper rh_null = { 0, NULL, 0, NULL, 0 };
+  static ringhelper rh_null = { 0, NULL, 0, NULL, 0, 0 };
   *self = rh_null;
   }
 
@@ -99,7 +101,7 @@ static void ringhelper_destroy (ringhelper *self)
   ringhelper_init(self);
   }
 
-static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
+NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
   {
   self->norot = (fabs(phi0)<1e-14);
   if (!(self->norot))
@@ -110,12 +112,15 @@ static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
       self->phi0_ = phi0;
       for (int m=0; m<=mmax; ++m)
         self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
+//      double *tmp=(double *) self->shiftarr;
+//      sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
       }
   if (!self->plan) self->plan=make_rfft_plan(nph);
-  if (nph!=(int)rfft_length(self->plan))
+  if (nph!=(int)self->length)
     {
     destroy_rfft_plan(self->plan);
     self->plan=make_rfft_plan(nph);
+    self->length=nph;
     }
   }
 
@@ -127,6 +132,7 @@ static int ringinfo_compare (const void *xa, const void *xb)
 static int ringpair_compare (const void *xa, const void *xb)
   {
   const sharp_ringpair *a=xa, *b=xb;
+//  return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
   if (a->r1.nph==b->r1.nph)
     return (a->r1.phi0 < b->r1.phi0) ? -1 :
       ((a->r1.phi0 > b->r1.phi0) ? 1 :
@@ -261,6 +267,7 @@ void sharp_destroy_geom_info (sharp_geom_info *geom_info)
    distribution are permissible. */
 static int sharp_get_mmax (int *mval, int nm)
   {
+  //FIXME: if gaps are allowed, we have to search the maximum m in the array
   int *mcheck=RALLOC(int,nm);
   SET_ARRAY(mcheck,0,nm,0);
   for (int i=0; i<nm; ++i)
@@ -274,7 +281,7 @@ static int sharp_get_mmax (int *mval, int nm)
   return nm-1;
   }
 
-static void ringhelper_phase2ring (ringhelper *self,
+NOINLINE static void ringhelper_phase2ring (ringhelper *self,
   const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase,
   int pstride, int flags)
   {
@@ -288,13 +295,19 @@ static void ringhelper_phase2ring (ringhelper *self,
 
   if (nph>=2*mmax+1)
     {
-    for (int m=0; m<=mmax; ++m)
-      {
-      dcmplx tmp = phase[m*pstride]*wgt;
-      if(!self->norot) tmp*=self->shiftarr[m];
-      data[2*m]=creal(tmp);
-      data[2*m+1]=cimag(tmp);
-      }
+    if (self->norot)
+      for (int m=0; m<=mmax; ++m)
+        {
+        data[2*m]=creal(phase[m*pstride])*wgt;
+        data[2*m+1]=cimag(phase[m*pstride])*wgt;
+        }
+    else
+      for (int m=0; m<=mmax; ++m)
+        {
+        dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
+        data[2*m]=creal(tmp)*wgt;
+        data[2*m+1]=cimag(tmp)*wgt;
+        }
     for (int m=2*(mmax+1); m<nph+2; ++m)
       data[m]=0.;
     }
@@ -326,7 +339,7 @@ static void ringhelper_phase2ring (ringhelper *self,
   rfft_backward (self->plan, &(data[1]), 1.);
   }
 
-static void ringhelper_ring2phase (ringhelper *self,
+NOINLINE static void ringhelper_ring2phase (ringhelper *self,
   const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase,
   int pstride, int flags)
   {
@@ -376,7 +389,7 @@ static void ringhelper_ring2phase (ringhelper *self,
     phase[m*pstride]=0.;
   }
 
-static void fill_map (const sharp_geom_info *ginfo, void *map, double value,
+NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
   int flags)
   {
   if (flags & SHARP_NO_FFT)
@@ -386,50 +399,55 @@ static void fill_map (const sharp_geom_info *ginfo, void *map, double value,
       if (flags&SHARP_DP)
         {
         for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
-          ((dcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
-            =value;
+          ((dcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
         for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
-          ((dcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
-            =value;
+          ((dcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
         }
       else
         {
         for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
-          ((fcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
-            =(float)value;
+          ((fcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
         for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
-          ((fcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
-            =(float)value;
+          ((fcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
         }
       }
     }
   else
     {
-    for (int j=0;j<ginfo->npairs;++j)
+    if (flags&SHARP_DP)
       {
-      if (flags&SHARP_DP)
+      for (int j=0;j<ginfo->npairs;++j)
         {
-        for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
-          ((double *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
-            =value;
-        for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
-          ((double *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
-            =value;
+        double *dmap=(double *)map;
+        if (ginfo->pair[j].r1.stride==1)
+          memset(&dmap[ginfo->pair[j].r1.ofs],0,
+            ginfo->pair[j].r1.nph*sizeof(double));
+        else
+          for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
+            dmap[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
+        if ((ginfo->pair[j].r2.nph>0)&&(ginfo->pair[j].r2.stride==1))
+          memset(&dmap[ginfo->pair[j].r2.ofs],0,
+            ginfo->pair[j].r2.nph*sizeof(double));
+        else
+          for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
+            dmap[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
         }
-      else
+      }
+    else
+      {
+      for (int j=0;j<ginfo->npairs;++j)
         {
         for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
-          ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
-            =(float)value;
+          ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
         for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
-          ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
-            =(float)value;
+          ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
         }
       }
     }
   }
 
-static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags)
+NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
+  int flags)
   {
 #define CLEARLOOP(real_t,body)             \
       {                                    \
@@ -465,7 +483,7 @@ static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags)
     }
   }
 
-static void init_output (sharp_job *job)
+NOINLINE static void init_output (sharp_job *job)
   {
   if (job->flags&SHARP_ADD) return;
   if (job->type == SHARP_MAP2ALM)
@@ -473,21 +491,21 @@ static void init_output (sharp_job *job)
       clear_alm (job->ainfo,job->alm[i],job->flags);
   else
     for (int i=0; i<job->ntrans*job->nmaps; ++i)
-      fill_map (job->ginfo,job->map[i],0.,job->flags);
+      clear_map (job->ginfo,job->map[i],job->flags);
   }
 
-static void alloc_phase (sharp_job *job, int nm, int ntheta)
+NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
   {
   if (job->type==SHARP_MAP2ALM)
     {
-    if ((nm&1023)==0) nm+=3; // hack to avoid critical strides
     job->s_m=2*job->ntrans*job->nmaps;
+    if (((job->s_m*16*nm)&1023)==0) nm+=3; // hack to avoid critical strides
     job->s_th=job->s_m*nm;
     }
   else
     {
-    if ((ntheta&1023)==0) ntheta+=3; // hack to avoid critical strides
     job->s_th=2*job->ntrans*job->nmaps;
+    if (((job->s_th*16*ntheta)&1023)==0) ntheta+=3; // hack to avoid critical strides
     job->s_m=job->s_th*ntheta;
     }
   job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta);
@@ -502,22 +520,28 @@ static void alloc_almtmp (sharp_job *job, int lmax)
 static void dealloc_almtmp (sharp_job *job)
   { DEALLOC(job->almtmp); }
 
-static void alm2almtmp (sharp_job *job, int lmax, int mi)
+NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
   {
 
-#define COPY_LOOP(real_t, source_t, expr_of_x)                      \
-  for (int l=job->ainfo->mval[mi]; l<=lmax; ++l)            \
+#define COPY_LOOP(real_t, source_t, expr_of_x)              \
+  {                                                         \
+  for (int l=m; l<lmin; ++l)                                \
+    for (int i=0; i<job->ntrans*job->nalm; ++i)             \
+      job->almtmp[job->ntrans*job->nalm*l+i] = 0;           \
+  for (int l=lmin; l<=lmax; ++l)                            \
     for (int i=0; i<job->ntrans*job->nalm; ++i)             \
       {                                                     \
-        source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \
-        job->almtmp[job->ntrans*job->nalm*l+i] = expr_of_x; \
-      }
+      source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \
+      job->almtmp[job->ntrans*job->nalm*l+i] = expr_of_x;   \
+      }                                                     \
+  }
 
   if (job->type!=SHARP_MAP2ALM)
     {
     ptrdiff_t ofs=job->ainfo->mvstart[mi];
     int stride=job->ainfo->stride;
     int m=job->ainfo->mval[mi];
+    int lmin=(m<job->spin) ? job->spin : m;
     /* in the case of SHARP_REAL_HARMONICS, phase2ring scales all the
        coefficients by sqrt_one_half; here we must compensate to avoid scaling
        m=0 */
@@ -562,17 +586,17 @@ static void alm2almtmp (sharp_job *job, int lmax, int mi)
       }
     }
   else
-    SET_ARRAY(job->almtmp,job->ntrans*job->nalm*job->ainfo->mval[mi],
-              job->ntrans*job->nalm*(lmax+1),0.);
+    memset (job->almtmp+job->ntrans*job->nalm*job->ainfo->mval[mi], 0,
+      job->ntrans*job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx));
 
 #undef COPY_LOOP
   }
 
-static void almtmp2alm (sharp_job *job, int lmax, int mi)
+NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
   {
 
 #define COPY_LOOP(real_t, target_t, expr_of_x)               \
-  for (int l=job->ainfo->mval[mi]; l<=lmax; ++l)             \
+  for (int l=lmin; l<=lmax; ++l)                             \
     for (int i=0; i<job->ntrans*job->nalm; ++i)              \
       {                                                      \
         dcmplx x = job->almtmp[job->ntrans*job->nalm*l+i];   \
@@ -583,6 +607,7 @@ static void almtmp2alm (sharp_job *job, int lmax, int mi)
   ptrdiff_t ofs=job->ainfo->mvstart[mi];
   int stride=job->ainfo->stride;
   int m=job->ainfo->mval[mi];
+  int lmin=(m<job->spin) ? job->spin : m;
   /* in the case of SHARP_REAL_HARMONICS, ring2phase scales all the
      coefficients by sqrt_two; here we must compensate to avoid scaling
      m=0 */
@@ -629,27 +654,56 @@ static void almtmp2alm (sharp_job *job, int lmax, int mi)
 #undef COPY_LOOP
   }
 
-static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, double *ringtmp,
-  int rstride)
+NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
+  const double *ringtmp, int rstride)
   {
-  double **dmap = (double **)job->map;
-  float  **fmap = (float  **)job->map;
-  for (int i=0; i<job->ntrans*job->nmaps; ++i)
-    for (int m=0; m<ri->nph; ++m)
-      if (job->flags & SHARP_DP)
-        dmap[i][ri->ofs+m*ri->stride] += ringtmp[i*rstride+m+1];
+  if (job->flags & SHARP_DP)
+    {
+    double **dmap = (double **)job->map;
+    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+      {
+      double *restrict p1=&dmap[i][ri->ofs];
+      const double *restrict p2=&ringtmp[i*rstride+1];
+      if (ri->stride==1)
+        {
+        if (job->flags&SHARP_ADD)
+          for (int m=0; m<ri->nph; ++m)
+            p1[m] += p2[m];
+        else
+          memcpy(p1,p2,ri->nph*sizeof(double));
+        }
       else
+        for (int m=0; m<ri->nph; ++m)
+          p1[m*ri->stride] += p2[m];
+      }
+    }
+  else
+    {
+    float  **fmap = (float  **)job->map;
+    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+      for (int m=0; m<ri->nph; ++m)
         fmap[i][ri->ofs+m*ri->stride] += (float)ringtmp[i*rstride+m+1];
+    }
   }
 
-static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, double *ringtmp,
-  int rstride)
+NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
+  double *ringtmp, int rstride)
   {
-  for (int i=0; i<job->ntrans*job->nmaps; ++i)
-    for (int m=0; m<ri->nph; ++m)
-      ringtmp[i*rstride+m+1] = (job->flags & SHARP_DP) ?
-        ((double *)(job->map[i]))[ri->ofs+m*ri->stride] :
-        ((float  *)(job->map[i]))[ri->ofs+m*ri->stride];
+  if (job->flags & SHARP_DP)
+    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+      {
+      double *restrict p1=&ringtmp[i*rstride+1],
+             *restrict p2=&(((double *)(job->map[i]))[ri->ofs]);
+      if (ri->stride==1)
+        memcpy(p1,p2,ri->nph*sizeof(double));
+      else
+        for (int m=0; m<ri->nph; ++m)
+          p1[m] = p2[m*ri->stride];
+      }
+  else
+    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+      for (int m=0; m<ri->nph; ++m)
+        ringtmp[i*rstride+m+1] = ((float *)(job->map[i]))[ri->ofs+m*ri->stride];
   }
 
 static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
@@ -693,7 +747,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
   }
 
 //FIXME: set phase to zero if not SHARP_MAP2ALM?
-static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
+NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
   {
   if (job->type != SHARP_MAP2ALM) return;
   int pstride = job->s_m;
@@ -738,7 +792,7 @@ static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
     }
   }
 
-static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
+NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
   {
   if (job->type == SHARP_MAP2ALM) return;
   int pstride = job->s_m;
@@ -783,7 +837,7 @@ static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
     }
   }
 
-static void sharp_execute_job (sharp_job *job)
+NOINLINE static void sharp_execute_job (sharp_job *job)
   {
   double timer=wallTime();
   job->opcnt=0;
@@ -800,6 +854,7 @@ static void sharp_execute_job (sharp_job *job)
   int nchunks, chunksize;
   get_chunk_info(job->ginfo->npairs,(job->flags&SHARP_NVMAX)*VLEN,&nchunks,
     &chunksize);
+//FIXME: needs to be changed to "nm"
   alloc_phase (job,mmax+1,chunksize);
 
 /* chunk loop */
diff --git a/libsharp/sharp_almhelpers.c b/libsharp/sharp_almhelpers.c
index 6a98309..12ce600 100644
--- a/libsharp/sharp_almhelpers.c
+++ b/libsharp/sharp_almhelpers.c
@@ -25,7 +25,7 @@
 /*! \file sharp_almhelpers.c
  *  Spherical transform library
  *
- *  Copyright (C) 2008-2013 Max-Planck-Society
+ *  Copyright (C) 2008-2016 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
diff --git a/libsharp/sharp_almhelpers.h b/libsharp/sharp_almhelpers.h
index 3bff317..67016d7 100644
--- a/libsharp/sharp_almhelpers.h
+++ b/libsharp/sharp_almhelpers.h
@@ -25,7 +25,7 @@
 /*! \file sharp_almhelpers.h
  *  SHARP helper function for the creation of a_lm data structures
  *
- *  Copyright (C) 2008-2011 Max-Planck-Society
+ *  Copyright (C) 2008-2016 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
diff --git a/libsharp/sharp_complex_hacks.h b/libsharp/sharp_complex_hacks.h
index 9227ca9..86d1153 100644
--- a/libsharp/sharp_complex_hacks.h
+++ b/libsharp/sharp_complex_hacks.h
@@ -25,7 +25,7 @@
 /*  \file sharp_complex_hacks.h
  *  support for converting vector types and complex numbers
  *
- *  Copyright (C) 2012,2013 Max-Planck-Society
+ *  Copyright (C) 2012-2016 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
@@ -51,6 +51,10 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
   complex double * restrict c1, complex double * restrict c2)
   { *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  complex double * restrict cc)
+  { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
+
 #endif
 
 #if (VLEN==2)
@@ -94,6 +98,10 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
 #endif
   }
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  complex double * restrict cc)
+  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
+
 #endif
 
 #if (VLEN==4)
@@ -130,6 +138,23 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
 #endif
   }
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  complex double * restrict cc)
+  {
+  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
+  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
+     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
+  tmp1=vadd(tmp3,tmp4);
+#ifdef UNSAFE_CODE
+  _mm256_storeu_pd((double *)cc,
+    _mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1));
+#else
+  union {Tv v; complex double c[2]; } u;
+  u.v=tmp1;
+  cc[0]+=u.c[0]; cc[1]+=u.c[1];
+#endif
+  }
+
 #endif
 
 #if (VLEN==8)
@@ -144,6 +169,10 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
   *c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
   }
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  complex double * restrict cc)
+  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
+
 #endif
 
 #endif
diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c
index 747658c..8a36bfe 100644
--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
@@ -25,7 +25,7 @@
 /*! \file sharp_core_inc.c
  *  Type-dependent code for the computational core
  *
- *  Copyright (C) 2012 Max-Planck-Society
+ *  Copyright (C) 2012-2017 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
@@ -73,8 +73,8 @@ static inline void Y(Tbmuleq)(Tb * restrict a, Tb b)
 static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
   double maxval)
   {
-  const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
   const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
+  const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
   for (int i=0;i<nvec; ++i)
     {
     Tm mask = vgt(vabs(val->v[i]),vfmax);
@@ -94,35 +94,58 @@ static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
     }
   }
 
-static void Y(mypow) (Tb val, int npow, Tb * restrict resd,
-  Tb * restrict ress)
+NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimit,
+  Tb * restrict resd, Tb * restrict ress)
   {
-  Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
-
-  Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
-
-  do
+  Tv vminv=vload(powlimit[npow]);
+  Tm mask = vlt(vabs(val.v[0]),vminv);
+  for (int i=1;i<nvec; ++i)
+    mask=vor_mask(mask,vlt(vabs(val.v[i]),vminv));
+  if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
     {
-    if (npow&1)
+    Tb res=Y(Tbconst)(1.);
+    do
       {
+      if (npow&1)
+        for (int i=0; i<nvec; ++i)
+          {
+          vmuleq(res.v[i],val.v[i]);
+          vmuleq(val.v[i],val.v[i]);
+          }
+      else
+        for (int i=0; i<nvec; ++i)
+          vmuleq(val.v[i],val.v[i]);
+      }
+    while(npow>>=1);
+    *resd=res;
+    *ress=Y(Tbconst)(0.);
+    }
+  else
+    {
+    Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
+    Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
+    do
+      {
+      if (npow&1)
+        {
+        for (int i=0; i<nvec; ++i)
+          {
+          vmuleq(res.v[i],val.v[i]);
+          vaddeq(scale.v[i],scaleint.v[i]);
+          }
+        Y(Tbnormalize)(&res,&scale,sharp_fbighalf);
+        }
       for (int i=0; i<nvec; ++i)
         {
-        vmuleq(res.v[i],val.v[i]);
-        vaddeq(scale.v[i],scaleint.v[i]);
+        vmuleq(val.v[i],val.v[i]);
+        vaddeq(scaleint.v[i],scaleint.v[i]);
         }
-      Y(Tbnormalize)(&res,&scale,sharp_fbighalf);
+      Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
       }
-    for (int i=0; i<nvec; ++i)
-      {
-      vmuleq(val.v[i],val.v[i]);
-      vaddeq(scaleint.v[i],scaleint.v[i]);
-      }
-    Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
+    while(npow>>=1);
+    *resd=res;
+    *ress=scale;
     }
-  while(npow>>=1);
-
-  *resd=res;
-  *ress=scale;
   }
 
 static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
@@ -179,13 +202,13 @@ static void Y(getCorfac)(Tb scale, Tb * restrict corfac,
   *corfac=corf.b;
   }
 
-static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
+NOINLINE static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
   Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
   const sharp_Ylmgen_C * restrict gen)
   {
   int l=gen->m;
   Tb lam_1=Y(Tbconst)(0.), lam_2, scale;
-  Y(mypow) (sth,l,&lam_2,&scale);
+  Y(mypow) (sth,l,gen->powlimit,&lam_2,&scale);
   Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
   Y(Tbnormalize)(&lam_2,&scale,sharp_ftol);
 
@@ -193,12 +216,12 @@ static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
   while (below_limit)
     {
     if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
-    Tv r0=vload(gen->rf[l].f[0]),r1=vload(gen->rf[l].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    r0=vload(gen->rf[l+1].f[0]); r1=vload(gen->rf[l+1].f[1]);
+      lam_1.v[i] = vload(gen->rf[l].f[0])*(cth.v[i]*lam_2.v[i])
+                 - vload(gen->rf[l].f[1])*lam_1.v[i];
     for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+      lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
+                 - vload(gen->rf[l+1].f[1])*lam_2.v[i];
     if (Y(rescale)(&lam_1,&lam_2,&scale))
       below_limit = Y(TballLt)(scale,sharp_limscale);
     l+=2;
@@ -213,10 +236,8 @@ static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
   Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
   for (int i=0; i<nvec; ++i)
     {
-    rxp->v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,ryp->v[i])),
-                vmul(fx2,rxp->v[i]));
-    rxm->v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rym->v[i])),
-                vmul(fx2,rxm->v[i]));
+    rxp->v[i] = (cth.v[i]-fx1)*fx0*ryp->v[i] - fx2*rxp->v[i];
+    rxm->v[i] = (cth.v[i]+fx1)*fx0*rym->v[i] - fx2*rxm->v[i];
     }
   }
 
@@ -240,8 +261,10 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
     }
 
   Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
-  Y(mypow)(cth2,gen->cosPow,&ccp,&ccps); Y(mypow)(sth2,gen->sinPow,&ssp,&ssps);
-  Y(mypow)(cth2,gen->sinPow,&csp,&csps); Y(mypow)(sth2,gen->cosPow,&scp,&scps);
+  Y(mypow)(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
+  Y(mypow)(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
+  Y(mypow)(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
+  Y(mypow)(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
 
   Tb rec2p, rec2m, scalep, scalem;
   Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.);
diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c
index 8590d2c..7a34e40 100644
--- a/libsharp/sharp_core_inc0.c
+++ b/libsharp/sharp_core_inc0.c
@@ -25,7 +25,7 @@
 /*! \file sharp_core_inc0.c
  *  Computational core
  *
- *  Copyright (C) 2012-2013 Max-Planck-Society
+ *  Copyright (C) 2012-2018 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
@@ -34,7 +34,6 @@
 #include <string.h>
 #include "sharp_vecsupport.h"
 #include "sharp_complex_hacks.h"
-#include "sharp_ylmgen_c.h"
 #include "sharp.h"
 #include "sharp_core.h"
 #include "c_utils.h"
diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c
index 5c9b4ab..9a2e26b 100644
--- a/libsharp/sharp_core_inc2.c
+++ b/libsharp/sharp_core_inc2.c
@@ -25,11 +25,11 @@
 /*! \file sharp_core_inc2.c
  *  Type-dependent code for the computational core
  *
- *  Copyright (C) 2012-2013 Max-Planck-Society
+ *  Copyright (C) 2012-2017 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
-static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
+NOINLINE static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
   Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
   const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
   int l, int lmax NJ1)
@@ -77,29 +77,32 @@ if (njobs>1)
   }
   while (l<lmax)
     {
-    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+      lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
+                 - vload(rf[l].f[1])*lam_1.v[i];
     for (int j=0; j<njobs; ++j)
       {
       Tv ar=vload(creal(alm[njobs*l+j])),
          ai=vload(cimag(alm[njobs*l+j]));
       for (int i=0; i<nvec; ++i)
         {
-        vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
-        vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
-        }
-      ar=vload(creal(alm[njobs*(l+1)+j]));
-      ai=vload(cimag(alm[njobs*(l+1)+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(p2[j].r.v[i],lam_1.v[i],ar);
-        vfmaeq(p2[j].i.v[i],lam_1.v[i],ai);
+        p1[j].r.v[i] += lam_2.v[i]*ar;
+        p1[j].i.v[i] += lam_2.v[i]*ai;
         }
       }
-    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+      lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
+                 - vload(rf[l+1].f[1])*lam_2.v[i];
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv ar=vload(creal(alm[njobs*(l+1)+j])),
+         ai=vload(cimag(alm[njobs*(l+1)+j]));
+      for (int i=0; i<nvec; ++i)
+        {
+        p2[j].r.v[i] += lam_1.v[i]*ar;
+        p2[j].i.v[i] += lam_1.v[i]*ai;
+        }
+      }
     l+=2;
     }
   if (l==lmax)
@@ -109,64 +112,57 @@ if (njobs>1)
       Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
       for (int i=0; i<nvec; ++i)
         {
-        vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
-        vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
+        p1[j].r.v[i] += lam_2.v[i]*ar;
+        p1[j].i.v[i] += lam_2.v[i]*ai;
         }
       }
     }
   }
 
-static void Z(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
-  const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax
+NOINLINE static void Z(map2alm_kernel) (const Tb cth,
+  const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
+  const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp
   NJ1)
   {
   while (l<lmax)
     {
-    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+      lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
+                 - vload(rf[l].f[1])*lam_1.v[i];
     for (int j=0; j<njobs; ++j)
-      {
-      Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
       for (int i=0; i<nvec; ++i)
         {
-        vfmaeq(tr1,lam_2.v[i],p1[j].r.v[i]);
-        vfmaeq(ti1,lam_2.v[i],p1[j].i.v[i]);
+        atmp[2*(l*njobs+j)]+=lam_2.v[i]*p1[j].r.v[i];
+        atmp[2*(l*njobs+j)+1]+=lam_2.v[i]*p1[j].i.v[i];
         }
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(tr2,lam_1.v[i],p2[j].r.v[i]);
-        vfmaeq(ti2,lam_1.v[i],p2[j].i.v[i]);
-        }
-      vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
-      }
-    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+      lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
+                 - vload(rf[l+1].f[1])*lam_2.v[i];
+    for (int j=0; j<njobs; ++j)
+      for (int i=0; i<nvec; ++i)
+        {
+        atmp[2*((l+1)*njobs+j)]+=lam_1.v[i]*p2[j].r.v[i];
+        atmp[2*((l+1)*njobs+j)+1]+=lam_1.v[i]*p2[j].i.v[i];
+        }
     l+=2;
     }
   if (l==lmax)
     {
     for (int j=0; j<njobs; ++j)
-      {
-      Tv tre=vzero, tim=vzero;
       for (int i=0; i<nvec; ++i)
         {
-        vfmaeq(tre,lam_2.v[i],p1[j].r.v[i]);
-        vfmaeq(tim,lam_2.v[i],p1[j].i.v[i]);
+        atmp[2*(l*njobs+j)] += lam_2.v[i]*p1[j].r.v[i];
+        atmp[2*(l*njobs+j)+1] += lam_2.v[i]*p1[j].i.v[i];
         }
-      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
-      }
     }
   }
 
-static void Z(calc_alm2map) (const Tb cth, const Tb sth,
+NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth,
   const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
   Y(Tbri) * restrict p2 NJ1)
   {
   int l,lmax=gen->lmax;
-  Tb lam_1,lam_2,scale;
+  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
   Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
   job->opcnt += (l-gen->m) * 4*VLEN*nvec;
   if (l>lmax) return;
@@ -219,12 +215,12 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth,
   Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
   }
 
-static void Z(calc_map2alm) (const Tb cth, const Tb sth,
+NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
   const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
-  const Y(Tbri) * restrict p2 NJ1)
+  const Y(Tbri) * restrict p2, Tv *restrict atmp NJ1)
   {
   int lmax=gen->lmax;
-  Tb lam_1,lam_2,scale;
+  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
   int l=gen->m;
   Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
   job->opcnt += (l-gen->m) * 4*VLEN*nvec;
@@ -234,40 +230,31 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
   const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
   Tb corfac;
   Y(getCorfac)(scale,&corfac,gen->cf);
-  dcmplx * restrict alm=job->almtmp;
   int full_ieee = Y(TballGe)(scale,sharp_minscale);
   while (!full_ieee)
     {
     for (int j=0; j<njobs; ++j)
-      {
-      Tv tre=vzero, tim=vzero;
       for (int i=0; i<nvec; ++i)
         {
-        Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
-        vfmaeq(tre,tmp,p1[j].r.v[i]);
-        vfmaeq(tim,tmp,p1[j].i.v[i]);
+        Tv tmp=lam_2.v[i]*corfac.v[i];
+        atmp[2*(l*njobs+j)]+=tmp*p1[j].r.v[i];
+        atmp[2*(l*njobs+j)+1]+=tmp*p1[j].i.v[i];
         }
-      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
-      }
     if (++l>lmax) return;
-    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+      lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
+                 - vload(rf[l-1].f[1])*lam_1.v[i];
     for (int j=0; j<njobs; ++j)
-      {
-      Tv tre=vzero, tim=vzero;
       for (int i=0; i<nvec; ++i)
         {
-        Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
-        vfmaeq(tre,tmp,p2[j].r.v[i]);
-        vfmaeq(tim,tmp,p2[j].i.v[i]);
+        Tv tmp=lam_1.v[i]*corfac.v[i];
+        atmp[2*(l*njobs+j)]+=tmp*p2[j].r.v[i];
+        atmp[2*(l*njobs+j)+1]+=tmp*p2[j].i.v[i];
         }
-      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
-      }
     if (++l>lmax) return;
-    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+      lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i])
+                 - vload(rf[l-1].f[1])*lam_2.v[i];
     if (Y(rescale)(&lam_1,&lam_2,&scale))
       {
       Y(getCorfac)(scale,&corfac,gen->cf);
@@ -276,7 +263,7 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
     }
 
   Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
+  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp NJ2);
   }
 
 static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
@@ -317,8 +304,8 @@ static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
        acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
     for (int i=0; i<nvec; ++i)
       {
-      Tv lw1=vadd(r2p.v[i],r2m.v[i]);
-      Tv lx2=vsub(r1m.v[i],r1p.v[i]);
+      Tv lw1=r2p.v[i]+r2m.v[i];
+      Tv lx2=r1m.v[i]-r1p.v[i];
       vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
       vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
       vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
@@ -326,8 +313,8 @@ static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
       }
     for (int i=0; i<nvec; ++i)
       {
-      Tv lx1=vsub(r2m.v[i],r2p.v[i]);
-      Tv lw2=vadd(r1p.v[i],r1m.v[i]);
+      Tv lx1=r2m.v[i]-r2p.v[i];
+      Tv lw2=r1p.v[i]+r1m.v[i];
       vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
       vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
       vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
@@ -359,11 +346,11 @@ static inline void Z(saddstep2) (const Y(Tbqu) * restrict px,
       vfmaeq(acr,py[j].qi.v[i],lx);
       vfmseq(aci,py[j].qr.v[i],lx);
       }
-    vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]);
+    vhsum_cmplx_special(agr,agi,acr,aci,&alm[2*j]);
     }
   }
 
-static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
+NOINLINE static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
   Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
   const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
   int lmax NJ1)
@@ -374,10 +361,8 @@ static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
        fx2=vload(fx[l+1].f[2]);
     for (int i=0; i<nvec; ++i)
       {
-      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
-                        vmul(fx2,rec1p.v[i]));
-      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
-                        vmul(fx2,rec1m.v[i]));
+      rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i];
+      rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i];
       }
     Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
       &alm[2*njobs*(l+1)] NJ2);
@@ -385,10 +370,8 @@ static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
     fx2=vload(fx[l+2].f[2]);
     for (int i=0; i<nvec; ++i)
       {
-      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
-                        vmul(fx2,rec2p.v[i]));
-      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
-                        vmul(fx2,rec2m.v[i]));
+      rec2p.v[i] = (cth.v[i]-fx1)*fx0*rec1p.v[i] - fx2*rec2p.v[i];
+      rec2m.v[i] = (cth.v[i]+fx1)*fx0*rec1m.v[i] - fx2*rec2m.v[i];
       }
     l+=2;
     }
@@ -396,7 +379,7 @@ static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
     Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
   }
 
-static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
+NOINLINE static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
   const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
   const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax
   NJ1)
@@ -429,7 +412,7 @@ static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
     Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
   }
 
-static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
+NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
   const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
   Y(Tbqu) * restrict p2 NJ1)
   {
@@ -475,7 +458,7 @@ static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
     lmax NJ2);
   }
 
-static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
+NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
   const sharp_Ylmgen_C * restrict gen, sharp_job *job,
   const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1)
   {
@@ -539,7 +522,7 @@ static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
     }
   }
 
-static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
+NOINLINE static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
   Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
   const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
   int lmax NJ1)
@@ -572,7 +555,7 @@ static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
     Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2);
   }
 
-static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
+NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
   const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
   Y(Tbqu) * restrict p2 NJ1)
   {
@@ -621,7 +604,7 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
 
 #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
 
-static void Z(inner_loop) (sharp_job *job, const int *ispair,
+NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
   const double *cth_, const double *sth_, int llim, int ulim,
   sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
   {
@@ -722,10 +705,30 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
         }
       break;
       }
+    default:
+      {
+      UTIL_FAIL("must not happen");
+      break;
+      }
+    }
+  }
+
+NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
+  {
+  const int nval=nvec*VLEN;
+  const int m = job->ainfo->mval[mi];
+  sharp_Ylmgen_prepare (gen, m);
+
+  switch (job->type)
+    {
     case SHARP_MAP2ALM:
       {
       if (job->spin==0)
         {
+        Tv atmp[2*njobs*(gen->lmax+1)];
+        memset (&atmp[2*njobs*m],0,2*njobs*(gen->lmax+1-m)*sizeof(Tv));
         for (int ith=0; ith<ulim-llim; ith+=nval)
           {
           Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
@@ -751,8 +754,15 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
               }
             }
           if (!skip)
-            Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
+            Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b, atmp NJ2);
           }
+        {
+        int istart=m*njobs, istop=(gen->lmax+1)*njobs;
+        for(; istart<istop-2; istart+=2)
+          vhsum_cmplx_special(atmp[2*istart],atmp[2*istart+1],atmp[2*istart+2],atmp[2*istart+3],&(job->almtmp[istart]));
+        for(; istart<istop; istart++)
+          job->almtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]);
+        }
         }
       else
         {
@@ -800,4 +810,13 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
     }
   }
 
+static void Z(inner_loop) (sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
+  {
+  (job->type==SHARP_MAP2ALM) ?
+    Z(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2) :
+    Z(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2);
+  }
+
 #undef VZERO
diff --git a/libsharp/sharp_cxx.h b/libsharp/sharp_cxx.h
index f8b2365..2c37505 100644
--- a/libsharp/sharp_cxx.h
+++ b/libsharp/sharp_cxx.h
@@ -25,13 +25,14 @@
 /*! \file sharp_cxx.h
  *  Spherical transform library
  *
- *  Copyright (C) 2012-2015 Max-Planck-Society
+ *  Copyright (C) 2012-2016 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
 #ifndef PLANCK_SHARP_CXX_H
 #define PLANCK_SHARP_CXX_H
 
+#include <complex>
 #include "sharp_lowlevel.h"
 #include "sharp_geomhelpers.h"
 #include "sharp_almhelpers.h"
@@ -107,19 +108,30 @@ template<typename T> class sharp_cxxjob: public sharp_base
   private:
     static void *conv (T *ptr)
       { return reinterpret_cast<void *>(ptr); }
+    static void *conv (std::complex<T> *ptr)
+      { return reinterpret_cast<void *>(ptr); }
     static void *conv (const T *ptr)
       { return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }
+    static void *conv (const std::complex<T> *ptr)
+      { return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }
 
   public:
-    void alm2map (const T *alm, T *map, bool add)
+    void alm2map (const T *alm, T *map, bool add) const
       {
       void *aptr=conv(alm), *mptr=conv(map);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
       sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1,
         flags,0,0);
       }
-    void alm2map_spin (const T *alm1, const T *alm2, T *map1, T *map2,
-      int spin, bool add)
+    void alm2map (const std::complex<T> *alm, T *map, bool add) const
+      {
+      void *aptr=conv(alm), *mptr=conv(map);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1,
+        flags,0,0);
+      }
+    void alm2map_spin (const T *alm1, const T *alm2,
+      T *map1, T *map2, int spin, bool add) const
       {
       void *aptr[2], *mptr[2];
       aptr[0]=conv(alm1); aptr[1]=conv(alm2);
@@ -127,21 +139,65 @@ template<typename T> class sharp_cxxjob: public sharp_base
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
       sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
       }
-    void alm2map_der1 (const T *alm, T *map1, T *map2, bool add)
+    void alm2map_spin (const std::complex<T> *alm1, const std::complex<T> *alm2,
+      T *map1, T *map2, int spin, bool add) const
+      {
+      void *aptr[2], *mptr[2];
+      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
+      mptr[0]=conv(map1); mptr[1]=conv(map2);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
+      }
+    void alm2map_der1 (const T *alm, T *map1, T *map2, bool add) const
       {
       void *aptr=conv(alm), *mptr[2];
       mptr[0]=conv(map1); mptr[1]=conv(map2);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
       sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0);
       }
-    void map2alm (const T *map, T *alm, bool add)
+    void alm2map_der1 (const std::complex<T> *alm, T *map1, T *map2, bool add)
+      const
+      {
+      void *aptr=conv(alm), *mptr[2];
+      mptr[0]=conv(map1); mptr[1]=conv(map2);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0);
+      }
+    void alm2map_adjoint (const T *map, T *alm, bool add) const
+      {
+      void *aptr=conv(alm), *mptr=conv(map);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
+      }
+    void alm2map_adjoint (const T *map, std::complex<T> *alm, bool add) const
+      {
+      void *aptr=conv(alm), *mptr=conv(map);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
+      }
+    void map2alm (const T *map, T *alm, bool add) const
+      {
+      void *aptr=conv(alm), *mptr=conv(map);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
+      }
+    void map2alm (const T *map, std::complex<T> *alm, bool add) const
       {
       void *aptr=conv(alm), *mptr=conv(map);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
       sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
       }
     void map2alm_spin (const T *map1, const T *map2, T *alm1, T *alm2,
-      int spin, bool add)
+      int spin, bool add) const
+      {
+      void *aptr[2], *mptr[2];
+      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
+      mptr[0]=conv(map1); mptr[1]=conv(map2);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_MAP2ALM,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
+      }
+    void map2alm_spin (const T *map1, const T *map2, std::complex<T> *alm1,
+      std::complex<T> *alm2, int spin, bool add) const
       {
       void *aptr[2], *mptr[2];
       aptr[0]=conv(alm1); aptr[1]=conv(alm2);
diff --git a/libsharp/sharp_geomhelpers.c b/libsharp/sharp_geomhelpers.c
index 0aed60d..8efb8a0 100644
--- a/libsharp/sharp_geomhelpers.c
+++ b/libsharp/sharp_geomhelpers.c
@@ -25,9 +25,8 @@
 /*! \file sharp_geomhelpers.c
  *  Spherical transform library
  *
- *  Copyright (C) 2006-2012 Max-Planck-Society<br>
- *  Copyright (C) 2007-2008 Pavel Holoborodko (for gauss_legendre_tbl)
- *  \author Martin Reinecke \author Pavel Holoborodko
+ *  Copyright (C) 2006-2018 Max-Planck-Society<br>
+ *  \author Martin Reinecke
  */
 
 #include <math.h>
@@ -35,7 +34,6 @@
 #include "sharp_legendre_roots.h"
 #include "c_utils.h"
 #include "pocketfft/pocketfft.h"
-#include <stdio.h>
 
 void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
   const int *rings, const double *weight, sharp_geom_info **geom_info)
diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c
index 2b1c7af..f02f9fd 100644
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@@ -23,7 +23,7 @@
  */
 
 /*  \file sharp_testsuite.c
- * 
+ *
  *  Copyright (C) 2012-2013 Max-Planck-Society
  *  \author Martin Reinecke
  */
@@ -50,9 +50,9 @@ typedef complex double dcmplx;
 
 int ntasks, mytask;
 
-static double drand (double min, double max, int *state)
+static double drand (double min, double max, unsigned *state)
   {
-  *state = (((*state) * 1103515245) + 12345) & 0x7fffffff;
+  *state = (((*state) * 1103515245u) + 12345u) & 0x7fffffffu;
   return min + (max-min)*(*state)/(0x7fffffff+1.0);
   }
 
@@ -65,7 +65,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
   for (mi=0;mi<helper->nm; ++mi)
     {
     int m=helper->mval[mi];
-    int state=1234567*cnt+8912*m; // random seed
+    unsigned state=1234567u*(unsigned)cnt+8912u*(unsigned)m; // random seed
     for (int l=m;l<=helper->lmax; ++l)
       {
       if ((l<spin)&&(m<spin))
diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index ee4c5e7..5250948 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -25,7 +25,7 @@
 /*  \file sharp_vecsupport.h
  *  Convenience functions for vector arithmetics
  *
- *  Copyright (C) 2012,2013 Max-Planck-Society
+ *  Copyright (C) 2012-2016 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
@@ -72,6 +72,7 @@ typedef int Tm;
 #define vge(a,b) ((a)>=(b))
 #define vne(a,b) ((a)!=(b))
 #define vand_mask(a,b) ((a)&&(b))
+#define vor_mask(a,b) ((a)||(b))
 #define vstoreu(p, a) (*(p)=a)
 #define vstoreu_s(p, a) (*(p)=a)
 
@@ -138,6 +139,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vge(a,b) _mm_cmpge_pd(a,b)
 #define vne(a,b) _mm_cmpneq_pd(a,b)
 #define vand_mask(a,b) _mm_and_pd(a,b)
+#define vor_mask(a,b) _mm_or_pd(a,b)
 #define vmin(a,b) _mm_min_pd(a,b)
 #define vmax(a,b) _mm_max_pd(a,b);
 #define vanyTrue(a) (_mm_movemask_pd(a)!=0)
@@ -183,6 +185,13 @@ typedef __m256d Tm;
 #define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
 #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
 #else
+#if (USE_FMA)
+#define vfmaeq(a,b,c) a=_mm256_fmadd_pd(b,c,a)
+#define vfmaeq_s(a,b,c) a=_mm256_fmadd_ps(b,c,a)
+#define vfmseq(a,b,c) a=_mm256_fnmadd_pd(b,c,a)
+#define vfmaaeq(a,b,c,d,e) a=_mm256_fmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
+#define vfmaseq(a,b,c,d,e) a=_mm256_fnmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
+#else
 #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
 #define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
 #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
@@ -191,6 +200,7 @@ typedef __m256d Tm;
 #define vfmaseq(a,b,c,d,e) \
   a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
 #endif
+#endif
 #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
 #define vload(a) _mm256_set1_pd(a)
 #define vload_s(a) _mm256_set1_ps(a)
@@ -201,6 +211,7 @@ typedef __m256d Tm;
 #define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ)
 #define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
 #define vand_mask(a,b) _mm256_and_pd(a,b)
+#define vor_mask(a,b) _mm256_or_pd(a,b)
 #define vmin(a,b) _mm256_min_pd(a,b)
 #define vmax(a,b) _mm256_max_pd(a,b)
 #define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
@@ -242,6 +253,7 @@ typedef __mmask8 Tm;
 #define vge(a,b) _mm512_cmpnlt_pd_mask(a,b)
 #define vne(a,b) _mm512_cmpneq_pd_mask(a,b)
 #define vand_mask(a,b) ((a)&(b))
+#define vor_mask(a,b) ((a)|(b))
 #define vmin(a,b) _mm512_min_pd(a,b)
 #define vmax(a,b) _mm512_max_pd(a,b)
 #define vanyTrue(a) (a!=0)
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index 6e8cee5..785e063 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -25,7 +25,7 @@
 /*
  *  Helper code for efficient calculation of Y_lm(theta,phi=0)
  *
- *  Copyright (C) 2005-2014 Max-Planck-Society
+ *  Copyright (C) 2005-2016 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
@@ -59,6 +59,12 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
     gen->cf[m]=gen->cf[m+1]*sharp_fsmall;
   for (int m=-sharp_minscale+1; m<(sharp_maxscale-sharp_minscale+1); ++m)
     gen->cf[m]=gen->cf[m-1]*sharp_fbig;
+  gen->powlimit=RALLOC(double,m_max+spin+1);
+  gen->powlimit[0]=0.;
+  const double ln2 = 0.6931471805599453094172321214581766;
+  const double expo=-400*ln2;
+  for (int m=1; m<=m_max+spin; ++m)
+    gen->powlimit[m]=exp(expo/m);
 
   gen->m = -1;
   if (spin==0)
@@ -124,6 +130,7 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
 void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
   {
   DEALLOC(gen->cf);
+  DEALLOC(gen->powlimit);
   if (gen->s==0)
     {
     DEALLOC(gen->rf);
diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h
index 3328f76..63b23cd 100644
--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@@ -25,7 +25,7 @@
 /*! \file sharp_ylmgen_c.h
  *  Code for efficient calculation of Y_lm(phi=0,theta)
  *
- *  Copyright (C) 2005-2012 Max-Planck-Society
+ *  Copyright (C) 2005-2016 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
@@ -49,6 +49,7 @@ typedef struct
 /* for public use; immutable during lifetime */
   int lmax, mmax, s;
   double *cf;
+  double *powlimit;
 
 /* for public use; will typically change after call to Ylmgen_prepare() */
   int m;

From 1c5c9eb5790e6221d6b3fef315d254febd80ef08 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 26 Oct 2018 16:06:30 +0200
Subject: [PATCH 06/85] updates

---
 c_utils/walltime_c.c         | 15 ++++++++++++++-
 libsharp/sharp_cxx.h         |  6 +++---
 libsharp/sharp_geomhelpers.c |  2 +-
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/c_utils/walltime_c.c b/c_utils/walltime_c.c
index c9dce3a..8f4ac0c 100644
--- a/c_utils/walltime_c.c
+++ b/c_utils/walltime_c.c
@@ -25,7 +25,7 @@
 /*
  *  Functionality for reading wall clock time
  *
- *  Copyright (C) 2010, 2011 Max-Planck-Society
+ *  Copyright (C) 2010-2016 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
@@ -33,6 +33,8 @@
 #include <omp.h>
 #elif defined (USE_MPI)
 #include "mpi.h"
+#elif defined (_WIN32)
+#include <Windows.h>
 #else
 #include <sys/time.h>
 #include <stdlib.h>
@@ -46,6 +48,17 @@ double wallTime(void)
   return omp_get_wtime();
 #elif defined (USE_MPI)
   return MPI_Wtime();
+#elif defined (_WIN32)
+  static double inv_freq = -1.;
+  if (inv_freq<0)
+    {
+    LARGE_INTEGER freq;
+    QueryPerformanceFrequency(&freq);
+    inv_freq = 1. / double(freq.QuadPart);
+    }
+  LARGE_INTEGER count;
+  QueryPerformanceCounter(&count);
+  return count.QuadPart*inv_freq;
 #else
   struct timeval t;
   gettimeofday(&t, NULL);
diff --git a/libsharp/sharp_cxx.h b/libsharp/sharp_cxx.h
index 2c37505..f0c2738 100644
--- a/libsharp/sharp_cxx.h
+++ b/libsharp/sharp_cxx.h
@@ -25,7 +25,7 @@
 /*! \file sharp_cxx.h
  *  Spherical transform library
  *
- *  Copyright (C) 2012-2016 Max-Planck-Society
+ *  Copyright (C) 2012-2017 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
@@ -48,8 +48,8 @@ class sharp_base
       : ainfo(0), ginfo(0) {}
     ~sharp_base()
       {
-      sharp_destroy_geom_info(ginfo);
-      sharp_destroy_alm_info(ainfo);
+      if (ginfo) sharp_destroy_geom_info(ginfo);
+      if (ainfo) sharp_destroy_alm_info(ainfo);
       }
 
     void set_general_geometry (int nrings, const int *nph, const ptrdiff_t *ofs,
diff --git a/libsharp/sharp_geomhelpers.c b/libsharp/sharp_geomhelpers.c
index 8efb8a0..0f6af39 100644
--- a/libsharp/sharp_geomhelpers.c
+++ b/libsharp/sharp_geomhelpers.c
@@ -25,7 +25,7 @@
 /*! \file sharp_geomhelpers.c
  *  Spherical transform library
  *
- *  Copyright (C) 2006-2018 Max-Planck-Society<br>
+ *  Copyright (C) 2006-2018 Max-Planck-Society
  *  \author Martin Reinecke
  */
 

From ff1ff8c25eeb63677cb8d71d25ba828a08f53c21 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 6 Nov 2018 15:27:28 +0100
Subject: [PATCH 07/85] better dispatching

---
 libsharp/sharp_core.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 8d75893..fbe83c8 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -34,6 +34,18 @@
 #undef ARCH
 
 #if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=5)
+
+static int have_avx(void)
+  {
+  static int res=-1;
+  if (res<0)
+    {
+    __builtin_cpu_init();
+    res = __builtin_cpu_supports("avx");
+    }
+  return res;
+  }
+
 void inner_loop_avx (sharp_job *job, const int *ispair,const double *cth,
   const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
   const int *mlim);
@@ -44,8 +56,7 @@ void inner_loop (sharp_job *job, const int *ispair,const double *cth,
   const int *mlim)
   {
 #if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=5)
-  __builtin_cpu_init();
-  if (__builtin_cpu_supports("avx"))
+  if (have_avx())
     inner_loop_avx (job, ispair, cth, sth, llim, ulim, gen, mi, mlim);
   else
 #endif
@@ -55,8 +66,7 @@ void inner_loop (sharp_job *job, const int *ispair,const double *cth,
 int sharp_veclen(void)
   {
 #if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=5)
-  __builtin_cpu_init();
-  if (__builtin_cpu_supports("avx"))
+  if (have_avx())
     return 4;
   else
 #endif

From 4e9b37ab3ae40f10eb3881fd0475a890f2e19c46 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 29 Nov 2018 08:46:17 +0100
Subject: [PATCH 08/85] sync with pocketfft master

---
 pocketfft/pocketfft.c | 386 ++++++++++++++++++++++++++++--------------
 1 file changed, 258 insertions(+), 128 deletions(-)

diff --git a/pocketfft/pocketfft.c b/pocketfft/pocketfft.c
index 562ebc9..de1af3e 100644
--- a/pocketfft/pocketfft.c
+++ b/pocketfft/pocketfft.c
@@ -31,15 +31,6 @@
 #define WARN_UNUSED_RESULT
 #endif
 
-#if 0
-static void fracsincos(size_t m, size_t n, double *restrict res)
-  {
-  static const long double twopi=6.283185307179586476925286766559006L;
-  long double arg = twopi*(long double)m/((long double)n);
-  res[0] = (double)cosl(arg); res[1] = (double)sinl(arg);
-  }
-#endif
-
 // adapted from https://stackoverflow.com/questions/42792939/
 // CAUTION: this function only works for arguments in the range [-0.25; 0.25]!
 static void my_sincosm1pi (double a, double *restrict res)
@@ -162,26 +153,33 @@ NOINLINE static void fill_first_half(size_t n, double * restrict res)
   {
   size_t half = n>>1;
   if ((n&3)==0)
-    { res[half] = 0.; res[half+1] = 1.; }
-  for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
-    {
-    res[j  ] = -res[i  ];
-    res[j+1] =  res[i+1];
-    }
+    for (size_t i=0; i<half; i+=2)
+      {
+      res[i+half]   = -res[i+1];
+      res[i+half+1] =  res[i  ];
+      }
+  else
+    for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
+      {
+      res[j  ] = -res[i  ];
+      res[j+1] =  res[i+1];
+      }
   }
 
 NOINLINE static void fill_second_half(size_t n, double * restrict res)
   {
   if ((n&1)==0)
-    { res[n] = -1.; res[n+1] = 0.; }
-  for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
-    {
-    res[j  ] =  res[i  ];
-    res[j+1] = -res[i+1];
-    }
+    for (size_t i=0; i<n; ++i)
+      res[i+n] = -res[i];
+  else
+    for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
+      {
+      res[j  ] =  res[i  ];
+      res[j+1] = -res[i+1];
+      }
   }
 
-NOINLINE static void sincos_2pibyn(size_t n, double * restrict res)
+NOINLINE static void sincos_2pibyn_half(size_t n, double * restrict res)
   {
   if ((n&3)==0)
     {
@@ -196,11 +194,15 @@ NOINLINE static void sincos_2pibyn(size_t n, double * restrict res)
     }
   else
     calc_first_half(n, res);
+  }
+
+NOINLINE static void sincos_2pibyn(size_t n, double * restrict res)
+  {
+  sincos_2pibyn_half(n, res);
   fill_second_half(n, res);
   }
 
-
-static size_t largest_prime_factor (size_t n)
+NOINLINE static size_t largest_prime_factor (size_t n)
   {
   size_t res=1;
   size_t tmp;
@@ -220,7 +222,7 @@ static size_t largest_prime_factor (size_t n)
   return res;
   }
 
-static double cost_guess (size_t n)
+NOINLINE static double cost_guess (size_t n)
   {
   const double lfp=1.1; // penalty for non-hardcoded larger factors
   size_t ni=n;
@@ -242,8 +244,8 @@ static double cost_guess (size_t n)
   return result*ni;
   }
 
-/* returns the smallest composite of 2, 3 and 5 which is >= n */
-static size_t good_size(size_t n)
+/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
+NOINLINE static size_t good_size(size_t n)
   {
   if (n<=6) return n;
 
@@ -277,16 +279,17 @@ typedef struct cfftp_plan_i
 typedef struct cfftp_plan_i * cfftp_plan;
 
 #define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
-#define MPC(a,b,c,d) { a.r=c.r-d.r; a.i=c.i-d.i; b.r=c.r+d.r; b.i=c.i+d.i; }
 #define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; }
 #define SCALEC(a,b) { a.r*=b; a.i*=b; }
-#define CONJFLIPC(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
+#define ROT90(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
+#define ROTM90(a) { double tmp_=-a.r; a.r=a.i; a.i=tmp_; }
 #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
 #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
 #define WA(x,i) wa[(i)-1+(x)*(ido-1)]
 /* a = b*c */
-#define MULPMC(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
-#define MULMPC(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
+#define A_EQ_B_MUL_C(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
+/* a = conj(b)*c*/
+#define A_EQ_CB_MUL_C(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
 
 #define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; }
 /* a = b*c */
@@ -294,8 +297,8 @@ typedef struct cfftp_plan_i * cfftp_plan;
 /* a *= b */
 #define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; }
 
-NOINLINE static void pass2 (size_t ido, size_t l1, const cmplx * restrict cc,
-  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+NOINLINE static void pass2b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
   {
   const size_t cdim=2;
 
@@ -310,7 +313,28 @@ NOINLINE static void pass2 (size_t ido, size_t l1, const cmplx * restrict cc,
         {
         cmplx t;
         PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
-        MULPMSIGNC (CH(i,k,1),WA(0,i),t)
+        A_EQ_B_MUL_C (CH(i,k,1),WA(0,i),t)
+        }
+      }
+  }
+
+NOINLINE static void pass2f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=2;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx t;
+        PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
+        A_EQ_CB_MUL_C (CH(i,k,1),WA(0,i),t)
         }
       }
   }
@@ -329,7 +353,8 @@ NOINLINE static void pass2 (size_t ido, size_t l1, const cmplx * restrict cc,
         cb.r=-(twi*t2.i); \
         PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
         }
-#define PARTSTEP3(u1,u2,twr,twi) \
+
+#define PARTSTEP3b(u1,u2,twr,twi) \
         { \
         cmplx ca,cb,da,db; \
         ca.r=t0.r+twr*t1.r; \
@@ -337,15 +362,14 @@ NOINLINE static void pass2 (size_t ido, size_t l1, const cmplx * restrict cc,
         cb.i=twi*t2.r; \
         cb.r=-(twi*t2.i); \
         PMC(da,db,ca,cb) \
-        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
-        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
+        A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
         }
-
-NOINLINE static void pass3 (size_t ido, size_t l1, const cmplx * restrict cc,
-  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+NOINLINE static void pass3b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
   {
   const size_t cdim=3;
-  const double tw1r=-0.5, tw1i= sign * 0.86602540378443864676;
+  const double tw1r=-0.5, tw1i= 0.86602540378443864676;
 
   if (ido==1)
     for (size_t k=0; k<l1; ++k)
@@ -363,37 +387,63 @@ NOINLINE static void pass3 (size_t ido, size_t l1, const cmplx * restrict cc,
       for (size_t i=1; i<ido; ++i)
         {
         PREP3(i)
-        PARTSTEP3(1,2,tw1r,tw1i)
+        PARTSTEP3b(1,2,tw1r,tw1i)
+        }
+      }
+  }
+#define PARTSTEP3f(u1,u2,twr,twi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twr*t1.r; \
+        ca.i=t0.i+twr*t1.i; \
+        cb.i=twi*t2.r; \
+        cb.r=-(twi*t2.i); \
+        PMC(da,db,ca,cb) \
+        A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+NOINLINE static void pass3f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=3;
+  const double tw1r=-0.5, tw1i= -0.86602540378443864676;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP3(i)
+        PARTSTEP3f(1,2,tw1r,tw1i)
         }
       }
   }
 
-NOINLINE static void pass4 (size_t ido, size_t l1, const cmplx * restrict cc,
-  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+NOINLINE static void pass4b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
   {
   const size_t cdim=4;
 
   if (ido==1)
-    if (sign>0)
-      for (size_t k=0; k<l1; ++k)
-        {
-        cmplx t1, t2, t3, t4;
-        PMC(t2,t1,CC(0,0,k),CC(0,2,k))
-        PMC(t3,t4,CC(0,1,k),CC(0,3,k))
-        CONJFLIPC(t4)
-        PMC(CH(0,k,0),CH(0,k,2),t2,t3)
-        PMC (CH(0,k,1),CH(0,k,3),t1,t4)
-        }
-    else
-      for (size_t k=0; k<l1; ++k)
-        {
-        cmplx t1, t2, t3, t4;
-        PMC(t2,t1,CC(0,0,k),CC(0,2,k))
-        PMC(t3,t4,CC(0,1,k),CC(0,3,k))
-        CONJFLIPC(t4)
-        PMC(CH(0,k,0),CH(0,k,2),t2,t3)
-        MPC (CH(0,k,1),CH(0,k,3),t1,t4)
-        }
+    for (size_t k=0; k<l1; ++k)
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      ROT90(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
+      }
   else
     for (size_t k=0; k<l1; ++k)
       {
@@ -401,40 +451,66 @@ NOINLINE static void pass4 (size_t ido, size_t l1, const cmplx * restrict cc,
       cmplx t1, t2, t3, t4;
       PMC(t2,t1,CC(0,0,k),CC(0,2,k))
       PMC(t3,t4,CC(0,1,k),CC(0,3,k))
-      CONJFLIPC(t4)
+      ROT90(t4)
       PMC(CH(0,k,0),CH(0,k,2),t2,t3)
-      PMSIGNC (CH(0,k,1),CH(0,k,3),t1,t4)
+      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
       }
-      if (sign>0)
-        for (size_t i=1; i<ido; ++i)
-          {
-          cmplx c2, c3, c4, t1, t2, t3, t4;
-          cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
-          PMC(t2,t1,cc0,cc2)
-          PMC(t3,t4,cc1,cc3)
-          CONJFLIPC(t4)
-          cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
-          PMC(CH(i,k,0),c3,t2,t3)
-          PMC (c2,c4,t1,t4)
-          MULPMC (CH(i,k,1),wa0,c2)
-          MULPMC (CH(i,k,2),wa1,c3)
-          MULPMC (CH(i,k,3),wa2,c4)
-          }
-      else
-        for (size_t i=1; i<ido; ++i)
-          {
-          cmplx c2, c3, c4, t1, t2, t3, t4;
-          cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
-          PMC(t2,t1,cc0,cc2)
-          PMC(t3,t4,cc1,cc3)
-          CONJFLIPC(t4)
-          cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
-          PMC(CH(i,k,0),c3,t2,t3)
-          MPC (c2,c4,t1,t4)
-          MULMPC (CH(i,k,1),wa0,c2)
-          MULMPC (CH(i,k,2),wa1,c3)
-          MULMPC (CH(i,k,3),wa2,c4)
-          }
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx c2, c3, c4, t1, t2, t3, t4;
+        cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
+        PMC(t2,t1,cc0,cc2)
+        PMC(t3,t4,cc1,cc3)
+        ROT90(t4)
+        cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
+        PMC(CH(i,k,0),c3,t2,t3)
+        PMC(c2,c4,t1,t4)
+        A_EQ_B_MUL_C (CH(i,k,1),wa0,c2)
+        A_EQ_B_MUL_C (CH(i,k,2),wa1,c3)
+        A_EQ_B_MUL_C (CH(i,k,3),wa2,c4)
+        }
+      }
+  }
+NOINLINE static void pass4f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=4;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      ROTM90(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      ROTM90(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMC (CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx c2, c3, c4, t1, t2, t3, t4;
+        cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
+        PMC(t2,t1,cc0,cc2)
+        PMC(t3,t4,cc1,cc3)
+        ROTM90(t4)
+        cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
+        PMC(CH(i,k,0),c3,t2,t3)
+        PMC(c2,c4,t1,t4)
+        A_EQ_CB_MUL_C (CH(i,k,1),wa0,c2)
+        A_EQ_CB_MUL_C (CH(i,k,2),wa1,c3)
+        A_EQ_CB_MUL_C (CH(i,k,3),wa2,c4)
+        }
       }
   }
 
@@ -454,7 +530,8 @@ NOINLINE static void pass4 (size_t ido, size_t l1, const cmplx * restrict cc,
         cb.r=-(twai*t4.i twbi*t3.i); \
         PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
         }
-#define PARTSTEP5(u1,u2,twar,twbr,twai,twbi) \
+
+#define PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
         { \
         cmplx ca,cb,da,db; \
         ca.r=t0.r+twar*t1.r+twbr*t2.r; \
@@ -462,18 +539,17 @@ NOINLINE static void pass4 (size_t ido, size_t l1, const cmplx * restrict cc,
         cb.i=twai*t4.r twbi*t3.r; \
         cb.r=-(twai*t4.i twbi*t3.i); \
         PMC(da,db,ca,cb) \
-        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
-        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
+        A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
         }
-
-NOINLINE static void pass5 (size_t ido, size_t l1, const cmplx * restrict cc,
-  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+NOINLINE static void pass5b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
   {
   const size_t cdim=5;
   const double tw1r= 0.3090169943749474241,
-               tw1i= sign * 0.95105651629515357212,
+               tw1i= 0.95105651629515357212,
                tw2r= -0.8090169943749474241,
-               tw2i= sign * 0.58778525229247312917;
+               tw2i= 0.58778525229247312917;
 
   if (ido==1)
     for (size_t k=0; k<l1; ++k)
@@ -493,8 +569,51 @@ NOINLINE static void pass5 (size_t ido, size_t l1, const cmplx * restrict cc,
       for (size_t i=1; i<ido; ++i)
         {
         PREP5(i)
-        PARTSTEP5(1,4,tw1r,tw2r,+tw1i,+tw2i)
-        PARTSTEP5(2,3,tw2r,tw1r,+tw2i,-tw1i)
+        PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
+        PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
+        }
+      }
+  }
+#define PARTSTEP5f(u1,u2,twar,twbr,twai,twbi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
+        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
+        cb.i=twai*t4.r twbi*t3.r; \
+        cb.r=-(twai*t4.i twbi*t3.i); \
+        PMC(da,db,ca,cb) \
+        A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+NOINLINE static void pass5f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=5;
+  const double tw1r= 0.3090169943749474241,
+               tw1i= -0.95105651629515357212,
+               tw2r= -0.8090169943749474241,
+               tw2i= -0.58778525229247312917;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP5(i)
+        PARTSTEP5f(1,4,tw1r,tw2r,+tw1i,+tw2i)
+        PARTSTEP5f(2,3,tw2r,tw1r,+tw2i,-tw1i)
         }
       }
   }
@@ -749,7 +868,7 @@ NOINLINE static int passg (size_t ido, size_t ip, size_t l1,
 #undef CX2
 #undef CX
 
-WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct,
+NOINLINE WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct,
   const int sign)
   {
   if (plan->length==1) return 0;
@@ -764,10 +883,18 @@ WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct,
     size_t ip=plan->fct[k1].fct;
     size_t l2=ip*l1;
     size_t ido = len/l2;
-    if     (ip==4)  pass4 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
-    else if(ip==2)  pass2 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
-    else if(ip==3)  pass3 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
-    else if(ip==5)  pass5 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
+    if     (ip==4)
+      sign>0 ? pass4b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass4f (ido, l1, p1, p2, plan->fct[k1].tw);
+    else if(ip==2)
+      sign>0 ? pass2b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass2f (ido, l1, p1, p2, plan->fct[k1].tw);
+    else if(ip==3)
+      sign>0 ? pass3b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass3f (ido, l1, p1, p2, plan->fct[k1].tw);
+    else if(ip==5)
+      sign>0 ? pass5b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass5f (ido, l1, p1, p2, plan->fct[k1].tw);
     else if(ip==7)  pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
     else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign);
     else
@@ -802,29 +929,28 @@ WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct,
   }
 
 #undef PMSIGNC
-#undef MULPMC
-#undef MULMPC
+#undef A_EQ_B_MUL_C
+#undef A_EQ_CB_MUL_C
 #undef MULPMSIGNC
 #undef MULPMSIGNCEQ
 
 #undef WA
 #undef CC
 #undef CH
-#undef CONJFLIPC
+#undef ROT90
 #undef SCALEC
 #undef ADDC
-#undef MPC
 #undef PMC
 
-WARN_UNUSED_RESULT
+NOINLINE WARN_UNUSED_RESULT
 static int cfftp_forward(cfftp_plan plan, double c[], double fct)
   { return pass_all(plan,(cmplx *)c, fct, -1); }
 
-WARN_UNUSED_RESULT
+NOINLINE WARN_UNUSED_RESULT
 static int cfftp_backward(cfftp_plan plan, double c[], double fct)
   { return pass_all(plan,(cmplx *)c, fct, 1); }
 
-WARN_UNUSED_RESULT
+NOINLINE WARN_UNUSED_RESULT
 static int cfftp_factorize (cfftp_plan plan)
   {
   size_t length=plan->length;
@@ -856,7 +982,7 @@ static int cfftp_factorize (cfftp_plan plan)
   return 0;
   }
 
-static size_t cfftp_twsize (cfftp_plan plan)
+NOINLINE static size_t cfftp_twsize (cfftp_plan plan)
   {
   size_t twsize=0, l1=1;
   for (size_t k=0; k<plan->nfct; ++k)
@@ -870,7 +996,7 @@ static size_t cfftp_twsize (cfftp_plan plan)
   return twsize;
   }
 
-WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan)
+NOINLINE WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan)
   {
   size_t length=plan->length;
   double *twid = RALLOC(double, 2*length);
@@ -1685,12 +1811,12 @@ static size_t rfftp_twsize(rfftp_plan plan)
   return 0;
   }
 
-WARN_UNUSED_RESULT static int rfftp_comp_twiddle (rfftp_plan plan)
+WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan)
   {
   size_t length=plan->length;
   double *twid = RALLOC(double, 2*length);
   if (!twid) return -1;
-  sincos_2pibyn(length, twid);
+  sincos_2pibyn_half(length, twid);
   size_t l1=1;
   double *ptr=plan->mem;
   for (size_t k=0; k<plan->nfct; ++k)
@@ -1709,10 +1835,14 @@ WARN_UNUSED_RESULT static int rfftp_comp_twiddle (rfftp_plan plan)
     if (ip>5) // special factors required by *g functions
       {
       plan->fct[k].tws=ptr; ptr+=2*ip;
-      for (size_t i=0; i<ip; ++i)
+      plan->fct[k].tws[0] = 1.;
+      plan->fct[k].tws[1] = 0.;
+      for (size_t i=1; i<=(ip>>1); ++i)
         {
         plan->fct[k].tws[2*i  ] = twid[2*i*(length/ip)];
         plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
+        plan->fct[k].tws[2*(ip-i)  ] = twid[2*i*(length/ip)];
+        plan->fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1];
         }
       }
     l1*=ip;
@@ -1721,7 +1851,7 @@ WARN_UNUSED_RESULT static int rfftp_comp_twiddle (rfftp_plan plan)
   return 0;
   }
 
-static rfftp_plan make_rfftp_plan (size_t length)
+NOINLINE static rfftp_plan make_rfftp_plan (size_t length)
   {
   if (length==0) return NULL;
   rfftp_plan plan = RALLOC(rfftp_plan_i,1);
@@ -1741,7 +1871,7 @@ static rfftp_plan make_rfftp_plan (size_t length)
   return plan;
   }
 
-static void destroy_rfftp_plan (rfftp_plan plan)
+NOINLINE static void destroy_rfftp_plan (rfftp_plan plan)
   {
   DEALLOC(plan->mem);
   DEALLOC(plan);
@@ -1756,7 +1886,7 @@ typedef struct fftblue_plan_i
   } fftblue_plan_i;
 typedef struct fftblue_plan_i * fftblue_plan;
 
-static fftblue_plan make_fftblue_plan (size_t length)
+NOINLINE static fftblue_plan make_fftblue_plan (size_t length)
   {
   fftblue_plan plan = RALLOC(fftblue_plan_i,1);
   if (!plan) return NULL;
@@ -1804,14 +1934,14 @@ static fftblue_plan make_fftblue_plan (size_t length)
   return plan;
   }
 
-static void destroy_fftblue_plan (fftblue_plan plan)
+NOINLINE static void destroy_fftblue_plan (fftblue_plan plan)
   {
   DEALLOC(plan->mem);
   destroy_cfftp_plan(plan->plan);
   DEALLOC(plan);
   }
 
-WARN_UNUSED_RESULT
+NOINLINE WARN_UNUSED_RESULT
 static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct)
   {
   size_t n=plan->n;

From ea8d4b4ecdea14c0a059171856a0490974f02e41 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 6 Dec 2018 19:26:41 +0100
Subject: [PATCH 09/85] only use dynamic AVX for gcc >= 6.0

---
 libsharp/sharp_core.c     | 6 +++---
 libsharp/sharp_core_avx.c | 6 +-----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index fbe83c8..1d6618d 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -33,7 +33,7 @@
 #include "sharp_core_inc0.c"
 #undef ARCH
 
-#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=5)
+#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
 
 static int have_avx(void)
   {
@@ -55,7 +55,7 @@ void inner_loop (sharp_job *job, const int *ispair,const double *cth,
   const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
   const int *mlim)
   {
-#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=5)
+#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
   if (have_avx())
     inner_loop_avx (job, ispair, cth, sth, llim, ulim, gen, mi, mlim);
   else
@@ -65,7 +65,7 @@ void inner_loop (sharp_job *job, const int *ispair,const double *cth,
 
 int sharp_veclen(void)
   {
-#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=5)
+#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
   if (have_avx())
     return 4;
   else
diff --git a/libsharp/sharp_core_avx.c b/libsharp/sharp_core_avx.c
index a250b49..79f1e79 100644
--- a/libsharp/sharp_core_avx.c
+++ b/libsharp/sharp_core_avx.c
@@ -1,14 +1,10 @@
-#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=5)
+#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
 // if we arrive here, we can benefit from an additional AVX version
 // #warning entering gcc and x86_64 specific code branch
 
 #define ARCH _avx
-//#define __AVX__
-#pragma GCC push_options
 #pragma GCC target("avx")
 #include "sharp_core_inc0.c"
-#pragma GCC pop_options
-//#undef __AVX__
 #undef ARCH
 
 #endif

From 65f47d10ccd35c7d1f728e063b9967c7046170f1 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Mon, 10 Dec 2018 14:37:34 +0100
Subject: [PATCH 10/85] clearer macro names

---
 libsharp/sharp_core_inc2.c  | 28 ++++++++++++++++++----------
 libsharp/sharp_vecsupport.h |  8 +++++++-
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c
index 9a2e26b..017df07 100644
--- a/libsharp/sharp_core_inc2.c
+++ b/libsharp/sharp_core_inc2.c
@@ -41,13 +41,16 @@ if (njobs>1)
     Tb lam_3, lam_4;
     Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+//      lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+      lam_3.v[i] = vabmc(vmul(cth.v[i],lam_2.v[i]),r0,vmul(lam_1.v[i],r1));
     r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
+//      lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
+      lam_4.v[i] = vabmc(vmul(cth.v[i],lam_3.v[i]),r0,vmul(lam_2.v[i],r1));
     r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
+//      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
+      lam_1.v[i] = vabmc(vmul(cth.v[i],lam_4.v[i]),r0,vmul(lam_3.v[i],r1));
     for (int j=0; j<njobs; ++j)
       {
       Tv ar2=vload(creal(alm[njobs*l+j])),
@@ -71,7 +74,8 @@ if (njobs>1)
       }
     r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
+//      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
+      lam_2.v[i] = vabmc(vmul(cth.v[i],lam_1.v[i]),r0,vmul(lam_4.v[i],r1));
     l+=4;
     }
   }
@@ -127,13 +131,15 @@ NOINLINE static void Z(map2alm_kernel) (const Tb cth,
   while (l<lmax)
     {
     for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
-                 - vload(rf[l].f[1])*lam_1.v[i];
+      lam_1.v[i] = vabmc(vload(rf[l].f[0]),vmul(cth.v[i],lam_2.v[i]),
+                   vmul(vload(rf[l].f[1]),lam_1.v[i]));
     for (int j=0; j<njobs; ++j)
       for (int i=0; i<nvec; ++i)
         {
-        atmp[2*(l*njobs+j)]+=lam_2.v[i]*p1[j].r.v[i];
-        atmp[2*(l*njobs+j)+1]+=lam_2.v[i]*p1[j].i.v[i];
+        vfmaeq(atmp[2*(l*njobs+j)],lam_2.v[i],p1[j].r.v[i]);
+        vfmaeq(atmp[2*(l*njobs+j)+1],lam_2.v[i],p1[j].i.v[i]);
+//        atmp[2*(l*njobs+j)]+=lam_2.v[i]*p1[j].r.v[i];
+//        atmp[2*(l*njobs+j)+1]+=lam_2.v[i]*p1[j].i.v[i];
         }
     for (int i=0; i<nvec; ++i)
       lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
@@ -141,8 +147,10 @@ NOINLINE static void Z(map2alm_kernel) (const Tb cth,
     for (int j=0; j<njobs; ++j)
       for (int i=0; i<nvec; ++i)
         {
-        atmp[2*((l+1)*njobs+j)]+=lam_1.v[i]*p2[j].r.v[i];
-        atmp[2*((l+1)*njobs+j)+1]+=lam_1.v[i]*p2[j].i.v[i];
+        vfmaeq(atmp[2*((l+1)*njobs+j)],lam_1.v[i],p2[j].r.v[i]);
+        vfmaeq(atmp[2*((l+1)*njobs+j)+1],lam_1.v[i],p2[j].i.v[i]);
+//        atmp[2*((l+1)*njobs+j)]+=lam_1.v[i]*p2[j].r.v[i];
+//        atmp[2*((l+1)*njobs+j)+1]+=lam_1.v[i]*p2[j].i.v[i];
         }
     l+=2;
     }
diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index 5250948..ff3f573 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -58,6 +58,7 @@ typedef int Tm;
 #define vfmaeq(a,b,c) ((a)+=(b)*(c))
 #define vfmaeq_s(a,b,c) ((a)+=(b)*(c))
 #define vfmseq(a,b,c) ((a)-=(b)*(c))
+#define vabmc(a,b,c) ((a)*(b)-(c))
 #define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
 #define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
 #define vneg(a) (-(a))
@@ -125,6 +126,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
 #define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c))
 #define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
+#define vabmc(a,b,c) _mm_sub_pd(_mm_mul_pd(a,b),c)
 #define vfmaaeq(a,b,c,d,e) \
   a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
 #define vfmaseq(a,b,c,d,e) \
@@ -182,6 +184,7 @@ typedef __m256d Tm;
 #define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
 #define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
 #define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
+#define vabmc(a,b,c) _mm256_msub_pd(a,b,c)
 #define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
 #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
 #else
@@ -189,12 +192,14 @@ typedef __m256d Tm;
 #define vfmaeq(a,b,c) a=_mm256_fmadd_pd(b,c,a)
 #define vfmaeq_s(a,b,c) a=_mm256_fmadd_ps(b,c,a)
 #define vfmseq(a,b,c) a=_mm256_fnmadd_pd(b,c,a)
+#define vabmc(a,b,c) _mm256_fmsub_pd(a,b,c)
 #define vfmaaeq(a,b,c,d,e) a=_mm256_fmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
 #define vfmaseq(a,b,c,d,e) a=_mm256_fnmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
 #else
 #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
 #define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
 #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
+#define vabmc(a,b,c) _mm256_sub_pd(_mm256_mul_pd(a,b),c)
 #define vfmaaeq(a,b,c,d,e) \
   a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
 #define vfmaseq(a,b,c,d,e) \
@@ -241,7 +246,8 @@ typedef __mmask8 Tm;
 #define vmuleq(a,b) a=_mm512_mul_pd(a,b)
 #define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
 #define vfmaeq(a,b,c) a=_mm512_fmadd_pd(b,c,a)
-#define vfmseq(a,b,c) a=_mm512_fnmadd_pd(b,c,a)
+//#define vabmc(a,b,c) a=_mm512_fnmadd_pd(b,c,a)
+//#define vfms(a,b,c) _mm512_fnmadd_pd(b,c,a)
 #define vfmaaeq(a,b,c,d,e) a=_mm512_fmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
 #define vfmaseq(a,b,c,d,e) a=_mm512_fnmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
 #define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))

From c56747d36e5b347cda0bfd029b3d8b04bfa96650 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Mon, 10 Dec 2018 15:05:41 +0100
Subject: [PATCH 11/85] do not support multiple simultaneous transforms any
 more

---
 .gitignore                 |  14 +--
 libsharp/sharp.c           |  97 +++++++++----------
 libsharp/sharp_core_inc0.c | 177 ++++-------------------------------
 libsharp/sharp_internal.h  |   3 +-
 libsharp/sharp_lowlevel.h  |   5 +-
 libsharp/sharp_mpi.c       |  16 ++--
 libsharp/sharp_mpi.h       |   3 +-
 libsharp/sharp_testsuite.c | 185 +++++++++++++++++--------------------
 8 files changed, 167 insertions(+), 333 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4cde3de..12a6531 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,11 @@
-*.o
-*.so
-#*
-*~
-*.pyc
-*.pyo
+**.o
+**.lo
+**.la
+**.so
+**/#*
+**~
+**.pyc
+**.pyo
 
 /auto
 /autom4te.cache
diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index 884a644..d882689 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -487,10 +487,10 @@ NOINLINE static void init_output (sharp_job *job)
   {
   if (job->flags&SHARP_ADD) return;
   if (job->type == SHARP_MAP2ALM)
-    for (int i=0; i<job->ntrans*job->nalm; ++i)
+    for (int i=0; i<job->nalm; ++i)
       clear_alm (job->ainfo,job->alm[i],job->flags);
   else
-    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+    for (int i=0; i<job->nmaps; ++i)
       clear_map (job->ginfo,job->map[i],job->flags);
   }
 
@@ -498,24 +498,24 @@ NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
   {
   if (job->type==SHARP_MAP2ALM)
     {
-    job->s_m=2*job->ntrans*job->nmaps;
+    job->s_m=2*job->nmaps;
     if (((job->s_m*16*nm)&1023)==0) nm+=3; // hack to avoid critical strides
     job->s_th=job->s_m*nm;
     }
   else
     {
-    job->s_th=2*job->ntrans*job->nmaps;
+    job->s_th=2*job->nmaps;
     if (((job->s_th*16*ntheta)&1023)==0) ntheta+=3; // hack to avoid critical strides
     job->s_m=job->s_th*ntheta;
     }
-  job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta);
+  job->phase=RALLOC(dcmplx,2*job->nmaps*nm*ntheta);
   }
 
 static void dealloc_phase (sharp_job *job)
   { DEALLOC(job->phase); }
 
 static void alloc_almtmp (sharp_job *job, int lmax)
-  { job->almtmp=RALLOC(dcmplx,job->ntrans*job->nalm*(lmax+1)); }
+  { job->almtmp=RALLOC(dcmplx,job->nalm*(lmax+1)); }
 
 static void dealloc_almtmp (sharp_job *job)
   { DEALLOC(job->almtmp); }
@@ -526,13 +526,13 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
 #define COPY_LOOP(real_t, source_t, expr_of_x)              \
   {                                                         \
   for (int l=m; l<lmin; ++l)                                \
-    for (int i=0; i<job->ntrans*job->nalm; ++i)             \
-      job->almtmp[job->ntrans*job->nalm*l+i] = 0;           \
+    for (int i=0; i<job->nalm; ++i)             \
+      job->almtmp[job->nalm*l+i] = 0;           \
   for (int l=lmin; l<=lmax; ++l)                            \
-    for (int i=0; i<job->ntrans*job->nalm; ++i)             \
+    for (int i=0; i<job->nalm; ++i)             \
       {                                                     \
       source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \
-      job->almtmp[job->ntrans*job->nalm*l+i] = expr_of_x;   \
+      job->almtmp[job->nalm*l+i] = expr_of_x;   \
       }                                                     \
   }
 
@@ -586,8 +586,8 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
       }
     }
   else
-    memset (job->almtmp+job->ntrans*job->nalm*job->ainfo->mval[mi], 0,
-      job->ntrans*job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx));
+    memset (job->almtmp+job->nalm*job->ainfo->mval[mi], 0,
+      job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx));
 
 #undef COPY_LOOP
   }
@@ -597,9 +597,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
 
 #define COPY_LOOP(real_t, target_t, expr_of_x)               \
   for (int l=lmin; l<=lmax; ++l)                             \
-    for (int i=0; i<job->ntrans*job->nalm; ++i)              \
+    for (int i=0; i<job->nalm; ++i)              \
       {                                                      \
-        dcmplx x = job->almtmp[job->ntrans*job->nalm*l+i];   \
+        dcmplx x = job->almtmp[job->nalm*l+i];   \
         *(target_t *)(((real_t *)job->alm[i])+ofs+l*stride) += expr_of_x; \
       }
 
@@ -660,7 +660,7 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
   if (job->flags & SHARP_DP)
     {
     double **dmap = (double **)job->map;
-    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+    for (int i=0; i<job->nmaps; ++i)
       {
       double *restrict p1=&dmap[i][ri->ofs];
       const double *restrict p2=&ringtmp[i*rstride+1];
@@ -680,7 +680,7 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
   else
     {
     float  **fmap = (float  **)job->map;
-    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+    for (int i=0; i<job->nmaps; ++i)
       for (int m=0; m<ri->nph; ++m)
         fmap[i][ri->ofs+m*ri->stride] += (float)ringtmp[i*rstride+m+1];
     }
@@ -690,7 +690,7 @@ NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
   double *ringtmp, int rstride)
   {
   if (job->flags & SHARP_DP)
-    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+    for (int i=0; i<job->nmaps; ++i)
       {
       double *restrict p1=&ringtmp[i*rstride+1],
              *restrict p2=&(((double *)(job->map[i]))[ri->ofs]);
@@ -701,7 +701,7 @@ NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
           p1[m] = p2[m*ri->stride];
       }
   else
-    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+    for (int i=0; i<job->nmaps; ++i)
       for (int m=0; m<ri->nph; ++m)
         ringtmp[i*rstride+m+1] = ((float *)(job->map[i]))[ri->ofs+m*ri->stride];
   }
@@ -711,7 +711,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
   {
   if (ri->nph<0)
     {
-    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+    for (int i=0; i<job->nmaps; ++i)
       for (int m=0; m<=mmax; ++m)
         phase[2*i+job->s_m*m]=0.;
     }
@@ -721,7 +721,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
     double wgt = (job->flags&SHARP_USE_WEIGHTS) ? (ri->nph*ri->weight) : 1.;
     if (job->flags&SHARP_REAL_HARMONICS)
       wgt *= sqrt_two;
-    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+    for (int i=0; i<job->nmaps; ++i)
       for (int m=0; m<=mmax; ++m)
         phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
           ((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
@@ -738,7 +738,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
   double wgt = (job->flags&SHARP_USE_WEIGHTS) ? (ri->nph*ri->weight) : 1.;
   if (job->flags&SHARP_REAL_HARMONICS)
     wgt *= sqrt_one_half;
-  for (int i=0; i<job->ntrans*job->nmaps; ++i)
+  for (int i=0; i<job->nmaps; ++i)
     for (int m=0; m<=mmax; ++m)
       if (job->flags & SHARP_DP)
         dmap[i][ri->ofs+m*ri->stride] += wgt*phase[2*i+job->s_m*m];
@@ -769,19 +769,19 @@ NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
     ringhelper helper;
     ringhelper_init(&helper);
     int rstride=job->ginfo->nphmax+2;
-    double *ringtmp=RALLOC(double,job->ntrans*job->nmaps*rstride);
+    double *ringtmp=RALLOC(double,job->nmaps*rstride);
 #pragma omp for schedule(dynamic,1)
     for (int ith=llim; ith<ulim; ++ith)
       {
       int dim2 = job->s_th*(ith-llim);
       ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
-      for (int i=0; i<job->ntrans*job->nmaps; ++i)
+      for (int i=0; i<job->nmaps; ++i)
         ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1),
           &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
       if (job->ginfo->pair[ith].r2.nph>0)
         {
         ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
-        for (int i=0; i<job->ntrans*job->nmaps; ++i)
+        for (int i=0; i<job->nmaps; ++i)
           ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2),
            &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
         }
@@ -814,18 +814,18 @@ NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
     ringhelper helper;
     ringhelper_init(&helper);
     int rstride=job->ginfo->nphmax+2;
-    double *ringtmp=RALLOC(double,job->ntrans*job->nmaps*rstride);
+    double *ringtmp=RALLOC(double,job->nmaps*rstride);
 #pragma omp for schedule(dynamic,1)
     for (int ith=llim; ith<ulim; ++ith)
       {
       int dim2 = job->s_th*(ith-llim);
-      for (int i=0; i<job->ntrans*job->nmaps; ++i)
+      for (int i=0; i<job->nmaps; ++i)
         ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1),
           &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
       ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
       if (job->ginfo->pair[ith].r2.nph>0)
         {
-        for (int i=0; i<job->ntrans*job->nmaps; ++i)
+        for (int i=0; i<job->nmaps; ++i)
           ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2),
             &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
         ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
@@ -918,10 +918,8 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
 
 static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
   int spin, void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags)
+  const sharp_alm_info *alm_info, int flags)
   {
-  UTIL_ASSERT((ntrans>0)&&(ntrans<=SHARP_MAXTRANS),
-    "bad number of simultaneous transforms");
   if (type==SHARP_ALM2MAP_DERIV1) spin=1;
   if (type==SHARP_MAP2ALM) flags|=SHARP_USE_WEIGHTS;
   if (type==SHARP_Yt) type=SHARP_MAP2ALM;
@@ -937,23 +935,22 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
   job->ainfo = alm_info;
   job->flags = flags;
   if ((job->flags&SHARP_NVMAX)==0)
-    job->flags|=sharp_nv_oracle (type, spin, ntrans);
+    job->flags|=sharp_nv_oracle (type, spin);
   if (alm_info->flags&SHARP_REAL_HARMONICS)
     job->flags|=SHARP_REAL_HARMONICS;
   job->time = 0.;
   job->opcnt = 0;
-  job->ntrans = ntrans;
   job->alm=alm;
   job->map=map;
   }
 
 void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
-  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans,
+  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
   int flags, double *time, unsigned long long *opcnt)
   {
   sharp_job job;
   sharp_build_job_common (&job, type, spin, alm, map, geom_info, alm_info,
-    ntrans, flags);
+    flags);
 
   sharp_execute_job (&job);
   if (time!=NULL) *time = job.time;
@@ -968,7 +965,7 @@ void sharp_set_nchunks_max(int new_nchunks_max)
 int sharp_get_nv_max (void)
 { return 6; }
 
-static int sharp_oracle (sharp_jobtype type, int spin, int ntrans)
+static int sharp_oracle (sharp_jobtype type, int spin)
   {
   int lmax=511;
   int mmax=(lmax+1)/2;
@@ -982,7 +979,7 @@ static int sharp_oracle (sharp_jobtype type, int spin, int ntrans)
   sharp_make_gauss_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
 
   ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
-  int ncomp = ntrans*((spin==0) ? 1 : 2);
+  int ncomp = (spin==0) ? 1 : 2;
 
   double **map;
   ALLOC2D(map,double,ncomp,npix);
@@ -1005,7 +1002,7 @@ static int sharp_oracle (sharp_jobtype type, int spin, int ntrans)
     int ntries=0;
     do
       {
-      sharp_execute(type,spin,&alm[0],&map[0],tinfo,alms,ntrans,
+      sharp_execute(type,spin,&alm[0],&map[0],tinfo,alms,
         nv|SHARP_DP|SHARP_NO_OPENMP,&jtime,NULL);
 
       if (jtime<time) { time=jtime; nvbest=nv; }
@@ -1023,26 +1020,18 @@ static int sharp_oracle (sharp_jobtype type, int spin, int ntrans)
   return nvbest;
   }
 
-int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
+int sharp_nv_oracle (sharp_jobtype type, int spin)
   {
   static const int maxtr = 6;
-  static int nv_opt[6][2][5] = {
-    {{0,0,0,0,0},{0,0,0,0,0}},
-    {{0,0,0,0,0},{0,0,0,0,0}},
-    {{0,0,0,0,0},{0,0,0,0,0}},
-    {{0,0,0,0,0},{0,0,0,0,0}},
-    {{0,0,0,0,0},{0,0,0,0,0}},
-    {{0,0,0,0,0},{0,0,0,0,0}} };
+  static int nv_opt[2][5] = {{0,0,0,0,0},{0,0,0,0,0}};
 
   if (type==SHARP_ALM2MAP_DERIV1) spin=1;
   UTIL_ASSERT(type<5,"bad type");
-  UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
   UTIL_ASSERT(spin>=0, "bad spin");
-  ntrans=IMIN(ntrans,maxtr);
 
-  if (nv_opt[ntrans-1][spin!=0][type]==0)
-    nv_opt[ntrans-1][spin!=0][type]=sharp_oracle(type,spin,ntrans);
-  return nv_opt[ntrans-1][spin!=0][type];
+  if (nv_opt[spin!=0][type]==0)
+    nv_opt[spin!=0][type]=sharp_oracle(type,spin);
+  return nv_opt[spin!=0][type];
   }
 
 #ifdef USE_MPI
@@ -1050,11 +1039,11 @@ int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
 
 int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
   void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  const sharp_alm_info *alm_info, int flags, double *time,
   unsigned long long *opcnt)
   {
   MPI_Comm comm = *(MPI_Comm*)pcomm;
-  sharp_execute_mpi((MPI_Comm)comm, type, spin, alm, map, geom_info, alm_info, ntrans,
+  sharp_execute_mpi((MPI_Comm)comm, type, spin, alm, map, geom_info, alm_info,
     flags, time, opcnt);
   return 0;
   }
@@ -1063,12 +1052,12 @@ int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
 
 int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
   void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  const sharp_alm_info *alm_info, int flags, double *time,
   unsigned long long *opcnt)
   {
   /* Suppress unused warning: */
   (void)pcomm; (void)type; (void)spin; (void)alm; (void)map; (void)geom_info;
-  (void)alm_info; (void)ntrans; (void)flags; (void)time; (void)opcnt;
+  (void)alm_info; (void)flags; (void)time; (void)opcnt;
   return SHARP_ERROR_NO_MPI;
   }
 
diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c
index 7a34e40..d7c3624 100644
--- a/libsharp/sharp_core_inc0.c
+++ b/libsharp/sharp_core_inc0.c
@@ -78,164 +78,27 @@ void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *c
   const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
   const int *mlim)
   {
-  int njobs=job->ntrans, nv=job->flags&SHARP_NVMAX;
-  if (njobs<=MAXJOB_SPECIAL)
+  int nv=job->flags&SHARP_NVMAX;
+  switch (nv)
     {
-    switch (njobs*16+nv)
-      {
-#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1))
-      case 0x11:
-        CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x12:
-        CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x13:
-        CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x14:
-        CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x15:
-        CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x16:
-        CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-#if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2))
-      case 0x21:
-        CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x22:
-        CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x23:
-        CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x24:
-        CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x25:
-        CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x26:
-        CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-#if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3))
-      case 0x31:
-        CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x32:
-        CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x33:
-        CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x34:
-        CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x35:
-        CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x36:
-        CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-#if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4))
-      case 0x41:
-        CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x42:
-        CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x43:
-        CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x44:
-        CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x45:
-        CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x46:
-        CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-#if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5))
-      case 0x51:
-        CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x52:
-        CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x53:
-        CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x54:
-        CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x55:
-        CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x56:
-        CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-#if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6))
-      case 0x61:
-        CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x62:
-        CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x63:
-        CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x64:
-        CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x65:
-        CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x66:
-        CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-      }
+    case 0x1:
+      CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      return;
+    case 0x2:
+      CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      return;
+    case 0x3:
+      CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      return;
+    case 0x4:
+      CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      return;
+    case 0x5:
+      CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      return;
+    case 0x6:
+      CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      return;
     }
-#if (SHARP_MAXTRANS>MAXJOB_SPECIAL)
-  else
-    {
-    switch (nv)
-      {
-      case 1:
-        CONCAT2(inner_loop,1)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      case 2:
-        CONCAT2(inner_loop,2)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      case 3:
-        CONCAT2(inner_loop,3)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      case 4:
-        CONCAT2(inner_loop,4)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      case 5:
-        CONCAT2(inner_loop,5)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      case 6:
-        CONCAT2(inner_loop,6)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      }
-    }
-#endif
   UTIL_FAIL("Incorrect vector parameters");
   }
diff --git a/libsharp/sharp_internal.h b/libsharp/sharp_internal.h
index fb56877..11f23cb 100644
--- a/libsharp/sharp_internal.h
+++ b/libsharp/sharp_internal.h
@@ -55,12 +55,11 @@ typedef struct
   const sharp_geom_info *ginfo;
   const sharp_alm_info *ainfo;
   double time;
-  int ntrans;
   unsigned long long opcnt;
   } sharp_job;
 
 int sharp_get_nv_max (void);
-int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans);
+int sharp_nv_oracle (sharp_jobtype type, int spin);
 int sharp_get_mlim (int lmax, int spin, double sth, double cth);
 
 #endif
diff --git a/libsharp/sharp_lowlevel.h b/libsharp/sharp_lowlevel.h
index d9aa01b..f36f5a8 100644
--- a/libsharp/sharp_lowlevel.h
+++ b/libsharp/sharp_lowlevel.h
@@ -223,7 +223,6 @@ typedef enum { SHARP_DP              = 1<<4,
   \param alm_info A \c sharp_alm_info object compatible with the provided
     \a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
     exactly once.
-  \param ntrans the number of simultaneous SHTs
   \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
     \a alm is expected to have the type "complex double **" and \a map is
     expected to have the type "double **"; otherwise, the expected
@@ -233,7 +232,7 @@ typedef enum { SHARP_DP              = 1<<4,
   \param opcnt If not NULL, a conservative estimate of the total floating point
     operation count for this SHT will be written here. */
 void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
-  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans,
+  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
   int flags, double *time, unsigned long long *opcnt);
 
 void sharp_set_chunksize_min(int new_chunksize_min);
@@ -258,7 +257,7 @@ typedef enum { SHARP_ERROR_NO_MPI = 1,
  */
 int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
   void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  const sharp_alm_info *alm_info, int flags, double *time,
   unsigned long long *opcnt);
 
 
diff --git a/libsharp/sharp_mpi.c b/libsharp/sharp_mpi.c
index a364ed4..b23409a 100644
--- a/libsharp/sharp_mpi.c
+++ b/libsharp/sharp_mpi.c
@@ -101,7 +101,7 @@ static void sharp_make_mpi_info (MPI_Comm comm, const sharp_job *job,
   DEALLOC(theta_tmp);
   DEALLOC(ispair_tmp);
 
-  minfo->nph=2*job->nmaps*job->ntrans;
+  minfo->nph=2*job->nmaps;
 
   minfo->almcount=RALLOC(int,minfo->ntasks);
   minfo->almdisp=RALLOC(int,minfo->ntasks+1);
@@ -184,8 +184,8 @@ static void alloc_phase_mpi (sharp_job *job, int nm, int ntheta,
   {
   ptrdiff_t phase_size = (job->type==SHARP_MAP2ALM) ?
     (ptrdiff_t)(nmfull)*ntheta : (ptrdiff_t)(nm)*nthetafull;
-  job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*phase_size);
-  job->s_m=2*job->ntrans*job->nmaps;
+  job->phase=RALLOC(dcmplx,2*job->nmaps*phase_size);
+  job->s_m=2*job->nmaps;
   job->s_th = job->s_m * ((job->type==SHARP_MAP2ALM) ? nmfull : nm);
   }
 
@@ -315,12 +315,12 @@ static void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm)
 
 void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
   void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  const sharp_alm_info *alm_info, int flags, double *time,
   unsigned long long *opcnt)
   {
   sharp_job job;
   sharp_build_job_common (&job, type, spin, alm, map, geom_info, alm_info,
-    ntrans, flags);
+    flags);
 
   sharp_execute_job_mpi (&job, comm);
   if (time!=NULL) *time = job.time;
@@ -331,15 +331,15 @@ void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
    without declaring it in C header as it should not be available to C code */
 void sharp_execute_mpi_fortran(MPI_Fint comm, sharp_jobtype type, int spin,
   void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  const sharp_alm_info *alm_info, int flags, double *time,
   unsigned long long *opcnt);
 void sharp_execute_mpi_fortran(MPI_Fint comm, sharp_jobtype type, int spin,
   void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  const sharp_alm_info *alm_info, int flags, double *time,
   unsigned long long *opcnt)
   {
   sharp_execute_mpi(MPI_Comm_f2c(comm), type, spin, alm, map, geom_info,
-                    alm_info, ntrans, flags, time, opcnt);
+                    alm_info, flags, time, opcnt);
   }
 
 #endif
diff --git a/libsharp/sharp_mpi.h b/libsharp/sharp_mpi.h
index 1053a65..df07117 100644
--- a/libsharp/sharp_mpi.h
+++ b/libsharp/sharp_mpi.h
@@ -62,7 +62,6 @@ extern "C" {
     \a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
     exactly once in the union of all \a alm_info objects over the participating
     MPI tasks.
-  \param ntrans the number of simultaneous SHTs
   \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
     \a alm is expected to have the type "complex double **" and \a map is
     expected to have the type "double **"; otherwise, the expected
@@ -73,7 +72,7 @@ extern "C" {
     operation count for this SHT will be written here. */
 void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
   void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  const sharp_alm_info *alm_info, int flags, double *time,
   unsigned long long *opcnt);
 
 #ifdef __cplusplus
diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c
index f02f9fd..c08fb20 100644
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@@ -358,97 +358,83 @@ static void check_sign_scale(void)
   sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
   ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
 
-  for (int ntrans=1; ntrans<10; ++ntrans)
-    {
-    double **map;
-    ALLOC2D(map,double,2*ntrans,npix);
+  double **map;
+  ALLOC2D(map,double,2,npix);
 
-    dcmplx **alm;
-    ALLOC2D(alm,dcmplx,2*ntrans,nalms);
-    for (int i=0; i<2*ntrans; ++i)
-      for (int j=0; j<nalms; ++j)
-        alm[i][j]=1.+_Complex_I;
+  dcmplx **alm;
+  ALLOC2D(alm,dcmplx,2,nalms);
+  for (int i=0; i<2; ++i)
+    for (int j=0; j<nalms; ++j)
+      alm[i][j]=1.+_Complex_I;
 
-    sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,ntrans,SHARP_DP,
-      NULL,NULL);
-    for (int it=0; it<ntrans; ++it)
-      {
-      UTIL_ASSERT(FAPPROX(map[it][0     ], 3.588246976618616912e+00,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[it][npix/2], 4.042209792157496651e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[it][npix-1],-1.234675107554816442e+01,1e-12),
-        "error");
-      }
-    sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,ntrans,SHARP_DP,
-      NULL,NULL);
-    for (int it=0; it<ntrans; ++it)
-      {
-      UTIL_ASSERT(FAPPROX(map[2*it  ][0     ], 2.750897760535633285e+00,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix/2], 3.137704477368562905e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix-1],-8.405730859837063917e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][0     ],-2.398026536095463346e+00,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-4.961140548331700728e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.412765834230440021e+01,1e-12),
-        "error");
-      }
+  sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP,
+    NULL,NULL);
+  UTIL_ASSERT(FAPPROX(map[0][0     ], 3.588246976618616912e+00,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix/2], 4.042209792157496651e+01,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.234675107554816442e+01,1e-12),
+    "error");
 
-    sharp_execute(SHARP_ALM2MAP,2,&alm[0],&map[0],tinfo,alms,ntrans,SHARP_DP,
-      NULL,NULL);
-    for (int it=0; it<ntrans; ++it)
-      {
-      UTIL_ASSERT(FAPPROX(map[2*it  ][0     ],-1.398186224727334448e+00,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix/2],-2.456676000884031197e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix-1],-1.516249174408820863e+02,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][0     ],-3.173406200299964119e+00,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-5.831327404513146462e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.863257892248353897e+01,1e-12),
-        "error");
-      }
+  sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,SHARP_DP,
+    NULL,NULL);
+  UTIL_ASSERT(FAPPROX(map[0][0     ], 2.750897760535633285e+00,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix/2], 3.137704477368562905e+01,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix-1],-8.405730859837063917e+01,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][0     ],-2.398026536095463346e+00,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][npix/2],-4.961140548331700728e+01,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][npix-1],-1.412765834230440021e+01,1e-12),
+    "error");
 
-    sharp_execute(SHARP_ALM2MAP_DERIV1,1,&alm[0],&map[0],tinfo,alms,ntrans,
-      SHARP_DP,NULL,NULL);
-    for (int it=0; it<ntrans; ++it)
-      {
-      UTIL_ASSERT(FAPPROX(map[2*it  ][0     ],-6.859393905369091105e-01,1e-11),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix/2],-2.103947835973212364e+02,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix-1],-1.092463246472086439e+03,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][0     ],-1.411433220713928165e+02,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-1.146122859381925082e+03,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1], 7.821618677689795049e+02,1e-12),
-        "error");
-      }
+  sharp_execute(SHARP_ALM2MAP,2,&alm[0],&map[0],tinfo,alms,SHARP_DP,
+    NULL,NULL);
+  UTIL_ASSERT(FAPPROX(map[0][0     ],-1.398186224727334448e+00,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix/2],-2.456676000884031197e+01,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.516249174408820863e+02,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][0     ],-3.173406200299964119e+00,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][npix/2],-5.831327404513146462e+01,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][npix-1],-1.863257892248353897e+01,1e-12),
+    "error");
 
-    DEALLOC2D(map);
-    DEALLOC2D(alm);
-    }
+  sharp_execute(SHARP_ALM2MAP_DERIV1,1,&alm[0],&map[0],tinfo,alms,
+    SHARP_DP,NULL,NULL);
+  UTIL_ASSERT(FAPPROX(map[0][0     ],-6.859393905369091105e-01,1e-11),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix/2],-2.103947835973212364e+02,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.092463246472086439e+03,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][0     ],-1.411433220713928165e+02,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][npix/2],-1.146122859381925082e+03,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][npix-1], 7.821618677689795049e+02,1e-12),
+    "error");
+
+  DEALLOC2D(map);
+  DEALLOC2D(alm);
 
   sharp_destroy_alm_info(alms);
   sharp_destroy_geom_info(tinfo);
   }
 
 static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
-  int spin, int ntrans, int nv, double **err_abs, double **err_rel,
+  int spin, int nv, double **err_abs, double **err_rel,
   double *t_a2m, double *t_m2a, unsigned long long *op_a2m,
   unsigned long long *op_m2a)
   {
   ptrdiff_t nalms = get_nalms(ainfo);
-  int ncomp = ntrans*((spin==0) ? 1 : 2);
+  int ncomp = (spin==0) ? 1 : 2;
 
   size_t npix = get_npix(ginfo);
   double **map;
@@ -463,9 +449,9 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
 
 #ifdef USE_MPI
   sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,
-    ainfo,ntrans, SHARP_DP|SHARP_ADD|nv,t_a2m,op_a2m);
+    ainfo, SHARP_DP|SHARP_ADD|nv,t_a2m,op_a2m);
 #else
-  sharp_execute(SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,ainfo,ntrans,
+  sharp_execute(SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,ainfo,
     SHARP_DP|nv,t_a2m,op_a2m);
 #endif
   if (t_a2m!=NULL) *t_a2m=maxTime(*t_a2m);
@@ -473,9 +459,9 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
   double *sqsum=get_sqsum_and_invert(alm,nalms,ncomp);
 #ifdef USE_MPI
   sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,
-    ainfo,ntrans,SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
+    ainfo,SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
 #else
-  sharp_execute(SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,ainfo,ntrans,
+  sharp_execute(SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,ainfo,
     SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
 #endif
   if (t_m2a!=NULL) *t_m2a=maxTime(*t_m2a);
@@ -488,11 +474,11 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
   }
 
 static void check_accuracy (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
-  int spin, int ntrans, int nv)
+  int spin, int nv)
   {
-  int ncomp = ntrans*((spin==0) ? 1 : 2);
+  int ncomp = (spin==0) ? 1 : 2;
   double *err_abs, *err_rel;
-  do_sht (ginfo, ainfo, spin, ntrans, nv, &err_abs, &err_rel, NULL, NULL,
+  do_sht (ginfo, ainfo, spin, nv, &err_abs, &err_rel, NULL, NULL,
     NULL, NULL);
   for (int i=0; i<ncomp; ++i)
     UTIL_ASSERT((err_rel[i]<1e-10) && (err_abs[i]<1e-10),"error");
@@ -515,14 +501,13 @@ static void sharp_acctest(void)
   int lmax=127, mmax=127, nlat=128, nlon=256;
   get_infos ("gauss", lmax, &mmax, &nlat, &nlon, &ginfo, &ainfo);
   for (int nv=1; nv<=6; ++nv)
-    for (int ntrans=1; ntrans<=6; ++ntrans)
-      {
-      check_accuracy(ginfo,ainfo,0,ntrans,nv);
-      check_accuracy(ginfo,ainfo,1,ntrans,nv);
-      check_accuracy(ginfo,ainfo,2,ntrans,nv);
-      check_accuracy(ginfo,ainfo,3,ntrans,nv);
-      check_accuracy(ginfo,ainfo,30,ntrans,nv);
-      }
+    {
+    check_accuracy(ginfo,ainfo,0,nv);
+    check_accuracy(ginfo,ainfo,1,nv);
+    check_accuracy(ginfo,ainfo,2,nv);
+    check_accuracy(ginfo,ainfo,3,nv);
+    check_accuracy(ginfo,ainfo,30,nv);
+    }
   sharp_destroy_alm_info(ainfo);
   sharp_destroy_geom_info(ginfo);
   if (mytask==0) printf("Passed.\n\n");
@@ -531,22 +516,21 @@ static void sharp_acctest(void)
 static void sharp_test (int argc, const char **argv)
   {
   if (mytask==0) sharp_announce("sharp_test");
-  UTIL_ASSERT(argc>=9,"usage: grid lmax mmax geom1 geom2 spin ntrans");
+  UTIL_ASSERT(argc>=8,"usage: grid lmax mmax geom1 geom2 spin");
   int lmax=atoi(argv[3]);
   int mmax=atoi(argv[4]);
   int gpar1=atoi(argv[5]);
   int gpar2=atoi(argv[6]);
   int spin=atoi(argv[7]);
-  int ntrans=atoi(argv[8]);
 
   if (mytask==0) printf("Testing map analysis accuracy.\n");
-  if (mytask==0) printf("spin=%d, ntrans=%d\n", spin, ntrans);
+  if (mytask==0) printf("spin=%d\n", spin);
 
   sharp_geom_info *ginfo;
   sharp_alm_info *ainfo;
   get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo);
 
-  int ncomp = ntrans*((spin==0) ? 1 : 2);
+  int ncomp = (spin==0) ? 1 : 2;
   double t_a2m=1e30, t_m2a=1e30;
   unsigned long long op_a2m, op_m2a;
   double *err_abs,*err_rel;
@@ -557,7 +541,7 @@ static void sharp_test (int argc, const char **argv)
     {
     ++nrpt;
     double ta2m2, tm2a2;
-    do_sht (ginfo, ainfo, spin, ntrans, 0, &err_abs, &err_rel, &ta2m2, &tm2a2,
+    do_sht (ginfo, ainfo, spin, 0, &err_abs, &err_rel, &ta2m2, &tm2a2,
       &op_a2m, &op_m2a);
     if (ta2m2<t_a2m) t_a2m=ta2m2;
     if (tm2a2<t_m2a) t_m2a=tm2a2;
@@ -610,7 +594,7 @@ static void sharp_test (int argc, const char **argv)
     printf("%-12s %-10s %2d %d %2d %3d %6d %6d %6d %6d %2d %.2e %7.2f %.2e %7.2f"
            " %9.2f %6.2f %.2e %.2e\n",
       getenv("HOST"),argv[2],spin,VLEN,nomp,ntasks,lmax,mmax,gpar1,gpar2,
-      ntrans,t_a2m,1e-9*op_a2m/t_a2m,t_m2a,1e-9*op_m2a/t_m2a,tmem/(1<<20),
+      t_a2m,1e-9*op_a2m/t_a2m,t_m2a,1e-9*op_m2a/t_m2a,tmem/(1<<20),
       100.*(1.-iosize/tmem),maxerel,maxeabs);
 
   DEALLOC(err_abs);
@@ -620,16 +604,15 @@ static void sharp_test (int argc, const char **argv)
 static void sharp_bench (int argc, const char **argv)
   {
   if (mytask==0) sharp_announce("sharp_bench");
-  UTIL_ASSERT(argc>=9,"usage: grid lmax mmax geom1 geom2 spin ntrans");
+  UTIL_ASSERT(argc>=8,"usage: grid lmax mmax geom1 geom2 spin");
   int lmax=atoi(argv[3]);
   int mmax=atoi(argv[4]);
   int gpar1=atoi(argv[5]);
   int gpar2=atoi(argv[6]);
   int spin=atoi(argv[7]);
-  int ntrans=atoi(argv[8]);
 
   if (mytask==0) printf("Testing map analysis accuracy.\n");
-  if (mytask==0) printf("spin=%d, ntrans=%d\n", spin, ntrans);
+  if (mytask==0) printf("spin=%d\n", spin);
 
   sharp_geom_info *ginfo;
   sharp_alm_info *ainfo;
@@ -647,7 +630,7 @@ static void sharp_bench (int argc, const char **argv)
       double t_a2m, t_m2a;
       unsigned long long op_a2m, op_m2a;
       double *err_abs,*err_rel;
-      do_sht (ginfo, ainfo, spin, ntrans, nv, &err_abs, &err_rel,
+      do_sht (ginfo, ainfo, spin, nv, &err_abs, &err_rel,
         &t_a2m, &t_m2a, &op_a2m, &op_m2a);
 
       DEALLOC(err_abs);

From 10c5b5f7a98cfc0fcdec8a2097e1aa827bc9ab37 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Mon, 10 Dec 2018 16:32:12 +0100
Subject: [PATCH 12/85] cleanup

---
 libsharp/sharp_core_inc0.c      |  12 +-
 libsharp/sharp_core_inc2.c      | 517 +++++++++++++-------------------
 libsharp/sharp_core_inchelper.c |  60 ----
 3 files changed, 220 insertions(+), 369 deletions(-)

diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c
index d7c3624..06b9285 100644
--- a/libsharp/sharp_core_inc0.c
+++ b/libsharp/sharp_core_inc0.c
@@ -82,22 +82,22 @@ void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *c
   switch (nv)
     {
     case 0x1:
-      CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      CONCAT2(inner_loop,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
       return;
     case 0x2:
-      CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      CONCAT2(inner_loop,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
       return;
     case 0x3:
-      CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      CONCAT2(inner_loop,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
       return;
     case 0x4:
-      CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      CONCAT2(inner_loop,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
       return;
     case 0x5:
-      CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      CONCAT2(inner_loop,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
       return;
     case 0x6:
-      CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      CONCAT2(inner_loop,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
       return;
     }
   UTIL_FAIL("Incorrect vector parameters");
diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c
index 017df07..de2924f 100644
--- a/libsharp/sharp_core_inc2.c
+++ b/libsharp/sharp_core_inc2.c
@@ -32,149 +32,89 @@
 NOINLINE static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
   Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
   const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
-  int l, int lmax NJ1)
+  int l, int lmax)
   {
-if (njobs>1)
-  {
-  while (l<lmax-2)
-    {
-    Tb lam_3, lam_4;
-    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
-    for (int i=0; i<nvec; ++i)
-//      lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-      lam_3.v[i] = vabmc(vmul(cth.v[i],lam_2.v[i]),r0,vmul(lam_1.v[i],r1));
-    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
-//      lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
-      lam_4.v[i] = vabmc(vmul(cth.v[i],lam_3.v[i]),r0,vmul(lam_2.v[i],r1));
-    r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
-    for (int i=0; i<nvec; ++i)
-//      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
-      lam_1.v[i] = vabmc(vmul(cth.v[i],lam_4.v[i]),r0,vmul(lam_3.v[i],r1));
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv ar2=vload(creal(alm[njobs*l+j])),
-         ai2=vload(cimag(alm[njobs*l+j])),
-         ar4=vload(creal(alm[njobs*(l+2)+j])),
-         ai4=vload(cimag(alm[njobs*(l+2)+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaaeq(p1[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
-        vfmaaeq(p1[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
-        }
-      Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
-         ai3=vload(cimag(alm[njobs*(l+1)+j])),
-         ar1=vload(creal(alm[njobs*(l+3)+j])),
-         ai1=vload(cimag(alm[njobs*(l+3)+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaaeq(p2[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
-        vfmaaeq(p2[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
-        }
-      }
-    r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
-    for (int i=0; i<nvec; ++i)
-//      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
-      lam_2.v[i] = vabmc(vmul(cth.v[i],lam_1.v[i]),r0,vmul(lam_4.v[i],r1));
-    l+=4;
-    }
-  }
   while (l<lmax)
     {
     for (int i=0; i<nvec; ++i)
       lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
                  - vload(rf[l].f[1])*lam_1.v[i];
-    for (int j=0; j<njobs; ++j)
+    {
+    Tv ar=vload(creal(alm[l])),
+       ai=vload(cimag(alm[l]));
+    for (int i=0; i<nvec; ++i)
       {
-      Tv ar=vload(creal(alm[njobs*l+j])),
-         ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        p1[j].r.v[i] += lam_2.v[i]*ar;
-        p1[j].i.v[i] += lam_2.v[i]*ai;
-        }
+      p1->r.v[i] += lam_2.v[i]*ar;
+      p1->i.v[i] += lam_2.v[i]*ai;
       }
+    }
     for (int i=0; i<nvec; ++i)
       lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
                  - vload(rf[l+1].f[1])*lam_2.v[i];
-    for (int j=0; j<njobs; ++j)
+    {
+    Tv ar=vload(creal(alm[l+1])),
+       ai=vload(cimag(alm[l+1]));
+    for (int i=0; i<nvec; ++i)
       {
-      Tv ar=vload(creal(alm[njobs*(l+1)+j])),
-         ai=vload(cimag(alm[njobs*(l+1)+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        p2[j].r.v[i] += lam_1.v[i]*ar;
-        p2[j].i.v[i] += lam_1.v[i]*ai;
-        }
+      p2->r.v[i] += lam_1.v[i]*ar;
+      p2->i.v[i] += lam_1.v[i]*ai;
+      }
       }
     l+=2;
     }
   if (l==lmax)
     {
-    for (int j=0; j<njobs; ++j)
+    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
+    for (int i=0; i<nvec; ++i)
       {
-      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        p1[j].r.v[i] += lam_2.v[i]*ar;
-        p1[j].i.v[i] += lam_2.v[i]*ai;
-        }
+      p1->r.v[i] += lam_2.v[i]*ar;
+      p1->i.v[i] += lam_2.v[i]*ai;
       }
     }
   }
 
 NOINLINE static void Z(map2alm_kernel) (const Tb cth,
   const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp
-  NJ1)
+  const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp)
   {
   while (l<lmax)
     {
     for (int i=0; i<nvec; ++i)
       lam_1.v[i] = vabmc(vload(rf[l].f[0]),vmul(cth.v[i],lam_2.v[i]),
                    vmul(vload(rf[l].f[1]),lam_1.v[i]));
-    for (int j=0; j<njobs; ++j)
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(atmp[2*(l*njobs+j)],lam_2.v[i],p1[j].r.v[i]);
-        vfmaeq(atmp[2*(l*njobs+j)+1],lam_2.v[i],p1[j].i.v[i]);
-//        atmp[2*(l*njobs+j)]+=lam_2.v[i]*p1[j].r.v[i];
-//        atmp[2*(l*njobs+j)+1]+=lam_2.v[i]*p1[j].i.v[i];
-        }
+    for (int i=0; i<nvec; ++i)
+      {
+      vfmaeq(atmp[2*l  ],lam_2.v[i],p1->r.v[i]);
+      vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]);
+      }
     for (int i=0; i<nvec; ++i)
       lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
                  - vload(rf[l+1].f[1])*lam_2.v[i];
-    for (int j=0; j<njobs; ++j)
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(atmp[2*((l+1)*njobs+j)],lam_1.v[i],p2[j].r.v[i]);
-        vfmaeq(atmp[2*((l+1)*njobs+j)+1],lam_1.v[i],p2[j].i.v[i]);
-//        atmp[2*((l+1)*njobs+j)]+=lam_1.v[i]*p2[j].r.v[i];
-//        atmp[2*((l+1)*njobs+j)+1]+=lam_1.v[i]*p2[j].i.v[i];
-        }
+    for (int i=0; i<nvec; ++i)
+      {
+      vfmaeq(atmp[2*(l+1)  ],lam_1.v[i],p2->r.v[i]);
+      vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]);
+      }
     l+=2;
     }
   if (l==lmax)
-    {
-    for (int j=0; j<njobs; ++j)
-      for (int i=0; i<nvec; ++i)
-        {
-        atmp[2*(l*njobs+j)] += lam_2.v[i]*p1[j].r.v[i];
-        atmp[2*(l*njobs+j)+1] += lam_2.v[i]*p1[j].i.v[i];
-        }
-    }
+    for (int i=0; i<nvec; ++i)
+      {
+      atmp[2*l  ] += lam_2.v[i]*p1->r.v[i];
+      atmp[2*l+1] += lam_2.v[i]*p1->i.v[i];
+      }
   }
 
 NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth,
   const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
-  Y(Tbri) * restrict p2 NJ1)
+  Y(Tbri) * restrict p2)
   {
   int l,lmax=gen->lmax;
   Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
   Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
   job->opcnt += (l-gen->m) * 4*VLEN*nvec;
   if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
+  job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
 
   Tb corfac;
   Y(getCorfac)(scale,&corfac,gen->cf);
@@ -183,30 +123,28 @@ NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth,
   int full_ieee = Y(TballGe)(scale,sharp_minscale);
   while (!full_ieee)
     {
-    for (int j=0; j<njobs; ++j)
+    {
+    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
+    for (int i=0; i<nvec; ++i)
       {
-      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
-        vfmaeq(p1[j].r.v[i],tmp,ar);
-        vfmaeq(p1[j].i.v[i],tmp,ai);
-        }
+      Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
+      vfmaeq(p1->r.v[i],tmp,ar);
+      vfmaeq(p1->i.v[i],tmp,ai);
       }
+    }
     if (++l>lmax) break;
     Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
     for (int i=0; i<nvec; ++i)
       lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    for (int j=0; j<njobs; ++j)
+    {
+    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
+    for (int i=0; i<nvec; ++i)
       {
-      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
-        vfmaeq(p2[j].r.v[i],tmp,ar);
-        vfmaeq(p2[j].i.v[i],tmp,ai);
-        }
+      Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
+      vfmaeq(p2->r.v[i],tmp,ar);
+      vfmaeq(p2->i.v[i],tmp,ai);
       }
+    }
     if (++l>lmax) break;
     r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
     for (int i=0; i<nvec; ++i)
@@ -220,12 +158,12 @@ NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth,
   if (l>lmax) return;
 
   Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
+  Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
   }
 
 NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
   const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
-  const Y(Tbri) * restrict p2, Tv *restrict atmp NJ1)
+  const Y(Tbri) * restrict p2, Tv *restrict atmp)
   {
   int lmax=gen->lmax;
   Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
@@ -233,7 +171,7 @@ NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
   Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
   job->opcnt += (l-gen->m) * 4*VLEN*nvec;
   if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
+  job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
 
   const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
   Tb corfac;
@@ -241,24 +179,22 @@ NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
   int full_ieee = Y(TballGe)(scale,sharp_minscale);
   while (!full_ieee)
     {
-    for (int j=0; j<njobs; ++j)
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=lam_2.v[i]*corfac.v[i];
-        atmp[2*(l*njobs+j)]+=tmp*p1[j].r.v[i];
-        atmp[2*(l*njobs+j)+1]+=tmp*p1[j].i.v[i];
-        }
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv tmp=lam_2.v[i]*corfac.v[i];
+      atmp[2*l  ]+=tmp*p1->r.v[i];
+      atmp[2*l+1]+=tmp*p1->i.v[i];
+      }
     if (++l>lmax) return;
     for (int i=0; i<nvec; ++i)
       lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
                  - vload(rf[l-1].f[1])*lam_1.v[i];
-    for (int j=0; j<njobs; ++j)
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=lam_1.v[i]*corfac.v[i];
-        atmp[2*(l*njobs+j)]+=tmp*p2[j].r.v[i];
-        atmp[2*(l*njobs+j)+1]+=tmp*p2[j].i.v[i];
-        }
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv tmp=lam_1.v[i]*corfac.v[i];
+      atmp[2*l  ]+=tmp*p2->r.v[i];
+      atmp[2*l+1]+=tmp*p2->i.v[i];
+      }
     if (++l>lmax) return;
     for (int i=0; i<nvec; ++i)
       lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i])
@@ -271,97 +207,88 @@ NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
     }
 
   Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp NJ2);
+  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp);
   }
 
 static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
-  const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1)
+  const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
   {
-  for (int j=0; j<njobs; ++j)
+  Tv agr=vload(creal(alm[0])), agi=vload(cimag(alm[0])),
+     acr=vload(creal(alm[1])), aci=vload(cimag(alm[1]));
+  for (int i=0; i<nvec; ++i)
     {
-    Tv agr=vload(creal(alm[2*j])), agi=vload(cimag(alm[2*j])),
-       acr=vload(creal(alm[2*j+1])), aci=vload(cimag(alm[2*j+1]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw=vadd(rxp.v[i],rxm.v[i]);
-      vfmaeq(px[j].qr.v[i],agr,lw);
-      vfmaeq(px[j].qi.v[i],agi,lw);
-      vfmaeq(px[j].ur.v[i],acr,lw);
-      vfmaeq(px[j].ui.v[i],aci,lw);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx=vsub(rxm.v[i],rxp.v[i]);
-      vfmseq(py[j].qr.v[i],aci,lx);
-      vfmaeq(py[j].qi.v[i],acr,lx);
-      vfmaeq(py[j].ur.v[i],agi,lx);
-      vfmseq(py[j].ui.v[i],agr,lx);
-      }
+    Tv lw=vadd(rxp.v[i],rxm.v[i]);
+    vfmaeq(px->qr.v[i],agr,lw);
+    vfmaeq(px->qi.v[i],agi,lw);
+    vfmaeq(px->ur.v[i],acr,lw);
+    vfmaeq(px->ui.v[i],aci,lw);
+    }
+  for (int i=0; i<nvec; ++i)
+    {
+    Tv lx=vsub(rxm.v[i],rxp.v[i]);
+    vfmseq(py->qr.v[i],aci,lx);
+    vfmaeq(py->qi.v[i],acr,lx);
+    vfmaeq(py->ur.v[i],agi,lx);
+    vfmseq(py->ui.v[i],agr,lx);
     }
   }
 
 static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
   const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
-  const dcmplx * restrict alm1, const dcmplx * restrict alm2 NJ1)
+  const dcmplx * restrict alm1, const dcmplx * restrict alm2)
   {
-  for (int j=0; j<njobs; ++j)
+  Tv agr1=vload(creal(alm1[0])), agi1=vload(cimag(alm1[0])),
+     acr1=vload(creal(alm1[1])), aci1=vload(cimag(alm1[1]));
+  Tv agr2=vload(creal(alm2[0])), agi2=vload(cimag(alm2[0])),
+     acr2=vload(creal(alm2[1])), aci2=vload(cimag(alm2[1]));
+  for (int i=0; i<nvec; ++i)
     {
-    Tv agr1=vload(creal(alm1[2*j])), agi1=vload(cimag(alm1[2*j])),
-       acr1=vload(creal(alm1[2*j+1])), aci1=vload(cimag(alm1[2*j+1]));
-    Tv agr2=vload(creal(alm2[2*j])), agi2=vload(cimag(alm2[2*j])),
-       acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw1=r2p.v[i]+r2m.v[i];
-      Tv lx2=r1m.v[i]-r1p.v[i];
-      vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
-      vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
-      vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
-      vfmaseq(p1[j].ui.v[i],aci1,lw1,agr2,lx2);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx1=r2m.v[i]-r2p.v[i];
-      Tv lw2=r1p.v[i]+r1m.v[i];
-      vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
-      vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
-      vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
-      vfmaseq(p2[j].ui.v[i],aci2,lw2,agr1,lx1);
-      }
+    Tv lw1=r2p.v[i]+r2m.v[i];
+    Tv lx2=r1m.v[i]-r1p.v[i];
+    vfmaseq(p1->qr.v[i],agr1,lw1,aci2,lx2);
+    vfmaaeq(p1->qi.v[i],agi1,lw1,acr2,lx2);
+    vfmaaeq(p1->ur.v[i],acr1,lw1,agi2,lx2);
+    vfmaseq(p1->ui.v[i],aci1,lw1,agr2,lx2);
+    }
+  for (int i=0; i<nvec; ++i)
+    {
+    Tv lx1=r2m.v[i]-r2p.v[i];
+    Tv lw2=r1p.v[i]+r1m.v[i];
+    vfmaseq(p2->qr.v[i],agr2,lw2,aci1,lx1);
+    vfmaaeq(p2->qi.v[i],agi2,lw2,acr1,lx1);
+    vfmaaeq(p2->ur.v[i],acr2,lw2,agi1,lx1);
+    vfmaseq(p2->ui.v[i],aci2,lw2,agr1,lx1);
     }
   }
 
 static inline void Z(saddstep2) (const Y(Tbqu) * restrict px,
   const Y(Tbqu) * restrict py, const Tb * restrict rxp,
-  const Tb * restrict rxm, dcmplx * restrict alm NJ1)
+  const Tb * restrict rxm, dcmplx * restrict alm)
   {
-  for (int j=0; j<njobs; ++j)
+  Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
+  for (int i=0; i<nvec; ++i)
     {
-    Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw=vadd(rxp->v[i],rxm->v[i]);
-      vfmaeq(agr,px[j].qr.v[i],lw);
-      vfmaeq(agi,px[j].qi.v[i],lw);
-      vfmaeq(acr,px[j].ur.v[i],lw);
-      vfmaeq(aci,px[j].ui.v[i],lw);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx=vsub(rxm->v[i],rxp->v[i]);
-      vfmseq(agr,py[j].ui.v[i],lx);
-      vfmaeq(agi,py[j].ur.v[i],lx);
-      vfmaeq(acr,py[j].qi.v[i],lx);
-      vfmseq(aci,py[j].qr.v[i],lx);
-      }
-    vhsum_cmplx_special(agr,agi,acr,aci,&alm[2*j]);
+    Tv lw=vadd(rxp->v[i],rxm->v[i]);
+    vfmaeq(agr,px->qr.v[i],lw);
+    vfmaeq(agi,px->qi.v[i],lw);
+    vfmaeq(acr,px->ur.v[i],lw);
+    vfmaeq(aci,px->ui.v[i],lw);
     }
+  for (int i=0; i<nvec; ++i)
+    {
+    Tv lx=vsub(rxm->v[i],rxp->v[i]);
+    vfmseq(agr,py->ui.v[i],lx);
+    vfmaeq(agi,py->ur.v[i],lx);
+    vfmaeq(acr,py->qi.v[i],lx);
+    vfmseq(aci,py->qr.v[i],lx);
+    }
+  vhsum_cmplx_special(agr,agi,acr,aci,alm);
   }
 
 NOINLINE static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
   Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
   const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
-  int lmax NJ1)
+  int lmax)
   {
   while (l<lmax)
     {
@@ -372,8 +299,8 @@ NOINLINE static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
       rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i];
       rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i];
       }
-    Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
-      &alm[2*njobs*(l+1)] NJ2);
+    Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*l],
+      &alm[2*(l+1)]);
     fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
     fx2=vload(fx[l+2].f[2]);
     for (int i=0; i<nvec; ++i)
@@ -384,13 +311,12 @@ NOINLINE static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
     l+=2;
     }
   if (l==lmax)
-    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
+    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*l]);
   }
 
 NOINLINE static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
   const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax
-  NJ1)
+  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
   {
   while (l<lmax)
     {
@@ -403,8 +329,8 @@ NOINLINE static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1
       rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
                         vmul(fx2,rec1m.v[i]));
       }
-    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
-    Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)] NJ2);
+    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*l]);
+    Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*(l+1)]);
     fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
     fx2=vload(fx[l+2].f[2]);
     for (int i=0; i<nvec; ++i)
@@ -417,12 +343,12 @@ NOINLINE static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1
     l+=2;
     }
   if (l==lmax)
-    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
+    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*l]);
   }
 
 NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
   const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2 NJ1)
+  Y(Tbqu) * restrict p2)
   {
   int l, lmax=gen->lmax;
   Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
@@ -430,7 +356,7 @@ NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
     (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
   job->opcnt += (l-gen->m) * 10*VLEN*nvec;
   if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
+  job->opcnt += (lmax+1-l) * 28*VLEN*nvec;
 
   const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
   Tb corfacp,corfacm;
@@ -442,11 +368,11 @@ NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
   while (!full_ieee)
     {
     Z(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
-      &alm[2*njobs*l] NJ2);
+      &alm[2*l]);
     if (++l>lmax) break;
     Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
     Z(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
-      &alm[2*njobs*l] NJ2);
+      &alm[2*l]);
     if (++l>lmax) break;
     Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
     if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
@@ -463,12 +389,12 @@ NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
   Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
   Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
   Z(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
-    lmax NJ2);
+    lmax);
   }
 
 NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
   const sharp_Ylmgen_C * restrict gen, sharp_job *job,
-  const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1)
+  const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2)
   {
   int l, lmax=gen->lmax;
   Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
@@ -476,7 +402,7 @@ NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
     (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
   job->opcnt += (l-gen->m) * 10*VLEN*nvec;
   if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
+  job->opcnt += (lmax+1-l) * 28*VLEN*nvec;
 
   const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
   Tb corfacp,corfacm;
@@ -488,11 +414,11 @@ NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
   while (!full_ieee)
     {
     Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
-    Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l] NJ2);
+    Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*l]);
     if (++l>lmax) return;
     Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
     t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
-    Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l] NJ2);
+    Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*l]);
     if (++l>lmax) return;
     Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
     if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
@@ -506,34 +432,31 @@ NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
 
   Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
   Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Z(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax NJ2);
+  Z(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
   }
 
 static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
-  const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1)
+  const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
   {
-  for (int j=0; j<njobs; ++j)
+  Tv ar=vload(creal(alm[0])), ai=vload(cimag(alm[0]));
+  for (int i=0; i<nvec; ++i)
     {
-    Tv ar=vload(creal(alm[j])), ai=vload(cimag(alm[j]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw=vadd(rxp.v[i],rxm.v[i]);
-      vfmaeq(px[j].qr.v[i],ar,lw);
-      vfmaeq(px[j].qi.v[i],ai,lw);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx=vsub(rxm.v[i],rxp.v[i]);
-      vfmaeq(py[j].ur.v[i],ai,lx);
-      vfmseq(py[j].ui.v[i],ar,lx);
-      }
+    Tv lw=vadd(rxp.v[i],rxm.v[i]);
+    vfmaeq(px->qr.v[i],ar,lw);
+    vfmaeq(px->qi.v[i],ai,lw);
+    }
+  for (int i=0; i<nvec; ++i)
+    {
+    Tv lx=vsub(rxm.v[i],rxp.v[i]);
+    vfmaeq(py->ur.v[i],ai,lx);
+    vfmseq(py->ui.v[i],ar,lx);
     }
   }
 
 NOINLINE static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
   Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
   const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
-  int lmax NJ1)
+  int lmax)
   {
   while (l<lmax)
     {
@@ -546,8 +469,8 @@ NOINLINE static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
       rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
                         vmul(fx2,rec1m.v[i]));
       }
-    Z(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l] NJ2);
-    Z(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)] NJ2);
+    Z(saddstep_d)(p1,p2,rec2p,rec2m,&alm[l]);
+    Z(saddstep_d)(p2,p1,rec1p,rec1m,&alm[l+1]);
     fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
     fx2=vload(fx[l+2].f[2]);
     for (int i=0; i<nvec; ++i)
@@ -560,12 +483,12 @@ NOINLINE static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
     l+=2;
     }
   if (l==lmax)
-    Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2);
+    Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[l]);
   }
 
 NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
   const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2 NJ1)
+  Y(Tbqu) * restrict p2)
   {
   int l, lmax=gen->lmax;
   Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
@@ -573,7 +496,7 @@ NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
     (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
   job->opcnt += (l-gen->m) * 10*VLEN*nvec;
   if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
+  job->opcnt += (lmax+1-l) * 20*VLEN*nvec;
 
   const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
   Tb corfacp,corfacm;
@@ -585,11 +508,11 @@ NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
   while (!full_ieee)
     {
     Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
-      &alm[njobs*l] NJ2);
+      &alm[l]);
     if (++l>lmax) break;
     Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
     Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
-      &alm[njobs*l] NJ2);
+      &alm[l]);
     if (++l>lmax) break;
     Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
     if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
@@ -606,7 +529,7 @@ NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
   Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
   Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
   Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
-    lmax NJ2);
+    lmax);
   }
 
 
@@ -614,7 +537,7 @@ NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
 
 NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
   const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
+  sharp_Ylmgen_C *gen, int mi, const int *mlim)
   {
   const int nval=nvec*VLEN;
   const int m = job->ainfo->mval[mi];
@@ -629,7 +552,7 @@ NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
         {
         for (int ith=0; ith<ulim-llim; ith+=nval)
           {
-          Y(Tburi) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
+          Y(Tburi) p1,p2; VZERO(p1); VZERO(p2);
           Y(Tbu) cth, sth;
 
           int skip=1;
@@ -641,22 +564,19 @@ NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
             cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
             }
           if (!skip)
-            Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
+            Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1.b,&p2.b);
 
           for (int i=0; i<nval; ++i)
             {
             int itot=i+ith;
             if (itot<ulim-llim)
               {
-              for (int j=0; j<njobs; ++j)
-                {
-                int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
-                complex double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I,
-                               r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I;
-                job->phase[phas_idx] = r1+r2;
-                if (ispair[itot])
-                  job->phase[phas_idx+1] = r1-r2;
-                }
+              int phas_idx = itot*job->s_th + mi*job->s_m;
+              complex double r1 = p1.s.r[i] + p1.s.i[i]*_Complex_I,
+                             r2 = p2.s.r[i] + p2.s.i[i]*_Complex_I;
+              job->phase[phas_idx] = r1+r2;
+              if (ispair[itot])
+                job->phase[phas_idx+1] = r1-r2;
               }
             }
           }
@@ -665,7 +585,7 @@ NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
         {
         for (int ith=0; ith<ulim-llim; ith+=nval)
           {
-          Y(Tbuqu) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
+          Y(Tbuqu) p1,p2; VZERO(p1); VZERO(p2);
           Y(Tbu) cth, sth;
           int skip=1;
 
@@ -679,33 +599,30 @@ NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
           if (!skip)
             (job->type==SHARP_ALM2MAP) ?
               Z(calc_alm2map_spin  )
-                (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2) :
+                (cth.b,sth.b,gen,job,&p1.b,&p2.b) :
               Z(calc_alm2map_deriv1)
-                (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
+                (cth.b,sth.b,gen,job,&p1.b,&p2.b);
 
           for (int i=0; i<nval; ++i)
             {
             int itot=i+ith;
             if (itot<ulim-llim)
               {
-              for (int j=0; j<njobs; ++j)
+              int phas_idx = itot*job->s_th + mi*job->s_m;
+              complex double q1 = p1.s.qr[i] + p1.s.qi[i]*_Complex_I,
+                             q2 = p2.s.qr[i] + p2.s.qi[i]*_Complex_I,
+                             u1 = p1.s.ur[i] + p1.s.ui[i]*_Complex_I,
+                             u2 = p2.s.ur[i] + p2.s.ui[i]*_Complex_I;
+              job->phase[phas_idx] = q1+q2;
+              job->phase[phas_idx+2] = u1+u2;
+              if (ispair[itot])
                 {
-                int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
-                complex double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I,
-                               q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I,
-                               u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I,
-                               u2 = p2[j].s.ur[i] + p2[j].s.ui[i]*_Complex_I;
-                job->phase[phas_idx] = q1+q2;
-                job->phase[phas_idx+2] = u1+u2;
-                if (ispair[itot])
-                  {
-                  dcmplx *phQ = &(job->phase[phas_idx+1]),
-                         *phU = &(job->phase[phas_idx+3]);
-                  *phQ = q1-q2;
-                  *phU = u1-u2;
-                  if ((gen->mhi-gen->m+gen->s)&1)
-                    { *phQ=-(*phQ); *phU=-(*phU); }
-                  }
+                dcmplx *phQ = &(job->phase[phas_idx+1]),
+                       *phU = &(job->phase[phas_idx+3]);
+                *phQ = q1-q2;
+                *phU = u1-u2;
+                if ((gen->mhi-gen->m+gen->s)&1)
+                  { *phQ=-(*phQ); *phU=-(*phU); }
                 }
               }
             }
@@ -723,7 +640,7 @@ NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
 
 NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
   const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
+  sharp_Ylmgen_C *gen, int mi, const int *mlim)
   {
   const int nval=nvec*VLEN;
   const int m = job->ainfo->mval[mi];
@@ -735,11 +652,11 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
       {
       if (job->spin==0)
         {
-        Tv atmp[2*njobs*(gen->lmax+1)];
-        memset (&atmp[2*njobs*m],0,2*njobs*(gen->lmax+1-m)*sizeof(Tv));
+        Tv atmp[2*(gen->lmax+1)];
+        memset (&atmp[2*m],0,2*(gen->lmax+1-m)*sizeof(Tv));
         for (int ith=0; ith<ulim-llim; ith+=nval)
           {
-          Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
+          Y(Tburi) p1, p2; VZERO(p1); VZERO(p2);
           Y(Tbu) cth, sth;
           int skip=1;
 
@@ -751,21 +668,18 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
             cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
             if ((i+ith<ulim-llim)&&(mlim[itot]>=m))
               {
-              for (int j=0; j<njobs; ++j)
-                {
-                int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
-                dcmplx ph1=job->phase[phas_idx];
-                dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
-                p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2);
-                p2[j].s.r[i]=creal(ph1-ph2); p2[j].s.i[i]=cimag(ph1-ph2);
-                }
+              int phas_idx = itot*job->s_th + mi*job->s_m;
+              dcmplx ph1=job->phase[phas_idx];
+              dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
+              p1.s.r[i]=creal(ph1+ph2); p1.s.i[i]=cimag(ph1+ph2);
+              p2.s.r[i]=creal(ph1-ph2); p2.s.i[i]=cimag(ph1-ph2);
               }
             }
           if (!skip)
-            Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b, atmp NJ2);
+            Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp);
           }
         {
-        int istart=m*njobs, istop=(gen->lmax+1)*njobs;
+        int istart=m, istop=gen->lmax+1;
         for(; istart<istop-2; istart+=2)
           vhsum_cmplx_special(atmp[2*istart],atmp[2*istart+1],atmp[2*istart+2],atmp[2*istart+3],&(job->almtmp[istart]));
         for(; istart<istop; istart++)
@@ -776,7 +690,7 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
         {
         for (int ith=0; ith<ulim-llim; ith+=nval)
           {
-          Y(Tbuqu) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
+          Y(Tbuqu) p1, p2; VZERO(p1); VZERO(p2);
           Y(Tbu) cth, sth;
           int skip=1;
 
@@ -788,24 +702,21 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
             cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
             if (i+ith<ulim-llim)
               {
-              for (int j=0; j<njobs; ++j)
-                {
-                int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
-                dcmplx p1Q=job->phase[phas_idx],
-                       p1U=job->phase[phas_idx+2],
-                       p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
-                       p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
-                if ((gen->mhi-gen->m+gen->s)&1)
-                  { p2Q=-p2Q; p2U=-p2U; }
-                p1[j].s.qr[i]=creal(p1Q+p2Q); p1[j].s.qi[i]=cimag(p1Q+p2Q);
-                p1[j].s.ur[i]=creal(p1U+p2U); p1[j].s.ui[i]=cimag(p1U+p2U);
-                p2[j].s.qr[i]=creal(p1Q-p2Q); p2[j].s.qi[i]=cimag(p1Q-p2Q);
-                p2[j].s.ur[i]=creal(p1U-p2U); p2[j].s.ui[i]=cimag(p1U-p2U);
-                }
+              int phas_idx = itot*job->s_th + mi*job->s_m;
+              dcmplx p1Q=job->phase[phas_idx],
+                     p1U=job->phase[phas_idx+2],
+                     p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
+                     p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
+              if ((gen->mhi-gen->m+gen->s)&1)
+                { p2Q=-p2Q; p2U=-p2U; }
+              p1.s.qr[i]=creal(p1Q+p2Q); p1.s.qi[i]=cimag(p1Q+p2Q);
+              p1.s.ur[i]=creal(p1U+p2U); p1.s.ui[i]=cimag(p1U+p2U);
+              p2.s.qr[i]=creal(p1Q-p2Q); p2.s.qi[i]=cimag(p1Q-p2Q);
+              p2.s.ur[i]=creal(p1U-p2U); p2.s.ui[i]=cimag(p1U-p2U);
               }
             }
           if (!skip)
-            Z(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
+            Z(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1.b,&p2.b);
           }
         }
       break;
@@ -820,11 +731,11 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
 
 static void Z(inner_loop) (sharp_job *job, const int *ispair,
   const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
+  sharp_Ylmgen_C *gen, int mi, const int *mlim)
   {
   (job->type==SHARP_MAP2ALM) ?
-    Z(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2) :
-    Z(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2);
+    Z(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) :
+    Z(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim);
   }
 
 #undef VZERO
diff --git a/libsharp/sharp_core_inchelper.c b/libsharp/sharp_core_inchelper.c
index 89d79cd..c58cecc 100644
--- a/libsharp/sharp_core_inchelper.c
+++ b/libsharp/sharp_core_inchelper.c
@@ -2,69 +2,9 @@
 #define Y(arg) CONCAT2(arg,nvec)
 #include "sharp_core_inc.c"
 
-#if (SHARP_MAXTRANS>MAXJOB_SPECIAL)
-#define NJ1 , int njobs
-#define NJ2 , njobs
 #define Z(arg) CONCAT2(arg,nvec)
 #include "sharp_core_inc2.c"
 #undef Z
-#undef NJ1
-#undef NJ2
-#endif
-
-#define NJ1
-#define NJ2
-
-#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1))
-#define njobs 1
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2))
-#define njobs 2
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3))
-#define njobs 3
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4))
-#define njobs 4
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5))
-#define njobs 5
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6))
-#define njobs 6
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#undef NJ1
-#undef NJ2
 
 #undef Y
 #undef Tb

From aee1a51ac2ebfaa40a4e5d37351688916da559fd Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 11 Dec 2018 09:33:44 +0100
Subject: [PATCH 13/85] more simplifications

---
 libsharp/sharp.c           | 78 ++----------------------------------
 libsharp/sharp_core_inc.c  |  3 +-
 libsharp/sharp_core_inc0.c | 49 +----------------------
 libsharp/sharp_core_inc2.c | 82 +++++++++-----------------------------
 libsharp/sharp_lowlevel.h  |  1 -
 libsharp/sharp_ylmgen_c.c  |  4 +-
 6 files changed, 28 insertions(+), 189 deletions(-)

diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index d882689..943bd79 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -515,7 +515,7 @@ static void dealloc_phase (sharp_job *job)
   { DEALLOC(job->phase); }
 
 static void alloc_almtmp (sharp_job *job, int lmax)
-  { job->almtmp=RALLOC(dcmplx,job->nalm*(lmax+1)); }
+  { job->almtmp=RALLOC(dcmplx,job->nalm*(lmax+2)); }
 
 static void dealloc_almtmp (sharp_job *job)
   { DEALLOC(job->almtmp); }
@@ -534,6 +534,8 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
       source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \
       job->almtmp[job->nalm*l+i] = expr_of_x;   \
       }                                                     \
+  for (int i=0; i<job->nalm; ++i)             \
+    job->almtmp[job->nalm*(lmax+1)+i] = 0;           \
   }
 
   if (job->type!=SHARP_MAP2ALM)
@@ -852,8 +854,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
   init_output (job);
 
   int nchunks, chunksize;
-  get_chunk_info(job->ginfo->npairs,(job->flags&SHARP_NVMAX)*VLEN,&nchunks,
-    &chunksize);
+  get_chunk_info(job->ginfo->npairs,6*VLEN,&nchunks,&chunksize);
 //FIXME: needs to be changed to "nm"
   alloc_phase (job,mmax+1,chunksize);
 
@@ -934,8 +935,6 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
   job->ginfo = geom_info;
   job->ainfo = alm_info;
   job->flags = flags;
-  if ((job->flags&SHARP_NVMAX)==0)
-    job->flags|=sharp_nv_oracle (type, spin);
   if (alm_info->flags&SHARP_REAL_HARMONICS)
     job->flags|=SHARP_REAL_HARMONICS;
   job->time = 0.;
@@ -965,75 +964,6 @@ void sharp_set_nchunks_max(int new_nchunks_max)
 int sharp_get_nv_max (void)
 { return 6; }
 
-static int sharp_oracle (sharp_jobtype type, int spin)
-  {
-  int lmax=511;
-  int mmax=(lmax+1)/2;
-  int nrings=(lmax+1)/4;
-  int ppring=1;
-
-  spin = (spin!=0) ? 2 : 0;
-
-  ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
-  sharp_geom_info *tinfo;
-  sharp_make_gauss_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
-
-  ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
-  int ncomp = (spin==0) ? 1 : 2;
-
-  double **map;
-  ALLOC2D(map,double,ncomp,npix);
-  SET_ARRAY(map[0],0,npix*ncomp,0.);
-
-  sharp_alm_info *alms;
-  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
-
-  dcmplx **alm;
-  ALLOC2D(alm,dcmplx,ncomp,nalms);
-  SET_ARRAY(alm[0],0,nalms*ncomp,0.);
-
-  double time=1e30;
-  int nvbest=-1;
-
-  for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
-    {
-    double time_acc=0.;
-    double jtime;
-    int ntries=0;
-    do
-      {
-      sharp_execute(type,spin,&alm[0],&map[0],tinfo,alms,
-        nv|SHARP_DP|SHARP_NO_OPENMP,&jtime,NULL);
-
-      if (jtime<time) { time=jtime; nvbest=nv; }
-      time_acc+=jtime;
-      ++ntries;
-      }
-    while ((time_acc<0.02)&&(ntries<2));
-    }
-
-  DEALLOC2D(map);
-  DEALLOC2D(alm);
-
-  sharp_destroy_alm_info(alms);
-  sharp_destroy_geom_info(tinfo);
-  return nvbest;
-  }
-
-int sharp_nv_oracle (sharp_jobtype type, int spin)
-  {
-  static const int maxtr = 6;
-  static int nv_opt[2][5] = {{0,0,0,0,0},{0,0,0,0,0}};
-
-  if (type==SHARP_ALM2MAP_DERIV1) spin=1;
-  UTIL_ASSERT(type<5,"bad type");
-  UTIL_ASSERT(spin>=0, "bad spin");
-
-  if (nv_opt[spin!=0][type]==0)
-    nv_opt[spin!=0][type]=sharp_oracle(type,spin);
-  return nv_opt[spin!=0][type];
-  }
-
 #ifdef USE_MPI
 #include "sharp_mpi.c"
 
diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c
index 8a36bfe..a15e35a 100644
--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
@@ -217,11 +217,12 @@ NOINLINE static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
     {
     if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
     for (int i=0; i<nvec; ++i)
+      {
       lam_1.v[i] = vload(gen->rf[l].f[0])*(cth.v[i]*lam_2.v[i])
                  - vload(gen->rf[l].f[1])*lam_1.v[i];
-    for (int i=0; i<nvec; ++i)
       lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
                  - vload(gen->rf[l+1].f[1])*lam_2.v[i];
+      }
     if (Y(rescale)(&lam_1,&lam_2,&scale))
       below_limit = Y(TballLt)(scale,sharp_limscale);
     l+=2;
diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c
index 06b9285..15190d7 100644
--- a/libsharp/sharp_core_inc0.c
+++ b/libsharp/sharp_core_inc0.c
@@ -40,35 +40,10 @@
 
 typedef complex double dcmplx;
 
-// must be in the range [0;6]
-#define MAXJOB_SPECIAL 2
-
 #define XCONCATX(a,b) a##b
 #define CONCATX(a,b) XCONCATX(a,b)
 #define XCONCAT2(a,b) a##_##b
 #define CONCAT2(a,b) XCONCAT2(a,b)
-#define XCONCAT3(a,b,c) a##_##b##_##c
-#define CONCAT3(a,b,c) XCONCAT3(a,b,c)
-
-#define nvec 1
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 2
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 3
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 4
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 5
-#include "sharp_core_inchelper.c"
-#undef nvec
 
 #define nvec 6
 #include "sharp_core_inchelper.c"
@@ -78,27 +53,5 @@ void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *c
   const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
   const int *mlim)
   {
-  int nv=job->flags&SHARP_NVMAX;
-  switch (nv)
-    {
-    case 0x1:
-      CONCAT2(inner_loop,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-      return;
-    case 0x2:
-      CONCAT2(inner_loop,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-      return;
-    case 0x3:
-      CONCAT2(inner_loop,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-      return;
-    case 0x4:
-      CONCAT2(inner_loop,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-      return;
-    case 0x5:
-      CONCAT2(inner_loop,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-      return;
-    case 0x6:
-      CONCAT2(inner_loop,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-      return;
-    }
-  UTIL_FAIL("Incorrect vector parameters");
+  CONCAT2(inner_loop,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
   }
diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c
index de2924f..364eef5 100644
--- a/libsharp/sharp_core_inc2.c
+++ b/libsharp/sharp_core_inc2.c
@@ -34,75 +34,44 @@ NOINLINE static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
   const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
   int l, int lmax)
   {
-  while (l<lmax)
+  while (l<=lmax)
     {
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
-                 - vload(rf[l].f[1])*lam_1.v[i];
-    {
-    Tv ar=vload(creal(alm[l])),
-       ai=vload(cimag(alm[l]));
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
+       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
     for (int i=0; i<nvec; ++i)
       {
-      p1->r.v[i] += lam_2.v[i]*ar;
-      p1->i.v[i] += lam_2.v[i]*ai;
-      }
-    }
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
-                 - vload(rf[l+1].f[1])*lam_2.v[i];
-    {
-    Tv ar=vload(creal(alm[l+1])),
-       ai=vload(cimag(alm[l+1]));
-    for (int i=0; i<nvec; ++i)
-      {
-      p2->r.v[i] += lam_1.v[i]*ar;
-      p2->i.v[i] += lam_1.v[i]*ai;
-      }
+      lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
+      p1->r.v[i] += lam_2.v[i]*ar1;
+      p1->i.v[i] += lam_2.v[i]*ai1;
+      lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
+      p2->r.v[i] += lam_1.v[i]*ar2;
+      p2->i.v[i] += lam_1.v[i]*ai2;
       }
     l+=2;
     }
-  if (l==lmax)
-    {
-    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
-    for (int i=0; i<nvec; ++i)
-      {
-      p1->r.v[i] += lam_2.v[i]*ar;
-      p1->i.v[i] += lam_2.v[i]*ai;
-      }
-    }
   }
 
 NOINLINE static void Z(map2alm_kernel) (const Tb cth,
   const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
   const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp)
   {
-  while (l<lmax)
+  while (l<=lmax)
     {
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vabmc(vload(rf[l].f[0]),vmul(cth.v[i],lam_2.v[i]),
-                   vmul(vload(rf[l].f[1]),lam_1.v[i]));
+    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
+       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
     for (int i=0; i<nvec; ++i)
       {
+      lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
       vfmaeq(atmp[2*l  ],lam_2.v[i],p1->r.v[i]);
       vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]);
-      }
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
-                 - vload(rf[l+1].f[1])*lam_2.v[i];
-    for (int i=0; i<nvec; ++i)
-      {
+      lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
       vfmaeq(atmp[2*(l+1)  ],lam_1.v[i],p2->r.v[i]);
       vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]);
       }
     l+=2;
     }
-  if (l==lmax)
-    for (int i=0; i<nvec; ++i)
-      {
-      atmp[2*l  ] += lam_2.v[i]*p1->r.v[i];
-      atmp[2*l+1] += lam_2.v[i]*p1->i.v[i];
-      }
   }
 
 NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth,
@@ -187,10 +156,9 @@ NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
       }
     if (++l>lmax) return;
     for (int i=0; i<nvec; ++i)
+      {
       lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
                  - vload(rf[l-1].f[1])*lam_1.v[i];
-    for (int i=0; i<nvec; ++i)
-      {
       Tv tmp=lam_1.v[i]*corfac.v[i];
       atmp[2*l  ]+=tmp*p2->r.v[i];
       atmp[2*l+1]+=tmp*p2->i.v[i];
@@ -222,9 +190,6 @@ static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
     vfmaeq(px->qi.v[i],agi,lw);
     vfmaeq(px->ur.v[i],acr,lw);
     vfmaeq(px->ui.v[i],aci,lw);
-    }
-  for (int i=0; i<nvec; ++i)
-    {
     Tv lx=vsub(rxm.v[i],rxp.v[i]);
     vfmseq(py->qr.v[i],aci,lx);
     vfmaeq(py->qi.v[i],acr,lx);
@@ -249,9 +214,6 @@ static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
     vfmaaeq(p1->qi.v[i],agi1,lw1,acr2,lx2);
     vfmaaeq(p1->ur.v[i],acr1,lw1,agi2,lx2);
     vfmaseq(p1->ui.v[i],aci1,lw1,agr2,lx2);
-    }
-  for (int i=0; i<nvec; ++i)
-    {
     Tv lx1=r2m.v[i]-r2p.v[i];
     Tv lw2=r1p.v[i]+r1m.v[i];
     vfmaseq(p2->qr.v[i],agr2,lw2,aci1,lx1);
@@ -273,9 +235,6 @@ static inline void Z(saddstep2) (const Y(Tbqu) * restrict px,
     vfmaeq(agi,px->qi.v[i],lw);
     vfmaeq(acr,px->ur.v[i],lw);
     vfmaeq(aci,px->ui.v[i],lw);
-    }
-  for (int i=0; i<nvec; ++i)
-    {
     Tv lx=vsub(rxm->v[i],rxp->v[i]);
     vfmseq(agr,py->ui.v[i],lx);
     vfmaeq(agi,py->ur.v[i],lx);
@@ -444,9 +403,6 @@ static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
     Tv lw=vadd(rxp.v[i],rxm.v[i]);
     vfmaeq(px->qr.v[i],ar,lw);
     vfmaeq(px->qi.v[i],ai,lw);
-    }
-  for (int i=0; i<nvec; ++i)
-    {
     Tv lx=vsub(rxm.v[i],rxp.v[i]);
     vfmaeq(py->ur.v[i],ai,lx);
     vfmseq(py->ui.v[i],ar,lx);
@@ -652,8 +608,8 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
       {
       if (job->spin==0)
         {
-        Tv atmp[2*(gen->lmax+1)];
-        memset (&atmp[2*m],0,2*(gen->lmax+1-m)*sizeof(Tv));
+        Tv atmp[2*(gen->lmax+2)];
+        memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv));
         for (int ith=0; ith<ulim-llim; ith+=nval)
           {
           Y(Tburi) p1, p2; VZERO(p1); VZERO(p2);
diff --git a/libsharp/sharp_lowlevel.h b/libsharp/sharp_lowlevel.h
index f36f5a8..2e7ab24 100644
--- a/libsharp/sharp_lowlevel.h
+++ b/libsharp/sharp_lowlevel.h
@@ -200,7 +200,6 @@ typedef enum { SHARP_DP              = 1<<4,
 
                SHARP_USE_WEIGHTS     = 1<<20,    /* internal use only */
                SHARP_NO_OPENMP       = 1<<21,    /* internal use only */
-               SHARP_NVMAX           = (1<<4)-1 /* internal use only */
              } sharp_jobflags;
 
 /*! Performs a libsharp SHT job. The interface deliberately does not use
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index 785e063..e967773 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -69,7 +69,7 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
   gen->m = -1;
   if (spin==0)
     {
-    gen->rf = RALLOC(sharp_ylmgen_dbl2,gen->lmax+1);
+    gen->rf = RALLOC(sharp_ylmgen_dbl2,gen->lmax+2);
     gen->mfac = RALLOC(double,gen->mmax+1);
     gen->mfac[0] = inv_sqrt4pi;
     for (int m=1; m<=gen->mmax; ++m)
@@ -159,7 +159,7 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
     {
     gen->rf[m].f[0] = gen->root[2*m+3];
     gen->rf[m].f[1] = 0.;
-    for (int l=m+1; l<=gen->lmax; ++l)
+    for (int l=m+1; l<=gen->lmax+1; ++l)
       {
       double tmp=gen->root[2*l+3]*gen->iroot[l+1+m]*gen->iroot[l+1-m];
       gen->rf[l].f[0] = tmp*gen->root[2*l+1];

From cdc09826a116756d63e6160a6c04fc2b352c94bb Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 11 Dec 2018 10:56:55 +0100
Subject: [PATCH 14/85] cleanup

---
 Makefile.am                     |   4 +-
 libsharp/sharp_core_inc.c       | 668 ++++++++++++++++++++++++++++++
 libsharp/sharp_core_inc0.c      |   9 +-
 libsharp/sharp_core_inc2.c      | 697 --------------------------------
 libsharp/sharp_core_inchelper.c |  10 -
 5 files changed, 676 insertions(+), 712 deletions(-)
 delete mode 100644 libsharp/sharp_core_inc2.c
 delete mode 100644 libsharp/sharp_core_inchelper.c

diff --git a/Makefile.am b/Makefile.am
index 5cd60d4..7a035ba 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -34,9 +34,7 @@ include_HEADERS = \
 
 EXTRA_DIST = \
   libsharp/sharp_core_inc0.c \
-  libsharp/sharp_core_inc.c \
-  libsharp/sharp_core_inc2.c \
-  libsharp/sharp_core_inchelper.c
+  libsharp/sharp_core_inc.c
 
 libsharp_la_SOURCES = $(src_sharp)
 
diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c
index a15e35a..60fbc6f 100644
--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
@@ -315,3 +315,671 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
   *rec1p_=rec1p; *rec2p_=rec2p; *scalep_=scalep;
   *rec1m_=rec1m; *rec2m_=rec2m; *scalem_=scalem;
   }
+
+
+NOINLINE static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
+  Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
+  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
+  int l, int lmax)
+  {
+  while (l<=lmax)
+    {
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
+       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      {
+      lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
+      p1->r.v[i] += lam_2.v[i]*ar1;
+      p1->i.v[i] += lam_2.v[i]*ai1;
+      lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
+      p2->r.v[i] += lam_1.v[i]*ar2;
+      p2->i.v[i] += lam_1.v[i]*ai2;
+      }
+    l+=2;
+    }
+  }
+
+NOINLINE static void Y(map2alm_kernel) (const Tb cth,
+  const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
+  const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp)
+  {
+  while (l<=lmax)
+    {
+    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
+       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      {
+      lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
+      vfmaeq(atmp[2*l  ],lam_2.v[i],p1->r.v[i]);
+      vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]);
+      lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
+      vfmaeq(atmp[2*(l+1)  ],lam_1.v[i],p2->r.v[i]);
+      vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]);
+      }
+    l+=2;
+    }
+  }
+
+NOINLINE static void Y(calc_alm2map) (const Tb cth, const Tb sth,
+  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
+  Y(Tbri) * restrict p2)
+  {
+  int l,lmax=gen->lmax;
+  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
+  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
+  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
+
+  Tb corfac;
+  Y(getCorfac)(scale,&corfac,gen->cf);
+  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee = Y(TballGe)(scale,sharp_minscale);
+  while (!full_ieee)
+    {
+    {
+    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
+      vfmaeq(p1->r.v[i],tmp,ar);
+      vfmaeq(p1->i.v[i],tmp,ai);
+      }
+    }
+    if (++l>lmax) break;
+    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+    {
+    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
+      vfmaeq(p2->r.v[i],tmp,ar);
+      vfmaeq(p2->i.v[i],tmp,ai);
+      }
+    }
+    if (++l>lmax) break;
+    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+    if (Y(rescale)(&lam_1,&lam_2,&scale))
+      {
+      Y(getCorfac)(scale,&corfac,gen->cf);
+      full_ieee = Y(TballGe)(scale,sharp_minscale);
+      }
+    }
+  if (l>lmax) return;
+
+  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
+  Y(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
+  }
+
+NOINLINE static void Y(calc_map2alm) (const Tb cth, const Tb sth,
+  const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
+  const Y(Tbri) * restrict p2, Tv *restrict atmp)
+  {
+  int lmax=gen->lmax;
+  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
+  int l=gen->m;
+  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
+  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
+
+  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
+  Tb corfac;
+  Y(getCorfac)(scale,&corfac,gen->cf);
+  int full_ieee = Y(TballGe)(scale,sharp_minscale);
+  while (!full_ieee)
+    {
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv tmp=lam_2.v[i]*corfac.v[i];
+      atmp[2*l  ]+=tmp*p1->r.v[i];
+      atmp[2*l+1]+=tmp*p1->i.v[i];
+      }
+    if (++l>lmax) return;
+    for (int i=0; i<nvec; ++i)
+      {
+      lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
+                 - vload(rf[l-1].f[1])*lam_1.v[i];
+      Tv tmp=lam_1.v[i]*corfac.v[i];
+      atmp[2*l  ]+=tmp*p2->r.v[i];
+      atmp[2*l+1]+=tmp*p2->i.v[i];
+      }
+    if (++l>lmax) return;
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i])
+                 - vload(rf[l-1].f[1])*lam_2.v[i];
+    if (Y(rescale)(&lam_1,&lam_2,&scale))
+      {
+      Y(getCorfac)(scale,&corfac,gen->cf);
+      full_ieee = Y(TballGe)(scale,sharp_minscale);
+      }
+    }
+
+  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
+  Y(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp);
+  }
+
+static inline void Y(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
+  const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
+  {
+  Tv agr=vload(creal(alm[0])), agi=vload(cimag(alm[0])),
+     acr=vload(creal(alm[1])), aci=vload(cimag(alm[1]));
+  for (int i=0; i<nvec; ++i)
+    {
+    Tv lw=vadd(rxp.v[i],rxm.v[i]);
+    vfmaeq(px->qr.v[i],agr,lw);
+    vfmaeq(px->qi.v[i],agi,lw);
+    vfmaeq(px->ur.v[i],acr,lw);
+    vfmaeq(px->ui.v[i],aci,lw);
+    Tv lx=vsub(rxm.v[i],rxp.v[i]);
+    vfmseq(py->qr.v[i],aci,lx);
+    vfmaeq(py->qi.v[i],acr,lx);
+    vfmaeq(py->ur.v[i],agi,lx);
+    vfmseq(py->ui.v[i],agr,lx);
+    }
+  }
+
+static inline void Y(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
+  const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
+  const dcmplx * restrict alm1, const dcmplx * restrict alm2)
+  {
+  Tv agr1=vload(creal(alm1[0])), agi1=vload(cimag(alm1[0])),
+     acr1=vload(creal(alm1[1])), aci1=vload(cimag(alm1[1]));
+  Tv agr2=vload(creal(alm2[0])), agi2=vload(cimag(alm2[0])),
+     acr2=vload(creal(alm2[1])), aci2=vload(cimag(alm2[1]));
+  for (int i=0; i<nvec; ++i)
+    {
+    Tv lw1=r2p.v[i]+r2m.v[i];
+    Tv lx2=r1m.v[i]-r1p.v[i];
+    vfmaseq(p1->qr.v[i],agr1,lw1,aci2,lx2);
+    vfmaaeq(p1->qi.v[i],agi1,lw1,acr2,lx2);
+    vfmaaeq(p1->ur.v[i],acr1,lw1,agi2,lx2);
+    vfmaseq(p1->ui.v[i],aci1,lw1,agr2,lx2);
+    Tv lx1=r2m.v[i]-r2p.v[i];
+    Tv lw2=r1p.v[i]+r1m.v[i];
+    vfmaseq(p2->qr.v[i],agr2,lw2,aci1,lx1);
+    vfmaaeq(p2->qi.v[i],agi2,lw2,acr1,lx1);
+    vfmaaeq(p2->ur.v[i],acr2,lw2,agi1,lx1);
+    vfmaseq(p2->ui.v[i],aci2,lw2,agr1,lx1);
+    }
+  }
+
+static inline void Y(saddstep2) (const Y(Tbqu) * restrict px,
+  const Y(Tbqu) * restrict py, const Tb * restrict rxp,
+  const Tb * restrict rxm, dcmplx * restrict alm)
+  {
+  Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
+  for (int i=0; i<nvec; ++i)
+    {
+    Tv lw=vadd(rxp->v[i],rxm->v[i]);
+    vfmaeq(agr,px->qr.v[i],lw);
+    vfmaeq(agi,px->qi.v[i],lw);
+    vfmaeq(acr,px->ur.v[i],lw);
+    vfmaeq(aci,px->ui.v[i],lw);
+    Tv lx=vsub(rxm->v[i],rxp->v[i]);
+    vfmseq(agr,py->ui.v[i],lx);
+    vfmaeq(agi,py->ur.v[i],lx);
+    vfmaeq(acr,py->qi.v[i],lx);
+    vfmseq(aci,py->qr.v[i],lx);
+    }
+  vhsum_cmplx_special(agr,agi,acr,aci,alm);
+  }
+
+NOINLINE static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
+  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
+  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
+  int lmax)
+  {
+  while (l<lmax)
+    {
+    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
+       fx2=vload(fx[l+1].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i];
+      rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i];
+      }
+    Y(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*l],
+      &alm[2*(l+1)]);
+    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
+    fx2=vload(fx[l+2].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec2p.v[i] = (cth.v[i]-fx1)*fx0*rec1p.v[i] - fx2*rec2p.v[i];
+      rec2m.v[i] = (cth.v[i]+fx1)*fx0*rec1m.v[i] - fx2*rec2m.v[i];
+      }
+    l+=2;
+    }
+  if (l==lmax)
+    Y(saddstep)(p1, p2, rec2p, rec2m, &alm[2*l]);
+  }
+
+NOINLINE static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
+  const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
+  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
+  {
+  while (l<lmax)
+    {
+    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
+       fx2=vload(fx[l+1].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
+                        vmul(fx2,rec1p.v[i]));
+      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
+                        vmul(fx2,rec1m.v[i]));
+      }
+    Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*l]);
+    Y(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*(l+1)]);
+    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
+    fx2=vload(fx[l+2].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
+                        vmul(fx2,rec2p.v[i]));
+      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
+                        vmul(fx2,rec2m.v[i]));
+      }
+    l+=2;
+    }
+  if (l==lmax)
+    Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*l]);
+  }
+
+NOINLINE static void Y(calc_alm2map_spin) (const Tb cth, const Tb sth,
+  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
+  Y(Tbqu) * restrict p2)
+  {
+  int l, lmax=gen->lmax;
+  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
+  Y(iter_to_ieee_spin)
+    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
+  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 28*VLEN*nvec;
+
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
+  Tb corfacp,corfacm;
+  Y(getCorfac)(scalep,&corfacp,gen->cf);
+  Y(getCorfac)(scalem,&corfacm,gen->cf);
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
+               && Y(TballGe)(scalem,sharp_minscale);
+  while (!full_ieee)
+    {
+    Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
+      &alm[2*l]);
+    if (++l>lmax) break;
+    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
+    Y(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
+      &alm[2*l]);
+    if (++l>lmax) break;
+    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
+    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
+      {
+      Y(getCorfac)(scalep,&corfacp,gen->cf);
+      Y(getCorfac)(scalem,&corfacm,gen->cf);
+      full_ieee = Y(TballGe)(scalep,sharp_minscale)
+               && Y(TballGe)(scalem,sharp_minscale);
+      }
+    }
+
+  if (l>lmax) return;
+
+  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
+  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
+  Y(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
+    lmax);
+  }
+
+NOINLINE static void Y(calc_map2alm_spin) (Tb cth, Tb sth,
+  const sharp_Ylmgen_C * restrict gen, sharp_job *job,
+  const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2)
+  {
+  int l, lmax=gen->lmax;
+  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
+  Y(iter_to_ieee_spin)
+    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
+  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 28*VLEN*nvec;
+
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
+  Tb corfacp,corfacm;
+  Y(getCorfac)(scalep,&corfacp,gen->cf);
+  Y(getCorfac)(scalem,&corfacm,gen->cf);
+  dcmplx * restrict alm=job->almtmp;
+  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
+               && Y(TballGe)(scalem,sharp_minscale);
+  while (!full_ieee)
+    {
+    Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
+    Y(saddstep2)(p1, p2, &t1, &t2, &alm[2*l]);
+    if (++l>lmax) return;
+    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
+    t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
+    Y(saddstep2)(p2, p1, &t1, &t2, &alm[2*l]);
+    if (++l>lmax) return;
+    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
+    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
+      {
+      Y(getCorfac)(scalep,&corfacp,gen->cf);
+      Y(getCorfac)(scalem,&corfacm,gen->cf);
+      full_ieee = Y(TballGe)(scalep,sharp_minscale)
+               && Y(TballGe)(scalem,sharp_minscale);
+      }
+    }
+
+  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
+  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
+  Y(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
+  }
+
+static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
+  const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
+  {
+  Tv ar=vload(creal(alm[0])), ai=vload(cimag(alm[0]));
+  for (int i=0; i<nvec; ++i)
+    {
+    Tv lw=vadd(rxp.v[i],rxm.v[i]);
+    vfmaeq(px->qr.v[i],ar,lw);
+    vfmaeq(px->qi.v[i],ai,lw);
+    Tv lx=vsub(rxm.v[i],rxp.v[i]);
+    vfmaeq(py->ur.v[i],ai,lx);
+    vfmseq(py->ui.v[i],ar,lx);
+    }
+  }
+
+NOINLINE static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
+  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
+  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
+  int lmax)
+  {
+  while (l<lmax)
+    {
+    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
+       fx2=vload(fx[l+1].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
+                        vmul(fx2,rec1p.v[i]));
+      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
+                        vmul(fx2,rec1m.v[i]));
+      }
+    Y(saddstep_d)(p1,p2,rec2p,rec2m,&alm[l]);
+    Y(saddstep_d)(p2,p1,rec1p,rec1m,&alm[l+1]);
+    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
+    fx2=vload(fx[l+2].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
+                        vmul(fx2,rec2p.v[i]));
+      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
+                        vmul(fx2,rec2m.v[i]));
+      }
+    l+=2;
+    }
+  if (l==lmax)
+    Y(saddstep_d)(p1, p2, rec2p, rec2m, &alm[l]);
+  }
+
+NOINLINE static void Y(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
+  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
+  Y(Tbqu) * restrict p2)
+  {
+  int l, lmax=gen->lmax;
+  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
+  Y(iter_to_ieee_spin)
+    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
+  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 20*VLEN*nvec;
+
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
+  Tb corfacp,corfacm;
+  Y(getCorfac)(scalep,&corfacp,gen->cf);
+  Y(getCorfac)(scalem,&corfacm,gen->cf);
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
+               && Y(TballGe)(scalem,sharp_minscale);
+  while (!full_ieee)
+    {
+    Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
+      &alm[l]);
+    if (++l>lmax) break;
+    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
+    Y(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
+      &alm[l]);
+    if (++l>lmax) break;
+    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
+    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
+      {
+      Y(getCorfac)(scalep,&corfacp,gen->cf);
+      Y(getCorfac)(scalem,&corfacm,gen->cf);
+      full_ieee = Y(TballGe)(scalep,sharp_minscale)
+               && Y(TballGe)(scalem,sharp_minscale);
+      }
+    }
+
+  if (l>lmax) return;
+
+  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
+  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
+  Y(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
+    lmax);
+  }
+
+
+#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
+
+NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *mlim)
+  {
+  const int nval=nvec*VLEN;
+  const int m = job->ainfo->mval[mi];
+  sharp_Ylmgen_prepare (gen, m);
+
+  switch (job->type)
+    {
+    case SHARP_ALM2MAP:
+    case SHARP_ALM2MAP_DERIV1:
+      {
+      if (job->spin==0)
+        {
+        for (int ith=0; ith<ulim-llim; ith+=nval)
+          {
+          Y(Tburi) p1,p2; VZERO(p1); VZERO(p2);
+          Y(Tbu) cth, sth;
+
+          int skip=1;
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot>=ulim-llim) itot=ulim-llim-1;
+            if (mlim[itot]>=m) skip=0;
+            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
+            }
+          if (!skip)
+            Y(calc_alm2map) (cth.b,sth.b,gen,job,&p1.b,&p2.b);
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot<ulim-llim)
+              {
+              int phas_idx = itot*job->s_th + mi*job->s_m;
+              complex double r1 = p1.s.r[i] + p1.s.i[i]*_Complex_I,
+                             r2 = p2.s.r[i] + p2.s.i[i]*_Complex_I;
+              job->phase[phas_idx] = r1+r2;
+              if (ispair[itot])
+                job->phase[phas_idx+1] = r1-r2;
+              }
+            }
+          }
+        }
+      else
+        {
+        for (int ith=0; ith<ulim-llim; ith+=nval)
+          {
+          Y(Tbuqu) p1,p2; VZERO(p1); VZERO(p2);
+          Y(Tbu) cth, sth;
+          int skip=1;
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot>=ulim-llim) itot=ulim-llim-1;
+            if (mlim[itot]>=m) skip=0;
+            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
+            }
+          if (!skip)
+            (job->type==SHARP_ALM2MAP) ?
+              Y(calc_alm2map_spin  )
+                (cth.b,sth.b,gen,job,&p1.b,&p2.b) :
+              Y(calc_alm2map_deriv1)
+                (cth.b,sth.b,gen,job,&p1.b,&p2.b);
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot<ulim-llim)
+              {
+              int phas_idx = itot*job->s_th + mi*job->s_m;
+              complex double q1 = p1.s.qr[i] + p1.s.qi[i]*_Complex_I,
+                             q2 = p2.s.qr[i] + p2.s.qi[i]*_Complex_I,
+                             u1 = p1.s.ur[i] + p1.s.ui[i]*_Complex_I,
+                             u2 = p2.s.ur[i] + p2.s.ui[i]*_Complex_I;
+              job->phase[phas_idx] = q1+q2;
+              job->phase[phas_idx+2] = u1+u2;
+              if (ispair[itot])
+                {
+                dcmplx *phQ = &(job->phase[phas_idx+1]),
+                       *phU = &(job->phase[phas_idx+3]);
+                *phQ = q1-q2;
+                *phU = u1-u2;
+                if ((gen->mhi-gen->m+gen->s)&1)
+                  { *phQ=-(*phQ); *phU=-(*phU); }
+                }
+              }
+            }
+          }
+        }
+      break;
+      }
+    default:
+      {
+      UTIL_FAIL("must not happen");
+      break;
+      }
+    }
+  }
+
+NOINLINE static void Y(inner_loop_m2a) (sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *mlim)
+  {
+  const int nval=nvec*VLEN;
+  const int m = job->ainfo->mval[mi];
+  sharp_Ylmgen_prepare (gen, m);
+
+  switch (job->type)
+    {
+    case SHARP_MAP2ALM:
+      {
+      if (job->spin==0)
+        {
+        Tv atmp[2*(gen->lmax+2)];
+        memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv));
+        for (int ith=0; ith<ulim-llim; ith+=nval)
+          {
+          Y(Tburi) p1, p2; VZERO(p1); VZERO(p2);
+          Y(Tbu) cth, sth;
+          int skip=1;
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot>=ulim-llim) itot=ulim-llim-1;
+            if (mlim[itot]>=m) skip=0;
+            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
+            if ((i+ith<ulim-llim)&&(mlim[itot]>=m))
+              {
+              int phas_idx = itot*job->s_th + mi*job->s_m;
+              dcmplx ph1=job->phase[phas_idx];
+              dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
+              p1.s.r[i]=creal(ph1+ph2); p1.s.i[i]=cimag(ph1+ph2);
+              p2.s.r[i]=creal(ph1-ph2); p2.s.i[i]=cimag(ph1-ph2);
+              }
+            }
+          if (!skip)
+            Y(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp);
+          }
+        {
+        int istart=m, istop=gen->lmax+1;
+        for(; istart<istop-2; istart+=2)
+          vhsum_cmplx_special(atmp[2*istart],atmp[2*istart+1],atmp[2*istart+2],atmp[2*istart+3],&(job->almtmp[istart]));
+        for(; istart<istop; istart++)
+          job->almtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]);
+        }
+        }
+      else
+        {
+        for (int ith=0; ith<ulim-llim; ith+=nval)
+          {
+          Y(Tbuqu) p1, p2; VZERO(p1); VZERO(p2);
+          Y(Tbu) cth, sth;
+          int skip=1;
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot>=ulim-llim) itot=ulim-llim-1;
+            if (mlim[itot]>=m) skip=0;
+            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
+            if (i+ith<ulim-llim)
+              {
+              int phas_idx = itot*job->s_th + mi*job->s_m;
+              dcmplx p1Q=job->phase[phas_idx],
+                     p1U=job->phase[phas_idx+2],
+                     p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
+                     p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
+              if ((gen->mhi-gen->m+gen->s)&1)
+                { p2Q=-p2Q; p2U=-p2U; }
+              p1.s.qr[i]=creal(p1Q+p2Q); p1.s.qi[i]=cimag(p1Q+p2Q);
+              p1.s.ur[i]=creal(p1U+p2U); p1.s.ui[i]=cimag(p1U+p2U);
+              p2.s.qr[i]=creal(p1Q-p2Q); p2.s.qi[i]=cimag(p1Q-p2Q);
+              p2.s.ur[i]=creal(p1U-p2U); p2.s.ui[i]=cimag(p1U-p2U);
+              }
+            }
+          if (!skip)
+            Y(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1.b,&p2.b);
+          }
+        }
+      break;
+      }
+    default:
+      {
+      UTIL_FAIL("must not happen");
+      break;
+      }
+    }
+  }
+
+static void Y(inner_loop) (sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *mlim)
+  {
+  (job->type==SHARP_MAP2ALM) ?
+    Y(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) :
+    Y(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim);
+  }
+
+#undef VZERO
diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c
index 15190d7..1139ef8 100644
--- a/libsharp/sharp_core_inc0.c
+++ b/libsharp/sharp_core_inc0.c
@@ -46,12 +46,17 @@ typedef complex double dcmplx;
 #define CONCAT2(a,b) XCONCAT2(a,b)
 
 #define nvec 6
-#include "sharp_core_inchelper.c"
+#define Tb CONCAT2(Tb,nvec)
+#define Y(arg) CONCAT2(arg,nvec)
+#include "sharp_core_inc.c"
+
+#undef Y
+#undef Tb
 #undef nvec
 
 void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *cth,
   const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
   const int *mlim)
   {
-  CONCAT2(inner_loop,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+  inner_loop_6(job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
   }
diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c
deleted file mode 100644
index 364eef5..0000000
--- a/libsharp/sharp_core_inc2.c
+++ /dev/null
@@ -1,697 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_core_inc2.c
- *  Type-dependent code for the computational core
- *
- *  Copyright (C) 2012-2017 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-NOINLINE static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
-  Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
-  int l, int lmax)
-  {
-  while (l<=lmax)
-    {
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
-       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      {
-      lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
-      p1->r.v[i] += lam_2.v[i]*ar1;
-      p1->i.v[i] += lam_2.v[i]*ai1;
-      lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
-      p2->r.v[i] += lam_1.v[i]*ar2;
-      p2->i.v[i] += lam_1.v[i]*ai2;
-      }
-    l+=2;
-    }
-  }
-
-NOINLINE static void Z(map2alm_kernel) (const Tb cth,
-  const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp)
-  {
-  while (l<=lmax)
-    {
-    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
-       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      {
-      lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
-      vfmaeq(atmp[2*l  ],lam_2.v[i],p1->r.v[i]);
-      vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]);
-      lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
-      vfmaeq(atmp[2*(l+1)  ],lam_1.v[i],p2->r.v[i]);
-      vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]);
-      }
-    l+=2;
-    }
-  }
-
-NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
-  Y(Tbri) * restrict p2)
-  {
-  int l,lmax=gen->lmax;
-  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
-  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
-  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
-
-  Tb corfac;
-  Y(getCorfac)(scale,&corfac,gen->cf);
-  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scale,sharp_minscale);
-  while (!full_ieee)
-    {
-    {
-    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
-      vfmaeq(p1->r.v[i],tmp,ar);
-      vfmaeq(p1->i.v[i],tmp,ai);
-      }
-    }
-    if (++l>lmax) break;
-    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    {
-    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
-      vfmaeq(p2->r.v[i],tmp,ar);
-      vfmaeq(p2->i.v[i],tmp,ai);
-      }
-    }
-    if (++l>lmax) break;
-    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
-    if (Y(rescale)(&lam_1,&lam_2,&scale))
-      {
-      Y(getCorfac)(scale,&corfac,gen->cf);
-      full_ieee = Y(TballGe)(scale,sharp_minscale);
-      }
-    }
-  if (l>lmax) return;
-
-  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
-  }
-
-NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
-  const Y(Tbri) * restrict p2, Tv *restrict atmp)
-  {
-  int lmax=gen->lmax;
-  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
-  int l=gen->m;
-  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
-  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
-
-  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
-  Tb corfac;
-  Y(getCorfac)(scale,&corfac,gen->cf);
-  int full_ieee = Y(TballGe)(scale,sharp_minscale);
-  while (!full_ieee)
-    {
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv tmp=lam_2.v[i]*corfac.v[i];
-      atmp[2*l  ]+=tmp*p1->r.v[i];
-      atmp[2*l+1]+=tmp*p1->i.v[i];
-      }
-    if (++l>lmax) return;
-    for (int i=0; i<nvec; ++i)
-      {
-      lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
-                 - vload(rf[l-1].f[1])*lam_1.v[i];
-      Tv tmp=lam_1.v[i]*corfac.v[i];
-      atmp[2*l  ]+=tmp*p2->r.v[i];
-      atmp[2*l+1]+=tmp*p2->i.v[i];
-      }
-    if (++l>lmax) return;
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i])
-                 - vload(rf[l-1].f[1])*lam_2.v[i];
-    if (Y(rescale)(&lam_1,&lam_2,&scale))
-      {
-      Y(getCorfac)(scale,&corfac,gen->cf);
-      full_ieee = Y(TballGe)(scale,sharp_minscale);
-      }
-    }
-
-  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp);
-  }
-
-static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
-  const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
-  {
-  Tv agr=vload(creal(alm[0])), agi=vload(cimag(alm[0])),
-     acr=vload(creal(alm[1])), aci=vload(cimag(alm[1]));
-  for (int i=0; i<nvec; ++i)
-    {
-    Tv lw=vadd(rxp.v[i],rxm.v[i]);
-    vfmaeq(px->qr.v[i],agr,lw);
-    vfmaeq(px->qi.v[i],agi,lw);
-    vfmaeq(px->ur.v[i],acr,lw);
-    vfmaeq(px->ui.v[i],aci,lw);
-    Tv lx=vsub(rxm.v[i],rxp.v[i]);
-    vfmseq(py->qr.v[i],aci,lx);
-    vfmaeq(py->qi.v[i],acr,lx);
-    vfmaeq(py->ur.v[i],agi,lx);
-    vfmseq(py->ui.v[i],agr,lx);
-    }
-  }
-
-static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
-  const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
-  const dcmplx * restrict alm1, const dcmplx * restrict alm2)
-  {
-  Tv agr1=vload(creal(alm1[0])), agi1=vload(cimag(alm1[0])),
-     acr1=vload(creal(alm1[1])), aci1=vload(cimag(alm1[1]));
-  Tv agr2=vload(creal(alm2[0])), agi2=vload(cimag(alm2[0])),
-     acr2=vload(creal(alm2[1])), aci2=vload(cimag(alm2[1]));
-  for (int i=0; i<nvec; ++i)
-    {
-    Tv lw1=r2p.v[i]+r2m.v[i];
-    Tv lx2=r1m.v[i]-r1p.v[i];
-    vfmaseq(p1->qr.v[i],agr1,lw1,aci2,lx2);
-    vfmaaeq(p1->qi.v[i],agi1,lw1,acr2,lx2);
-    vfmaaeq(p1->ur.v[i],acr1,lw1,agi2,lx2);
-    vfmaseq(p1->ui.v[i],aci1,lw1,agr2,lx2);
-    Tv lx1=r2m.v[i]-r2p.v[i];
-    Tv lw2=r1p.v[i]+r1m.v[i];
-    vfmaseq(p2->qr.v[i],agr2,lw2,aci1,lx1);
-    vfmaaeq(p2->qi.v[i],agi2,lw2,acr1,lx1);
-    vfmaaeq(p2->ur.v[i],acr2,lw2,agi1,lx1);
-    vfmaseq(p2->ui.v[i],aci2,lw2,agr1,lx1);
-    }
-  }
-
-static inline void Z(saddstep2) (const Y(Tbqu) * restrict px,
-  const Y(Tbqu) * restrict py, const Tb * restrict rxp,
-  const Tb * restrict rxm, dcmplx * restrict alm)
-  {
-  Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
-  for (int i=0; i<nvec; ++i)
-    {
-    Tv lw=vadd(rxp->v[i],rxm->v[i]);
-    vfmaeq(agr,px->qr.v[i],lw);
-    vfmaeq(agi,px->qi.v[i],lw);
-    vfmaeq(acr,px->ur.v[i],lw);
-    vfmaeq(aci,px->ui.v[i],lw);
-    Tv lx=vsub(rxm->v[i],rxp->v[i]);
-    vfmseq(agr,py->ui.v[i],lx);
-    vfmaeq(agi,py->ur.v[i],lx);
-    vfmaeq(acr,py->qi.v[i],lx);
-    vfmseq(aci,py->qr.v[i],lx);
-    }
-  vhsum_cmplx_special(agr,agi,acr,aci,alm);
-  }
-
-NOINLINE static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
-  int lmax)
-  {
-  while (l<lmax)
-    {
-    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
-       fx2=vload(fx[l+1].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i];
-      rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i];
-      }
-    Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*l],
-      &alm[2*(l+1)]);
-    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
-    fx2=vload(fx[l+2].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec2p.v[i] = (cth.v[i]-fx1)*fx0*rec1p.v[i] - fx2*rec2p.v[i];
-      rec2m.v[i] = (cth.v[i]+fx1)*fx0*rec1m.v[i] - fx2*rec2m.v[i];
-      }
-    l+=2;
-    }
-  if (l==lmax)
-    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*l]);
-  }
-
-NOINLINE static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
-  const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
-  {
-  while (l<lmax)
-    {
-    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
-       fx2=vload(fx[l+1].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
-                        vmul(fx2,rec1p.v[i]));
-      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
-                        vmul(fx2,rec1m.v[i]));
-      }
-    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*l]);
-    Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*(l+1)]);
-    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
-    fx2=vload(fx[l+2].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
-                        vmul(fx2,rec2p.v[i]));
-      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
-                        vmul(fx2,rec2m.v[i]));
-      }
-    l+=2;
-    }
-  if (l==lmax)
-    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*l]);
-  }
-
-NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2)
-  {
-  int l, lmax=gen->lmax;
-  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  Y(iter_to_ieee_spin)
-    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
-  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 28*VLEN*nvec;
-
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tb corfacp,corfacm;
-  Y(getCorfac)(scalep,&corfacp,gen->cf);
-  Y(getCorfac)(scalem,&corfacm,gen->cf);
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-  while (!full_ieee)
-    {
-    Z(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
-      &alm[2*l]);
-    if (++l>lmax) break;
-    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    Z(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
-      &alm[2*l]);
-    if (++l>lmax) break;
-    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
-    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
-      {
-      Y(getCorfac)(scalep,&corfacp,gen->cf);
-      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-      }
-    }
-
-  if (l>lmax) return;
-
-  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
-  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Z(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
-    lmax);
-  }
-
-NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
-  const sharp_Ylmgen_C * restrict gen, sharp_job *job,
-  const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2)
-  {
-  int l, lmax=gen->lmax;
-  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  Y(iter_to_ieee_spin)
-    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
-  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 28*VLEN*nvec;
-
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tb corfacp,corfacm;
-  Y(getCorfac)(scalep,&corfacp,gen->cf);
-  Y(getCorfac)(scalem,&corfacm,gen->cf);
-  dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-  while (!full_ieee)
-    {
-    Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
-    Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*l]);
-    if (++l>lmax) return;
-    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
-    Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*l]);
-    if (++l>lmax) return;
-    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
-    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
-      {
-      Y(getCorfac)(scalep,&corfacp,gen->cf);
-      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-      }
-    }
-
-  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
-  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Z(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
-  }
-
-static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
-  const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
-  {
-  Tv ar=vload(creal(alm[0])), ai=vload(cimag(alm[0]));
-  for (int i=0; i<nvec; ++i)
-    {
-    Tv lw=vadd(rxp.v[i],rxm.v[i]);
-    vfmaeq(px->qr.v[i],ar,lw);
-    vfmaeq(px->qi.v[i],ai,lw);
-    Tv lx=vsub(rxm.v[i],rxp.v[i]);
-    vfmaeq(py->ur.v[i],ai,lx);
-    vfmseq(py->ui.v[i],ar,lx);
-    }
-  }
-
-NOINLINE static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
-  int lmax)
-  {
-  while (l<lmax)
-    {
-    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
-       fx2=vload(fx[l+1].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
-                        vmul(fx2,rec1p.v[i]));
-      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
-                        vmul(fx2,rec1m.v[i]));
-      }
-    Z(saddstep_d)(p1,p2,rec2p,rec2m,&alm[l]);
-    Z(saddstep_d)(p2,p1,rec1p,rec1m,&alm[l+1]);
-    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
-    fx2=vload(fx[l+2].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
-                        vmul(fx2,rec2p.v[i]));
-      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
-                        vmul(fx2,rec2m.v[i]));
-      }
-    l+=2;
-    }
-  if (l==lmax)
-    Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[l]);
-  }
-
-NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2)
-  {
-  int l, lmax=gen->lmax;
-  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  Y(iter_to_ieee_spin)
-    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
-  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 20*VLEN*nvec;
-
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tb corfacp,corfacm;
-  Y(getCorfac)(scalep,&corfacp,gen->cf);
-  Y(getCorfac)(scalem,&corfacm,gen->cf);
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-  while (!full_ieee)
-    {
-    Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
-      &alm[l]);
-    if (++l>lmax) break;
-    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
-      &alm[l]);
-    if (++l>lmax) break;
-    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
-    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
-      {
-      Y(getCorfac)(scalep,&corfacp,gen->cf);
-      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-      }
-    }
-
-  if (l>lmax) return;
-
-  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
-  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
-    lmax);
-  }
-
-
-#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
-
-NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
-  const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim)
-  {
-  const int nval=nvec*VLEN;
-  const int m = job->ainfo->mval[mi];
-  sharp_Ylmgen_prepare (gen, m);
-
-  switch (job->type)
-    {
-    case SHARP_ALM2MAP:
-    case SHARP_ALM2MAP_DERIV1:
-      {
-      if (job->spin==0)
-        {
-        for (int ith=0; ith<ulim-llim; ith+=nval)
-          {
-          Y(Tburi) p1,p2; VZERO(p1); VZERO(p2);
-          Y(Tbu) cth, sth;
-
-          int skip=1;
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            if (mlim[itot]>=m) skip=0;
-            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-            }
-          if (!skip)
-            Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1.b,&p2.b);
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot<ulim-llim)
-              {
-              int phas_idx = itot*job->s_th + mi*job->s_m;
-              complex double r1 = p1.s.r[i] + p1.s.i[i]*_Complex_I,
-                             r2 = p2.s.r[i] + p2.s.i[i]*_Complex_I;
-              job->phase[phas_idx] = r1+r2;
-              if (ispair[itot])
-                job->phase[phas_idx+1] = r1-r2;
-              }
-            }
-          }
-        }
-      else
-        {
-        for (int ith=0; ith<ulim-llim; ith+=nval)
-          {
-          Y(Tbuqu) p1,p2; VZERO(p1); VZERO(p2);
-          Y(Tbu) cth, sth;
-          int skip=1;
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            if (mlim[itot]>=m) skip=0;
-            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-            }
-          if (!skip)
-            (job->type==SHARP_ALM2MAP) ?
-              Z(calc_alm2map_spin  )
-                (cth.b,sth.b,gen,job,&p1.b,&p2.b) :
-              Z(calc_alm2map_deriv1)
-                (cth.b,sth.b,gen,job,&p1.b,&p2.b);
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot<ulim-llim)
-              {
-              int phas_idx = itot*job->s_th + mi*job->s_m;
-              complex double q1 = p1.s.qr[i] + p1.s.qi[i]*_Complex_I,
-                             q2 = p2.s.qr[i] + p2.s.qi[i]*_Complex_I,
-                             u1 = p1.s.ur[i] + p1.s.ui[i]*_Complex_I,
-                             u2 = p2.s.ur[i] + p2.s.ui[i]*_Complex_I;
-              job->phase[phas_idx] = q1+q2;
-              job->phase[phas_idx+2] = u1+u2;
-              if (ispair[itot])
-                {
-                dcmplx *phQ = &(job->phase[phas_idx+1]),
-                       *phU = &(job->phase[phas_idx+3]);
-                *phQ = q1-q2;
-                *phU = u1-u2;
-                if ((gen->mhi-gen->m+gen->s)&1)
-                  { *phQ=-(*phQ); *phU=-(*phU); }
-                }
-              }
-            }
-          }
-        }
-      break;
-      }
-    default:
-      {
-      UTIL_FAIL("must not happen");
-      break;
-      }
-    }
-  }
-
-NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
-  const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim)
-  {
-  const int nval=nvec*VLEN;
-  const int m = job->ainfo->mval[mi];
-  sharp_Ylmgen_prepare (gen, m);
-
-  switch (job->type)
-    {
-    case SHARP_MAP2ALM:
-      {
-      if (job->spin==0)
-        {
-        Tv atmp[2*(gen->lmax+2)];
-        memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv));
-        for (int ith=0; ith<ulim-llim; ith+=nval)
-          {
-          Y(Tburi) p1, p2; VZERO(p1); VZERO(p2);
-          Y(Tbu) cth, sth;
-          int skip=1;
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            if (mlim[itot]>=m) skip=0;
-            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-            if ((i+ith<ulim-llim)&&(mlim[itot]>=m))
-              {
-              int phas_idx = itot*job->s_th + mi*job->s_m;
-              dcmplx ph1=job->phase[phas_idx];
-              dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
-              p1.s.r[i]=creal(ph1+ph2); p1.s.i[i]=cimag(ph1+ph2);
-              p2.s.r[i]=creal(ph1-ph2); p2.s.i[i]=cimag(ph1-ph2);
-              }
-            }
-          if (!skip)
-            Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp);
-          }
-        {
-        int istart=m, istop=gen->lmax+1;
-        for(; istart<istop-2; istart+=2)
-          vhsum_cmplx_special(atmp[2*istart],atmp[2*istart+1],atmp[2*istart+2],atmp[2*istart+3],&(job->almtmp[istart]));
-        for(; istart<istop; istart++)
-          job->almtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]);
-        }
-        }
-      else
-        {
-        for (int ith=0; ith<ulim-llim; ith+=nval)
-          {
-          Y(Tbuqu) p1, p2; VZERO(p1); VZERO(p2);
-          Y(Tbu) cth, sth;
-          int skip=1;
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            if (mlim[itot]>=m) skip=0;
-            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-            if (i+ith<ulim-llim)
-              {
-              int phas_idx = itot*job->s_th + mi*job->s_m;
-              dcmplx p1Q=job->phase[phas_idx],
-                     p1U=job->phase[phas_idx+2],
-                     p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
-                     p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
-              if ((gen->mhi-gen->m+gen->s)&1)
-                { p2Q=-p2Q; p2U=-p2U; }
-              p1.s.qr[i]=creal(p1Q+p2Q); p1.s.qi[i]=cimag(p1Q+p2Q);
-              p1.s.ur[i]=creal(p1U+p2U); p1.s.ui[i]=cimag(p1U+p2U);
-              p2.s.qr[i]=creal(p1Q-p2Q); p2.s.qi[i]=cimag(p1Q-p2Q);
-              p2.s.ur[i]=creal(p1U-p2U); p2.s.ui[i]=cimag(p1U-p2U);
-              }
-            }
-          if (!skip)
-            Z(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1.b,&p2.b);
-          }
-        }
-      break;
-      }
-    default:
-      {
-      UTIL_FAIL("must not happen");
-      break;
-      }
-    }
-  }
-
-static void Z(inner_loop) (sharp_job *job, const int *ispair,
-  const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim)
-  {
-  (job->type==SHARP_MAP2ALM) ?
-    Z(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) :
-    Z(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim);
-  }
-
-#undef VZERO
diff --git a/libsharp/sharp_core_inchelper.c b/libsharp/sharp_core_inchelper.c
deleted file mode 100644
index c58cecc..0000000
--- a/libsharp/sharp_core_inchelper.c
+++ /dev/null
@@ -1,10 +0,0 @@
-#define Tb CONCAT2(Tb,nvec)
-#define Y(arg) CONCAT2(arg,nvec)
-#include "sharp_core_inc.c"
-
-#define Z(arg) CONCAT2(arg,nvec)
-#include "sharp_core_inc2.c"
-#undef Z
-
-#undef Y
-#undef Tb

From 1e39d436c624cb74421ef71957d9969c5a007202 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 11 Dec 2018 11:12:34 +0100
Subject: [PATCH 15/85] de-macroizing

---
 libsharp/sharp_core_inc.c  | 354 ++++++++++++++++++-------------------
 libsharp/sharp_core_inc0.c |   8 +-
 2 files changed, 179 insertions(+), 183 deletions(-)

diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c
index 60fbc6f..fa1f429 100644
--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
@@ -33,27 +33,27 @@ typedef struct
   { Tv v[nvec]; } Tb;
 
 typedef union
-  { Tb b; double s[VLEN*nvec]; } Y(Tbu);
+  { Tb b; double s[VLEN*nvec]; } Tbu;
 
 typedef struct
-  { Tb r, i; } Y(Tbri);
+  { Tb r, i; } Tbri;
 
 typedef struct
-  { Tb qr, qi, ur, ui; } Y(Tbqu);
+  { Tb qr, qi, ur, ui; } Tbqu;
 
 typedef struct
-  { double r[VLEN*nvec], i[VLEN*nvec]; } Y(Tsri);
+  { double r[VLEN*nvec], i[VLEN*nvec]; } Tsri;
 
 typedef struct
-  { double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Y(Tsqu);
+  { double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Tsqu;
 
 typedef union
-  { Y(Tbri) b; Y(Tsri)s; } Y(Tburi);
+  { Tbri b; Tsri s; } Tburi;
 
 typedef union
-  { Y(Tbqu) b; Y(Tsqu)s; } Y(Tbuqu);
+  { Tbqu b; Tsqu s; } Tbuqu;
 
-static inline Tb Y(Tbconst)(double val)
+static inline Tb Tbconst(double val)
   {
   Tv v=vload(val);
   Tb res;
@@ -61,16 +61,16 @@ static inline Tb Y(Tbconst)(double val)
   return res;
   }
 
-static inline void Y(Tbmuleq1)(Tb * restrict a, double b)
+static inline void Tbmuleq1(Tb * restrict a, double b)
   { Tv v=vload(b); for (int i=0; i<nvec; ++i) vmuleq(a->v[i],v); }
 
-static inline Tb Y(Tbprod)(Tb a, Tb b)
+static inline Tb Tbprod(Tb a, Tb b)
   { Tb r; for (int i=0; i<nvec; ++i) r.v[i]=vmul(a.v[i],b.v[i]); return r; }
 
-static inline void Y(Tbmuleq)(Tb * restrict a, Tb b)
+static inline void Tbmuleq(Tb * restrict a, Tb b)
   { for (int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); }
 
-static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
+static void Tbnormalize (Tb * restrict val, Tb * restrict scale,
   double maxval)
   {
   const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
@@ -94,7 +94,7 @@ static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
     }
   }
 
-NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimit,
+NOINLINE static void mypow (Tb val, int npow, const double * restrict powlimit,
   Tb * restrict resd, Tb * restrict ress)
   {
   Tv vminv=vload(powlimit[npow]);
@@ -103,7 +103,7 @@ NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimi
     mask=vor_mask(mask,vlt(vabs(val.v[i]),vminv));
   if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
     {
-    Tb res=Y(Tbconst)(1.);
+    Tb res=Tbconst(1.);
     do
       {
       if (npow&1)
@@ -118,12 +118,12 @@ NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimi
       }
     while(npow>>=1);
     *resd=res;
-    *ress=Y(Tbconst)(0.);
+    *ress=Tbconst(0.);
     }
   else
     {
-    Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
-    Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
+    Tb scale=Tbconst(0.), scaleint=Tbconst(0.), res=Tbconst(1.);
+    Tbnormalize(&val,&scaleint,sharp_fbighalf);
     do
       {
       if (npow&1)
@@ -133,14 +133,14 @@ NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimi
           vmuleq(res.v[i],val.v[i]);
           vaddeq(scale.v[i],scaleint.v[i]);
           }
-        Y(Tbnormalize)(&res,&scale,sharp_fbighalf);
+        Tbnormalize(&res,&scale,sharp_fbighalf);
         }
       for (int i=0; i<nvec; ++i)
         {
         vmuleq(val.v[i],val.v[i]);
         vaddeq(scaleint.v[i],scaleint.v[i]);
         }
-      Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
+      Tbnormalize(&val,&scaleint,sharp_fbighalf);
       }
     while(npow>>=1);
     *resd=res;
@@ -148,7 +148,7 @@ NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimi
     }
   }
 
-static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
+static inline int rescale(Tb * restrict lam1, Tb * restrict lam2,
   Tb * restrict scale)
   {
   int did_scale=0;
@@ -166,7 +166,7 @@ static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
   return did_scale;
   }
 
-static inline int Y(TballLt)(Tb a,double b)
+static inline int TballLt(Tb a,double b)
   {
   Tv vb=vload(b);
   Tm res=vlt(a.v[0],vb);
@@ -174,7 +174,7 @@ static inline int Y(TballLt)(Tb a,double b)
     res=vand_mask(res,vlt(a.v[i],vb));
   return vallTrue(res);
   }
-static inline int Y(TballGt)(Tb a,double b)
+static inline int TballGt(Tb a,double b)
   {
   Tv vb=vload(b);
   Tm res=vgt(a.v[0],vb);
@@ -182,7 +182,7 @@ static inline int Y(TballGt)(Tb a,double b)
     res=vand_mask(res,vgt(a.v[i],vb));
   return vallTrue(res);
   }
-static inline int Y(TballGe)(Tb a,double b)
+static inline int TballGe(Tb a,double b)
   {
   Tv vb=vload(b);
   Tm res=vge(a.v[0],vb);
@@ -191,10 +191,10 @@ static inline int Y(TballGe)(Tb a,double b)
   return vallTrue(res);
   }
 
-static void Y(getCorfac)(Tb scale, Tb * restrict corfac,
+static void getCorfac(Tb scale, Tb * restrict corfac,
   const double * restrict cf)
   {
-  Y(Tbu) sc, corf;
+  Tbu sc, corf;
   sc.b=scale;
   for (int i=0; i<VLEN*nvec; ++i)
     corf.s[i] = (sc.s[i]<sharp_minscale) ?
@@ -202,17 +202,17 @@ static void Y(getCorfac)(Tb scale, Tb * restrict corfac,
   *corfac=corf.b;
   }
 
-NOINLINE static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
+NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
   Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
   const sharp_Ylmgen_C * restrict gen)
   {
   int l=gen->m;
-  Tb lam_1=Y(Tbconst)(0.), lam_2, scale;
-  Y(mypow) (sth,l,gen->powlimit,&lam_2,&scale);
-  Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
-  Y(Tbnormalize)(&lam_2,&scale,sharp_ftol);
+  Tb lam_1=Tbconst(0.), lam_2, scale;
+  mypow(sth,l,gen->powlimit,&lam_2,&scale);
+  Tbmuleq1(&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
+  Tbnormalize(&lam_2,&scale,sharp_ftol);
 
-  int below_limit = Y(TballLt)(scale,sharp_limscale);
+  int below_limit = TballLt(scale,sharp_limscale);
   while (below_limit)
     {
     if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
@@ -223,14 +223,14 @@ NOINLINE static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
       lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
                  - vload(gen->rf[l+1].f[1])*lam_2.v[i];
       }
-    if (Y(rescale)(&lam_1,&lam_2,&scale))
-      below_limit = Y(TballLt)(scale,sharp_limscale);
+    if (rescale(&lam_1,&lam_2,&scale))
+      below_limit = TballLt(scale,sharp_limscale);
     l+=2;
     }
   *l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale;
   }
 
-static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
+static inline void rec_step(Tb * restrict rxp, Tb * restrict rxm,
   Tb * restrict ryp, Tb * restrict rym, const Tb cth,
   const sharp_ylmgen_dbl3 fx)
   {
@@ -242,7 +242,7 @@ static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
     }
   }
 
-static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
+static void iter_to_ieee_spin(const Tb cth, const Tb sth, int *l_,
   Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_,
   Tb * scalep_, Tb * scalem_, const sharp_Ylmgen_C * restrict gen)
   {
@@ -262,13 +262,13 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
     }
 
   Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
-  Y(mypow)(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
-  Y(mypow)(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
-  Y(mypow)(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
-  Y(mypow)(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
+  mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
+  mypow(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
+  mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
+  mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
 
   Tb rec2p, rec2m, scalep, scalem;
-  Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.);
+  Tb rec1p=Tbconst(0.), rec1m=Tbconst(0.);
   Tv prefac=vload(gen->prefac[gen->m]),
      prescale=vload(gen->fscale[gen->m]);
   for (int i=0; i<nvec; ++i)
@@ -278,8 +278,8 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
     rec2m.v[i]=vmul(prefac,csp.v[i]);
     scalem.v[i]=vadd(prescale,csps.v[i]);
     }
-  Y(Tbnormalize)(&rec2m,&scalem,sharp_fbighalf);
-  Y(Tbnormalize)(&rec2p,&scalep,sharp_fbighalf);
+  Tbnormalize(&rec2m,&scalem,sharp_fbighalf);
+  Tbnormalize(&rec2p,&scalep,sharp_fbighalf);
   for (int i=0; i<nvec; ++i)
     {
     rec2p.v[i]=vmul(rec2p.v[i],ssp.v[i]);
@@ -293,21 +293,21 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
     if (gen->s&1)
       rec2p.v[i]=vneg(rec2p.v[i]);
     }
-  Y(Tbnormalize)(&rec2m,&scalem,sharp_ftol);
-  Y(Tbnormalize)(&rec2p,&scalep,sharp_ftol);
+  Tbnormalize(&rec2m,&scalem,sharp_ftol);
+  Tbnormalize(&rec2p,&scalep,sharp_ftol);
 
   int l=gen->mhi;
 
-  int below_limit = Y(TballLt)(scalep,sharp_limscale)
-                 && Y(TballLt)(scalem,sharp_limscale);
+  int below_limit = TballLt(scalep,sharp_limscale)
+                 && TballLt(scalem,sharp_limscale);
   while (below_limit)
     {
     if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
-    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]);
-    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]);
-    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
-      below_limit = Y(TballLt)(scalep,sharp_limscale)
-                 && Y(TballLt)(scalem,sharp_limscale);
+    rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]);
+    rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]);
+    if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem))
+      below_limit = TballLt(scalep,sharp_limscale)
+                 && TballLt(scalem,sharp_limscale);
     l+=2;
     }
 
@@ -317,8 +317,8 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
   }
 
 
-NOINLINE static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
-  Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
+NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
+  Tbri * restrict p2, Tb lam_1, Tb lam_2,
   const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
   int l, int lmax)
   {
@@ -341,8 +341,8 @@ NOINLINE static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
     }
   }
 
-NOINLINE static void Y(map2alm_kernel) (const Tb cth,
-  const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
+NOINLINE static void map2alm_kernel (const Tb cth,
+  const Tbri * restrict p1, const Tbri * restrict p2, Tb lam_1, Tb lam_2,
   const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp)
   {
   while (l<=lmax)
@@ -362,22 +362,22 @@ NOINLINE static void Y(map2alm_kernel) (const Tb cth,
     }
   }
 
-NOINLINE static void Y(calc_alm2map) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
-  Y(Tbri) * restrict p2)
+NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
+  const sharp_Ylmgen_C *gen, sharp_job *job, Tbri * restrict p1,
+  Tbri * restrict p2)
   {
   int l,lmax=gen->lmax;
-  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
-  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
+  Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale;
+  iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen);
   job->opcnt += (l-gen->m) * 4*VLEN*nvec;
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
 
   Tb corfac;
-  Y(getCorfac)(scale,&corfac,gen->cf);
+  getCorfac(scale,&corfac,gen->cf);
   const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
   const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scale,sharp_minscale);
+  int full_ieee = TballGe(scale,sharp_minscale);
   while (!full_ieee)
     {
     {
@@ -406,34 +406,34 @@ NOINLINE static void Y(calc_alm2map) (const Tb cth, const Tb sth,
     r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
     for (int i=0; i<nvec; ++i)
       lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
-    if (Y(rescale)(&lam_1,&lam_2,&scale))
+    if (rescale(&lam_1,&lam_2,&scale))
       {
-      Y(getCorfac)(scale,&corfac,gen->cf);
-      full_ieee = Y(TballGe)(scale,sharp_minscale);
+      getCorfac(scale,&corfac,gen->cf);
+      full_ieee = TballGe(scale,sharp_minscale);
       }
     }
   if (l>lmax) return;
 
-  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Y(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
+  Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac);
+  alm2map_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
   }
 
-NOINLINE static void Y(calc_map2alm) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
-  const Y(Tbri) * restrict p2, Tv *restrict atmp)
+NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
+  const sharp_Ylmgen_C *gen, sharp_job *job, const Tbri * restrict p1,
+  const Tbri * restrict p2, Tv *restrict atmp)
   {
   int lmax=gen->lmax;
-  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
+  Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale;
   int l=gen->m;
-  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
+  iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen);
   job->opcnt += (l-gen->m) * 4*VLEN*nvec;
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
 
   const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
   Tb corfac;
-  Y(getCorfac)(scale,&corfac,gen->cf);
-  int full_ieee = Y(TballGe)(scale,sharp_minscale);
+  getCorfac(scale,&corfac,gen->cf);
+  int full_ieee = TballGe(scale,sharp_minscale);
   while (!full_ieee)
     {
     for (int i=0; i<nvec; ++i)
@@ -455,18 +455,18 @@ NOINLINE static void Y(calc_map2alm) (const Tb cth, const Tb sth,
     for (int i=0; i<nvec; ++i)
       lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i])
                  - vload(rf[l-1].f[1])*lam_2.v[i];
-    if (Y(rescale)(&lam_1,&lam_2,&scale))
+    if (rescale(&lam_1,&lam_2,&scale))
       {
-      Y(getCorfac)(scale,&corfac,gen->cf);
-      full_ieee = Y(TballGe)(scale,sharp_minscale);
+      getCorfac(scale,&corfac,gen->cf);
+      full_ieee = TballGe(scale,sharp_minscale);
       }
     }
 
-  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Y(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp);
+  Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac);
+  map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp);
   }
 
-static inline void Y(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
+static inline void saddstep(Tbqu * restrict px, Tbqu * restrict py,
   const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
   {
   Tv agr=vload(creal(alm[0])), agi=vload(cimag(alm[0])),
@@ -486,7 +486,7 @@ static inline void Y(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
     }
   }
 
-static inline void Y(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
+static inline void saddstepb(Tbqu * restrict p1, Tbqu * restrict p2,
   const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
   const dcmplx * restrict alm1, const dcmplx * restrict alm2)
   {
@@ -511,8 +511,8 @@ static inline void Y(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
     }
   }
 
-static inline void Y(saddstep2) (const Y(Tbqu) * restrict px,
-  const Y(Tbqu) * restrict py, const Tb * restrict rxp,
+static inline void saddstep2(const Tbqu * restrict px,
+  const Tbqu * restrict py, const Tb * restrict rxp,
   const Tb * restrict rxm, dcmplx * restrict alm)
   {
   Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
@@ -532,8 +532,8 @@ static inline void Y(saddstep2) (const Y(Tbqu) * restrict px,
   vhsum_cmplx_special(agr,agi,acr,aci,alm);
   }
 
-NOINLINE static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
+NOINLINE static void alm2map_spin_kernel(Tb cth, Tbqu * restrict p1,
+  Tbqu * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
   const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
   int lmax)
   {
@@ -546,7 +546,7 @@ NOINLINE static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
       rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i];
       rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i];
       }
-    Y(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*l],
+    saddstepb(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*l],
       &alm[2*(l+1)]);
     fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
     fx2=vload(fx[l+2].f[2]);
@@ -558,11 +558,11 @@ NOINLINE static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
     l+=2;
     }
   if (l==lmax)
-    Y(saddstep)(p1, p2, rec2p, rec2m, &alm[2*l]);
+    saddstep(p1, p2, rec2p, rec2m, &alm[2*l]);
   }
 
-NOINLINE static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
-  const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
+NOINLINE static void map2alm_spin_kernel(Tb cth, const Tbqu * restrict p1,
+  const Tbqu * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
   const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
   {
   while (l<lmax)
@@ -576,8 +576,8 @@ NOINLINE static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1
       rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
                         vmul(fx2,rec1m.v[i]));
       }
-    Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*l]);
-    Y(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*(l+1)]);
+    saddstep2(p1, p2, &rec2p, &rec2m, &alm[2*l]);
+    saddstep2(p2, p1, &rec1p, &rec1m, &alm[2*(l+1)]);
     fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
     fx2=vload(fx[l+2].f[2]);
     for (int i=0; i<nvec; ++i)
@@ -590,16 +590,16 @@ NOINLINE static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1
     l+=2;
     }
   if (l==lmax)
-    Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*l]);
+    saddstep2(p1, p2, &rec2p, &rec2m, &alm[2*l]);
   }
 
-NOINLINE static void Y(calc_alm2map_spin) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2)
+NOINLINE static void calc_alm2map_spin(const Tb cth, const Tb sth,
+  const sharp_Ylmgen_C *gen, sharp_job *job, Tbqu * restrict p1,
+  Tbqu * restrict p2)
   {
   int l, lmax=gen->lmax;
   Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  Y(iter_to_ieee_spin)
+  iter_to_ieee_spin
     (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
   job->opcnt += (l-gen->m) * 10*VLEN*nvec;
   if (l>lmax) return;
@@ -607,45 +607,45 @@ NOINLINE static void Y(calc_alm2map_spin) (const Tb cth, const Tb sth,
 
   const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
   Tb corfacp,corfacm;
-  Y(getCorfac)(scalep,&corfacp,gen->cf);
-  Y(getCorfac)(scalem,&corfacm,gen->cf);
+  getCorfac(scalep,&corfacp,gen->cf);
+  getCorfac(scalem,&corfacm,gen->cf);
   const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
+  int full_ieee = TballGe(scalep,sharp_minscale)
+               && TballGe(scalem,sharp_minscale);
   while (!full_ieee)
     {
-    Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
+    saddstep(p1, p2, Tbprod(rec2p,corfacp), Tbprod(rec2m,corfacm),
       &alm[2*l]);
     if (++l>lmax) break;
-    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    Y(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
+    rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
+    saddstep(p2, p1, Tbprod(rec1p,corfacp), Tbprod(rec1m,corfacm),
       &alm[2*l]);
     if (++l>lmax) break;
-    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
-    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
+    rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
+    if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem))
       {
-      Y(getCorfac)(scalep,&corfacp,gen->cf);
-      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
+      getCorfac(scalep,&corfacp,gen->cf);
+      getCorfac(scalem,&corfacm,gen->cf);
+      full_ieee = TballGe(scalep,sharp_minscale)
+               && TballGe(scalem,sharp_minscale);
       }
     }
 
   if (l>lmax) return;
 
-  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
-  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Y(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
+  Tbmuleq(&rec1p,corfacp); Tbmuleq(&rec2p,corfacp);
+  Tbmuleq(&rec1m,corfacm); Tbmuleq(&rec2m,corfacm);
+  alm2map_spin_kernel(cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
     lmax);
   }
 
-NOINLINE static void Y(calc_map2alm_spin) (Tb cth, Tb sth,
+NOINLINE static void calc_map2alm_spin (Tb cth, Tb sth,
   const sharp_Ylmgen_C * restrict gen, sharp_job *job,
-  const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2)
+  const Tbqu * restrict p1, const Tbqu * restrict p2)
   {
   int l, lmax=gen->lmax;
   Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  Y(iter_to_ieee_spin)
+  iter_to_ieee_spin
     (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
   job->opcnt += (l-gen->m) * 10*VLEN*nvec;
   if (l>lmax) return;
@@ -653,36 +653,36 @@ NOINLINE static void Y(calc_map2alm_spin) (Tb cth, Tb sth,
 
   const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
   Tb corfacp,corfacm;
-  Y(getCorfac)(scalep,&corfacp,gen->cf);
-  Y(getCorfac)(scalem,&corfacm,gen->cf);
+  getCorfac(scalep,&corfacp,gen->cf);
+  getCorfac(scalem,&corfacm,gen->cf);
   dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
+  int full_ieee = TballGe(scalep,sharp_minscale)
+               && TballGe(scalem,sharp_minscale);
   while (!full_ieee)
     {
-    Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
-    Y(saddstep2)(p1, p2, &t1, &t2, &alm[2*l]);
+    Tb t1=Tbprod(rec2p,corfacp), t2=Tbprod(rec2m,corfacm);
+    saddstep2(p1, p2, &t1, &t2, &alm[2*l]);
     if (++l>lmax) return;
-    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
-    Y(saddstep2)(p2, p1, &t1, &t2, &alm[2*l]);
+    rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
+    t1=Tbprod(rec1p,corfacp); t2=Tbprod(rec1m,corfacm);
+    saddstep2(p2, p1, &t1, &t2, &alm[2*l]);
     if (++l>lmax) return;
-    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
-    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
+    rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
+    if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem))
       {
-      Y(getCorfac)(scalep,&corfacp,gen->cf);
-      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
+      getCorfac(scalep,&corfacp,gen->cf);
+      getCorfac(scalem,&corfacm,gen->cf);
+      full_ieee = TballGe(scalep,sharp_minscale)
+               && TballGe(scalem,sharp_minscale);
       }
     }
 
-  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
-  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Y(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
+  Tbmuleq(&rec1p,corfacp); Tbmuleq(&rec2p,corfacp);
+  Tbmuleq(&rec1m,corfacm); Tbmuleq(&rec2m,corfacm);
+  map2alm_spin_kernel(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
   }
 
-static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
+static inline void saddstep_d(Tbqu * restrict px, Tbqu * restrict py,
   const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
   {
   Tv ar=vload(creal(alm[0])), ai=vload(cimag(alm[0]));
@@ -697,8 +697,8 @@ static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
     }
   }
 
-NOINLINE static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
+NOINLINE static void alm2map_deriv1_kernel(Tb cth, Tbqu * restrict p1,
+  Tbqu * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
   const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
   int lmax)
   {
@@ -713,8 +713,8 @@ NOINLINE static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
       rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
                         vmul(fx2,rec1m.v[i]));
       }
-    Y(saddstep_d)(p1,p2,rec2p,rec2m,&alm[l]);
-    Y(saddstep_d)(p2,p1,rec1p,rec1m,&alm[l+1]);
+    saddstep_d(p1,p2,rec2p,rec2m,&alm[l]);
+    saddstep_d(p2,p1,rec1p,rec1m,&alm[l+1]);
     fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
     fx2=vload(fx[l+2].f[2]);
     for (int i=0; i<nvec; ++i)
@@ -727,16 +727,16 @@ NOINLINE static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
     l+=2;
     }
   if (l==lmax)
-    Y(saddstep_d)(p1, p2, rec2p, rec2m, &alm[l]);
+    saddstep_d(p1, p2, rec2p, rec2m, &alm[l]);
   }
 
-NOINLINE static void Y(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2)
+NOINLINE static void calc_alm2map_deriv1(const Tb cth, const Tb sth,
+  const sharp_Ylmgen_C *gen, sharp_job *job, Tbqu * restrict p1,
+  Tbqu * restrict p2)
   {
   int l, lmax=gen->lmax;
   Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  Y(iter_to_ieee_spin)
+  iter_to_ieee_spin
     (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
   job->opcnt += (l-gen->m) * 10*VLEN*nvec;
   if (l>lmax) return;
@@ -744,42 +744,42 @@ NOINLINE static void Y(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
 
   const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
   Tb corfacp,corfacm;
-  Y(getCorfac)(scalep,&corfacp,gen->cf);
-  Y(getCorfac)(scalem,&corfacm,gen->cf);
+  getCorfac(scalep,&corfacp,gen->cf);
+  getCorfac(scalem,&corfacm,gen->cf);
   const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
+  int full_ieee = TballGe(scalep,sharp_minscale)
+               && TballGe(scalem,sharp_minscale);
   while (!full_ieee)
     {
-    Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
+    saddstep_d(p1, p2, Tbprod(rec2p,corfacp), Tbprod(rec2m,corfacm),
       &alm[l]);
     if (++l>lmax) break;
-    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    Y(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
+    rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
+    saddstep_d(p2, p1, Tbprod(rec1p,corfacp), Tbprod(rec1m,corfacm),
       &alm[l]);
     if (++l>lmax) break;
-    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
-    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
+    rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
+    if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem))
       {
-      Y(getCorfac)(scalep,&corfacp,gen->cf);
-      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
+      getCorfac(scalep,&corfacp,gen->cf);
+      getCorfac(scalem,&corfacm,gen->cf);
+      full_ieee = TballGe(scalep,sharp_minscale)
+               && TballGe(scalem,sharp_minscale);
       }
     }
 
   if (l>lmax) return;
 
-  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
-  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Y(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
+  Tbmuleq(&rec1p,corfacp); Tbmuleq(&rec2p,corfacp);
+  Tbmuleq(&rec1m,corfacm); Tbmuleq(&rec2m,corfacm);
+  alm2map_deriv1_kernel(cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
     lmax);
   }
 
 
 #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
 
-NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair,
+NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
   const double *cth_, const double *sth_, int llim, int ulim,
   sharp_Ylmgen_C *gen, int mi, const int *mlim)
   {
@@ -796,8 +796,8 @@ NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair,
         {
         for (int ith=0; ith<ulim-llim; ith+=nval)
           {
-          Y(Tburi) p1,p2; VZERO(p1); VZERO(p2);
-          Y(Tbu) cth, sth;
+          Tburi p1,p2; VZERO(p1); VZERO(p2);
+          Tbu cth, sth;
 
           int skip=1;
           for (int i=0; i<nval; ++i)
@@ -808,7 +808,7 @@ NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair,
             cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
             }
           if (!skip)
-            Y(calc_alm2map) (cth.b,sth.b,gen,job,&p1.b,&p2.b);
+            calc_alm2map (cth.b,sth.b,gen,job,&p1.b,&p2.b);
 
           for (int i=0; i<nval; ++i)
             {
@@ -829,8 +829,8 @@ NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair,
         {
         for (int ith=0; ith<ulim-llim; ith+=nval)
           {
-          Y(Tbuqu) p1,p2; VZERO(p1); VZERO(p2);
-          Y(Tbu) cth, sth;
+          Tbuqu p1,p2; VZERO(p1); VZERO(p2);
+          Tbu cth, sth;
           int skip=1;
 
           for (int i=0; i<nval; ++i)
@@ -842,9 +842,9 @@ NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair,
             }
           if (!skip)
             (job->type==SHARP_ALM2MAP) ?
-              Y(calc_alm2map_spin  )
+              calc_alm2map_spin
                 (cth.b,sth.b,gen,job,&p1.b,&p2.b) :
-              Y(calc_alm2map_deriv1)
+              calc_alm2map_deriv1
                 (cth.b,sth.b,gen,job,&p1.b,&p2.b);
 
           for (int i=0; i<nval; ++i)
@@ -882,7 +882,7 @@ NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair,
     }
   }
 
-NOINLINE static void Y(inner_loop_m2a) (sharp_job *job, const int *ispair,
+NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
   const double *cth_, const double *sth_, int llim, int ulim,
   sharp_Ylmgen_C *gen, int mi, const int *mlim)
   {
@@ -900,8 +900,8 @@ NOINLINE static void Y(inner_loop_m2a) (sharp_job *job, const int *ispair,
         memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv));
         for (int ith=0; ith<ulim-llim; ith+=nval)
           {
-          Y(Tburi) p1, p2; VZERO(p1); VZERO(p2);
-          Y(Tbu) cth, sth;
+          Tburi p1, p2; VZERO(p1); VZERO(p2);
+          Tbu cth, sth;
           int skip=1;
 
           for (int i=0; i<nval; ++i)
@@ -920,7 +920,7 @@ NOINLINE static void Y(inner_loop_m2a) (sharp_job *job, const int *ispair,
               }
             }
           if (!skip)
-            Y(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp);
+            calc_map2alm(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp);
           }
         {
         int istart=m, istop=gen->lmax+1;
@@ -934,8 +934,8 @@ NOINLINE static void Y(inner_loop_m2a) (sharp_job *job, const int *ispair,
         {
         for (int ith=0; ith<ulim-llim; ith+=nval)
           {
-          Y(Tbuqu) p1, p2; VZERO(p1); VZERO(p2);
-          Y(Tbu) cth, sth;
+          Tbuqu p1, p2; VZERO(p1); VZERO(p2);
+          Tbu cth, sth;
           int skip=1;
 
           for (int i=0; i<nval; ++i)
@@ -960,7 +960,7 @@ NOINLINE static void Y(inner_loop_m2a) (sharp_job *job, const int *ispair,
               }
             }
           if (!skip)
-            Y(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1.b,&p2.b);
+            calc_map2alm_spin (cth.b,sth.b,gen,job,&p1.b,&p2.b);
           }
         }
       break;
@@ -973,13 +973,13 @@ NOINLINE static void Y(inner_loop_m2a) (sharp_job *job, const int *ispair,
     }
   }
 
-static void Y(inner_loop) (sharp_job *job, const int *ispair,
+static void inner_loop_ (sharp_job *job, const int *ispair,
   const double *cth_, const double *sth_, int llim, int ulim,
   sharp_Ylmgen_C *gen, int mi, const int *mlim)
   {
   (job->type==SHARP_MAP2ALM) ?
-    Y(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) :
-    Y(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim);
+    inner_loop_m2a(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) :
+    inner_loop_a2m(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim);
   }
 
 #undef VZERO
diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c
index 1139ef8..b209cae 100644
--- a/libsharp/sharp_core_inc0.c
+++ b/libsharp/sharp_core_inc0.c
@@ -42,21 +42,17 @@ typedef complex double dcmplx;
 
 #define XCONCATX(a,b) a##b
 #define CONCATX(a,b) XCONCATX(a,b)
-#define XCONCAT2(a,b) a##_##b
-#define CONCAT2(a,b) XCONCAT2(a,b)
 
 #define nvec 6
-#define Tb CONCAT2(Tb,nvec)
-#define Y(arg) CONCAT2(arg,nvec)
+#define Y(arg) arg
 #include "sharp_core_inc.c"
 
 #undef Y
-#undef Tb
 #undef nvec
 
 void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *cth,
   const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
   const int *mlim)
   {
-  inner_loop_6(job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+  inner_loop_(job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
   }

From 0976aabbad9d9651c90b7c4adfb7139ad7d9c664 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 11 Dec 2018 14:27:23 +0100
Subject: [PATCH 16/85] disable spin>0 for the moment

---
 libsharp/sharp_core_inc.c  | 472 +------------------------------------
 libsharp/sharp_testsuite.c |   4 +
 2 files changed, 6 insertions(+), 470 deletions(-)

diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c
index fa1f429..dd3ac4b 100644
--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
@@ -230,92 +230,6 @@ NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
   *l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale;
   }
 
-static inline void rec_step(Tb * restrict rxp, Tb * restrict rxm,
-  Tb * restrict ryp, Tb * restrict rym, const Tb cth,
-  const sharp_ylmgen_dbl3 fx)
-  {
-  Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
-  for (int i=0; i<nvec; ++i)
-    {
-    rxp->v[i] = (cth.v[i]-fx1)*fx0*ryp->v[i] - fx2*rxp->v[i];
-    rxm->v[i] = (cth.v[i]+fx1)*fx0*rym->v[i] - fx2*rxm->v[i];
-    }
-  }
-
-static void iter_to_ieee_spin(const Tb cth, const Tb sth, int *l_,
-  Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_,
-  Tb * scalep_, Tb * scalem_, const sharp_Ylmgen_C * restrict gen)
-  {
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tb cth2, sth2;
-  for (int i=0; i<nvec; ++i)
-    {
-    cth2.v[i]=vsqrt(vmul(vadd(vone,cth.v[i]),vload(0.5)));
-    cth2.v[i]=vmax(cth2.v[i],vload(1e-15));
-    sth2.v[i]=vsqrt(vmul(vsub(vone,cth.v[i]),vload(0.5)));
-    sth2.v[i]=vmax(sth2.v[i],vload(1e-15));
-    Tm mask=vlt(sth.v[i],vzero);
-    Tm cmask=vand_mask(mask,vlt(cth.v[i],vzero));
-    vmuleq_mask(cmask,cth2.v[i],vload(-1.));
-    Tm smask=vand_mask(mask,vgt(cth.v[i],vzero));
-    vmuleq_mask(smask,sth2.v[i],vload(-1.));
-    }
-
-  Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
-  mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
-  mypow(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
-  mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
-  mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
-
-  Tb rec2p, rec2m, scalep, scalem;
-  Tb rec1p=Tbconst(0.), rec1m=Tbconst(0.);
-  Tv prefac=vload(gen->prefac[gen->m]),
-     prescale=vload(gen->fscale[gen->m]);
-  for (int i=0; i<nvec; ++i)
-    {
-    rec2p.v[i]=vmul(prefac,ccp.v[i]);
-    scalep.v[i]=vadd(prescale,ccps.v[i]);
-    rec2m.v[i]=vmul(prefac,csp.v[i]);
-    scalem.v[i]=vadd(prescale,csps.v[i]);
-    }
-  Tbnormalize(&rec2m,&scalem,sharp_fbighalf);
-  Tbnormalize(&rec2p,&scalep,sharp_fbighalf);
-  for (int i=0; i<nvec; ++i)
-    {
-    rec2p.v[i]=vmul(rec2p.v[i],ssp.v[i]);
-    scalep.v[i]=vadd(scalep.v[i],ssps.v[i]);
-    rec2m.v[i]=vmul(rec2m.v[i],scp.v[i]);
-    scalem.v[i]=vadd(scalem.v[i],scps.v[i]);
-    if (gen->preMinus_p)
-      rec2p.v[i]=vneg(rec2p.v[i]);
-    if (gen->preMinus_m)
-      rec2m.v[i]=vneg(rec2m.v[i]);
-    if (gen->s&1)
-      rec2p.v[i]=vneg(rec2p.v[i]);
-    }
-  Tbnormalize(&rec2m,&scalem,sharp_ftol);
-  Tbnormalize(&rec2p,&scalep,sharp_ftol);
-
-  int l=gen->mhi;
-
-  int below_limit = TballLt(scalep,sharp_limscale)
-                 && TballLt(scalem,sharp_limscale);
-  while (below_limit)
-    {
-    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
-    rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]);
-    rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]);
-    if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem))
-      below_limit = TballLt(scalep,sharp_limscale)
-                 && TballLt(scalem,sharp_limscale);
-    l+=2;
-    }
-
-  *l_=l;
-  *rec1p_=rec1p; *rec2p_=rec2p; *scalep_=scalep;
-  *rec1m_=rec1m; *rec2m_=rec2m; *scalem_=scalem;
-  }
-
 
 NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
   Tbri * restrict p2, Tb lam_1, Tb lam_2,
@@ -466,316 +380,6 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
   map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp);
   }
 
-static inline void saddstep(Tbqu * restrict px, Tbqu * restrict py,
-  const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
-  {
-  Tv agr=vload(creal(alm[0])), agi=vload(cimag(alm[0])),
-     acr=vload(creal(alm[1])), aci=vload(cimag(alm[1]));
-  for (int i=0; i<nvec; ++i)
-    {
-    Tv lw=vadd(rxp.v[i],rxm.v[i]);
-    vfmaeq(px->qr.v[i],agr,lw);
-    vfmaeq(px->qi.v[i],agi,lw);
-    vfmaeq(px->ur.v[i],acr,lw);
-    vfmaeq(px->ui.v[i],aci,lw);
-    Tv lx=vsub(rxm.v[i],rxp.v[i]);
-    vfmseq(py->qr.v[i],aci,lx);
-    vfmaeq(py->qi.v[i],acr,lx);
-    vfmaeq(py->ur.v[i],agi,lx);
-    vfmseq(py->ui.v[i],agr,lx);
-    }
-  }
-
-static inline void saddstepb(Tbqu * restrict p1, Tbqu * restrict p2,
-  const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
-  const dcmplx * restrict alm1, const dcmplx * restrict alm2)
-  {
-  Tv agr1=vload(creal(alm1[0])), agi1=vload(cimag(alm1[0])),
-     acr1=vload(creal(alm1[1])), aci1=vload(cimag(alm1[1]));
-  Tv agr2=vload(creal(alm2[0])), agi2=vload(cimag(alm2[0])),
-     acr2=vload(creal(alm2[1])), aci2=vload(cimag(alm2[1]));
-  for (int i=0; i<nvec; ++i)
-    {
-    Tv lw1=r2p.v[i]+r2m.v[i];
-    Tv lx2=r1m.v[i]-r1p.v[i];
-    vfmaseq(p1->qr.v[i],agr1,lw1,aci2,lx2);
-    vfmaaeq(p1->qi.v[i],agi1,lw1,acr2,lx2);
-    vfmaaeq(p1->ur.v[i],acr1,lw1,agi2,lx2);
-    vfmaseq(p1->ui.v[i],aci1,lw1,agr2,lx2);
-    Tv lx1=r2m.v[i]-r2p.v[i];
-    Tv lw2=r1p.v[i]+r1m.v[i];
-    vfmaseq(p2->qr.v[i],agr2,lw2,aci1,lx1);
-    vfmaaeq(p2->qi.v[i],agi2,lw2,acr1,lx1);
-    vfmaaeq(p2->ur.v[i],acr2,lw2,agi1,lx1);
-    vfmaseq(p2->ui.v[i],aci2,lw2,agr1,lx1);
-    }
-  }
-
-static inline void saddstep2(const Tbqu * restrict px,
-  const Tbqu * restrict py, const Tb * restrict rxp,
-  const Tb * restrict rxm, dcmplx * restrict alm)
-  {
-  Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
-  for (int i=0; i<nvec; ++i)
-    {
-    Tv lw=vadd(rxp->v[i],rxm->v[i]);
-    vfmaeq(agr,px->qr.v[i],lw);
-    vfmaeq(agi,px->qi.v[i],lw);
-    vfmaeq(acr,px->ur.v[i],lw);
-    vfmaeq(aci,px->ui.v[i],lw);
-    Tv lx=vsub(rxm->v[i],rxp->v[i]);
-    vfmseq(agr,py->ui.v[i],lx);
-    vfmaeq(agi,py->ur.v[i],lx);
-    vfmaeq(acr,py->qi.v[i],lx);
-    vfmseq(aci,py->qr.v[i],lx);
-    }
-  vhsum_cmplx_special(agr,agi,acr,aci,alm);
-  }
-
-NOINLINE static void alm2map_spin_kernel(Tb cth, Tbqu * restrict p1,
-  Tbqu * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
-  int lmax)
-  {
-  while (l<lmax)
-    {
-    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
-       fx2=vload(fx[l+1].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i];
-      rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i];
-      }
-    saddstepb(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*l],
-      &alm[2*(l+1)]);
-    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
-    fx2=vload(fx[l+2].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec2p.v[i] = (cth.v[i]-fx1)*fx0*rec1p.v[i] - fx2*rec2p.v[i];
-      rec2m.v[i] = (cth.v[i]+fx1)*fx0*rec1m.v[i] - fx2*rec2m.v[i];
-      }
-    l+=2;
-    }
-  if (l==lmax)
-    saddstep(p1, p2, rec2p, rec2m, &alm[2*l]);
-  }
-
-NOINLINE static void map2alm_spin_kernel(Tb cth, const Tbqu * restrict p1,
-  const Tbqu * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
-  {
-  while (l<lmax)
-    {
-    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
-       fx2=vload(fx[l+1].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
-                        vmul(fx2,rec1p.v[i]));
-      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
-                        vmul(fx2,rec1m.v[i]));
-      }
-    saddstep2(p1, p2, &rec2p, &rec2m, &alm[2*l]);
-    saddstep2(p2, p1, &rec1p, &rec1m, &alm[2*(l+1)]);
-    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
-    fx2=vload(fx[l+2].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
-                        vmul(fx2,rec2p.v[i]));
-      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
-                        vmul(fx2,rec2m.v[i]));
-      }
-    l+=2;
-    }
-  if (l==lmax)
-    saddstep2(p1, p2, &rec2p, &rec2m, &alm[2*l]);
-  }
-
-NOINLINE static void calc_alm2map_spin(const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Tbqu * restrict p1,
-  Tbqu * restrict p2)
-  {
-  int l, lmax=gen->lmax;
-  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  iter_to_ieee_spin
-    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
-  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 28*VLEN*nvec;
-
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tb corfacp,corfacm;
-  getCorfac(scalep,&corfacp,gen->cf);
-  getCorfac(scalem,&corfacm,gen->cf);
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = TballGe(scalep,sharp_minscale)
-               && TballGe(scalem,sharp_minscale);
-  while (!full_ieee)
-    {
-    saddstep(p1, p2, Tbprod(rec2p,corfacp), Tbprod(rec2m,corfacm),
-      &alm[2*l]);
-    if (++l>lmax) break;
-    rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    saddstep(p2, p1, Tbprod(rec1p,corfacp), Tbprod(rec1m,corfacm),
-      &alm[2*l]);
-    if (++l>lmax) break;
-    rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
-    if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem))
-      {
-      getCorfac(scalep,&corfacp,gen->cf);
-      getCorfac(scalem,&corfacm,gen->cf);
-      full_ieee = TballGe(scalep,sharp_minscale)
-               && TballGe(scalem,sharp_minscale);
-      }
-    }
-
-  if (l>lmax) return;
-
-  Tbmuleq(&rec1p,corfacp); Tbmuleq(&rec2p,corfacp);
-  Tbmuleq(&rec1m,corfacm); Tbmuleq(&rec2m,corfacm);
-  alm2map_spin_kernel(cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
-    lmax);
-  }
-
-NOINLINE static void calc_map2alm_spin (Tb cth, Tb sth,
-  const sharp_Ylmgen_C * restrict gen, sharp_job *job,
-  const Tbqu * restrict p1, const Tbqu * restrict p2)
-  {
-  int l, lmax=gen->lmax;
-  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  iter_to_ieee_spin
-    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
-  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 28*VLEN*nvec;
-
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tb corfacp,corfacm;
-  getCorfac(scalep,&corfacp,gen->cf);
-  getCorfac(scalem,&corfacm,gen->cf);
-  dcmplx * restrict alm=job->almtmp;
-  int full_ieee = TballGe(scalep,sharp_minscale)
-               && TballGe(scalem,sharp_minscale);
-  while (!full_ieee)
-    {
-    Tb t1=Tbprod(rec2p,corfacp), t2=Tbprod(rec2m,corfacm);
-    saddstep2(p1, p2, &t1, &t2, &alm[2*l]);
-    if (++l>lmax) return;
-    rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    t1=Tbprod(rec1p,corfacp); t2=Tbprod(rec1m,corfacm);
-    saddstep2(p2, p1, &t1, &t2, &alm[2*l]);
-    if (++l>lmax) return;
-    rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
-    if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem))
-      {
-      getCorfac(scalep,&corfacp,gen->cf);
-      getCorfac(scalem,&corfacm,gen->cf);
-      full_ieee = TballGe(scalep,sharp_minscale)
-               && TballGe(scalem,sharp_minscale);
-      }
-    }
-
-  Tbmuleq(&rec1p,corfacp); Tbmuleq(&rec2p,corfacp);
-  Tbmuleq(&rec1m,corfacm); Tbmuleq(&rec2m,corfacm);
-  map2alm_spin_kernel(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
-  }
-
-static inline void saddstep_d(Tbqu * restrict px, Tbqu * restrict py,
-  const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
-  {
-  Tv ar=vload(creal(alm[0])), ai=vload(cimag(alm[0]));
-  for (int i=0; i<nvec; ++i)
-    {
-    Tv lw=vadd(rxp.v[i],rxm.v[i]);
-    vfmaeq(px->qr.v[i],ar,lw);
-    vfmaeq(px->qi.v[i],ai,lw);
-    Tv lx=vsub(rxm.v[i],rxp.v[i]);
-    vfmaeq(py->ur.v[i],ai,lx);
-    vfmseq(py->ui.v[i],ar,lx);
-    }
-  }
-
-NOINLINE static void alm2map_deriv1_kernel(Tb cth, Tbqu * restrict p1,
-  Tbqu * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
-  int lmax)
-  {
-  while (l<lmax)
-    {
-    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
-       fx2=vload(fx[l+1].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
-                        vmul(fx2,rec1p.v[i]));
-      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
-                        vmul(fx2,rec1m.v[i]));
-      }
-    saddstep_d(p1,p2,rec2p,rec2m,&alm[l]);
-    saddstep_d(p2,p1,rec1p,rec1m,&alm[l+1]);
-    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
-    fx2=vload(fx[l+2].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
-                        vmul(fx2,rec2p.v[i]));
-      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
-                        vmul(fx2,rec2m.v[i]));
-      }
-    l+=2;
-    }
-  if (l==lmax)
-    saddstep_d(p1, p2, rec2p, rec2m, &alm[l]);
-  }
-
-NOINLINE static void calc_alm2map_deriv1(const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Tbqu * restrict p1,
-  Tbqu * restrict p2)
-  {
-  int l, lmax=gen->lmax;
-  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  iter_to_ieee_spin
-    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
-  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 20*VLEN*nvec;
-
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tb corfacp,corfacm;
-  getCorfac(scalep,&corfacp,gen->cf);
-  getCorfac(scalem,&corfacm,gen->cf);
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = TballGe(scalep,sharp_minscale)
-               && TballGe(scalem,sharp_minscale);
-  while (!full_ieee)
-    {
-    saddstep_d(p1, p2, Tbprod(rec2p,corfacp), Tbprod(rec2m,corfacm),
-      &alm[l]);
-    if (++l>lmax) break;
-    rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    saddstep_d(p2, p1, Tbprod(rec1p,corfacp), Tbprod(rec1m,corfacm),
-      &alm[l]);
-    if (++l>lmax) break;
-    rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
-    if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem))
-      {
-      getCorfac(scalep,&corfacp,gen->cf);
-      getCorfac(scalem,&corfacm,gen->cf);
-      full_ieee = TballGe(scalep,sharp_minscale)
-               && TballGe(scalem,sharp_minscale);
-      }
-    }
-
-  if (l>lmax) return;
-
-  Tbmuleq(&rec1p,corfacp); Tbmuleq(&rec2p,corfacp);
-  Tbmuleq(&rec1m,corfacm); Tbmuleq(&rec2m,corfacm);
-  alm2map_deriv1_kernel(cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
-    lmax);
-  }
-
 
 #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
 
@@ -827,50 +431,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
         }
       else
         {
-        for (int ith=0; ith<ulim-llim; ith+=nval)
-          {
-          Tbuqu p1,p2; VZERO(p1); VZERO(p2);
-          Tbu cth, sth;
-          int skip=1;
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            if (mlim[itot]>=m) skip=0;
-            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-            }
-          if (!skip)
-            (job->type==SHARP_ALM2MAP) ?
-              calc_alm2map_spin
-                (cth.b,sth.b,gen,job,&p1.b,&p2.b) :
-              calc_alm2map_deriv1
-                (cth.b,sth.b,gen,job,&p1.b,&p2.b);
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot<ulim-llim)
-              {
-              int phas_idx = itot*job->s_th + mi*job->s_m;
-              complex double q1 = p1.s.qr[i] + p1.s.qi[i]*_Complex_I,
-                             q2 = p2.s.qr[i] + p2.s.qi[i]*_Complex_I,
-                             u1 = p1.s.ur[i] + p1.s.ui[i]*_Complex_I,
-                             u2 = p2.s.ur[i] + p2.s.ui[i]*_Complex_I;
-              job->phase[phas_idx] = q1+q2;
-              job->phase[phas_idx+2] = u1+u2;
-              if (ispair[itot])
-                {
-                dcmplx *phQ = &(job->phase[phas_idx+1]),
-                       *phU = &(job->phase[phas_idx+3]);
-                *phQ = q1-q2;
-                *phU = u1-u2;
-                if ((gen->mhi-gen->m+gen->s)&1)
-                  { *phQ=-(*phQ); *phU=-(*phU); }
-                }
-              }
-            }
-          }
+        UTIL_FAIL("only spin==0 allowed at the moment");
         }
       break;
       }
@@ -932,36 +493,7 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
         }
       else
         {
-        for (int ith=0; ith<ulim-llim; ith+=nval)
-          {
-          Tbuqu p1, p2; VZERO(p1); VZERO(p2);
-          Tbu cth, sth;
-          int skip=1;
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            if (mlim[itot]>=m) skip=0;
-            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-            if (i+ith<ulim-llim)
-              {
-              int phas_idx = itot*job->s_th + mi*job->s_m;
-              dcmplx p1Q=job->phase[phas_idx],
-                     p1U=job->phase[phas_idx+2],
-                     p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
-                     p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
-              if ((gen->mhi-gen->m+gen->s)&1)
-                { p2Q=-p2Q; p2U=-p2U; }
-              p1.s.qr[i]=creal(p1Q+p2Q); p1.s.qi[i]=cimag(p1Q+p2Q);
-              p1.s.ur[i]=creal(p1U+p2U); p1.s.ui[i]=cimag(p1U+p2U);
-              p2.s.qr[i]=creal(p1Q-p2Q); p2.s.qi[i]=cimag(p1Q-p2Q);
-              p2.s.ur[i]=creal(p1U-p2U); p2.s.ui[i]=cimag(p1U-p2U);
-              }
-            }
-          if (!skip)
-            calc_map2alm_spin (cth.b,sth.b,gen,job,&p1.b,&p2.b);
-          }
+        UTIL_FAIL("only spin==0 allowed at the moment");
         }
       break;
       }
diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c
index c08fb20..26b9f92 100644
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@@ -376,6 +376,7 @@ static void check_sign_scale(void)
   UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.234675107554816442e+01,1e-12),
     "error");
 
+#if 0
   sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,SHARP_DP,
     NULL,NULL);
   UTIL_ASSERT(FAPPROX(map[0][0     ], 2.750897760535633285e+00,1e-12),
@@ -420,6 +421,7 @@ static void check_sign_scale(void)
     "error");
   UTIL_ASSERT(FAPPROX(map[1][npix-1], 7.821618677689795049e+02,1e-12),
     "error");
+#endif
 
   DEALLOC2D(map);
   DEALLOC2D(alm);
@@ -503,10 +505,12 @@ static void sharp_acctest(void)
   for (int nv=1; nv<=6; ++nv)
     {
     check_accuracy(ginfo,ainfo,0,nv);
+#if 0
     check_accuracy(ginfo,ainfo,1,nv);
     check_accuracy(ginfo,ainfo,2,nv);
     check_accuracy(ginfo,ainfo,3,nv);
     check_accuracy(ginfo,ainfo,30,nv);
+#endif
     }
   sharp_destroy_alm_info(ainfo);
   sharp_destroy_geom_info(ginfo);

From cd7163d48518e924a735dfa23bf8427b27a206ef Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 11 Dec 2018 15:08:37 +0100
Subject: [PATCH 17/85] simplifications

---
 Makefile.am                |   5 -
 libsharp/sharp.c           |   3 +-
 libsharp/sharp_core.c      | 526 ++++++++++++++++++++++++++++++++++---
 libsharp/sharp_core.h      |   1 +
 libsharp/sharp_core_avx.c  |  10 -
 libsharp/sharp_core_inc.c  | 517 ------------------------------------
 libsharp/sharp_core_inc0.c |  58 ----
 libsharp/sharp_testsuite.c |   3 +-
 8 files changed, 499 insertions(+), 624 deletions(-)
 delete mode 100644 libsharp/sharp_core_avx.c
 delete mode 100644 libsharp/sharp_core_inc.c
 delete mode 100644 libsharp/sharp_core_inc0.c

diff --git a/Makefile.am b/Makefile.am
index 7a035ba..c738f29 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -11,7 +11,6 @@ src_sharp = \
   libsharp/sharp_almhelpers.c \
   libsharp/sharp_announce.c \
   libsharp/sharp_core.c \
-  libsharp/sharp_core_avx.c \
   libsharp/sharp_geomhelpers.c \
   libsharp/sharp_legendre_roots.c \
   libsharp/sharp_ylmgen_c.c \
@@ -32,10 +31,6 @@ include_HEADERS = \
   libsharp/sharp_almhelpers.h \
   libsharp/sharp_cxx.h
 
-EXTRA_DIST = \
-  libsharp/sharp_core_inc0.c \
-  libsharp/sharp_core_inc.c
-
 libsharp_la_SOURCES = $(src_sharp)
 
 check_PROGRAMS = sharp_testsuite
diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index 943bd79..f312fc3 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -36,7 +36,6 @@
 #include "sharp_internal.h"
 #include "c_utils.h"
 #include "sharp_core.h"
-#include "sharp_vecutil.h"
 #include "walltime_c.h"
 #include "sharp_almhelpers.h"
 #include "sharp_geomhelpers.h"
@@ -854,7 +853,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
   init_output (job);
 
   int nchunks, chunksize;
-  get_chunk_info(job->ginfo->npairs,6*VLEN,&nchunks,&chunksize);
+  get_chunk_info(job->ginfo->npairs,sharp_veclen()*sharp_max_nvec(),&nchunks,&chunksize);
 //FIXME: needs to be changed to "nm"
   alloc_phase (job,mmax+1,chunksize);
 
diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 1d6618d..7038b38 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -29,46 +29,512 @@
  *  \author Martin Reinecke
  */
 
-#define ARCH _default
-#include "sharp_core_inc0.c"
-#undef ARCH
+#include <complex.h>
+#include <math.h>
+#include <string.h>
+#include "sharp_vecsupport.h"
+#include "sharp_complex_hacks.h"
+#include "sharp.h"
+#include "sharp_core.h"
+#include "c_utils.h"
 
-#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+typedef complex double dcmplx;
 
-static int have_avx(void)
+#define nvec (128/VLEN)
+typedef struct
+  { Tv v[nvec]; } Tb;
+
+typedef union
+  { Tb b; double s[VLEN*nvec]; } Tbu;
+
+typedef struct
+  { Tb r, i; } Tbri;
+
+typedef struct
+  { Tb qr, qi, ur, ui; } Tbqu;
+
+typedef struct
+  { double r[VLEN*nvec], i[VLEN*nvec]; } Tsri;
+
+typedef struct
+  { double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Tsqu;
+
+typedef union
+  { Tbri b; Tsri s; } Tburi;
+
+typedef union
+  { Tbqu b; Tsqu s; } Tbuqu;
+
+static inline Tb Tbconst(double val)
   {
-  static int res=-1;
-  if (res<0)
-    {
-    __builtin_cpu_init();
-    res = __builtin_cpu_supports("avx");
-    }
+  Tv v=vload(val);
+  Tb res;
+  for (int i=0; i<nvec; ++i) res.v[i]=v;
   return res;
   }
 
-void inner_loop_avx (sharp_job *job, const int *ispair,const double *cth,
-  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
-  const int *mlim);
-#endif
+static inline void Tbmuleq1(Tb * restrict a, double b)
+  { Tv v=vload(b); for (int i=0; i<nvec; ++i) vmuleq(a->v[i],v); }
 
-void inner_loop (sharp_job *job, const int *ispair,const double *cth,
-  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
-  const int *mlim)
+static inline Tb Tbprod(Tb a, Tb b)
+  { Tb r; for (int i=0; i<nvec; ++i) r.v[i]=vmul(a.v[i],b.v[i]); return r; }
+
+static inline void Tbmuleq(Tb * restrict a, Tb b)
+  { for (int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); }
+
+static void Tbnormalize (Tb * restrict val, Tb * restrict scale,
+  double maxval)
   {
-#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
-  if (have_avx())
-    inner_loop_avx (job, ispair, cth, sth, llim, ulim, gen, mi, mlim);
-  else
-#endif
-    inner_loop_default (job, ispair, cth, sth, llim, ulim, gen, mi, mlim);
+  const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
+  const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
+  for (int i=0;i<nvec; ++i)
+    {
+    Tm mask = vgt(vabs(val->v[i]),vfmax);
+    while (vanyTrue(mask))
+      {
+      vmuleq_mask(mask,val->v[i],vfsmall);
+      vaddeq_mask(mask,scale->v[i],vone);
+      mask = vgt(vabs(val->v[i]),vfmax);
+      }
+    mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
+    while (vanyTrue(mask))
+      {
+      vmuleq_mask(mask,val->v[i],vfbig);
+      vsubeq_mask(mask,scale->v[i],vone);
+      mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
+      }
+    }
   }
 
+NOINLINE static void mypow (Tb val, int npow, const double * restrict powlimit,
+  Tb * restrict resd, Tb * restrict ress)
+  {
+  Tv vminv=vload(powlimit[npow]);
+  Tm mask = vlt(vabs(val.v[0]),vminv);
+  for (int i=1;i<nvec; ++i)
+    mask=vor_mask(mask,vlt(vabs(val.v[i]),vminv));
+  if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
+    {
+    Tb res=Tbconst(1.);
+    do
+      {
+      if (npow&1)
+        for (int i=0; i<nvec; ++i)
+          {
+          vmuleq(res.v[i],val.v[i]);
+          vmuleq(val.v[i],val.v[i]);
+          }
+      else
+        for (int i=0; i<nvec; ++i)
+          vmuleq(val.v[i],val.v[i]);
+      }
+    while(npow>>=1);
+    *resd=res;
+    *ress=Tbconst(0.);
+    }
+  else
+    {
+    Tb scale=Tbconst(0.), scaleint=Tbconst(0.), res=Tbconst(1.);
+    Tbnormalize(&val,&scaleint,sharp_fbighalf);
+    do
+      {
+      if (npow&1)
+        {
+        for (int i=0; i<nvec; ++i)
+          {
+          vmuleq(res.v[i],val.v[i]);
+          vaddeq(scale.v[i],scaleint.v[i]);
+          }
+        Tbnormalize(&res,&scale,sharp_fbighalf);
+        }
+      for (int i=0; i<nvec; ++i)
+        {
+        vmuleq(val.v[i],val.v[i]);
+        vaddeq(scaleint.v[i],scaleint.v[i]);
+        }
+      Tbnormalize(&val,&scaleint,sharp_fbighalf);
+      }
+    while(npow>>=1);
+    *resd=res;
+    *ress=scale;
+    }
+  }
+
+static inline int rescale(Tb * restrict lam1, Tb * restrict lam2,
+  Tb * restrict scale)
+  {
+  int did_scale=0;
+  for (int i=0;i<nvec; ++i)
+    {
+    Tm mask = vgt(vabs(lam2->v[i]),vload(sharp_ftol));
+    if (vanyTrue(mask))
+      {
+      did_scale=1;
+      vmuleq_mask(mask,lam1->v[i],vload(sharp_fsmall));
+      vmuleq_mask(mask,lam2->v[i],vload(sharp_fsmall));
+      vaddeq_mask(mask,scale->v[i],vone);
+      }
+    }
+  return did_scale;
+  }
+
+static inline int TballLt(Tb a,double b)
+  {
+  Tv vb=vload(b);
+  Tm res=vlt(a.v[0],vb);
+  for (int i=1; i<nvec; ++i)
+    res=vand_mask(res,vlt(a.v[i],vb));
+  return vallTrue(res);
+  }
+static inline int TballGt(Tb a,double b)
+  {
+  Tv vb=vload(b);
+  Tm res=vgt(a.v[0],vb);
+  for (int i=1; i<nvec; ++i)
+    res=vand_mask(res,vgt(a.v[i],vb));
+  return vallTrue(res);
+  }
+static inline int TballGe(Tb a,double b)
+  {
+  Tv vb=vload(b);
+  Tm res=vge(a.v[0],vb);
+  for (int i=1; i<nvec; ++i)
+    res=vand_mask(res,vge(a.v[i],vb));
+  return vallTrue(res);
+  }
+
+static void getCorfac(Tb scale, Tb * restrict corfac,
+  const double * restrict cf)
+  {
+  Tbu sc, corf;
+  sc.b=scale;
+  for (int i=0; i<VLEN*nvec; ++i)
+    corf.s[i] = (sc.s[i]<sharp_minscale) ?
+      0. : cf[(int)(sc.s[i])-sharp_minscale];
+  *corfac=corf.b;
+  }
+
+NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
+  Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
+  const sharp_Ylmgen_C * restrict gen)
+  {
+  int l=gen->m;
+  Tb lam_1=Tbconst(0.), lam_2, scale;
+  mypow(sth,l,gen->powlimit,&lam_2,&scale);
+  Tbmuleq1(&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
+  Tbnormalize(&lam_2,&scale,sharp_ftol);
+
+  int below_limit = TballLt(scale,sharp_limscale);
+  while (below_limit)
+    {
+    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
+    for (int i=0; i<nvec; ++i)
+      {
+      lam_1.v[i] = vload(gen->rf[l].f[0])*(cth.v[i]*lam_2.v[i])
+                 - vload(gen->rf[l].f[1])*lam_1.v[i];
+      lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
+                 - vload(gen->rf[l+1].f[1])*lam_2.v[i];
+      }
+    if (rescale(&lam_1,&lam_2,&scale))
+      below_limit = TballLt(scale,sharp_limscale);
+    l+=2;
+    }
+  *l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale;
+  }
+
+
+NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
+  Tbri * restrict p2, Tb lam_1, Tb lam_2,
+  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
+  int l, int lmax)
+  {
+  while (l<=lmax)
+    {
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
+       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      {
+      lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
+      p1->r.v[i] += lam_2.v[i]*ar1;
+      p1->i.v[i] += lam_2.v[i]*ai1;
+      lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
+      p2->r.v[i] += lam_1.v[i]*ar2;
+      p2->i.v[i] += lam_1.v[i]*ai2;
+      }
+    l+=2;
+    }
+  }
+
+NOINLINE static void map2alm_kernel (const Tb cth,
+  const Tbri * restrict p1, const Tbri * restrict p2, Tb lam_1, Tb lam_2,
+  const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp)
+  {
+  while (l<=lmax)
+    {
+    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
+       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      {
+      lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
+      vfmaeq(atmp[2*l  ],lam_2.v[i],p1->r.v[i]);
+      vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]);
+      lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
+      vfmaeq(atmp[2*(l+1)  ],lam_1.v[i],p2->r.v[i]);
+      vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]);
+      }
+    l+=2;
+    }
+  }
+
+NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
+  const sharp_Ylmgen_C *gen, sharp_job *job, Tbri * restrict p1,
+  Tbri * restrict p2)
+  {
+  int l,lmax=gen->lmax;
+  Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale;
+  iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen);
+  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
+
+  Tb corfac;
+  getCorfac(scale,&corfac,gen->cf);
+  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee = TballGe(scale,sharp_minscale);
+  while (!full_ieee)
+    {
+    {
+    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
+      vfmaeq(p1->r.v[i],tmp,ar);
+      vfmaeq(p1->i.v[i],tmp,ai);
+      }
+    }
+    if (++l>lmax) break;
+    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+    {
+    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
+      vfmaeq(p2->r.v[i],tmp,ar);
+      vfmaeq(p2->i.v[i],tmp,ai);
+      }
+    }
+    if (++l>lmax) break;
+    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+    if (rescale(&lam_1,&lam_2,&scale))
+      {
+      getCorfac(scale,&corfac,gen->cf);
+      full_ieee = TballGe(scale,sharp_minscale);
+      }
+    }
+  if (l>lmax) return;
+
+  Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac);
+  alm2map_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
+  }
+
+NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
+  const sharp_Ylmgen_C *gen, sharp_job *job, const Tbri * restrict p1,
+  const Tbri * restrict p2, Tv *restrict atmp)
+  {
+  int lmax=gen->lmax;
+  Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale;
+  int l=gen->m;
+  iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen);
+  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
+
+  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
+  Tb corfac;
+  getCorfac(scale,&corfac,gen->cf);
+  int full_ieee = TballGe(scale,sharp_minscale);
+  while (!full_ieee)
+    {
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv tmp=lam_2.v[i]*corfac.v[i];
+      atmp[2*l  ]+=tmp*p1->r.v[i];
+      atmp[2*l+1]+=tmp*p1->i.v[i];
+      }
+    if (++l>lmax) return;
+    for (int i=0; i<nvec; ++i)
+      {
+      lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
+                 - vload(rf[l-1].f[1])*lam_1.v[i];
+      Tv tmp=lam_1.v[i]*corfac.v[i];
+      atmp[2*l  ]+=tmp*p2->r.v[i];
+      atmp[2*l+1]+=tmp*p2->i.v[i];
+      }
+    if (++l>lmax) return;
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i])
+                 - vload(rf[l-1].f[1])*lam_2.v[i];
+    if (rescale(&lam_1,&lam_2,&scale))
+      {
+      getCorfac(scale,&corfac,gen->cf);
+      full_ieee = TballGe(scale,sharp_minscale);
+      }
+    }
+
+  Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac);
+  map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp);
+  }
+
+
+#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
+
+NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *mlim)
+  {
+  const int nval=nvec*VLEN;
+  const int m = job->ainfo->mval[mi];
+  sharp_Ylmgen_prepare (gen, m);
+
+  switch (job->type)
+    {
+    case SHARP_ALM2MAP:
+    case SHARP_ALM2MAP_DERIV1:
+      {
+      if (job->spin==0)
+        {
+        for (int ith=0; ith<ulim-llim; ith+=nval)
+          {
+          Tburi p1,p2; VZERO(p1); VZERO(p2);
+          Tbu cth, sth;
+
+          int skip=1;
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot>=ulim-llim) itot=ulim-llim-1;
+            if (mlim[itot]>=m) skip=0;
+            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
+            }
+          if (!skip)
+            calc_alm2map (cth.b,sth.b,gen,job,&p1.b,&p2.b);
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot<ulim-llim)
+              {
+              int phas_idx = itot*job->s_th + mi*job->s_m;
+              complex double r1 = p1.s.r[i] + p1.s.i[i]*_Complex_I,
+                             r2 = p2.s.r[i] + p2.s.i[i]*_Complex_I;
+              job->phase[phas_idx] = r1+r2;
+              if (ispair[itot])
+                job->phase[phas_idx+1] = r1-r2;
+              }
+            }
+          }
+        }
+      else
+        {
+        UTIL_FAIL("only spin==0 allowed at the moment");
+        }
+      break;
+      }
+    default:
+      {
+      UTIL_FAIL("must not happen");
+      break;
+      }
+    }
+  }
+
+NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *mlim)
+  {
+  const int nval=nvec*VLEN;
+  const int m = job->ainfo->mval[mi];
+  sharp_Ylmgen_prepare (gen, m);
+
+  switch (job->type)
+    {
+    case SHARP_MAP2ALM:
+      {
+      if (job->spin==0)
+        {
+        Tv atmp[2*(gen->lmax+2)];
+        memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv));
+        for (int ith=0; ith<ulim-llim; ith+=nval)
+          {
+          Tburi p1, p2; VZERO(p1); VZERO(p2);
+          Tbu cth, sth;
+          int skip=1;
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot>=ulim-llim) itot=ulim-llim-1;
+            if (mlim[itot]>=m) skip=0;
+            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
+            if ((i+ith<ulim-llim)&&(mlim[itot]>=m))
+              {
+              int phas_idx = itot*job->s_th + mi*job->s_m;
+              dcmplx ph1=job->phase[phas_idx];
+              dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
+              p1.s.r[i]=creal(ph1+ph2); p1.s.i[i]=cimag(ph1+ph2);
+              p2.s.r[i]=creal(ph1-ph2); p2.s.i[i]=cimag(ph1-ph2);
+              }
+            }
+          if (!skip)
+            calc_map2alm(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp);
+          }
+        {
+        int istart=m, istop=gen->lmax+1;
+        for(; istart<istop-2; istart+=2)
+          vhsum_cmplx_special(atmp[2*istart],atmp[2*istart+1],atmp[2*istart+2],atmp[2*istart+3],&(job->almtmp[istart]));
+        for(; istart<istop; istart++)
+          job->almtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]);
+        }
+        }
+      else
+        {
+        UTIL_FAIL("only spin==0 allowed at the moment");
+        }
+      break;
+      }
+    default:
+      {
+      UTIL_FAIL("must not happen");
+      break;
+      }
+    }
+  }
+
+void inner_loop (sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *mlim)
+  {
+  (job->type==SHARP_MAP2ALM) ?
+    inner_loop_m2a(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) :
+    inner_loop_a2m(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim);
+  }
+
+#undef VZERO
+#undef nvec
+
 int sharp_veclen(void)
   {
-#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
-  if (have_avx())
-    return 4;
-  else
-#endif
-    return VLEN;
+  return VLEN;
+  }
+
+int sharp_max_nvec(void)
+  {
+  return 128/VLEN;
   }
diff --git a/libsharp/sharp_core.h b/libsharp/sharp_core.h
index f641125..a9e509b 100644
--- a/libsharp/sharp_core.h
+++ b/libsharp/sharp_core.h
@@ -44,6 +44,7 @@ void inner_loop (sharp_job *job, const int *ispair,const double *cth,
   const int *mlim);
 
 int sharp_veclen(void);
+int sharp_max_nvec(void);
 
 #ifdef __cplusplus
 }
diff --git a/libsharp/sharp_core_avx.c b/libsharp/sharp_core_avx.c
deleted file mode 100644
index 79f1e79..0000000
--- a/libsharp/sharp_core_avx.c
+++ /dev/null
@@ -1,10 +0,0 @@
-#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
-// if we arrive here, we can benefit from an additional AVX version
-// #warning entering gcc and x86_64 specific code branch
-
-#define ARCH _avx
-#pragma GCC target("avx")
-#include "sharp_core_inc0.c"
-#undef ARCH
-
-#endif
diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c
deleted file mode 100644
index dd3ac4b..0000000
--- a/libsharp/sharp_core_inc.c
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_core_inc.c
- *  Type-dependent code for the computational core
- *
- *  Copyright (C) 2012-2017 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-typedef struct
-  { Tv v[nvec]; } Tb;
-
-typedef union
-  { Tb b; double s[VLEN*nvec]; } Tbu;
-
-typedef struct
-  { Tb r, i; } Tbri;
-
-typedef struct
-  { Tb qr, qi, ur, ui; } Tbqu;
-
-typedef struct
-  { double r[VLEN*nvec], i[VLEN*nvec]; } Tsri;
-
-typedef struct
-  { double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Tsqu;
-
-typedef union
-  { Tbri b; Tsri s; } Tburi;
-
-typedef union
-  { Tbqu b; Tsqu s; } Tbuqu;
-
-static inline Tb Tbconst(double val)
-  {
-  Tv v=vload(val);
-  Tb res;
-  for (int i=0; i<nvec; ++i) res.v[i]=v;
-  return res;
-  }
-
-static inline void Tbmuleq1(Tb * restrict a, double b)
-  { Tv v=vload(b); for (int i=0; i<nvec; ++i) vmuleq(a->v[i],v); }
-
-static inline Tb Tbprod(Tb a, Tb b)
-  { Tb r; for (int i=0; i<nvec; ++i) r.v[i]=vmul(a.v[i],b.v[i]); return r; }
-
-static inline void Tbmuleq(Tb * restrict a, Tb b)
-  { for (int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); }
-
-static void Tbnormalize (Tb * restrict val, Tb * restrict scale,
-  double maxval)
-  {
-  const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
-  const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
-  for (int i=0;i<nvec; ++i)
-    {
-    Tm mask = vgt(vabs(val->v[i]),vfmax);
-    while (vanyTrue(mask))
-      {
-      vmuleq_mask(mask,val->v[i],vfsmall);
-      vaddeq_mask(mask,scale->v[i],vone);
-      mask = vgt(vabs(val->v[i]),vfmax);
-      }
-    mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
-    while (vanyTrue(mask))
-      {
-      vmuleq_mask(mask,val->v[i],vfbig);
-      vsubeq_mask(mask,scale->v[i],vone);
-      mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
-      }
-    }
-  }
-
-NOINLINE static void mypow (Tb val, int npow, const double * restrict powlimit,
-  Tb * restrict resd, Tb * restrict ress)
-  {
-  Tv vminv=vload(powlimit[npow]);
-  Tm mask = vlt(vabs(val.v[0]),vminv);
-  for (int i=1;i<nvec; ++i)
-    mask=vor_mask(mask,vlt(vabs(val.v[i]),vminv));
-  if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
-    {
-    Tb res=Tbconst(1.);
-    do
-      {
-      if (npow&1)
-        for (int i=0; i<nvec; ++i)
-          {
-          vmuleq(res.v[i],val.v[i]);
-          vmuleq(val.v[i],val.v[i]);
-          }
-      else
-        for (int i=0; i<nvec; ++i)
-          vmuleq(val.v[i],val.v[i]);
-      }
-    while(npow>>=1);
-    *resd=res;
-    *ress=Tbconst(0.);
-    }
-  else
-    {
-    Tb scale=Tbconst(0.), scaleint=Tbconst(0.), res=Tbconst(1.);
-    Tbnormalize(&val,&scaleint,sharp_fbighalf);
-    do
-      {
-      if (npow&1)
-        {
-        for (int i=0; i<nvec; ++i)
-          {
-          vmuleq(res.v[i],val.v[i]);
-          vaddeq(scale.v[i],scaleint.v[i]);
-          }
-        Tbnormalize(&res,&scale,sharp_fbighalf);
-        }
-      for (int i=0; i<nvec; ++i)
-        {
-        vmuleq(val.v[i],val.v[i]);
-        vaddeq(scaleint.v[i],scaleint.v[i]);
-        }
-      Tbnormalize(&val,&scaleint,sharp_fbighalf);
-      }
-    while(npow>>=1);
-    *resd=res;
-    *ress=scale;
-    }
-  }
-
-static inline int rescale(Tb * restrict lam1, Tb * restrict lam2,
-  Tb * restrict scale)
-  {
-  int did_scale=0;
-  for (int i=0;i<nvec; ++i)
-    {
-    Tm mask = vgt(vabs(lam2->v[i]),vload(sharp_ftol));
-    if (vanyTrue(mask))
-      {
-      did_scale=1;
-      vmuleq_mask(mask,lam1->v[i],vload(sharp_fsmall));
-      vmuleq_mask(mask,lam2->v[i],vload(sharp_fsmall));
-      vaddeq_mask(mask,scale->v[i],vone);
-      }
-    }
-  return did_scale;
-  }
-
-static inline int TballLt(Tb a,double b)
-  {
-  Tv vb=vload(b);
-  Tm res=vlt(a.v[0],vb);
-  for (int i=1; i<nvec; ++i)
-    res=vand_mask(res,vlt(a.v[i],vb));
-  return vallTrue(res);
-  }
-static inline int TballGt(Tb a,double b)
-  {
-  Tv vb=vload(b);
-  Tm res=vgt(a.v[0],vb);
-  for (int i=1; i<nvec; ++i)
-    res=vand_mask(res,vgt(a.v[i],vb));
-  return vallTrue(res);
-  }
-static inline int TballGe(Tb a,double b)
-  {
-  Tv vb=vload(b);
-  Tm res=vge(a.v[0],vb);
-  for (int i=1; i<nvec; ++i)
-    res=vand_mask(res,vge(a.v[i],vb));
-  return vallTrue(res);
-  }
-
-static void getCorfac(Tb scale, Tb * restrict corfac,
-  const double * restrict cf)
-  {
-  Tbu sc, corf;
-  sc.b=scale;
-  for (int i=0; i<VLEN*nvec; ++i)
-    corf.s[i] = (sc.s[i]<sharp_minscale) ?
-      0. : cf[(int)(sc.s[i])-sharp_minscale];
-  *corfac=corf.b;
-  }
-
-NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
-  Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
-  const sharp_Ylmgen_C * restrict gen)
-  {
-  int l=gen->m;
-  Tb lam_1=Tbconst(0.), lam_2, scale;
-  mypow(sth,l,gen->powlimit,&lam_2,&scale);
-  Tbmuleq1(&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
-  Tbnormalize(&lam_2,&scale,sharp_ftol);
-
-  int below_limit = TballLt(scale,sharp_limscale);
-  while (below_limit)
-    {
-    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
-    for (int i=0; i<nvec; ++i)
-      {
-      lam_1.v[i] = vload(gen->rf[l].f[0])*(cth.v[i]*lam_2.v[i])
-                 - vload(gen->rf[l].f[1])*lam_1.v[i];
-      lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
-                 - vload(gen->rf[l+1].f[1])*lam_2.v[i];
-      }
-    if (rescale(&lam_1,&lam_2,&scale))
-      below_limit = TballLt(scale,sharp_limscale);
-    l+=2;
-    }
-  *l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale;
-  }
-
-
-NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
-  Tbri * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
-  int l, int lmax)
-  {
-  while (l<=lmax)
-    {
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
-       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      {
-      lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
-      p1->r.v[i] += lam_2.v[i]*ar1;
-      p1->i.v[i] += lam_2.v[i]*ai1;
-      lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
-      p2->r.v[i] += lam_1.v[i]*ar2;
-      p2->i.v[i] += lam_1.v[i]*ai2;
-      }
-    l+=2;
-    }
-  }
-
-NOINLINE static void map2alm_kernel (const Tb cth,
-  const Tbri * restrict p1, const Tbri * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp)
-  {
-  while (l<=lmax)
-    {
-    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
-       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      {
-      lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
-      vfmaeq(atmp[2*l  ],lam_2.v[i],p1->r.v[i]);
-      vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]);
-      lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
-      vfmaeq(atmp[2*(l+1)  ],lam_1.v[i],p2->r.v[i]);
-      vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]);
-      }
-    l+=2;
-    }
-  }
-
-NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Tbri * restrict p1,
-  Tbri * restrict p2)
-  {
-  int l,lmax=gen->lmax;
-  Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale;
-  iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen);
-  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
-
-  Tb corfac;
-  getCorfac(scale,&corfac,gen->cf);
-  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = TballGe(scale,sharp_minscale);
-  while (!full_ieee)
-    {
-    {
-    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
-      vfmaeq(p1->r.v[i],tmp,ar);
-      vfmaeq(p1->i.v[i],tmp,ai);
-      }
-    }
-    if (++l>lmax) break;
-    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    {
-    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
-      vfmaeq(p2->r.v[i],tmp,ar);
-      vfmaeq(p2->i.v[i],tmp,ai);
-      }
-    }
-    if (++l>lmax) break;
-    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
-    if (rescale(&lam_1,&lam_2,&scale))
-      {
-      getCorfac(scale,&corfac,gen->cf);
-      full_ieee = TballGe(scale,sharp_minscale);
-      }
-    }
-  if (l>lmax) return;
-
-  Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac);
-  alm2map_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
-  }
-
-NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, const Tbri * restrict p1,
-  const Tbri * restrict p2, Tv *restrict atmp)
-  {
-  int lmax=gen->lmax;
-  Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale;
-  int l=gen->m;
-  iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen);
-  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
-
-  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
-  Tb corfac;
-  getCorfac(scale,&corfac,gen->cf);
-  int full_ieee = TballGe(scale,sharp_minscale);
-  while (!full_ieee)
-    {
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv tmp=lam_2.v[i]*corfac.v[i];
-      atmp[2*l  ]+=tmp*p1->r.v[i];
-      atmp[2*l+1]+=tmp*p1->i.v[i];
-      }
-    if (++l>lmax) return;
-    for (int i=0; i<nvec; ++i)
-      {
-      lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
-                 - vload(rf[l-1].f[1])*lam_1.v[i];
-      Tv tmp=lam_1.v[i]*corfac.v[i];
-      atmp[2*l  ]+=tmp*p2->r.v[i];
-      atmp[2*l+1]+=tmp*p2->i.v[i];
-      }
-    if (++l>lmax) return;
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i])
-                 - vload(rf[l-1].f[1])*lam_2.v[i];
-    if (rescale(&lam_1,&lam_2,&scale))
-      {
-      getCorfac(scale,&corfac,gen->cf);
-      full_ieee = TballGe(scale,sharp_minscale);
-      }
-    }
-
-  Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac);
-  map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp);
-  }
-
-
-#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
-
-NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
-  const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim)
-  {
-  const int nval=nvec*VLEN;
-  const int m = job->ainfo->mval[mi];
-  sharp_Ylmgen_prepare (gen, m);
-
-  switch (job->type)
-    {
-    case SHARP_ALM2MAP:
-    case SHARP_ALM2MAP_DERIV1:
-      {
-      if (job->spin==0)
-        {
-        for (int ith=0; ith<ulim-llim; ith+=nval)
-          {
-          Tburi p1,p2; VZERO(p1); VZERO(p2);
-          Tbu cth, sth;
-
-          int skip=1;
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            if (mlim[itot]>=m) skip=0;
-            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-            }
-          if (!skip)
-            calc_alm2map (cth.b,sth.b,gen,job,&p1.b,&p2.b);
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot<ulim-llim)
-              {
-              int phas_idx = itot*job->s_th + mi*job->s_m;
-              complex double r1 = p1.s.r[i] + p1.s.i[i]*_Complex_I,
-                             r2 = p2.s.r[i] + p2.s.i[i]*_Complex_I;
-              job->phase[phas_idx] = r1+r2;
-              if (ispair[itot])
-                job->phase[phas_idx+1] = r1-r2;
-              }
-            }
-          }
-        }
-      else
-        {
-        UTIL_FAIL("only spin==0 allowed at the moment");
-        }
-      break;
-      }
-    default:
-      {
-      UTIL_FAIL("must not happen");
-      break;
-      }
-    }
-  }
-
-NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
-  const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim)
-  {
-  const int nval=nvec*VLEN;
-  const int m = job->ainfo->mval[mi];
-  sharp_Ylmgen_prepare (gen, m);
-
-  switch (job->type)
-    {
-    case SHARP_MAP2ALM:
-      {
-      if (job->spin==0)
-        {
-        Tv atmp[2*(gen->lmax+2)];
-        memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv));
-        for (int ith=0; ith<ulim-llim; ith+=nval)
-          {
-          Tburi p1, p2; VZERO(p1); VZERO(p2);
-          Tbu cth, sth;
-          int skip=1;
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            if (mlim[itot]>=m) skip=0;
-            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-            if ((i+ith<ulim-llim)&&(mlim[itot]>=m))
-              {
-              int phas_idx = itot*job->s_th + mi*job->s_m;
-              dcmplx ph1=job->phase[phas_idx];
-              dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
-              p1.s.r[i]=creal(ph1+ph2); p1.s.i[i]=cimag(ph1+ph2);
-              p2.s.r[i]=creal(ph1-ph2); p2.s.i[i]=cimag(ph1-ph2);
-              }
-            }
-          if (!skip)
-            calc_map2alm(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp);
-          }
-        {
-        int istart=m, istop=gen->lmax+1;
-        for(; istart<istop-2; istart+=2)
-          vhsum_cmplx_special(atmp[2*istart],atmp[2*istart+1],atmp[2*istart+2],atmp[2*istart+3],&(job->almtmp[istart]));
-        for(; istart<istop; istart++)
-          job->almtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]);
-        }
-        }
-      else
-        {
-        UTIL_FAIL("only spin==0 allowed at the moment");
-        }
-      break;
-      }
-    default:
-      {
-      UTIL_FAIL("must not happen");
-      break;
-      }
-    }
-  }
-
-static void inner_loop_ (sharp_job *job, const int *ispair,
-  const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim)
-  {
-  (job->type==SHARP_MAP2ALM) ?
-    inner_loop_m2a(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) :
-    inner_loop_a2m(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim);
-  }
-
-#undef VZERO
diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c
deleted file mode 100644
index b209cae..0000000
--- a/libsharp/sharp_core_inc0.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_core_inc0.c
- *  Computational core
- *
- *  Copyright (C) 2012-2018 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#include <complex.h>
-#include <math.h>
-#include <string.h>
-#include "sharp_vecsupport.h"
-#include "sharp_complex_hacks.h"
-#include "sharp.h"
-#include "sharp_core.h"
-#include "c_utils.h"
-
-typedef complex double dcmplx;
-
-#define XCONCATX(a,b) a##b
-#define CONCATX(a,b) XCONCATX(a,b)
-
-#define nvec 6
-#define Y(arg) arg
-#include "sharp_core_inc.c"
-
-#undef Y
-#undef nvec
-
-void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *cth,
-  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
-  const int *mlim)
-  {
-  inner_loop_(job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-  }
diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c
index 26b9f92..4b124af 100644
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@@ -44,7 +44,6 @@
 #include "c_utils.h"
 #include "sharp_announce.h"
 #include "memusage.h"
-#include "sharp_vecsupport.h"
 
 typedef complex double dcmplx;
 
@@ -597,7 +596,7 @@ static void sharp_test (int argc, const char **argv)
   if (mytask==0)
     printf("%-12s %-10s %2d %d %2d %3d %6d %6d %6d %6d %2d %.2e %7.2f %.2e %7.2f"
            " %9.2f %6.2f %.2e %.2e\n",
-      getenv("HOST"),argv[2],spin,VLEN,nomp,ntasks,lmax,mmax,gpar1,gpar2,
+      getenv("HOST"),argv[2],spin,sharp_veclen(),nomp,ntasks,lmax,mmax,gpar1,gpar2,
       t_a2m,1e-9*op_a2m/t_a2m,t_m2a,1e-9*op_m2a/t_m2a,tmem/(1<<20),
       100.*(1.-iosize/tmem),maxerel,maxeabs);
 

From cec84e5853bcc37eb2729735c1efa2fc2b713d97 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 12 Dec 2018 19:29:30 +0100
Subject: [PATCH 18/85] tweaks

---
 libsharp/sharp_core.c | 288 ++++++++++++++++++++----------------------
 1 file changed, 136 insertions(+), 152 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 7038b38..eded1ef 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -40,7 +40,11 @@
 
 typedef complex double dcmplx;
 
-#define nvec (128/VLEN)
+#define nvec (256/VLEN)
+
+typedef union
+  { Tv v; double s[VLEN]; } Tvu;
+
 typedef struct
   { Tv v[nvec]; } Tb;
 
@@ -82,27 +86,24 @@ static inline Tb Tbprod(Tb a, Tb b)
 static inline void Tbmuleq(Tb * restrict a, Tb b)
   { for (int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); }
 
-static void Tbnormalize (Tb * restrict val, Tb * restrict scale,
+static void Tbnormalize (Tv * restrict val, Tv * restrict scale,
   double maxval)
   {
   const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
   const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
-  for (int i=0;i<nvec; ++i)
+  Tm mask = vgt(vabs(*val),vfmax);
+  while (vanyTrue(mask))
     {
-    Tm mask = vgt(vabs(val->v[i]),vfmax);
-    while (vanyTrue(mask))
-      {
-      vmuleq_mask(mask,val->v[i],vfsmall);
-      vaddeq_mask(mask,scale->v[i],vone);
-      mask = vgt(vabs(val->v[i]),vfmax);
-      }
-    mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
-    while (vanyTrue(mask))
-      {
-      vmuleq_mask(mask,val->v[i],vfbig);
-      vsubeq_mask(mask,scale->v[i],vone);
-      mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
-      }
+    vmuleq_mask(mask,*val,vfsmall);
+    vaddeq_mask(mask,*scale,vone);
+    mask = vgt(vabs(*val),vfmax);
+    }
+  mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
+  while (vanyTrue(mask))
+    {
+    vmuleq_mask(mask,*val,vfbig);
+    vsubeq_mask(mask,*scale,vone);
+    mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
     }
   }
 
@@ -110,72 +111,46 @@ NOINLINE static void mypow (Tb val, int npow, const double * restrict powlimit,
   Tb * restrict resd, Tb * restrict ress)
   {
   Tv vminv=vload(powlimit[npow]);
-  Tm mask = vlt(vabs(val.v[0]),vminv);
-  for (int i=1;i<nvec; ++i)
-    mask=vor_mask(mask,vlt(vabs(val.v[i]),vminv));
-  if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
-    {
-    Tb res=Tbconst(1.);
-    do
-      {
-      if (npow&1)
-        for (int i=0; i<nvec; ++i)
-          {
-          vmuleq(res.v[i],val.v[i]);
-          vmuleq(val.v[i],val.v[i]);
-          }
-      else
-        for (int i=0; i<nvec; ++i)
-          vmuleq(val.v[i],val.v[i]);
-      }
-    while(npow>>=1);
-    *resd=res;
-    *ress=Tbconst(0.);
-    }
-  else
-    {
-    Tb scale=Tbconst(0.), scaleint=Tbconst(0.), res=Tbconst(1.);
-    Tbnormalize(&val,&scaleint,sharp_fbighalf);
-    do
-      {
-      if (npow&1)
-        {
-        for (int i=0; i<nvec; ++i)
-          {
-          vmuleq(res.v[i],val.v[i]);
-          vaddeq(scale.v[i],scaleint.v[i]);
-          }
-        Tbnormalize(&res,&scale,sharp_fbighalf);
-        }
-      for (int i=0; i<nvec; ++i)
-        {
-        vmuleq(val.v[i],val.v[i]);
-        vaddeq(scaleint.v[i],scaleint.v[i]);
-        }
-      Tbnormalize(&val,&scaleint,sharp_fbighalf);
-      }
-    while(npow>>=1);
-    *resd=res;
-    *ress=scale;
-    }
-  }
-
-static inline int rescale(Tb * restrict lam1, Tb * restrict lam2,
-  Tb * restrict scale)
-  {
-  int did_scale=0;
+  int npsave=npow;
   for (int i=0;i<nvec; ++i)
     {
-    Tm mask = vgt(vabs(lam2->v[i]),vload(sharp_ftol));
-    if (vanyTrue(mask))
+    npow=npsave;
+    Tv res=vone;
+    Tm mask = vlt(vabs(val.v[i]),vminv);
+    if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
       {
-      did_scale=1;
-      vmuleq_mask(mask,lam1->v[i],vload(sharp_fsmall));
-      vmuleq_mask(mask,lam2->v[i],vload(sharp_fsmall));
-      vaddeq_mask(mask,scale->v[i],vone);
+      Tv res=vone;
+      do
+        {
+        if (npow&1)
+          vmuleq(res,val.v[i]);
+        vmuleq(val.v[i],val.v[i]);
+        }
+      while(npow>>=1);
+      resd->v[i]=res;
+      ress->v[i]=vzero;
+      }
+    else
+      {
+      Tv scale=vzero, scaleint=vzero, res=vone;
+      Tbnormalize(&val.v[i],&scaleint,sharp_fbighalf);
+      do
+        {
+        if (npow&1)
+          {
+          vmuleq(res,val.v[i]);
+          vaddeq(scale,scaleint);
+          Tbnormalize(&res,&scale,sharp_fbighalf);
+          }
+        vmuleq(val.v[i],val.v[i]);
+        vaddeq(scaleint,scaleint);
+        Tbnormalize(&val.v[i],&scaleint,sharp_fbighalf);
+        }
+      while(npow>>=1);
+      resd->v[i]=res;
+      ress->v[i]=scale;
       }
     }
-  return did_scale;
   }
 
 static inline int TballLt(Tb a,double b)
@@ -215,31 +190,40 @@ static void getCorfac(Tb scale, Tb * restrict corfac,
   }
 
 NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
-  Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
+  Tb * restrict lam_1, Tb * restrict lam_2, Tb * restrict scale,
   const sharp_Ylmgen_C * restrict gen)
   {
   int l=gen->m;
-  Tb lam_1=Tbconst(0.), lam_2, scale;
-  mypow(sth,l,gen->powlimit,&lam_2,&scale);
-  Tbmuleq1(&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
-  Tbnormalize(&lam_2,&scale,sharp_ftol);
+  for (int i=0; i<nvec; ++i) lam_1->v[i]=vzero;
+  mypow(sth,l,gen->powlimit,lam_2,scale);
+  Tbmuleq1(lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
+  for (int i=0; i<nvec; ++i)
+    Tbnormalize(&lam_2->v[i],&scale->v[i],sharp_ftol);
+  Tv fsmall=vload(sharp_fsmall), limscale=vload(sharp_limscale);
 
-  int below_limit = TballLt(scale,sharp_limscale);
+  int below_limit = TballLt(*scale,sharp_limscale);
   while (below_limit)
     {
     if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
+    below_limit=1;
+    Tv r10=vload(gen->rf[l  ].f[0]), r11=vload(gen->rf[l  ].f[1]),
+       r20=vload(gen->rf[l+1].f[0]), r21=vload(gen->rf[l+1].f[1]);
     for (int i=0; i<nvec; ++i)
       {
-      lam_1.v[i] = vload(gen->rf[l].f[0])*(cth.v[i]*lam_2.v[i])
-                 - vload(gen->rf[l].f[1])*lam_1.v[i];
-      lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
-                 - vload(gen->rf[l+1].f[1])*lam_2.v[i];
+      lam_1->v[i] = r10*cth.v[i]*lam_2->v[i] - r11*lam_1->v[i];
+      lam_2->v[i] = r20*cth.v[i]*lam_1->v[i] - r21*lam_2->v[i];
+      Tm mask = vgt(vabs(lam_2->v[i]),vload(sharp_ftol));
+      if (vanyTrue(mask))
+        {
+        vmuleq_mask(mask,lam_1->v[i],fsmall);
+        vmuleq_mask(mask,lam_2->v[i],fsmall);
+        vaddeq_mask(mask,scale->v[i],vone);
+        below_limit &= vallTrue(vlt(scale->v[i],limscale));
+        }
       }
-    if (rescale(&lam_1,&lam_2,&scale))
-      below_limit = TballLt(scale,sharp_limscale);
     l+=2;
     }
-  *l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale;
+  *l_=l;
   }
 
 
@@ -256,12 +240,12 @@ NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
        f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
     for (int i=0; i<nvec; ++i)
       {
-      lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
-      p1->r.v[i] += lam_2.v[i]*ar1;
-      p1->i.v[i] += lam_2.v[i]*ai1;
-      lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
-      p2->r.v[i] += lam_1.v[i]*ar2;
-      p2->i.v[i] += lam_1.v[i]*ai2;
+      lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
+      vfmaeq(p1->r.v[i],lam_2.v[i],ar1);
+      vfmaeq(p1->i.v[i],lam_2.v[i],ai1);
+      lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
+      vfmaeq(p2->r.v[i],lam_1.v[i],ar2);
+      vfmaeq(p2->i.v[i],lam_1.v[i],ai2);
       }
     l+=2;
     }
@@ -277,10 +261,10 @@ NOINLINE static void map2alm_kernel (const Tb cth,
        f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
     for (int i=0; i<nvec; ++i)
       {
-      lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
+      lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
       vfmaeq(atmp[2*l  ],lam_2.v[i],p1->r.v[i]);
       vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]);
-      lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
+      lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
       vfmaeq(atmp[2*(l+1)  ],lam_1.v[i],p2->r.v[i]);
       vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]);
       }
@@ -304,39 +288,37 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
   const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
   const dcmplx * restrict alm=job->almtmp;
   int full_ieee = TballGe(scale,sharp_minscale);
-  while (!full_ieee)
+  while((!full_ieee) && (l<=lmax))
     {
-    {
-    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
+       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
+    full_ieee=1;
     for (int i=0; i<nvec; ++i)
       {
-      Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
-      vfmaeq(p1->r.v[i],tmp,ar);
-      vfmaeq(p1->i.v[i],tmp,ai);
-      }
-    }
-    if (++l>lmax) break;
-    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    {
-    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
-      vfmaeq(p2->r.v[i],tmp,ar);
-      vfmaeq(p2->i.v[i],tmp,ai);
-      }
-    }
-    if (++l>lmax) break;
-    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
-    if (rescale(&lam_1,&lam_2,&scale))
-      {
-      getCorfac(scale,&corfac,gen->cf);
-      full_ieee = TballGe(scale,sharp_minscale);
+      lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
+      vfmaeq(p1->r.v[i],lam_2.v[i]*corfac.v[i],ar1);
+      vfmaeq(p1->i.v[i],lam_2.v[i]*corfac.v[i],ai1);
+      lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
+      Tm mask = vgt(vabs(lam_2.v[i]),vload(sharp_ftol));
+      if (vanyTrue(mask))
+        {
+        vmuleq_mask(mask,lam_1.v[i],vload(sharp_fsmall));
+        vmuleq_mask(mask,lam_2.v[i],vload(sharp_fsmall));
+        vaddeq_mask(mask,scale.v[i],vone);
+        Tvu sc, corf;
+        sc.v=scale.v[i];
+        for (int j=0; j<VLEN; ++j)
+          corf.s[j] = (sc.s[j]<sharp_minscale) ?
+            0. : gen->cf[(int)(sc.s[j])-sharp_minscale];
+        corfac.v[i]=corf.v;
+        full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
+        }
+      vfmaeq(p2->r.v[i],lam_1.v[i]*corfac.v[i],ar2);
+      vfmaeq(p2->i.v[i],lam_1.v[i]*corfac.v[i],ai2);
       }
+    l+=2;
     }
   if (l>lmax) return;
 
@@ -360,32 +342,35 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
   Tb corfac;
   getCorfac(scale,&corfac,gen->cf);
   int full_ieee = TballGe(scale,sharp_minscale);
-  while (!full_ieee)
+  while ((!full_ieee) && (l<=lmax))
     {
+    full_ieee=1;
+    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
+       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
     for (int i=0; i<nvec; ++i)
       {
-      Tv tmp=lam_2.v[i]*corfac.v[i];
-      atmp[2*l  ]+=tmp*p1->r.v[i];
-      atmp[2*l+1]+=tmp*p1->i.v[i];
-      }
-    if (++l>lmax) return;
-    for (int i=0; i<nvec; ++i)
-      {
-      lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
-                 - vload(rf[l-1].f[1])*lam_1.v[i];
-      Tv tmp=lam_1.v[i]*corfac.v[i];
-      atmp[2*l  ]+=tmp*p2->r.v[i];
-      atmp[2*l+1]+=tmp*p2->i.v[i];
-      }
-    if (++l>lmax) return;
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i])
-                 - vload(rf[l-1].f[1])*lam_2.v[i];
-    if (rescale(&lam_1,&lam_2,&scale))
-      {
-      getCorfac(scale,&corfac,gen->cf);
-      full_ieee = TballGe(scale,sharp_minscale);
+      lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
+      vfmaeq(atmp[2*l  ],lam_2.v[i]*corfac.v[i],p1->r.v[i]);
+      vfmaeq(atmp[2*l+1],lam_2.v[i]*corfac.v[i],p1->i.v[i]);
+      lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
+      Tm mask = vgt(vabs(lam_2.v[i]),vload(sharp_ftol));
+      if (vanyTrue(mask))
+        {
+        vmuleq_mask(mask,lam_1.v[i],vload(sharp_fsmall));
+        vmuleq_mask(mask,lam_2.v[i],vload(sharp_fsmall));
+        vaddeq_mask(mask,scale.v[i],vone);
+        Tvu sc, corf;
+        sc.v=scale.v[i];
+        for (int j=0; j<VLEN; ++j)
+          corf.s[j] = (sc.s[j]<sharp_minscale) ?
+            0. : gen->cf[(int)(sc.s[j])-sharp_minscale];
+        corfac.v[i]=corf.v;
+        full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
+        }
+      vfmaeq(atmp[2*(l+1)  ],lam_1.v[i]*corfac.v[i],p2->r.v[i]);
+      vfmaeq(atmp[2*(l+1)+1],lam_1.v[i]*corfac.v[i],p2->i.v[i]);
       }
+    l+=2;
     }
 
   Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac);
@@ -527,7 +512,6 @@ void inner_loop (sharp_job *job, const int *ispair,
   }
 
 #undef VZERO
-#undef nvec
 
 int sharp_veclen(void)
   {
@@ -536,5 +520,5 @@ int sharp_veclen(void)
 
 int sharp_max_nvec(void)
   {
-  return 128/VLEN;
+  return nvec;
   }

From 716cd9e558a9c258b753b1bd66e4024a1a7e0816 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 12 Dec 2018 20:16:03 +0100
Subject: [PATCH 19/85] cleanup

---
 libsharp/sharp_core.c | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index eded1ef..eda81f4 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -253,21 +253,23 @@ NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
 
 NOINLINE static void map2alm_kernel (const Tb cth,
   const Tbri * restrict p1, const Tbri * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp)
+  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax)
   {
   while (l<=lmax)
     {
     Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
        f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
+    Tv atmp[4] = {vzero, vzero, vzero, vzero};
     for (int i=0; i<nvec; ++i)
       {
       lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
-      vfmaeq(atmp[2*l  ],lam_2.v[i],p1->r.v[i]);
-      vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]);
+      vfmaeq(atmp[0],lam_2.v[i],p1->r.v[i]);
+      vfmaeq(atmp[1],lam_2.v[i],p1->i.v[i]);
       lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
-      vfmaeq(atmp[2*(l+1)  ],lam_1.v[i],p2->r.v[i]);
-      vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]);
+      vfmaeq(atmp[2],lam_1.v[i],p2->r.v[i]);
+      vfmaeq(atmp[3],lam_1.v[i],p2->i.v[i]);
       }
+    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
     l+=2;
     }
   }
@@ -328,7 +330,7 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
 
 NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
   const sharp_Ylmgen_C *gen, sharp_job *job, const Tbri * restrict p1,
-  const Tbri * restrict p2, Tv *restrict atmp)
+  const Tbri * restrict p2)
   {
   int lmax=gen->lmax;
   Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale;
@@ -339,6 +341,7 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
   job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
 
   const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
+  dcmplx * restrict alm=job->almtmp;
   Tb corfac;
   getCorfac(scale,&corfac,gen->cf);
   int full_ieee = TballGe(scale,sharp_minscale);
@@ -347,11 +350,12 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
     full_ieee=1;
     Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
        f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
+    Tv atmp[4] = {vzero, vzero, vzero, vzero};
     for (int i=0; i<nvec; ++i)
       {
       lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
-      vfmaeq(atmp[2*l  ],lam_2.v[i]*corfac.v[i],p1->r.v[i]);
-      vfmaeq(atmp[2*l+1],lam_2.v[i]*corfac.v[i],p1->i.v[i]);
+      vfmaeq(atmp[0],lam_2.v[i]*corfac.v[i],p1->r.v[i]);
+      vfmaeq(atmp[1],lam_2.v[i]*corfac.v[i],p1->i.v[i]);
       lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
       Tm mask = vgt(vabs(lam_2.v[i]),vload(sharp_ftol));
       if (vanyTrue(mask))
@@ -367,14 +371,15 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
         corfac.v[i]=corf.v;
         full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
         }
-      vfmaeq(atmp[2*(l+1)  ],lam_1.v[i]*corfac.v[i],p2->r.v[i]);
-      vfmaeq(atmp[2*(l+1)+1],lam_1.v[i]*corfac.v[i],p2->i.v[i]);
+      vfmaeq(atmp[2],lam_1.v[i]*corfac.v[i],p2->r.v[i]);
+      vfmaeq(atmp[3],lam_1.v[i]*corfac.v[i],p2->i.v[i]);
       }
+    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
     l+=2;
     }
 
   Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac);
-  map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp);
+  map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
   }
 
 
@@ -454,8 +459,6 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
       {
       if (job->spin==0)
         {
-        Tv atmp[2*(gen->lmax+2)];
-        memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv));
         for (int ith=0; ith<ulim-llim; ith+=nval)
           {
           Tburi p1, p2; VZERO(p1); VZERO(p2);
@@ -478,15 +481,8 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
               }
             }
           if (!skip)
-            calc_map2alm(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp);
+            calc_map2alm(cth.b,sth.b,gen,job,&p1.b,&p2.b);
           }
-        {
-        int istart=m, istop=gen->lmax+1;
-        for(; istart<istop-2; istart+=2)
-          vhsum_cmplx_special(atmp[2*istart],atmp[2*istart+1],atmp[2*istart+2],atmp[2*istart+3],&(job->almtmp[istart]));
-        for(; istart<istop; istart++)
-          job->almtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]);
-        }
         }
       else
         {

From 7b02cb33abc2fc58bfee72efd1adb9fc71b1e27e Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 12 Dec 2018 20:45:48 +0100
Subject: [PATCH 20/85] tweaks

---
 libsharp/sharp_core.c | 162 ++++++++++++++++--------------------------
 1 file changed, 62 insertions(+), 100 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index eda81f4..f13d83d 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -54,21 +54,12 @@ typedef union
 typedef struct
   { Tb r, i; } Tbri;
 
-typedef struct
-  { Tb qr, qi, ur, ui; } Tbqu;
-
 typedef struct
   { double r[VLEN*nvec], i[VLEN*nvec]; } Tsri;
 
-typedef struct
-  { double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Tsqu;
-
 typedef union
   { Tbri b; Tsri s; } Tburi;
 
-typedef union
-  { Tbqu b; Tsqu s; } Tbuqu;
-
 static inline Tb Tbconst(double val)
   {
   Tv v=vload(val);
@@ -80,9 +71,6 @@ static inline Tb Tbconst(double val)
 static inline void Tbmuleq1(Tb * restrict a, double b)
   { Tv v=vload(b); for (int i=0; i<nvec; ++i) vmuleq(a->v[i],v); }
 
-static inline Tb Tbprod(Tb a, Tb b)
-  { Tb r; for (int i=0; i<nvec; ++i) r.v[i]=vmul(a.v[i],b.v[i]); return r; }
-
 static inline void Tbmuleq(Tb * restrict a, Tb b)
   { for (int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); }
 
@@ -107,86 +95,56 @@ static void Tbnormalize (Tv * restrict val, Tv * restrict scale,
     }
   }
 
-NOINLINE static void mypow (Tb val, int npow, const double * restrict powlimit,
-  Tb * restrict resd, Tb * restrict ress)
+static void mypow(Tv val, int npow, const double * restrict powlimit,
+  Tv * restrict resd, Tv * restrict ress)
   {
   Tv vminv=vload(powlimit[npow]);
-  int npsave=npow;
-  for (int i=0;i<nvec; ++i)
+  Tv res=vone;
+  Tm mask = vlt(vabs(val),vminv);
+  if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
     {
-    npow=npsave;
     Tv res=vone;
-    Tm mask = vlt(vabs(val.v[i]),vminv);
-    if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
+    do
       {
-      Tv res=vone;
-      do
-        {
-        if (npow&1)
-          vmuleq(res,val.v[i]);
-        vmuleq(val.v[i],val.v[i]);
-        }
-      while(npow>>=1);
-      resd->v[i]=res;
-      ress->v[i]=vzero;
+      if (npow&1)
+        vmuleq(res,val);
+      vmuleq(val,val);
       }
-    else
+    while(npow>>=1);
+    *resd=res;
+    *ress=vzero;
+    }
+  else
+    {
+    Tv scale=vzero, scaleint=vzero, res=vone;
+    Tbnormalize(&val,&scaleint,sharp_fbighalf);
+    do
       {
-      Tv scale=vzero, scaleint=vzero, res=vone;
-      Tbnormalize(&val.v[i],&scaleint,sharp_fbighalf);
-      do
+      if (npow&1)
         {
-        if (npow&1)
-          {
-          vmuleq(res,val.v[i]);
-          vaddeq(scale,scaleint);
-          Tbnormalize(&res,&scale,sharp_fbighalf);
-          }
-        vmuleq(val.v[i],val.v[i]);
-        vaddeq(scaleint,scaleint);
-        Tbnormalize(&val.v[i],&scaleint,sharp_fbighalf);
+        vmuleq(res,val);
+        vaddeq(scale,scaleint);
+        Tbnormalize(&res,&scale,sharp_fbighalf);
         }
-      while(npow>>=1);
-      resd->v[i]=res;
-      ress->v[i]=scale;
+      vmuleq(val,val);
+      vaddeq(scaleint,scaleint);
+      Tbnormalize(&val,&scaleint,sharp_fbighalf);
       }
+    while(npow>>=1);
+    *resd=res;
+    *ress=scale;
     }
   }
 
-static inline int TballLt(Tb a,double b)
-  {
-  Tv vb=vload(b);
-  Tm res=vlt(a.v[0],vb);
-  for (int i=1; i<nvec; ++i)
-    res=vand_mask(res,vlt(a.v[i],vb));
-  return vallTrue(res);
-  }
-static inline int TballGt(Tb a,double b)
-  {
-  Tv vb=vload(b);
-  Tm res=vgt(a.v[0],vb);
-  for (int i=1; i<nvec; ++i)
-    res=vand_mask(res,vgt(a.v[i],vb));
-  return vallTrue(res);
-  }
-static inline int TballGe(Tb a,double b)
-  {
-  Tv vb=vload(b);
-  Tm res=vge(a.v[0],vb);
-  for (int i=1; i<nvec; ++i)
-    res=vand_mask(res,vge(a.v[i],vb));
-  return vallTrue(res);
-  }
-
-static void getCorfac(Tb scale, Tb * restrict corfac,
+static void getCorfac(Tv scale, Tv * restrict corfac,
   const double * restrict cf)
   {
-  Tbu sc, corf;
-  sc.b=scale;
-  for (int i=0; i<VLEN*nvec; ++i)
+  Tvu sc, corf;
+  sc.v=scale;
+  for (int i=0; i<VLEN; ++i)
     corf.s[i] = (sc.s[i]<sharp_minscale) ?
       0. : cf[(int)(sc.s[i])-sharp_minscale];
-  *corfac=corf.b;
+  *corfac=corf.v;
   }
 
 NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
@@ -194,14 +152,18 @@ NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
   const sharp_Ylmgen_C * restrict gen)
   {
   int l=gen->m;
-  for (int i=0; i<nvec; ++i) lam_1->v[i]=vzero;
-  mypow(sth,l,gen->powlimit,lam_2,scale);
-  Tbmuleq1(lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
-  for (int i=0; i<nvec; ++i)
-    Tbnormalize(&lam_2->v[i],&scale->v[i],sharp_ftol);
+  Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
   Tv fsmall=vload(sharp_fsmall), limscale=vload(sharp_limscale);
+  int below_limit = 1;
+  for (int i=0; i<nvec; ++i)
+    {
+    lam_1->v[i]=vzero;
+    mypow(sth.v[i],l,gen->powlimit,&lam_2->v[i],&scale->v[i]);
+    lam_2->v[i] *= mfac;
+    Tbnormalize(&lam_2->v[i],&scale->v[i],sharp_ftol);
+    below_limit &= vallTrue(vlt(scale->v[i],limscale));
+    }
 
-  int below_limit = TballLt(*scale,sharp_limscale);
   while (below_limit)
     {
     if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
@@ -279,17 +241,22 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
   Tbri * restrict p2)
   {
   int l,lmax=gen->lmax;
-  Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale;
+  Tb lam_1,lam_2,scale;
   iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen);
   job->opcnt += (l-gen->m) * 4*VLEN*nvec;
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
 
-  Tb corfac;
-  getCorfac(scale,&corfac,gen->cf);
   const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
   const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = TballGe(scale,sharp_minscale);
+  int full_ieee=1;
+  Tb corfac;
+  for (int i=0; i<nvec; ++i)
+    {
+    getCorfac(scale.v[i], &corfac.v[i], gen->cf);
+    full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
+    }
+
   while((!full_ieee) && (l<=lmax))
     {
     Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
@@ -309,12 +276,7 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
         vmuleq_mask(mask,lam_1.v[i],vload(sharp_fsmall));
         vmuleq_mask(mask,lam_2.v[i],vload(sharp_fsmall));
         vaddeq_mask(mask,scale.v[i],vone);
-        Tvu sc, corf;
-        sc.v=scale.v[i];
-        for (int j=0; j<VLEN; ++j)
-          corf.s[j] = (sc.s[j]<sharp_minscale) ?
-            0. : gen->cf[(int)(sc.s[j])-sharp_minscale];
-        corfac.v[i]=corf.v;
+        getCorfac(scale.v[i], &corfac.v[i], gen->cf);
         full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
         }
       vfmaeq(p2->r.v[i],lam_1.v[i]*corfac.v[i],ar2);
@@ -333,7 +295,7 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
   const Tbri * restrict p2)
   {
   int lmax=gen->lmax;
-  Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale;
+  Tb lam_1,lam_2,scale;
   int l=gen->m;
   iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen);
   job->opcnt += (l-gen->m) * 4*VLEN*nvec;
@@ -342,9 +304,14 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
 
   const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
   dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
   Tb corfac;
-  getCorfac(scale,&corfac,gen->cf);
-  int full_ieee = TballGe(scale,sharp_minscale);
+  for (int i=0; i<nvec; ++i)
+    {
+    getCorfac(scale.v[i], &corfac.v[i], gen->cf);
+    full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
+    }
+
   while ((!full_ieee) && (l<=lmax))
     {
     full_ieee=1;
@@ -363,12 +330,7 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
         vmuleq_mask(mask,lam_1.v[i],vload(sharp_fsmall));
         vmuleq_mask(mask,lam_2.v[i],vload(sharp_fsmall));
         vaddeq_mask(mask,scale.v[i],vone);
-        Tvu sc, corf;
-        sc.v=scale.v[i];
-        for (int j=0; j<VLEN; ++j)
-          corf.s[j] = (sc.s[j]<sharp_minscale) ?
-            0. : gen->cf[(int)(sc.s[j])-sharp_minscale];
-        corfac.v[i]=corf.v;
+        getCorfac(scale.v[i], &corfac.v[i], gen->cf);
         full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
         }
       vfmaeq(atmp[2],lam_1.v[i]*corfac.v[i],p2->r.v[i]);

From 88a78b2fcb2b7d144cef0cf43ca0306e20e4606b Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 13 Dec 2018 10:30:02 +0100
Subject: [PATCH 21/85] tweaks

---
 libsharp/sharp_core.c | 156 ++++++++++++++++++++++--------------------
 1 file changed, 80 insertions(+), 76 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index f13d83d..cfdaab5 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -60,21 +60,7 @@ typedef struct
 typedef union
   { Tbri b; Tsri s; } Tburi;
 
-static inline Tb Tbconst(double val)
-  {
-  Tv v=vload(val);
-  Tb res;
-  for (int i=0; i<nvec; ++i) res.v[i]=v;
-  return res;
-  }
-
-static inline void Tbmuleq1(Tb * restrict a, double b)
-  { Tv v=vload(b); for (int i=0; i<nvec; ++i) vmuleq(a->v[i],v); }
-
-static inline void Tbmuleq(Tb * restrict a, Tb b)
-  { for (int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); }
-
-static void Tbnormalize (Tv * restrict val, Tv * restrict scale,
+static void Tvnormalize (Tv * restrict val, Tv * restrict scale,
   double maxval)
   {
   const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
@@ -117,18 +103,18 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
   else
     {
     Tv scale=vzero, scaleint=vzero, res=vone;
-    Tbnormalize(&val,&scaleint,sharp_fbighalf);
+    Tvnormalize(&val,&scaleint,sharp_fbighalf);
     do
       {
       if (npow&1)
         {
         vmuleq(res,val);
         vaddeq(scale,scaleint);
-        Tbnormalize(&res,&scale,sharp_fbighalf);
+        Tvnormalize(&res,&scale,sharp_fbighalf);
         }
       vmuleq(val,val);
       vaddeq(scaleint,scaleint);
-      Tbnormalize(&val,&scaleint,sharp_fbighalf);
+      Tvnormalize(&val,&scaleint,sharp_fbighalf);
       }
     while(npow>>=1);
     *resd=res;
@@ -149,18 +135,18 @@ static void getCorfac(Tv scale, Tv * restrict corfac,
 
 NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
   Tb * restrict lam_1, Tb * restrict lam_2, Tb * restrict scale,
-  const sharp_Ylmgen_C * restrict gen)
+  const sharp_Ylmgen_C * restrict gen, int nv2)
   {
   int l=gen->m;
   Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
   Tv fsmall=vload(sharp_fsmall), limscale=vload(sharp_limscale);
   int below_limit = 1;
-  for (int i=0; i<nvec; ++i)
+  for (int i=0; i<nv2; ++i)
     {
     lam_1->v[i]=vzero;
     mypow(sth.v[i],l,gen->powlimit,&lam_2->v[i],&scale->v[i]);
     lam_2->v[i] *= mfac;
-    Tbnormalize(&lam_2->v[i],&scale->v[i],sharp_ftol);
+    Tvnormalize(&lam_2->v[i],&scale->v[i],sharp_ftol);
     below_limit &= vallTrue(vlt(scale->v[i],limscale));
     }
 
@@ -170,7 +156,7 @@ NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
     below_limit=1;
     Tv r10=vload(gen->rf[l  ].f[0]), r11=vload(gen->rf[l  ].f[1]),
        r20=vload(gen->rf[l+1].f[0]), r21=vload(gen->rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
+    for (int i=0; i<nv2; ++i)
       {
       lam_1->v[i] = r10*cth.v[i]*lam_2->v[i] - r11*lam_1->v[i];
       lam_2->v[i] = r20*cth.v[i]*lam_1->v[i] - r21*lam_2->v[i];
@@ -192,7 +178,7 @@ NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
 NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
   Tbri * restrict p2, Tb lam_1, Tb lam_2,
   const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
-  int l, int lmax)
+  int l, int lmax, int nv2)
   {
   while (l<=lmax)
     {
@@ -200,7 +186,7 @@ NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
     Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
     Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
        f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
+    for (int i=0; i<nv2; ++i)
       {
       lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
       vfmaeq(p1->r.v[i],lam_2.v[i],ar1);
@@ -215,14 +201,15 @@ NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
 
 NOINLINE static void map2alm_kernel (const Tb cth,
   const Tbri * restrict p1, const Tbri * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax)
+  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l,
+  int lmax, int nv2)
   {
   while (l<=lmax)
     {
     Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
        f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
     Tv atmp[4] = {vzero, vzero, vzero, vzero};
-    for (int i=0; i<nvec; ++i)
+    for (int i=0; i<nv2; ++i)
       {
       lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
       vfmaeq(atmp[0],lam_2.v[i],p1->r.v[i]);
@@ -238,20 +225,21 @@ NOINLINE static void map2alm_kernel (const Tb cth,
 
 NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
   const sharp_Ylmgen_C *gen, sharp_job *job, Tbri * restrict p1,
-  Tbri * restrict p2)
+  Tbri * restrict p2, int nth)
   {
   int l,lmax=gen->lmax;
   Tb lam_1,lam_2,scale;
-  iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen);
-  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen,nv2);
+  job->opcnt += (l-gen->m) * 4*nth;
   if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
+  job->opcnt += (lmax+1-l) * 8*nth;
 
   const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
   const dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
   Tb corfac;
-  for (int i=0; i<nvec; ++i)
+  for (int i=0; i<nv2; ++i)
     {
     getCorfac(scale.v[i], &corfac.v[i], gen->cf);
     full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
@@ -264,7 +252,7 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
     Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
        f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
     full_ieee=1;
-    for (int i=0; i<nvec; ++i)
+    for (int i=0; i<nv2; ++i)
       {
       lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
       vfmaeq(p1->r.v[i],lam_2.v[i]*corfac.v[i],ar1);
@@ -286,27 +274,32 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
     }
   if (l>lmax) return;
 
-  Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac);
-  alm2map_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
+  for (int i=0; i<nv2; ++i)
+    {
+    lam_1.v[i] *= corfac.v[i];
+    lam_2.v[i] *= corfac.v[i];
+    }
+  alm2map_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, nv2);
   }
 
 NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
   const sharp_Ylmgen_C *gen, sharp_job *job, const Tbri * restrict p1,
-  const Tbri * restrict p2)
+  const Tbri * restrict p2, int nth)
   {
   int lmax=gen->lmax;
   Tb lam_1,lam_2,scale;
   int l=gen->m;
-  iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen);
-  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen,nv2);
+  job->opcnt += (l-gen->m) * 4*nth;
   if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
+  job->opcnt += (lmax+1-l) * 8*nth;
 
   const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
   dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
   Tb corfac;
-  for (int i=0; i<nvec; ++i)
+  for (int i=0; i<nv2; ++i)
     {
     getCorfac(scale.v[i], &corfac.v[i], gen->cf);
     full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
@@ -318,7 +311,7 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
     Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
        f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
     Tv atmp[4] = {vzero, vzero, vzero, vzero};
-    for (int i=0; i<nvec; ++i)
+    for (int i=0; i<nv2; ++i)
       {
       lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
       vfmaeq(atmp[0],lam_2.v[i]*corfac.v[i],p1->r.v[i]);
@@ -340,8 +333,12 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
     l+=2;
     }
 
-  Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac);
-  map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
+  for (int i=0; i<nv2; ++i)
+    {
+    lam_1.v[i] *= corfac.v[i];
+    lam_2.v[i] *= corfac.v[i];
+    }
+  map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, nv2);
   }
 
 
@@ -362,32 +359,40 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
       {
       if (job->spin==0)
         {
-        for (int ith=0; ith<ulim-llim; ith+=nval)
+        int ith=0;
+        int itgt[nvec*VLEN];
+        while (ith<ulim-llim)
           {
           Tburi p1,p2; VZERO(p1); VZERO(p2);
           Tbu cth, sth;
-
-          int skip=1;
-          for (int i=0; i<nval; ++i)
+          int nth=0;
+          while ((nth<nval)&&(ith<ulim-llim))
             {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            if (mlim[itot]>=m) skip=0;
-            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-            }
-          if (!skip)
-            calc_alm2map (cth.b,sth.b,gen,job,&p1.b,&p2.b);
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot<ulim-llim)
+            if (mlim[ith]>=m)
               {
-              int phas_idx = itot*job->s_th + mi*job->s_m;
+              itgt[nth] = ith;
+              cth.s[nth]=cth_[ith]; sth.s[nth]=sth_[ith];
+              ++nth;
+              }
+            ++ith;
+            }
+          if (nth>0)
+            {
+            int i2=((nth+VLEN-1)/VLEN)*VLEN;
+            for (int i=nth; i<i2; ++i)
+              {
+              cth.s[i]=cth.s[nth-1];
+              sth.s[i]=sth.s[nth-1];
+              }
+            calc_alm2map (cth.b,sth.b,gen,job,&p1.b,&p2.b,nth);
+            for (int i=0; i<nth; ++i)
+              {
+              int tgt=itgt[i];
+              int phas_idx = tgt*job->s_th + mi*job->s_m;
               complex double r1 = p1.s.r[i] + p1.s.i[i]*_Complex_I,
                              r2 = p2.s.r[i] + p2.s.i[i]*_Complex_I;
               job->phase[phas_idx] = r1+r2;
-              if (ispair[itot])
+              if (ispair[tgt])
                 job->phase[phas_idx+1] = r1-r2;
               }
             }
@@ -421,29 +426,28 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
       {
       if (job->spin==0)
         {
-        for (int ith=0; ith<ulim-llim; ith+=nval)
+        int ith=0;
+        while (ith<ulim-llim)
           {
-          Tburi p1, p2; VZERO(p1); VZERO(p2);
+          Tburi p1,p2; VZERO(p1); VZERO(p2);
           Tbu cth, sth;
-          int skip=1;
-
-          for (int i=0; i<nval; ++i)
+          int nth=0;
+          while ((nth<nval)&&(ith<ulim-llim))
             {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            if (mlim[itot]>=m) skip=0;
-            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-            if ((i+ith<ulim-llim)&&(mlim[itot]>=m))
+            if (mlim[ith]>=m)
               {
-              int phas_idx = itot*job->s_th + mi*job->s_m;
+              cth.s[nth]=cth_[ith]; sth.s[nth]=sth_[ith];
+              int phas_idx = ith*job->s_th + mi*job->s_m;
               dcmplx ph1=job->phase[phas_idx];
-              dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
-              p1.s.r[i]=creal(ph1+ph2); p1.s.i[i]=cimag(ph1+ph2);
-              p2.s.r[i]=creal(ph1-ph2); p2.s.i[i]=cimag(ph1-ph2);
+              dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.;
+              p1.s.r[nth]=creal(ph1+ph2); p1.s.i[nth]=cimag(ph1+ph2);
+              p2.s.r[nth]=creal(ph1-ph2); p2.s.i[nth]=cimag(ph1-ph2);
+              ++nth;
               }
+            ++ith;
             }
-          if (!skip)
-            calc_map2alm(cth.b,sth.b,gen,job,&p1.b,&p2.b);
+          if (nth>0)
+            calc_map2alm(cth.b,sth.b,gen,job,&p1.b,&p2.b, nth);
           }
         }
       else

From 5057843daca9e61a7ca4a0ab90cec3e3fef8ef54 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 13 Dec 2018 10:42:07 +0100
Subject: [PATCH 22/85] cleanup

---
 libsharp/sharp_core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index cfdaab5..4ecc132 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -447,7 +447,16 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
             ++ith;
             }
           if (nth>0)
+            {
+            int i2=((nth+VLEN-1)/VLEN)*VLEN;
+            for (int i=nth; i<i2; ++i)
+              {
+              cth.s[i]=cth.s[nth-1];
+              sth.s[i]=sth.s[nth-1];
+              p1.s.r[i]=p1.s.i[i]=p2.s.r[i]=p2.s.i[i]=0.;
+              }
             calc_map2alm(cth.b,sth.b,gen,job,&p1.b,&p2.b, nth);
+            }
           }
         }
       else

From e88160e8d9a08581e24b100697165b850bd65a8a Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 13 Dec 2018 10:54:27 +0100
Subject: [PATCH 23/85] fixes

---
 libsharp/sharp_core.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 4ecc132..34802de 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -374,6 +374,11 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
               cth.s[nth]=cth_[ith]; sth.s[nth]=sth_[ith];
               ++nth;
               }
+            else
+              {
+              int phas_idx = ith*job->s_th + mi*job->s_m;
+              job->phase[phas_idx] = job->phase[phas_idx+1] = 0;
+              }
             ++ith;
             }
           if (nth>0)

From b5c6cff430352108ec3f92539068c90306f8fbc8 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 13 Dec 2018 10:57:04 +0100
Subject: [PATCH 24/85] fixes

---
 libsharp/sharp_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 34802de..57a3ecd 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -25,7 +25,7 @@
 /*! \file sharp_core.c
  *  Computational core
  *
- *  Copyright (C) 2012-2013 Max-Planck-Society
+ *  Copyright (C) 2012-2018 Max-Planck-Society
  *  \author Martin Reinecke
  */
 

From 12d8d9b9da0cbabfa738e3514226d2a9424f360c Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 14 Dec 2018 12:48:57 +0100
Subject: [PATCH 25/85] beginning of spin>0; still disabled

---
 libsharp/sharp_core.c | 155 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 134 insertions(+), 21 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 57a3ecd..41e1fd8 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -60,7 +60,7 @@ typedef struct
 typedef union
   { Tbri b; Tsri s; } Tburi;
 
-static void Tvnormalize (Tv * restrict val, Tv * restrict scale,
+static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale,
   double maxval)
   {
   const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
@@ -85,7 +85,6 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
   Tv * restrict resd, Tv * restrict ress)
   {
   Tv vminv=vload(powlimit[npow]);
-  Tv res=vone;
   Tm mask = vlt(vabs(val),vminv);
   if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
     {
@@ -122,7 +121,7 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
     }
   }
 
-static void getCorfac(Tv scale, Tv * restrict corfac,
+static inline void getCorfac(Tv scale, Tv * restrict corfac,
   const double * restrict cf)
   {
   Tvu sc, corf;
@@ -133,13 +132,26 @@ static void getCorfac(Tv scale, Tv * restrict corfac,
   *corfac=corf.v;
   }
 
+static inline int rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, Tv eps)
+  {
+  Tm mask = vgt(vabs(*v2),eps);
+  if (vanyTrue(mask))
+    {
+    vmuleq_mask(mask,*v1,vload(sharp_fsmall));
+    vmuleq_mask(mask,*v2,vload(sharp_fsmall));
+    vaddeq_mask(mask,*s,vone);
+    return 1;
+    }
+  return 0;
+  }
+
 NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
   Tb * restrict lam_1, Tb * restrict lam_2, Tb * restrict scale,
   const sharp_Ylmgen_C * restrict gen, int nv2)
   {
   int l=gen->m;
   Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
-  Tv fsmall=vload(sharp_fsmall), limscale=vload(sharp_limscale);
+  Tv limscale=vload(sharp_limscale);
   int below_limit = 1;
   for (int i=0; i<nv2; ++i)
     {
@@ -160,20 +172,129 @@ NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
       {
       lam_1->v[i] = r10*cth.v[i]*lam_2->v[i] - r11*lam_1->v[i];
       lam_2->v[i] = r20*cth.v[i]*lam_1->v[i] - r21*lam_2->v[i];
-      Tm mask = vgt(vabs(lam_2->v[i]),vload(sharp_ftol));
-      if (vanyTrue(mask))
-        {
-        vmuleq_mask(mask,lam_1->v[i],fsmall);
-        vmuleq_mask(mask,lam_2->v[i],fsmall);
-        vaddeq_mask(mask,scale->v[i],vone);
+      if (rescale(&lam_1->v[i], &lam_2->v[i], &scale->v[i], vload(sharp_ftol)))
         below_limit &= vallTrue(vlt(scale->v[i],limscale));
-        }
       }
     l+=2;
     }
   *l_=l;
   }
 
+#if 1
+static inline void rec_step (Tv * restrict rxp, Tv * restrict rxm,
+  Tv * restrict ryp, Tv * restrict rym, const Tv cth,
+  const sharp_ylmgen_dbl3 fx)
+  {
+  Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
+  *rxp = (cth-fx1)*fx0* *ryp - fx2* *rxp;
+  *rxm = (cth+fx1)*fx0* *rym - fx2* *rxm;
+  }
+
+NOINLINE static void iter_to_ieee_spin (const Tb cth, const Tb sth, int *l_,
+  Tb * rec1p, Tb * rec1m, Tb * rec2p, Tb * rec2m,
+  Tb * scalep, Tb * scalem, const sharp_Ylmgen_C * restrict gen, int nv2)
+  {
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
+  Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
+  Tv prefac=vload(gen->prefac[gen->m]),
+     prescale=vload(gen->fscale[gen->m]);
+  Tv limscale=vload(sharp_limscale);
+  int below_limit=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    Tv cth2, sth2;
+    cth2=vsqrt(vmul(vadd(vone,cth.v[i]),vload(0.5)));
+    cth2=vmax(cth2,vload(1e-15));
+    sth2=vsqrt(vmul(vsub(vone,cth.v[i]),vload(0.5)));
+    sth2=vmax(sth2,vload(1e-15));
+    Tm mask=vlt(sth.v[i],vzero);
+    Tm cmask=vand_mask(mask,vlt(cth.v[i],vzero));
+    vmuleq_mask(cmask,cth2,vload(-1.));
+    Tm smask=vand_mask(mask,vgt(cth.v[i],vzero));
+    vmuleq_mask(smask,sth2,vload(-1.));
+
+    mypow(cth2,gen->cosPow,gen->powlimit,&ccp.v[i],&ccps.v[i]);
+    mypow(sth2,gen->sinPow,gen->powlimit,&ssp.v[i],&ssps.v[i]);
+    mypow(cth2,gen->sinPow,gen->powlimit,&csp.v[i],&csps.v[i]);
+    mypow(sth2,gen->cosPow,gen->powlimit,&scp.v[i],&scps.v[i]);
+
+    rec1p->v[i] = vzero;
+    rec1m->v[i] = vzero;
+    rec2p->v[i]=vmul(prefac,ccp.v[i]);
+    scalep->v[i]=vadd(prescale,ccps.v[i]);
+    rec2m->v[i]=vmul(prefac,csp.v[i]);
+    scalem->v[i]=vadd(prescale,csps.v[i]);
+    Tvnormalize(&rec2m->v[i],&scalem->v[i],sharp_fbighalf);
+    Tvnormalize(&rec2p->v[i],&scalep->v[i],sharp_fbighalf);
+
+    rec2p->v[i]=vmul(rec2p->v[i],ssp.v[i]);
+    scalep->v[i]=vadd(scalep->v[i],ssps.v[i]);
+    rec2m.v[i]=vmul(rec2m.v[i],scp.v[i]);
+    scalem.v[i]=vadd(scalem.v[i],scps.v[i]);
+    if (gen->preMinus_p)
+      rec2p.v[i]=vneg(rec2p.v[i]);
+    if (gen->preMinus_m)
+      rec2m.v[i]=vneg(rec2m.v[i]);
+    if (gen->s&1)
+      rec2p.v[i]=vneg(rec2p.v[i]);
+
+    Tvnormalize(&rec2m.v[i],&scalem.v[i],sharp_ftol);
+    Tvnormalize(&rec2p.v[i],&scalep.v[i],sharp_ftol);
+
+    below_limit &= vallTrue(vand_mask(vlt(scalem.v[i],limscale),vlt(scalep.v[i],limscale)));
+    }
+
+  int l=gen->mhi;
+
+  while (below_limit)
+    {
+    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
+    for (int i=0; i<nv2; ++i)
+      {
+      rec_step(&rec1p.v[i],&rec1m.v[i],&rec2p.v[i],&rec2m.v[i],cth.v[i],fx[l+1]);
+      rec_step(&rec2p.v[i],&rec2m.v[i],&rec1p.v[i],&rec1m.v[i],cth.v[i],fx[l+2]);
+      if (rescale(&rec1p.v[i],&rec2p.v[i],&scalep.v[i],vload(sharp_ftol)) ||
+          rescale(&rec1m.v[i],&rec2m.v[i],&scalem.v[i],vload(sharp_ftol)))
+      below_limit &= vallTrue(vlt(scalep.v[i],limscale)) &&
+                     vallTrue(vlt(scalem.v[i],limscale));
+      }
+    l+=2;
+    }
+
+  *l_=l;
+  *rec1p_=rec1p; *rec2p_=rec2p; *scalep_=scalep;
+  *rec1m_=rec1m; *rec2m_=rec2m; *scalem_=scalem;
+  }
+
+NOINLINE static void alm2map_spin_kernel(Tb cth, Tbqu * restrict p1,
+  Tbqu * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
+  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
+  int lmax, int nv2)
+  {
+  while (l<=lmax)
+    {
+    Tv fx10=vload(fx[l+1].f[0]),fx1=v1load(fx[l+1].f[1]),
+       fx12=vload(fx[l+1].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i];
+      rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i];
+      }
+    Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
+      &alm[2*njobs*(l+1)] NJ2);
+    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
+    fx2=vload(fx[l+2].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec2p.v[i] = (cth.v[i]-fx1)*fx0*rec1p.v[i] - fx2*rec2p.v[i];
+      rec2m.v[i] = (cth.v[i]+fx1)*fx0*rec1m.v[i] - fx2*rec2m.v[i];
+      }
+    l+=2;
+    }
+  if (l==lmax)
+    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
+  }
+#endif
 
 NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
   Tbri * restrict p2, Tb lam_1, Tb lam_2,
@@ -258,12 +379,8 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
       vfmaeq(p1->r.v[i],lam_2.v[i]*corfac.v[i],ar1);
       vfmaeq(p1->i.v[i],lam_2.v[i]*corfac.v[i],ai1);
       lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
-      Tm mask = vgt(vabs(lam_2.v[i]),vload(sharp_ftol));
-      if (vanyTrue(mask))
+      if (rescale(&lam_1.v[i], &lam_2.v[i], &scale.v[i], vload(sharp_ftol)))
         {
-        vmuleq_mask(mask,lam_1.v[i],vload(sharp_fsmall));
-        vmuleq_mask(mask,lam_2.v[i],vload(sharp_fsmall));
-        vaddeq_mask(mask,scale.v[i],vone);
         getCorfac(scale.v[i], &corfac.v[i], gen->cf);
         full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
         }
@@ -317,12 +434,8 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
       vfmaeq(atmp[0],lam_2.v[i]*corfac.v[i],p1->r.v[i]);
       vfmaeq(atmp[1],lam_2.v[i]*corfac.v[i],p1->i.v[i]);
       lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
-      Tm mask = vgt(vabs(lam_2.v[i]),vload(sharp_ftol));
-      if (vanyTrue(mask))
+      if (rescale(&lam_1.v[i], &lam_2.v[i], &scale.v[i], vload(sharp_ftol)))
         {
-        vmuleq_mask(mask,lam_1.v[i],vload(sharp_fsmall));
-        vmuleq_mask(mask,lam_2.v[i],vload(sharp_fsmall));
-        vaddeq_mask(mask,scale.v[i],vone);
         getCorfac(scale.v[i], &corfac.v[i], gen->cf);
         full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
         }

From 382badecb3314860fbeb70ec91102067da0cd2ed Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 14 Dec 2018 13:52:46 +0100
Subject: [PATCH 26/85] tweaks

---
 libsharp/sharp_core.c | 182 ++++++++++++++++++++++--------------------
 1 file changed, 96 insertions(+), 86 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 41e1fd8..f235976 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -60,6 +60,25 @@ typedef struct
 typedef union
   { Tbri b; Tsri s; } Tburi;
 
+typedef Tv Tbv[nvec];
+typedef double Tbs[nvec*VLEN];
+
+typedef struct
+  {
+  Tbv sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i;
+  } s0data_v;
+
+typedef struct
+  {
+  Tbs sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i;
+  } s0data_s;
+
+typedef union
+  {
+  s0data_v v;
+  s0data_s s;
+  } s0data_u;
+
 static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale,
   double maxval)
   {
@@ -145,9 +164,8 @@ static inline int rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, T
   return 0;
   }
 
-NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
-  Tb * restrict lam_1, Tb * restrict lam_2, Tb * restrict scale,
-  const sharp_Ylmgen_C * restrict gen, int nv2)
+NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
+  s0data_v * restrict d, int * restrict l_, int nv2)
   {
   int l=gen->m;
   Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
@@ -155,11 +173,11 @@ NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
   int below_limit = 1;
   for (int i=0; i<nv2; ++i)
     {
-    lam_1->v[i]=vzero;
-    mypow(sth.v[i],l,gen->powlimit,&lam_2->v[i],&scale->v[i]);
-    lam_2->v[i] *= mfac;
-    Tvnormalize(&lam_2->v[i],&scale->v[i],sharp_ftol);
-    below_limit &= vallTrue(vlt(scale->v[i],limscale));
+    d->lam1[i]=vzero;
+    mypow(d->sth[i],l,gen->powlimit,&d->lam2[i],&d->scale[i]);
+    d->lam2[i] *= mfac;
+    Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
+    below_limit &= vallTrue(vlt(d->scale[i],limscale));
     }
 
   while (below_limit)
@@ -170,17 +188,17 @@ NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
        r20=vload(gen->rf[l+1].f[0]), r21=vload(gen->rf[l+1].f[1]);
     for (int i=0; i<nv2; ++i)
       {
-      lam_1->v[i] = r10*cth.v[i]*lam_2->v[i] - r11*lam_1->v[i];
-      lam_2->v[i] = r20*cth.v[i]*lam_1->v[i] - r21*lam_2->v[i];
-      if (rescale(&lam_1->v[i], &lam_2->v[i], &scale->v[i], vload(sharp_ftol)))
-        below_limit &= vallTrue(vlt(scale->v[i],limscale));
+      d->lam1[i] = r10*d->cth[i]*d->lam2[i] - r11*d->lam1[i];
+      d->lam2[i] = r20*d->cth[i]*d->lam1[i] - r21*d->lam2[i];
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
+        below_limit &= vallTrue(vlt(d->scale[i],limscale));
       }
     l+=2;
     }
   *l_=l;
   }
 
-#if 1
+#if 0
 static inline void rec_step (Tv * restrict rxp, Tv * restrict rxm,
   Tv * restrict ryp, Tv * restrict rym, const Tv cth,
   const sharp_ylmgen_dbl3 fx)
@@ -296,8 +314,7 @@ NOINLINE static void alm2map_spin_kernel(Tb cth, Tbqu * restrict p1,
   }
 #endif
 
-NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
-  Tbri * restrict p2, Tb lam_1, Tb lam_2,
+NOINLINE static void alm2map_kernel(s0data_v * restrict d,
   const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
   int l, int lmax, int nv2)
   {
@@ -309,19 +326,18 @@ NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
        f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
     for (int i=0; i<nv2; ++i)
       {
-      lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
-      vfmaeq(p1->r.v[i],lam_2.v[i],ar1);
-      vfmaeq(p1->i.v[i],lam_2.v[i],ai1);
-      lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
-      vfmaeq(p2->r.v[i],lam_1.v[i],ar2);
-      vfmaeq(p2->i.v[i],lam_1.v[i],ai2);
+      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
+      d->p1r[i] += d->lam2[i]*ar1;
+      d->p1i[i] += d->lam2[i]*ai1;
+      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
+      d->p2r[i] += d->lam1[i]*ar2;
+      d->p2i[i] += d->lam1[i]*ai2;
       }
     l+=2;
     }
   }
 
-NOINLINE static void map2alm_kernel (const Tb cth,
-  const Tbri * restrict p1, const Tbri * restrict p2, Tb lam_1, Tb lam_2,
+NOINLINE static void map2alm_kernel(s0data_v * restrict d,
   const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l,
   int lmax, int nv2)
   {
@@ -332,26 +348,24 @@ NOINLINE static void map2alm_kernel (const Tb cth,
     Tv atmp[4] = {vzero, vzero, vzero, vzero};
     for (int i=0; i<nv2; ++i)
       {
-      lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
-      vfmaeq(atmp[0],lam_2.v[i],p1->r.v[i]);
-      vfmaeq(atmp[1],lam_2.v[i],p1->i.v[i]);
-      lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
-      vfmaeq(atmp[2],lam_1.v[i],p2->r.v[i]);
-      vfmaeq(atmp[3],lam_1.v[i],p2->i.v[i]);
+      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
+      atmp[0] += d->lam2[i]*d->p1r[i];
+      atmp[1] += d->lam2[i]*d->p1i[i];
+      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
+      atmp[2] += d->lam1[i]*d->p2r[i];
+      atmp[3] += d->lam1[i]*d->p2i[i];
       }
     vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
     l+=2;
     }
   }
 
-NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Tbri * restrict p1,
-  Tbri * restrict p2, int nth)
+NOINLINE static void calc_alm2map (sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
   {
   int l,lmax=gen->lmax;
-  Tb lam_1,lam_2,scale;
   int nv2 = (nth+VLEN-1)/VLEN;
-  iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen,nv2);
+  iter_to_ieee(gen, d, &l, nv2);
   job->opcnt += (l-gen->m) * 4*nth;
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 8*nth;
@@ -359,11 +373,10 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
   const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
   const dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
-  Tb corfac;
   for (int i=0; i<nv2; ++i)
     {
-    getCorfac(scale.v[i], &corfac.v[i], gen->cf);
-    full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
+    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
     }
 
   while((!full_ieee) && (l<=lmax))
@@ -375,17 +388,17 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
     full_ieee=1;
     for (int i=0; i<nv2; ++i)
       {
-      lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
-      vfmaeq(p1->r.v[i],lam_2.v[i]*corfac.v[i],ar1);
-      vfmaeq(p1->i.v[i],lam_2.v[i]*corfac.v[i],ai1);
-      lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
-      if (rescale(&lam_1.v[i], &lam_2.v[i], &scale.v[i], vload(sharp_ftol)))
+      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
+      d->p1r[i] += d->lam2[i]*d->corfac[i]*ar1;
+      d->p1i[i] += d->lam2[i]*d->corfac[i]*ai1;
+      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
         {
-        getCorfac(scale.v[i], &corfac.v[i], gen->cf);
-        full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
+        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
         }
-      vfmaeq(p2->r.v[i],lam_1.v[i]*corfac.v[i],ar2);
-      vfmaeq(p2->i.v[i],lam_1.v[i]*corfac.v[i],ai2);
+      d->p2r[i] += d->lam1[i]*d->corfac[i]*ar2;
+      d->p2i[i] += d->lam1[i]*d->corfac[i]*ai2;
       }
     l+=2;
     }
@@ -393,21 +406,19 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
 
   for (int i=0; i<nv2; ++i)
     {
-    lam_1.v[i] *= corfac.v[i];
-    lam_2.v[i] *= corfac.v[i];
+    d->lam1[i] *= d->corfac[i];
+    d->lam2[i] *= d->corfac[i];
     }
-  alm2map_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, nv2);
+  alm2map_kernel(d, rf, alm, l, lmax, nv2);
   }
 
-NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, const Tbri * restrict p1,
-  const Tbri * restrict p2, int nth)
+NOINLINE static void calc_map2alm(sharp_job * restrict job,
+  const sharp_Ylmgen_C *gen, s0data_v * restrict d, int nth)
   {
   int lmax=gen->lmax;
-  Tb lam_1,lam_2,scale;
   int l=gen->m;
   int nv2 = (nth+VLEN-1)/VLEN;
-  iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen,nv2);
+  iter_to_ieee(gen, d, &l, nv2);
   job->opcnt += (l-gen->m) * 4*nth;
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 8*nth;
@@ -415,11 +426,10 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
   const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
   dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
-  Tb corfac;
   for (int i=0; i<nv2; ++i)
     {
-    getCorfac(scale.v[i], &corfac.v[i], gen->cf);
-    full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
+    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
     }
 
   while ((!full_ieee) && (l<=lmax))
@@ -430,17 +440,17 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
     Tv atmp[4] = {vzero, vzero, vzero, vzero};
     for (int i=0; i<nv2; ++i)
       {
-      lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
-      vfmaeq(atmp[0],lam_2.v[i]*corfac.v[i],p1->r.v[i]);
-      vfmaeq(atmp[1],lam_2.v[i]*corfac.v[i],p1->i.v[i]);
-      lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
-      if (rescale(&lam_1.v[i], &lam_2.v[i], &scale.v[i], vload(sharp_ftol)))
+      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
+      atmp[0] += d->lam2[i]*d->corfac[i]*d->p1r[i];
+      atmp[1] += d->lam2[i]*d->corfac[i]*d->p1i[i];
+      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
         {
-        getCorfac(scale.v[i], &corfac.v[i], gen->cf);
-        full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
+        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
         }
-      vfmaeq(atmp[2],lam_1.v[i]*corfac.v[i],p2->r.v[i]);
-      vfmaeq(atmp[3],lam_1.v[i]*corfac.v[i],p2->i.v[i]);
+      atmp[2] += d->lam1[i]*d->corfac[i]*d->p2r[i];
+      atmp[3] += d->lam1[i]*d->corfac[i]*d->p2i[i];
       }
     vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
     l+=2;
@@ -448,10 +458,10 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
 
   for (int i=0; i<nv2; ++i)
     {
-    lam_1.v[i] *= corfac.v[i];
-    lam_2.v[i] *= corfac.v[i];
+    d->lam1[i] *= d->corfac[i];
+    d->lam2[i] *= d->corfac[i];
     }
-  map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, nv2);
+  map2alm_kernel(d, rf, alm, l, lmax, nv2);
   }
 
 
@@ -476,15 +486,15 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
         int itgt[nvec*VLEN];
         while (ith<ulim-llim)
           {
-          Tburi p1,p2; VZERO(p1); VZERO(p2);
-          Tbu cth, sth;
+          s0data_u d;
+          VZERO(d.s.p1r); VZERO(d.s.p1i); VZERO(d.s.p2r); VZERO(d.s.p2i);
           int nth=0;
           while ((nth<nval)&&(ith<ulim-llim))
             {
             if (mlim[ith]>=m)
               {
               itgt[nth] = ith;
-              cth.s[nth]=cth_[ith]; sth.s[nth]=sth_[ith];
+              d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
               ++nth;
               }
             else
@@ -499,16 +509,16 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
             int i2=((nth+VLEN-1)/VLEN)*VLEN;
             for (int i=nth; i<i2; ++i)
               {
-              cth.s[i]=cth.s[nth-1];
-              sth.s[i]=sth.s[nth-1];
+              d.s.cth[i]=d.s.cth[nth-1];
+              d.s.sth[i]=d.s.sth[nth-1];
               }
-            calc_alm2map (cth.b,sth.b,gen,job,&p1.b,&p2.b,nth);
+            calc_alm2map (job, gen, &d.v, nth);
             for (int i=0; i<nth; ++i)
               {
               int tgt=itgt[i];
               int phas_idx = tgt*job->s_th + mi*job->s_m;
-              complex double r1 = p1.s.r[i] + p1.s.i[i]*_Complex_I,
-                             r2 = p2.s.r[i] + p2.s.i[i]*_Complex_I;
+              complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I,
+                             r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I;
               job->phase[phas_idx] = r1+r2;
               if (ispair[tgt])
                 job->phase[phas_idx+1] = r1-r2;
@@ -547,19 +557,19 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
         int ith=0;
         while (ith<ulim-llim)
           {
-          Tburi p1,p2; VZERO(p1); VZERO(p2);
-          Tbu cth, sth;
+          s0data_u d;
+          VZERO(d.s.p1r); VZERO(d.s.p1i); VZERO(d.s.p2r); VZERO(d.s.p2i);
           int nth=0;
           while ((nth<nval)&&(ith<ulim-llim))
             {
             if (mlim[ith]>=m)
               {
-              cth.s[nth]=cth_[ith]; sth.s[nth]=sth_[ith];
+              d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
               int phas_idx = ith*job->s_th + mi*job->s_m;
               dcmplx ph1=job->phase[phas_idx];
               dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.;
-              p1.s.r[nth]=creal(ph1+ph2); p1.s.i[nth]=cimag(ph1+ph2);
-              p2.s.r[nth]=creal(ph1-ph2); p2.s.i[nth]=cimag(ph1-ph2);
+              d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2);
+              d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2);
               ++nth;
               }
             ++ith;
@@ -569,11 +579,11 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
             int i2=((nth+VLEN-1)/VLEN)*VLEN;
             for (int i=nth; i<i2; ++i)
               {
-              cth.s[i]=cth.s[nth-1];
-              sth.s[i]=sth.s[nth-1];
-              p1.s.r[i]=p1.s.i[i]=p2.s.r[i]=p2.s.i[i]=0.;
+              d.s.cth[i]=d.s.cth[nth-1];
+              d.s.sth[i]=d.s.sth[nth-1];
+              d.s.p1r[i]=d.s.p1i[i]=d.s.p2r[i]=d.s.p2i[i]=0.;
               }
-            calc_map2alm(cth.b,sth.b,gen,job,&p1.b,&p2.b, nth);
+            calc_map2alm (job, gen, &d.v, nth);
             }
           }
         }

From c750162e2b9b87703f19b09e75c74d171ea83497 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 14 Dec 2018 14:58:34 +0100
Subject: [PATCH 27/85] tweaking

---
 libsharp/sharp_core.c | 154 ++++++++++++++++++++++--------------------
 1 file changed, 82 insertions(+), 72 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index f235976..2f88b90 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -45,21 +45,6 @@ typedef complex double dcmplx;
 typedef union
   { Tv v; double s[VLEN]; } Tvu;
 
-typedef struct
-  { Tv v[nvec]; } Tb;
-
-typedef union
-  { Tb b; double s[VLEN*nvec]; } Tbu;
-
-typedef struct
-  { Tb r, i; } Tbri;
-
-typedef struct
-  { double r[VLEN*nvec], i[VLEN*nvec]; } Tsri;
-
-typedef union
-  { Tbri b; Tsri s; } Tburi;
-
 typedef Tv Tbv[nvec];
 typedef double Tbs[nvec*VLEN];
 
@@ -79,6 +64,24 @@ typedef union
   s0data_s s;
   } s0data_u;
 
+typedef struct
+  {
+  Tbv sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
+      p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
+  } sxdata_v;
+
+typedef struct
+  {
+  Tbs sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
+      p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
+  } sxdata_s;
+
+typedef union
+  {
+  sxdata_v v;
+  sxdata_s s;
+  } sxdata_u;
+
 static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale,
   double maxval)
   {
@@ -208,12 +211,11 @@ static inline void rec_step (Tv * restrict rxp, Tv * restrict rxm,
   *rxm = (cth+fx1)*fx0* *rym - fx2* *rxm;
   }
 
-NOINLINE static void iter_to_ieee_spin (const Tb cth, const Tb sth, int *l_,
-  Tb * rec1p, Tb * rec1m, Tb * rec2p, Tb * rec2m,
-  Tb * scalep, Tb * scalem, const sharp_Ylmgen_C * restrict gen, int nv2)
+NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
+  sxdata_v * restrict d, int * restrict l_, int nv2)
   {
   const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
+  Tbv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
   Tv prefac=vload(gen->prefac[gen->m]),
      prescale=vload(gen->fscale[gen->m]);
   Tv limscale=vload(sharp_limscale);
@@ -221,45 +223,45 @@ NOINLINE static void iter_to_ieee_spin (const Tb cth, const Tb sth, int *l_,
   for (int i=0; i<nv2; ++i)
     {
     Tv cth2, sth2;
-    cth2=vsqrt(vmul(vadd(vone,cth.v[i]),vload(0.5)));
+    cth2=vsqrt((vone+d->cth[i])*vload(0.5));
     cth2=vmax(cth2,vload(1e-15));
-    sth2=vsqrt(vmul(vsub(vone,cth.v[i]),vload(0.5)));
+    sth2=vsqrt((vone-d->cth[i])*vload(0.5));
     sth2=vmax(sth2,vload(1e-15));
-    Tm mask=vlt(sth.v[i],vzero);
-    Tm cmask=vand_mask(mask,vlt(cth.v[i],vzero));
+    Tm mask=vlt(d->sth[i],vzero);
+    Tm cmask=vand_mask(mask,vlt(d->cth[i],vzero));
     vmuleq_mask(cmask,cth2,vload(-1.));
-    Tm smask=vand_mask(mask,vgt(cth.v[i],vzero));
+    Tm smask=vand_mask(mask,vgt(d->cth[i],vzero));
     vmuleq_mask(smask,sth2,vload(-1.));
 
-    mypow(cth2,gen->cosPow,gen->powlimit,&ccp.v[i],&ccps.v[i]);
-    mypow(sth2,gen->sinPow,gen->powlimit,&ssp.v[i],&ssps.v[i]);
-    mypow(cth2,gen->sinPow,gen->powlimit,&csp.v[i],&csps.v[i]);
-    mypow(sth2,gen->cosPow,gen->powlimit,&scp.v[i],&scps.v[i]);
+    mypow(cth2,gen->cosPow,gen->powlimit,&ccp[i],&ccps[i]);
+    mypow(sth2,gen->sinPow,gen->powlimit,&ssp[i],&ssps[i]);
+    mypow(cth2,gen->sinPow,gen->powlimit,&csp[i],&csps[i]);
+    mypow(sth2,gen->cosPow,gen->powlimit,&scp[i],&scps[i]);
 
-    rec1p->v[i] = vzero;
-    rec1m->v[i] = vzero;
-    rec2p->v[i]=vmul(prefac,ccp.v[i]);
-    scalep->v[i]=vadd(prescale,ccps.v[i]);
-    rec2m->v[i]=vmul(prefac,csp.v[i]);
-    scalem->v[i]=vadd(prescale,csps.v[i]);
-    Tvnormalize(&rec2m->v[i],&scalem->v[i],sharp_fbighalf);
-    Tvnormalize(&rec2p->v[i],&scalep->v[i],sharp_fbighalf);
+    d->l1p[i] = vzero;
+    d->l1m[i] = vzero;
+    d->l2p[i] = prefac*ccp[i];
+    d->scp[i] = prescale*ccps[i];
+    d->l2m[i] = prefac*csp[i];
+    d->scm[i] = prescale*csps[i];
+    Tvnormalize(&d->l2m[i],&d->scm[i],sharp_fbighalf);
+    Tvnormalize(&d->l2p[i],&d->scp[i],sharp_fbighalf);
 
-    rec2p->v[i]=vmul(rec2p->v[i],ssp.v[i]);
-    scalep->v[i]=vadd(scalep->v[i],ssps.v[i]);
-    rec2m.v[i]=vmul(rec2m.v[i],scp.v[i]);
-    scalem.v[i]=vadd(scalem.v[i],scps.v[i]);
+    d->l2p[i] *= ssp[i];
+    d->scp[i] += ssps[i];
+    d->l2m[i] *= scp[i];
+    d->scm[i] += scps[i];
     if (gen->preMinus_p)
-      rec2p.v[i]=vneg(rec2p.v[i]);
+      d->l2p[i] = vneg(d->l2p[i]);
     if (gen->preMinus_m)
-      rec2m.v[i]=vneg(rec2m.v[i]);
+      d->l2m[i] = vneg(d->l2m[i]);
     if (gen->s&1)
-      rec2p.v[i]=vneg(rec2p.v[i]);
+      d->l2p[i] = vneg(d->l2p[i]);
 
-    Tvnormalize(&rec2m.v[i],&scalem.v[i],sharp_ftol);
-    Tvnormalize(&rec2p.v[i],&scalep.v[i],sharp_ftol);
+    Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol);
+    Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol);
 
-    below_limit &= vallTrue(vand_mask(vlt(scalem.v[i],limscale),vlt(scalep.v[i],limscale)));
+    below_limit &= vallTrue(vand_mask(vlt(d->scm[i],limscale),vlt(d->scp[i],limscale)));
     }
 
   int l=gen->mhi;
@@ -269,48 +271,56 @@ NOINLINE static void iter_to_ieee_spin (const Tb cth, const Tb sth, int *l_,
     if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
     for (int i=0; i<nv2; ++i)
       {
-      rec_step(&rec1p.v[i],&rec1m.v[i],&rec2p.v[i],&rec2m.v[i],cth.v[i],fx[l+1]);
-      rec_step(&rec2p.v[i],&rec2m.v[i],&rec1p.v[i],&rec1m.v[i],cth.v[i],fx[l+2]);
-      if (rescale(&rec1p.v[i],&rec2p.v[i],&scalep.v[i],vload(sharp_ftol)) ||
-          rescale(&rec1m.v[i],&rec2m.v[i],&scalem.v[i],vload(sharp_ftol)))
-      below_limit &= vallTrue(vlt(scalep.v[i],limscale)) &&
-                     vallTrue(vlt(scalem.v[i],limscale));
+      rec_step(&d->l1p[i],&d->l1m[i],&d->l2p[i],&d->l2m[i],d->cth[i],fx[l+1]);
+      rec_step(&d->l2p[i],&d->l2m[i],&d->l1p[i],&d->l1m[i],d->cth[i],fx[l+2]);
+      if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
+          rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
+      below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
+                     vallTrue(vlt(d->scm[i],limscale));
       }
     l+=2;
     }
 
   *l_=l;
-  *rec1p_=rec1p; *rec2p_=rec2p; *scalep_=scalep;
-  *rec1m_=rec1m; *rec2m_=rec2m; *scalem_=scalem;
   }
 
-NOINLINE static void alm2map_spin_kernel(Tb cth, Tbqu * restrict p1,
-  Tbqu * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
-  int lmax, int nv2)
+NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
+  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm,
+  int l, int lmax, int nv2)
   {
   while (l<=lmax)
     {
-    Tv fx10=vload(fx[l+1].f[0]),fx1=v1load(fx[l+1].f[1]),
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
        fx12=vload(fx[l+1].f[2]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
+       fx22=vload(fx[l+2].f[2]);
+    Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
+       acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
+    Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
+       acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
     for (int i=0; i<nvec; ++i)
       {
-      rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i];
-      rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i];
-      }
-    Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
-      &alm[2*njobs*(l+1)] NJ2);
-    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
-    fx2=vload(fx[l+2].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec2p.v[i] = (cth.v[i]-fx1)*fx0*rec1p.v[i] - fx2*rec2p.v[i];
-      rec2m.v[i] = (cth.v[i]+fx1)*fx0*rec1m.v[i] - fx2*rec2m.v[i];
+      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
+      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      Tv lw1=d->l2p[i]+d->l2m[i];
+      Tv lx2=d->l1m[i]-d->l1p[i];
+      d->p1pr[i] += agr1*lw1 - aci2*lx2;
+      d->p1pi[i] += agi1*lw1 + acr2*lx2;
+      d->p1mr[i] += acr1*lw1 + agi2*lx2;
+      d->p1mi[i] += aci1*lw1 - agr2*lx2;
+      Tv lx1=d->l2m[i]-d->l2p[i];
+      Tv lw2=d->l1p[i]+d->l1m[i];
+      d->p2pr[i] -= agr2*lw2 - aci1*lx1;
+      d->p2pi[i] += agi2*lw2 + acr1*lx1;
+      d->p2mr[i] += acr2*lw2 + agi1*lx1;
+      d->p2mi[i] -= aci2*lw2 - agr1*lx1;
+      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
+      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
       }
     l+=2;
     }
-  if (l==lmax)
-    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
+//  if (l==lmax)
+//    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
   }
 #endif
 

From 0b8222393fb9e60dc39cea6098ed543ac1874006 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Sat, 15 Dec 2018 13:02:56 +0100
Subject: [PATCH 28/85] cleanup

---
 Makefile.am                    |   2 -
 libsharp/sharp.h               | 243 ++++++++++++++++++++++++++++-
 libsharp/sharp_almhelpers.h    |   2 +-
 libsharp/sharp_complex_hacks.h |  71 ++++-----
 libsharp/sharp_core.c          |  12 +-
 libsharp/sharp_cxx.h           |   2 +-
 libsharp/sharp_geomhelpers.h   |   2 +-
 libsharp/sharp_internal.h      |   3 +-
 libsharp/sharp_lowlevel.h      | 270 ---------------------------------
 libsharp/sharp_mpi.h           |   2 +-
 libsharp/sharp_vecsupport.h    |  98 ------------
 libsharp/sharp_vecutil.h       |   2 +-
 12 files changed, 281 insertions(+), 428 deletions(-)
 delete mode 100644 libsharp/sharp_lowlevel.h

diff --git a/Makefile.am b/Makefile.am
index c738f29..a82583d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -19,14 +19,12 @@ src_sharp = \
   libsharp/sharp_core.h \
   libsharp/sharp_internal.h \
   libsharp/sharp_legendre_roots.h \
-  libsharp/sharp_lowlevel.h \
   libsharp/sharp_vecsupport.h \
   libsharp/sharp_vecutil.h \
   libsharp/sharp_ylmgen_c.h
 
 include_HEADERS = \
   libsharp/sharp.h \
-  libsharp/sharp_lowlevel.h \
   libsharp/sharp_geomhelpers.h \
   libsharp/sharp_almhelpers.h \
   libsharp/sharp_cxx.h
diff --git a/libsharp/sharp.h b/libsharp/sharp.h
index 9c5dd57..35a0cb5 100644
--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@@ -23,21 +23,248 @@
  */
 
 /*! \file sharp.h
- *  Interface for the spherical transform library.
+ *  Portable interface for the spherical transform library.
  *
- *  Copyright (C) 2006-2012 Max-Planck-Society
- *  \author Martin Reinecke
+ *  Copyright (C) 2012-2018 Max-Planck-Society
+ *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
-#ifndef PLANCK_SHARP_H
-#define PLANCK_SHARP_H
+#ifndef PLANCK_SHARP_LOWLEVEL_H
+#define PLANCK_SHARP_LOWLEVEL_H
+
+#include <stddef.h>
 
 #ifdef __cplusplus
-#error This header file cannot be included from C++, only from C
+extern "C" {
 #endif
 
-#include <complex.h>
+/*! \internal
+    Helper type containing information about a single ring. */
+typedef struct
+  {
+  double theta, phi0, weight, cth, sth;
+  ptrdiff_t ofs;
+  int nph, stride;
+  } sharp_ringinfo;
 
-#include "sharp_lowlevel.h"
+/*! \internal
+    Helper type containing information about a pair of rings with colatitudes
+    symmetric around the equator. */
+typedef struct
+  {
+  sharp_ringinfo r1,r2;
+  } sharp_ringpair;
+
+/*! \internal
+    Type holding all required information about a map geometry. */
+typedef struct
+  {
+  sharp_ringpair *pair;
+  int npairs, nphmax;
+  } sharp_geom_info;
+
+/*! \defgroup almgroup Helpers for dealing with a_lm */
+/*! \{ */
+
+/*! \internal
+    Helper type for index calculation in a_lm arrays. */
+typedef struct
+  {
+  /*! Maximum \a l index of the array */
+  int lmax;
+  /*! Number of different \a m values in this object */
+  int nm;
+  /*! Array with \a nm entries containing the individual m values */
+  int *mval;
+  /*! Combination of flags from sharp_almflags */
+  int flags;
+  /*! Array with \a nm entries containing the (hypothetical) indices of
+      the coefficients with quantum numbers 0,\a mval[i] */
+  ptrdiff_t *mvstart;
+  /*! Stride between a_lm and a_(l+1),m */
+  ptrdiff_t stride;
+  } sharp_alm_info;
+
+/*! alm_info flags */
+typedef enum { SHARP_PACKED = 1,
+               /*!< m=0-coefficients are packed so that the (zero) imaginary part is
+                    not present. mvstart is in units of *real* float/double for all
+                    m; stride is in units of reals for m=0 and complex for m!=0 */
+               SHARP_REAL_HARMONICS  = 1<<6
+               /*!< Use the real spherical harmonic convention. For
+                    m==0, the alm are treated exactly the same as in
+                    the complex case.  For m!=0, alm[i] represent a
+                    pair (+abs(m), -abs(m)) instead of (real, imag),
+                    and the coefficients are scaled by a factor of
+                    sqrt(2) relative to the complex case.  In other
+                    words, (sqrt(.5) * alm[i]) recovers the
+                    corresponding complex coefficient (when accessed
+                    as complex).
+                */
+             } sharp_almflags;
+
+
+
+/*! Creates an a_lm data structure from the following parameters:
+    \param lmax maximum \a l quantum number (>=0)
+    \param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
+    \param stride the stride between entries with identical \a m, and \a l
+      differing by 1.
+    \param mstart the index of the (hypothetical) coefficient with the
+      quantum numbers 0,\a m. Must have \a mmax+1 entries.
+    \param alm_info will hold a pointer to the newly created data structure
+ */
+void sharp_make_alm_info (int lmax, int mmax, int stride,
+  const ptrdiff_t *mstart, sharp_alm_info **alm_info);
+/*! Creates an a_lm data structure which from the following parameters:
+    \param lmax maximum \a l quantum number (\a >=0)
+    \param nm number of different \a m (\a 0<=nm<=lmax+1)
+    \param stride the stride between entries with identical \a m, and \a l
+      differing by 1.
+    \param mval array with \a nm entries containing the individual m values
+    \param mvstart array with \a nm entries containing the (hypothetical)
+      indices of the coefficients with the quantum numbers 0,\a mval[i]
+    \param flags a combination of sharp_almflags (pass 0 unless you know you need this)
+    \param alm_info will hold a pointer to the newly created data structure
+ */
+void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
+  const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info);
+/*! Returns the index of the coefficient with quantum numbers \a l,
+    \a mval[mi].
+    \note for a \a sharp_alm_info generated by sharp_make_alm_info() this is
+    the index for the coefficient with the quantum numbers \a l, \a mi. */
+ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
+/*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED
+    flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0),
+    otherwise it is the number of complex coefficients (with m>=0). */
+ptrdiff_t sharp_alm_count(const sharp_alm_info *self);
+/*! Deallocates the a_lm info object. */
+void sharp_destroy_alm_info (sharp_alm_info *info);
+
+/*! \} */
+
+/*! \defgroup geominfogroup Functions for dealing with geometry information */
+/*! \{ */
+
+/*! Creates a geometry information from a set of ring descriptions.
+    All arrays passed to this function must have \a nrings elements.
+    \param nrings the number of rings in the map
+    \param nph the number of pixels in each ring
+    \param ofs the index of the first pixel in each ring in the map array
+    \param stride the stride between consecutive pixels
+    \param phi0 the azimuth (in radians) of the first pixel in each ring
+    \param theta the colatitude (in radians) of each ring
+    \param wgt the pixel weight to be used for the ring in map2alm
+      and adjoint map2alm transforms.
+      Pass NULL to use 1.0 as weight for all rings.
+    \param geom_info will hold a pointer to the newly created data structure
+ */
+void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
+  const int *stride, const double *phi0, const double *theta,
+  const double *wgt, sharp_geom_info **geom_info);
+
+/*! Counts the number of grid points needed for (the local part of) a map described
+    by \a info.
+ */
+ptrdiff_t sharp_map_size(const sharp_geom_info *info);
+
+/*! Deallocates the geometry information in \a info. */
+void sharp_destroy_geom_info (sharp_geom_info *info);
+
+/*! \} */
+
+/*! \defgroup lowlevelgroup Low-level libsharp SHT interface */
+/*! \{ */
+
+/*! Enumeration of SHARP job types. */
+typedef enum { SHARP_YtW=0,               /*!< analysis */
+               SHARP_MAP2ALM=SHARP_YtW,   /*!< analysis */
+               SHARP_Y=1,                 /*!< synthesis */
+               SHARP_ALM2MAP=SHARP_Y,     /*!< synthesis */
+               SHARP_Yt=2,                /*!< adjoint synthesis */
+               SHARP_WY=3,                /*!< adjoint analysis */
+               SHARP_ALM2MAP_DERIV1=4     /*!< synthesis of first derivatives */
+             } sharp_jobtype;
+
+/*! Job flags */
+typedef enum { SHARP_DP              = 1<<4,
+               /*!< map and a_lm are in double precision */
+               SHARP_ADD             = 1<<5,
+               /*!< results are added to the output arrays, instead of
+                    overwriting them */
+
+               /* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags,
+                  but its use here is deprecated in favor of having it in the sharp_alm_info */
+
+               SHARP_NO_FFT          = 1<<7,
+
+               SHARP_USE_WEIGHTS     = 1<<20,    /* internal use only */
+               SHARP_NO_OPENMP       = 1<<21,    /* internal use only */
+             } sharp_jobflags;
+
+/*! Performs a libsharp SHT job. The interface deliberately does not use
+  the C99 "complex" data type, in order to be callable from C89 and C++.
+  \param type the type of SHT
+  \param spin the spin of the quantities to be transformed
+  \param alm contains pointers to the a_lm coefficients. If \a spin==0,
+    alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
+    etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
+    alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
+    depends on whether the SHARP_DP flag is set.
+  \param map contains pointers to the maps. If \a spin==0,
+    map[0] points to the map of the first SHT, map[1] to that of the second
+    etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
+    point to the maps of the first SHT, map[2] and map[3] to those of the
+    second, etc. The exact data type of \a map depends on whether the SHARP_DP
+    flag is set.
+  \param geom_info A \c sharp_geom_info object compatible with the provided
+    \a map arrays.
+  \param alm_info A \c sharp_alm_info object compatible with the provided
+    \a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
+    exactly once.
+  \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
+    \a alm is expected to have the type "complex double **" and \a map is
+    expected to have the type "double **"; otherwise, the expected
+    types are "complex float **" and "float **", respectively.
+  \param time If not NULL, the wall clock time required for this SHT
+    (in seconds) will be written here.
+  \param opcnt If not NULL, a conservative estimate of the total floating point
+    operation count for this SHT will be written here. */
+void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
+  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
+  int flags, double *time, unsigned long long *opcnt);
+
+void sharp_set_chunksize_min(int new_chunksize_min);
+void sharp_set_nchunks_max(int new_nchunks_max);
+
+
+typedef enum { SHARP_ERROR_NO_MPI = 1,
+               /*!< libsharp not compiled with MPI support */
+              } sharp_errors;
+
+/*! Works like sharp_execute_mpi, but is always present whether or not libsharp
+    is compiled with USE_MPI. This is primarily useful for wrapper code etc.
+
+    Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid
+    pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi
+    is *(MPI_Comm*)pcomm.
+
+    Other parameters are the same as sharp_execute_mpi.
+
+    Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available
+    (in which case nothing is done).
+ */
+int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
+  void *alm, void *map, const sharp_geom_info *geom_info,
+  const sharp_alm_info *alm_info, int flags, double *time,
+  unsigned long long *opcnt);
+
+
+
+/*! \} */
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif
diff --git a/libsharp/sharp_almhelpers.h b/libsharp/sharp_almhelpers.h
index 67016d7..c17028a 100644
--- a/libsharp/sharp_almhelpers.h
+++ b/libsharp/sharp_almhelpers.h
@@ -32,7 +32,7 @@
 #ifndef PLANCK_SHARP_ALMHELPERS_H
 #define PLANCK_SHARP_ALMHELPERS_H
 
-#include "sharp_lowlevel.h"
+#include "sharp.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/libsharp/sharp_complex_hacks.h b/libsharp/sharp_complex_hacks.h
index 86d1153..6ec27bb 100644
--- a/libsharp/sharp_complex_hacks.h
+++ b/libsharp/sharp_complex_hacks.h
@@ -25,131 +25,126 @@
 /*  \file sharp_complex_hacks.h
  *  support for converting vector types and complex numbers
  *
- *  Copyright (C) 2012-2016 Max-Planck-Society
+ *  Copyright (C) 2012-2018 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
 #ifndef SHARP_COMPLEX_HACKS_H
 #define SHARP_COMPLEX_HACKS_H
 
-#ifdef __cplusplus
-#error This header file cannot be included from C++, only from C
-#endif
-
 #include <math.h>
-#include <complex.h>
 #include "sharp_vecsupport.h"
 
 #define UNSAFE_CODE
 
 #if (VLEN==1)
 
-static inline complex double vhsum_cmplx(Tv a, Tv b)
+static inline _Complex double vhsum_cmplx(Tv a, Tv b)
   { return a+_Complex_I*b; }
 
 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict c1, complex double * restrict c2)
+  _Complex double * restrict c1, _Complex double * restrict c2)
   { *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
 
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict cc)
+  _Complex double * restrict cc)
   { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
 
 #endif
 
 #if (VLEN==2)
 
-static inline complex double vhsum_cmplx (Tv a, Tv b)
+static inline _Complex double vhsum_cmplx (Tv a, Tv b)
   {
 #if defined(__SSE3__)
   Tv tmp = _mm_hadd_pd(a,b);
 #else
-  Tv tmp = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
-                _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
+  Tv tmp = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
+           _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
 #endif
-  union {Tv v; complex double c; } u;
+  union {Tv v; _Complex double c; } u;
   u.v=tmp; return u.c;
   }
 
 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
-  Tv d, complex double * restrict c1, complex double * restrict c2)
+  Tv d, _Complex double * restrict c1, _Complex double * restrict c2)
   {
 #ifdef UNSAFE_CODE
 #if defined(__SSE3__)
-  vaddeq(*((__m128d *)c1),_mm_hadd_pd(a,b));
-  vaddeq(*((__m128d *)c2),_mm_hadd_pd(c,d));
+  *((__m128d *)c1) += _mm_hadd_pd(a,b);
+  *((__m128d *)c2) += _mm_hadd_pd(c,d);
 #else
-  vaddeq(*((__m128d *)c1),vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
-                               _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0))));
-  vaddeq(*((__m128d *)c2),vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
-                               _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0))));
+  *((__m128d *)c1) += _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
+                      _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
+  *((__m128d *)c2) += _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
+                      _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
 #endif
 #else
-  union {Tv v; complex double c; } u1, u2;
+  union {Tv v; _Complex double c; } u1, u2;
 #if defined(__SSE3__)
   u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
 #else
-  u1.v = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
-              _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
-  u2.v = vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
-              _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)));
+  u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
+         _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
+  u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
+         _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
 #endif
   *c1+=u1.c; *c2+=u2.c;
 #endif
   }
 
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict cc)
+  _Complex double * restrict cc)
   { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
 
 #endif
 
 #if (VLEN==4)
 
-static inline complex double vhsum_cmplx (Tv a, Tv b)
+static inline _Complex double vhsum_cmplx (Tv a, Tv b)
   {
   Tv tmp=_mm256_hadd_pd(a,b);
   Tv tmp2=_mm256_permute2f128_pd(tmp,tmp,1);
   tmp=_mm256_add_pd(tmp,tmp2);
 #ifdef UNSAFE_CODE
-  complex double ret;
+  _Complex double ret;
   *((__m128d *)&ret)=_mm256_extractf128_pd(tmp, 0);
   return ret;
 #else
-  union {Tv v; complex double c[2]; } u;
+  union {Tv v; _Complex double c[2]; } u;
   u.v=tmp; return u.c[0];
 #endif
   }
 
 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict c1, complex double * restrict c2)
+  _Complex double * restrict c1, _Complex double * restrict c2)
   {
   Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
   Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
      tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
-  tmp1=vadd(tmp3,tmp4);
+  tmp1=tmp3+tmp4;
 #ifdef UNSAFE_CODE
   *((__m128d *)c1)=_mm_add_pd(*((__m128d *)c1),_mm256_extractf128_pd(tmp1, 0));
   *((__m128d *)c2)=_mm_add_pd(*((__m128d *)c2),_mm256_extractf128_pd(tmp1, 1));
 #else
-  union {Tv v; complex double c[2]; } u;
+  union {Tv v; _Complex double c[2]; } u;
   u.v=tmp1;
   *c1+=u.c[0]; *c2+=u.c[1];
 #endif
   }
 
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict cc)
+  _Complex double * restrict cc)
   {
   Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
   Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
      tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
-  tmp1=vadd(tmp3,tmp4);
+  tmp1=tmp3+tmp4;
 #ifdef UNSAFE_CODE
   _mm256_storeu_pd((double *)cc,
     _mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1));
 #else
-  union {Tv v; complex double c[2]; } u;
+  union {Tv v; _Complex double c[2]; } u;
   u.v=tmp1;
   cc[0]+=u.c[0]; cc[1]+=u.c[1];
 #endif
@@ -159,18 +154,18 @@ static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
 
 #if (VLEN==8)
 
-static inline complex double vhsum_cmplx(Tv a, Tv b)
+static inline _Complex double vhsum_cmplx(Tv a, Tv b)
   { return _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); }
 
 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict c1, complex double * restrict c2)
+  _Complex double * restrict c1, _Complex double * restrict c2)
   {
   *c1 += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
   *c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
   }
 
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict cc)
+  _Complex double * restrict cc)
   { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
 
 #endif
diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 2f88b90..36d4b43 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -114,8 +114,8 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
     do
       {
       if (npow&1)
-        vmuleq(res,val);
-      vmuleq(val,val);
+        res*=val;
+      val*=val;
       }
     while(npow>>=1);
     *resd=res;
@@ -129,12 +129,12 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
       {
       if (npow&1)
         {
-        vmuleq(res,val);
-        vaddeq(scale,scaleint);
+        res*=val;
+        scale+=scaleint;
         Tvnormalize(&res,&scale,sharp_fbighalf);
         }
-      vmuleq(val,val);
-      vaddeq(scaleint,scaleint);
+      val*=val;
+      scaleint+=scaleint;
       Tvnormalize(&val,&scaleint,sharp_fbighalf);
       }
     while(npow>>=1);
diff --git a/libsharp/sharp_cxx.h b/libsharp/sharp_cxx.h
index f0c2738..6d5a6e4 100644
--- a/libsharp/sharp_cxx.h
+++ b/libsharp/sharp_cxx.h
@@ -33,7 +33,7 @@
 #define PLANCK_SHARP_CXX_H
 
 #include <complex>
-#include "sharp_lowlevel.h"
+#include "sharp.h"
 #include "sharp_geomhelpers.h"
 #include "sharp_almhelpers.h"
 
diff --git a/libsharp/sharp_geomhelpers.h b/libsharp/sharp_geomhelpers.h
index b7f98c4..1c77e27 100644
--- a/libsharp/sharp_geomhelpers.h
+++ b/libsharp/sharp_geomhelpers.h
@@ -32,7 +32,7 @@
 #ifndef PLANCK_SHARP_GEOMHELPERS_H
 #define PLANCK_SHARP_GEOMHELPERS_H
 
-#include "sharp_lowlevel.h"
+#include "sharp.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/libsharp/sharp_internal.h b/libsharp/sharp_internal.h
index 11f23cb..1c7f6b0 100644
--- a/libsharp/sharp_internal.h
+++ b/libsharp/sharp_internal.h
@@ -25,7 +25,7 @@
 /*! \file sharp_internal.h
  *  Internally used functionality for the spherical transform library.
  *
- *  Copyright (C) 2006-2013 Max-Planck-Society
+ *  Copyright (C) 2006-2018 Max-Planck-Society
  *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
@@ -36,6 +36,7 @@
 #error This header file cannot be included from C++, only from C
 #endif
 
+#include <complex.h>
 #include "sharp.h"
 
 #define SHARP_MAXTRANS 100
diff --git a/libsharp/sharp_lowlevel.h b/libsharp/sharp_lowlevel.h
deleted file mode 100644
index 2e7ab24..0000000
--- a/libsharp/sharp_lowlevel.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_lowlevel.h
- *  Low-level, portable interface for the spherical transform library.
- *
- *  Copyright (C) 2012-2013 Max-Planck-Society
- *  \author Martin Reinecke \author Dag Sverre Seljebotn
- */
-
-#ifndef PLANCK_SHARP_LOWLEVEL_H
-#define PLANCK_SHARP_LOWLEVEL_H
-
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*! \internal
-    Helper type containing information about a single ring. */
-typedef struct
-  {
-  double theta, phi0, weight, cth, sth;
-  ptrdiff_t ofs;
-  int nph, stride;
-  } sharp_ringinfo;
-
-/*! \internal
-    Helper type containing information about a pair of rings with colatitudes
-    symmetric around the equator. */
-typedef struct
-  {
-  sharp_ringinfo r1,r2;
-  } sharp_ringpair;
-
-/*! \internal
-    Type holding all required information about a map geometry. */
-typedef struct
-  {
-  sharp_ringpair *pair;
-  int npairs, nphmax;
-  } sharp_geom_info;
-
-/*! \defgroup almgroup Helpers for dealing with a_lm */
-/*! \{ */
-
-/*! \internal
-    Helper type for index calculation in a_lm arrays. */
-typedef struct
-  {
-  /*! Maximum \a l index of the array */
-  int lmax;
-  /*! Number of different \a m values in this object */
-  int nm;
-  /*! Array with \a nm entries containing the individual m values */
-  int *mval;
-  /*! Combination of flags from sharp_almflags */
-  int flags;
-  /*! Array with \a nm entries containing the (hypothetical) indices of
-      the coefficients with quantum numbers 0,\a mval[i] */
-  ptrdiff_t *mvstart;
-  /*! Stride between a_lm and a_(l+1),m */
-  ptrdiff_t stride;
-  } sharp_alm_info;
-
-/*! alm_info flags */
-typedef enum { SHARP_PACKED = 1,
-               /*!< m=0-coefficients are packed so that the (zero) imaginary part is
-                    not present. mvstart is in units of *real* float/double for all
-                    m; stride is in units of reals for m=0 and complex for m!=0 */
-               SHARP_REAL_HARMONICS  = 1<<6
-               /*!< Use the real spherical harmonic convention. For
-                    m==0, the alm are treated exactly the same as in
-                    the complex case.  For m!=0, alm[i] represent a
-                    pair (+abs(m), -abs(m)) instead of (real, imag),
-                    and the coefficients are scaled by a factor of
-                    sqrt(2) relative to the complex case.  In other
-                    words, (sqrt(.5) * alm[i]) recovers the
-                    corresponding complex coefficient (when accessed
-                    as complex).
-                */
-             } sharp_almflags;
-
-
-
-/*! Creates an a_lm data structure from the following parameters:
-    \param lmax maximum \a l quantum number (>=0)
-    \param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
-    \param stride the stride between entries with identical \a m, and \a l
-      differing by 1.
-    \param mstart the index of the (hypothetical) coefficient with the
-      quantum numbers 0,\a m. Must have \a mmax+1 entries.
-    \param alm_info will hold a pointer to the newly created data structure
- */
-void sharp_make_alm_info (int lmax, int mmax, int stride,
-  const ptrdiff_t *mstart, sharp_alm_info **alm_info);
-/*! Creates an a_lm data structure which from the following parameters:
-    \param lmax maximum \a l quantum number (\a >=0)
-    \param nm number of different \a m (\a 0<=nm<=lmax+1)
-    \param stride the stride between entries with identical \a m, and \a l
-      differing by 1.
-    \param mval array with \a nm entries containing the individual m values
-    \param mvstart array with \a nm entries containing the (hypothetical)
-      indices of the coefficients with the quantum numbers 0,\a mval[i]
-    \param flags a combination of sharp_almflags (pass 0 unless you know you need this)
-    \param alm_info will hold a pointer to the newly created data structure
- */
-void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
-  const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info);
-/*! Returns the index of the coefficient with quantum numbers \a l,
-    \a mval[mi].
-    \note for a \a sharp_alm_info generated by sharp_make_alm_info() this is
-    the index for the coefficient with the quantum numbers \a l, \a mi. */
-ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
-/*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED
-    flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0),
-    otherwise it is the number of complex coefficients (with m>=0). */
-ptrdiff_t sharp_alm_count(const sharp_alm_info *self);
-/*! Deallocates the a_lm info object. */
-void sharp_destroy_alm_info (sharp_alm_info *info);
-
-/*! \} */
-
-/*! \defgroup geominfogroup Functions for dealing with geometry information */
-/*! \{ */
-
-/*! Creates a geometry information from a set of ring descriptions.
-    All arrays passed to this function must have \a nrings elements.
-    \param nrings the number of rings in the map
-    \param nph the number of pixels in each ring
-    \param ofs the index of the first pixel in each ring in the map array
-    \param stride the stride between consecutive pixels
-    \param phi0 the azimuth (in radians) of the first pixel in each ring
-    \param theta the colatitude (in radians) of each ring
-    \param wgt the pixel weight to be used for the ring in map2alm
-      and adjoint map2alm transforms.
-      Pass NULL to use 1.0 as weight for all rings.
-    \param geom_info will hold a pointer to the newly created data structure
- */
-void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
-  const int *stride, const double *phi0, const double *theta,
-  const double *wgt, sharp_geom_info **geom_info);
-
-/*! Counts the number of grid points needed for (the local part of) a map described
-    by \a info.
- */
-ptrdiff_t sharp_map_size(const sharp_geom_info *info);
-
-/*! Deallocates the geometry information in \a info. */
-void sharp_destroy_geom_info (sharp_geom_info *info);
-
-/*! \} */
-
-/*! \defgroup lowlevelgroup Low-level libsharp SHT interface */
-/*! \{ */
-
-/*! Enumeration of SHARP job types. */
-typedef enum { SHARP_YtW=0,               /*!< analysis */
-               SHARP_MAP2ALM=SHARP_YtW,   /*!< analysis */
-               SHARP_Y=1,                 /*!< synthesis */
-               SHARP_ALM2MAP=SHARP_Y,     /*!< synthesis */
-               SHARP_Yt=2,                /*!< adjoint synthesis */
-               SHARP_WY=3,                /*!< adjoint analysis */
-               SHARP_ALM2MAP_DERIV1=4     /*!< synthesis of first derivatives */
-             } sharp_jobtype;
-
-/*! Job flags */
-typedef enum { SHARP_DP              = 1<<4,
-               /*!< map and a_lm are in double precision */
-               SHARP_ADD             = 1<<5,
-               /*!< results are added to the output arrays, instead of
-                    overwriting them */
-
-               /* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags,
-                  but its use here is deprecated in favor of having it in the sharp_alm_info */
-
-               SHARP_NO_FFT          = 1<<7,
-
-               SHARP_USE_WEIGHTS     = 1<<20,    /* internal use only */
-               SHARP_NO_OPENMP       = 1<<21,    /* internal use only */
-             } sharp_jobflags;
-
-/*! Performs a libsharp SHT job. The interface deliberately does not use
-  the C99 "complex" data type, in order to be callable from C89 and C++.
-  \param type the type of SHT
-  \param spin the spin of the quantities to be transformed
-  \param alm contains pointers to the a_lm coefficients. If \a spin==0,
-    alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
-    etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
-    alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
-    depends on whether the SHARP_DP flag is set.
-  \param map contains pointers to the maps. If \a spin==0,
-    map[0] points to the map of the first SHT, map[1] to that of the second
-    etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
-    point to the maps of the first SHT, map[2] and map[3] to those of the
-    second, etc. The exact data type of \a map depends on whether the SHARP_DP
-    flag is set.
-  \param geom_info A \c sharp_geom_info object compatible with the provided
-    \a map arrays.
-  \param alm_info A \c sharp_alm_info object compatible with the provided
-    \a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
-    exactly once.
-  \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
-    \a alm is expected to have the type "complex double **" and \a map is
-    expected to have the type "double **"; otherwise, the expected
-    types are "complex float **" and "float **", respectively.
-  \param time If not NULL, the wall clock time required for this SHT
-    (in seconds) will be written here.
-  \param opcnt If not NULL, a conservative estimate of the total floating point
-    operation count for this SHT will be written here. */
-void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
-  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
-  int flags, double *time, unsigned long long *opcnt);
-
-void sharp_set_chunksize_min(int new_chunksize_min);
-void sharp_set_nchunks_max(int new_nchunks_max);
-
-
-typedef enum { SHARP_ERROR_NO_MPI = 1,
-               /*!< libsharp not compiled with MPI support */
-              } sharp_errors;
-
-/*! Works like sharp_execute_mpi, but is always present whether or not libsharp
-    is compiled with USE_MPI. This is primarily useful for wrapper code etc.
-
-    Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid
-    pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi
-    is *(MPI_Comm*)pcomm.
-
-    Other parameters are the same as sharp_execute_mpi.
-
-    Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available
-    (in which case nothing is done).
- */
-int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
-  void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int flags, double *time,
-  unsigned long long *opcnt);
-
-
-
-/*! \} */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/libsharp/sharp_mpi.h b/libsharp/sharp_mpi.h
index df07117..73a8aa0 100644
--- a/libsharp/sharp_mpi.h
+++ b/libsharp/sharp_mpi.h
@@ -33,7 +33,7 @@
 #define PLANCK_SHARP_MPI_H
 
 #include <mpi.h>
-#include "sharp_lowlevel.h"
+#include "sharp.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index ff3f573..942e290 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -40,32 +40,13 @@ typedef double Ts;
 #if (VLEN==1)
 
 typedef double Tv;
-typedef float Tv_s;
 typedef int Tm;
 
-#define vadd(a,b) ((a)+(b))
-#define vadd_s(a,b) ((a)+(b))
-#define vaddeq(a,b) ((a)+=(b))
 #define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
-#define vsub(a,b) ((a)-(b))
-#define vsub_s(a,b) ((a)-(b))
-#define vsubeq(a,b) ((a)-=(b))
 #define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
-#define vmul(a,b) ((a)*(b))
-#define vmul_s(a,b) ((a)*(b))
-#define vmuleq(a,b) ((a)*=(b))
 #define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
-#define vfmaeq(a,b,c) ((a)+=(b)*(c))
-#define vfmaeq_s(a,b,c) ((a)+=(b)*(c))
-#define vfmseq(a,b,c) ((a)-=(b)*(c))
-#define vabmc(a,b,c) ((a)*(b)-(c))
-#define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
-#define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
 #define vneg(a) (-(a))
 #define vload(a) (a)
-#define vload_s(a) (a)
-#define vloadu(p) (*(p))
-#define vloadu_s(p) (*(p))
 #define vabs(a) fabs(a)
 #define vsqrt(a) sqrt(a)
 #define vlt(a,b) ((a)<(b))
@@ -74,8 +55,6 @@ typedef int Tm;
 #define vne(a,b) ((a)!=(b))
 #define vand_mask(a,b) ((a)&&(b))
 #define vor_mask(a,b) ((a)||(b))
-#define vstoreu(p, a) (*(p)=a)
-#define vstoreu_s(p, a) (*(p)=a)
 
 static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
 static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
@@ -99,7 +78,6 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
 #endif
 
 typedef __m128d Tv;
-typedef __m128 Tv_s;
 typedef __m128d Tm;
 
 #if defined(__SSE4_1__)
@@ -111,29 +89,11 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vzero _mm_setzero_pd()
 #define vone _mm_set1_pd(1.)
 
-#define vadd(a,b) _mm_add_pd(a,b)
-#define vadd_s(a,b) _mm_add_ps(a,b)
-#define vaddeq(a,b) a=_mm_add_pd(a,b)
 #define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero))
-#define vsub(a,b) _mm_sub_pd(a,b)
-#define vsub_s(a,b) _mm_sub_ps(a,b)
-#define vsubeq(a,b) a=_mm_sub_pd(a,b)
 #define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero))
-#define vmul(a,b) _mm_mul_pd(a,b)
-#define vmul_s(a,b) _mm_mul_ps(a,b)
-#define vmuleq(a,b) a=_mm_mul_pd(a,b)
 #define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone))
-#define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
-#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c))
-#define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
-#define vabmc(a,b,c) _mm_sub_pd(_mm_mul_pd(a,b),c)
-#define vfmaaeq(a,b,c,d,e) \
-  a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
-#define vfmaseq(a,b,c,d,e) \
-  a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
 #define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
 #define vload(a) _mm_set1_pd(a)
-#define vload_s(a) _mm_set1_ps(a)
 #define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
 #define vsqrt(a) _mm_sqrt_pd(a)
 #define vlt(a,b) _mm_cmplt_pd(a,b)
@@ -146,10 +106,6 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vmax(a,b) _mm_max_pd(a,b);
 #define vanyTrue(a) (_mm_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm_movemask_pd(a)==3)
-#define vloadu(p) _mm_loadu_pd(p)
-#define vloadu_s(p) _mm_loadu_ps(p)
-#define vstoreu(p, v) _mm_storeu_pd(p, v)
-#define vstoreu_s(p, v) _mm_storeu_ps(p, v)
 
 #endif
 
@@ -161,54 +117,17 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #endif
 
 typedef __m256d Tv;
-typedef __m256 Tv_s;
 typedef __m256d Tm;
 
 #define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
 #define vzero _mm256_setzero_pd()
 #define vone _mm256_set1_pd(1.)
 
-#define vadd(a,b) _mm256_add_pd(a,b)
-#define vadd_s(a,b) _mm256_add_ps(a,b)
-#define vaddeq(a,b) a=_mm256_add_pd(a,b)
 #define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero))
-#define vsub(a,b) _mm256_sub_pd(a,b)
-#define vsub_s(a,b) _mm256_sub_ps(a,b)
-#define vsubeq(a,b) a=_mm256_sub_pd(a,b)
 #define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero))
-#define vmul(a,b) _mm256_mul_pd(a,b)
-#define vmul_s(a,b) _mm256_mul_ps(a,b)
-#define vmuleq(a,b) a=_mm256_mul_pd(a,b)
 #define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
-#if (USE_FMA4)
-#define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
-#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
-#define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
-#define vabmc(a,b,c) _mm256_msub_pd(a,b,c)
-#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
-#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
-#else
-#if (USE_FMA)
-#define vfmaeq(a,b,c) a=_mm256_fmadd_pd(b,c,a)
-#define vfmaeq_s(a,b,c) a=_mm256_fmadd_ps(b,c,a)
-#define vfmseq(a,b,c) a=_mm256_fnmadd_pd(b,c,a)
-#define vabmc(a,b,c) _mm256_fmsub_pd(a,b,c)
-#define vfmaaeq(a,b,c,d,e) a=_mm256_fmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
-#define vfmaseq(a,b,c,d,e) a=_mm256_fnmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
-#else
-#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
-#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
-#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
-#define vabmc(a,b,c) _mm256_sub_pd(_mm256_mul_pd(a,b),c)
-#define vfmaaeq(a,b,c,d,e) \
-  a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
-#define vfmaseq(a,b,c,d,e) \
-  a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
-#endif
-#endif
 #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
 #define vload(a) _mm256_set1_pd(a)
-#define vload_s(a) _mm256_set1_ps(a)
 #define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
 #define vsqrt(a) _mm256_sqrt_pd(a)
 #define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
@@ -222,11 +141,6 @@ typedef __m256d Tm;
 #define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm256_movemask_pd(a)==15)
 
-#define vloadu(p) _mm256_loadu_pd(p)
-#define vloadu_s(p) _mm256_loadu_ps(p)
-#define vstoreu(p, v) _mm256_storeu_pd(p, v)
-#define vstoreu_s(p, v) _mm256_storeu_ps(p, v)
-
 #endif
 
 #if (VLEN==8)
@@ -236,20 +150,8 @@ typedef __m256d Tm;
 typedef __m512d Tv;
 typedef __mmask8 Tm;
 
-#define vadd(a,b) _mm512_add_pd(a,b)
-#define vaddeq(a,b) a=_mm512_add_pd(a,b)
 #define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
-#define vsub(a,b) _mm512_sub_pd(a,b)
-#define vsubeq(a,b) a=_mm512_sub_pd(a,b)
-#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
-#define vmul(a,b) _mm512_mul_pd(a,b)
-#define vmuleq(a,b) a=_mm512_mul_pd(a,b)
 #define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
-#define vfmaeq(a,b,c) a=_mm512_fmadd_pd(b,c,a)
-//#define vabmc(a,b,c) a=_mm512_fnmadd_pd(b,c,a)
-//#define vfms(a,b,c) _mm512_fnmadd_pd(b,c,a)
-#define vfmaaeq(a,b,c,d,e) a=_mm512_fmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
-#define vfmaseq(a,b,c,d,e) a=_mm512_fnmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
 #define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))
 #define vload(a) _mm512_set1_pd(a)
 #define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)_mm512_set1_pd(-0.),(__m512i)a)
diff --git a/libsharp/sharp_vecutil.h b/libsharp/sharp_vecutil.h
index 24a2e94..522cc5f 100644
--- a/libsharp/sharp_vecutil.h
+++ b/libsharp/sharp_vecutil.h
@@ -25,7 +25,7 @@
 /*! \file sharp_vecutil.h
  *  Functionality related to vector instruction support
  *
- *  Copyright (C) 2012,2013 Max-Planck-Society
+ *  Copyright (C) 2012-2018 Max-Planck-Society
  *  \author Martin Reinecke
  */
 

From 07a708dbc648087c8a2b5449f883ac264e3ab0e6 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Sat, 15 Dec 2018 15:39:09 +0100
Subject: [PATCH 29/85] spin>0 not yet working

---
 libsharp/sharp_core.c      | 312 ++++++++++++++++++++++++++++++++++++-
 libsharp/sharp_testsuite.c |   2 +-
 2 files changed, 306 insertions(+), 8 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 36d4b43..5060137 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -201,7 +201,7 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
   *l_=l;
   }
 
-#if 0
+#if 1
 static inline void rec_step (Tv * restrict rxp, Tv * restrict rxm,
   Tv * restrict ryp, Tv * restrict rym, const Tv cth,
   const sharp_ylmgen_dbl3 fx)
@@ -261,7 +261,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
     Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol);
     Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol);
 
-    below_limit &= vallTrue(vand_mask(vlt(d->scm[i],limscale),vlt(d->scp[i],limscale)));
+    below_limit &= vallTrue(vlt(d->scm[i],limscale)) &&
+                   vallTrue(vlt(d->scp[i],limscale));
     }
 
   int l=gen->mhi;
@@ -298,7 +299,7 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
        acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
     Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
        acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
-    for (int i=0; i<nvec; ++i)
+    for (int i=0; i<nv2; ++i)
       {
       d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
       d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
@@ -319,8 +320,209 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
       }
     l+=2;
     }
-//  if (l==lmax)
-//    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
+  }
+
+NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
+  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm,
+  int l, int lmax, int nv2)
+  {
+  while (l<=lmax)
+    {
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
+       fx12=vload(fx[l+1].f[2]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
+       fx22=vload(fx[l+2].f[2]);
+    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
+    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
+      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      Tv lw = d->l2p[i] + d->l2m[i];
+      agr1 += d->p1pr[i]*lw;
+      agi1 += d->p1pi[i]*lw;
+      acr1 += d->p1mr[i]*lw;
+      aci1 += d->p1mi[i]*lw;
+      Tv lx = d->l2m[i] - d->l2p[i];
+      agr1 -= d->p2mi[i]*lx;
+      agi1 += d->p2mr[i]*lx;
+      acr1 += d->p2pi[i]*lx;
+      aci1 -= d->p2pr[i]*lx;
+      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
+      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      lw = d->l1p[i] + d->l1m[i];
+      agr2 += d->p2pr[i]*lw;
+      agi2 += d->p2pi[i]*lw;
+      acr2 += d->p2mr[i]*lw;
+      aci2 += d->p2mi[i]*lw;
+      lx = d->l1m[i] - d->l1p[i];
+      agr2 -= d->p1mi[i]*lx;
+      agi2 += d->p1mr[i]*lx;
+      acr2 += d->p1pi[i]*lx;
+      aci2 -= d->p1pr[i]*lx;
+      }
+    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
+    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
+    l+=2;
+    }
+  }
+
+NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
+  {
+  int l,lmax=gen->lmax;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee_spin(gen, d, &l, nv2);
+  job->opcnt += (l-gen->m) * 10*nth;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 28*nth;
+
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+    getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
+                 vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+    }
+
+  while((!full_ieee) && (l<=lmax))
+    {
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
+       fx12=vload(fx[l+1].f[2]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
+       fx22=vload(fx[l+2].f[2]);
+    Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
+       acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
+    Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
+       acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
+    full_ieee=1;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
+      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      Tv lw1=d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i];
+      Tv lx2=d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i];
+      d->p1pr[i] += agr1*lw1 - aci2*lx2;
+      d->p1pi[i] += agi1*lw1 + acr2*lx2;
+      d->p1mr[i] += acr1*lw1 + agi2*lx2;
+      d->p1mi[i] += aci1*lw1 - agr2*lx2;
+      Tv lx1=d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i];
+      Tv lw2=d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i];
+      d->p2pr[i] -= agr2*lw2 - aci1*lx1;
+      d->p2pi[i] += agi2*lw2 + acr1*lx1;
+      d->p2mr[i] += acr2*lw2 + agi1*lx1;
+      d->p2mi[i] -= aci2*lw2 - agr1*lx1;
+      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
+      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
+        }
+      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+        }
+      }
+    l+=2;
+    }
+  if (l>lmax) return;
+
+  for (int i=0; i<nv2; ++i)
+    {
+    d->l1p[i] *= d->cfp[i];
+    d->l2p[i] *= d->cfp[i];
+    d->l1m[i] *= d->cfm[i];
+    d->l2m[i] *= d->cfm[i];
+    }
+  alm2map_spin_kernel(d, fx, alm, l, lmax, nv2);
+  }
+
+NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
+  {
+  int l,lmax=gen->lmax;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee_spin(gen, d, &l, nv2);
+  job->opcnt += (l-gen->m) * 10*nth;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 28*nth;
+
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
+  dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+    getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
+                 vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+    }
+
+  while((!full_ieee) && (l<=lmax))
+    {
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
+       fx12=vload(fx[l+1].f[2]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
+       fx22=vload(fx[l+2].f[2]);
+    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
+    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
+    full_ieee=1;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
+      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      Tv lw = d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i];
+      agr1 += d->p1pr[i]*lw;
+      agi1 += d->p1pi[i]*lw;
+      acr1 += d->p1mr[i]*lw;
+      aci1 += d->p1mi[i]*lw;
+      Tv lx = d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i];
+      agr1 -= d->p2mi[i]*lx;
+      agi1 += d->p2mr[i]*lx;
+      acr1 += d->p2pi[i]*lx;
+      aci1 -= d->p2pr[i]*lx;
+      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
+      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      lw = d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i];
+      agr2 += d->p2pr[i]*lw;
+      agi2 += d->p2pi[i]*lw;
+      acr2 += d->p2mr[i]*lw;
+      aci2 += d->p2mi[i]*lw;
+      lx = d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i];
+      agr2 -= d->p1mi[i]*lx;
+      agi2 += d->p1mr[i]*lx;
+      acr2 += d->p1pi[i]*lx;
+      aci2 -= d->p1pr[i]*lx;
+      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
+        }
+      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+        }
+      }
+    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
+    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
+    l+=2;
+    }
+  if (l>lmax) return;
+
+  for (int i=0; i<nv2; ++i)
+    {
+    d->l1p[i] *= d->cfp[i];
+    d->l2p[i] *= d->cfp[i];
+    d->l1m[i] *= d->cfm[i];
+    d->l2m[i] *= d->cfm[i];
+    }
+  map2alm_spin_kernel(d, fx, alm, l, lmax, nv2);
   }
 #endif
 
@@ -521,6 +723,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
               {
               d.s.cth[i]=d.s.cth[nth-1];
               d.s.sth[i]=d.s.sth[nth-1];
+              d.s.p1r[i]=d.s.p1i[i]=d.s.p2r[i]=d.s.p2i[i]=0.;
               }
             calc_alm2map (job, gen, &d.v, nth);
             for (int i=0; i<nth; ++i)
@@ -538,7 +741,63 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
         }
       else
         {
-        UTIL_FAIL("only spin==0 allowed at the moment");
+        int ith=0;
+        int itgt[nvec*VLEN];
+        while (ith<ulim-llim)
+          {
+          sxdata_u d;
+          VZERO(d.s.p1pr); VZERO(d.s.p1pi); VZERO(d.s.p2pr); VZERO(d.s.p2pi);
+          VZERO(d.s.p1mr); VZERO(d.s.p1mi); VZERO(d.s.p2mr); VZERO(d.s.p2mi);
+          int nth=0;
+          while ((nth<nval)&&(ith<ulim-llim))
+            {
+            if (mlim[ith]>=m)
+              {
+              itgt[nth] = ith;
+              d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
+              ++nth;
+              }
+            else
+              {
+              int phas_idx = ith*job->s_th + mi*job->s_m;
+              job->phase[phas_idx  ] = job->phase[phas_idx+1] = 0;
+              job->phase[phas_idx+2] = job->phase[phas_idx+3] = 0;
+              }
+            ++ith;
+            }
+          if (nth>0)
+            {
+            int i2=((nth+VLEN-1)/VLEN)*VLEN;
+            for (int i=nth; i<i2; ++i)
+              {
+              d.s.cth[i]=d.s.cth[nth-1];
+              d.s.sth[i]=d.s.sth[nth-1];
+              d.s.p1pr[i]=d.s.p1pi[i]=d.s.p2pr[i]=d.s.p2pi[i]=0.;
+              d.s.p1mr[i]=d.s.p1mi[i]=d.s.p2mr[i]=d.s.p2mi[i]=0.;
+              }
+            calc_alm2map_spin (job, gen, &d.v, nth);
+            for (int i=0; i<nth; ++i)
+              {
+              int tgt=itgt[i];
+              int phas_idx = tgt*job->s_th + mi*job->s_m;
+              complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I,
+                             q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I,
+                             u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I,
+                             u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I;
+              job->phase[phas_idx  ] = q1+q2;
+              job->phase[phas_idx+2] = u1+u2;
+              if (ispair[tgt])
+                {
+                dcmplx *phQ = &(job->phase[phas_idx+1]),
+                       *phU = &(job->phase[phas_idx+3]);
+                *phQ = q1-q2;
+                *phU = u1-u2;
+                if ((gen->mhi-gen->m+gen->s)&1)
+                  { *phQ=-(*phQ); *phU=-(*phU); }
+                }
+              }
+            }
+          }
         }
       break;
       }
@@ -599,7 +858,46 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
         }
       else
         {
-        UTIL_FAIL("only spin==0 allowed at the moment");
+        int ith=0;
+        while (ith<ulim-llim)
+          {
+          sxdata_u d;
+          VZERO(d.s.p1pr); VZERO(d.s.p1pi); VZERO(d.s.p2pr); VZERO(d.s.p2pi);
+          VZERO(d.s.p1mr); VZERO(d.s.p1mi); VZERO(d.s.p2mr); VZERO(d.s.p2mi);
+          int nth=0;
+          while ((nth<nval)&&(ith<ulim-llim))
+            {
+            if (mlim[ith]>=m)
+              {
+              d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
+              int phas_idx = ith*job->s_th + mi*job->s_m;
+              dcmplx p1Q=job->phase[phas_idx],
+                     p1U=job->phase[phas_idx+2],
+                     p2Q=ispair[ith] ? job->phase[phas_idx+1]:0.,
+                     p2U=ispair[ith] ? job->phase[phas_idx+3]:0.;
+              if ((gen->mhi-gen->m+gen->s)&1)
+                { p2Q=-p2Q; p2U=-p2U; }
+              d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q);
+              d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U);
+              d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q);
+              d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U);
+              ++nth;
+              }
+            ++ith;
+            }
+          if (nth>0)
+            {
+            int i2=((nth+VLEN-1)/VLEN)*VLEN;
+            for (int i=nth; i<i2; ++i)
+              {
+              d.s.cth[i]=d.s.cth[nth-1];
+              d.s.sth[i]=d.s.sth[nth-1];
+              d.s.p1pr[i]=d.s.p1pi[i]=d.s.p2pr[i]=d.s.p2pi[i]=0.;
+              d.s.p1mr[i]=d.s.p1mi[i]=d.s.p2mr[i]=d.s.p2mi[i]=0.;
+              }
+            calc_map2alm_spin(job, gen, &d.v, nth);
+            }
+          }
         }
       break;
       }
diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c
index 4b124af..0171fe2 100644
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@@ -375,7 +375,7 @@ static void check_sign_scale(void)
   UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.234675107554816442e+01,1e-12),
     "error");
 
-#if 0
+#if 1
   sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,SHARP_DP,
     NULL,NULL);
   UTIL_ASSERT(FAPPROX(map[0][0     ], 2.750897760535633285e+00,1e-12),

From 846b37c231a4928f6161739509c14afa4834d6ff Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Sat, 15 Dec 2018 16:00:27 +0100
Subject: [PATCH 30/85] fixes, but still broken

---
 libsharp/sharp_core.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 5060137..f232018 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -241,9 +241,9 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
     d->l1p[i] = vzero;
     d->l1m[i] = vzero;
     d->l2p[i] = prefac*ccp[i];
-    d->scp[i] = prescale*ccps[i];
+    d->scp[i] = prescale+ccps[i];
     d->l2m[i] = prefac*csp[i];
-    d->scm[i] = prescale*csps[i];
+    d->scm[i] = prescale+csps[i];
     Tvnormalize(&d->l2m[i],&d->scm[i],sharp_fbighalf);
     Tvnormalize(&d->l2p[i],&d->scp[i],sharp_fbighalf);
 
@@ -270,6 +270,7 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
   while (below_limit)
     {
     if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
+    below_limit=1;
     for (int i=0; i<nv2; ++i)
       {
       rec_step(&d->l1p[i],&d->l1m[i],&d->l2p[i],&d->l2m[i],d->cth[i],fx[l+1]);
@@ -827,7 +828,6 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
         while (ith<ulim-llim)
           {
           s0data_u d;
-          VZERO(d.s.p1r); VZERO(d.s.p1i); VZERO(d.s.p2r); VZERO(d.s.p2i);
           int nth=0;
           while ((nth<nval)&&(ith<ulim-llim))
             {
@@ -862,8 +862,6 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
         while (ith<ulim-llim)
           {
           sxdata_u d;
-          VZERO(d.s.p1pr); VZERO(d.s.p1pi); VZERO(d.s.p2pr); VZERO(d.s.p2pi);
-          VZERO(d.s.p1mr); VZERO(d.s.p1mi); VZERO(d.s.p2mr); VZERO(d.s.p2mi);
           int nth=0;
           while ((nth<nval)&&(ith<ulim-llim))
             {

From 8c98d4624e41318217a70708da168546d9577655 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Mon, 17 Dec 2018 10:32:49 +0100
Subject: [PATCH 31/85] works now

---
 libsharp/sharp_core.c     | 50 +++++++++++++++++----------------------
 libsharp/sharp_ylmgen_c.c | 16 ++++++-------
 2 files changed, 30 insertions(+), 36 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index f232018..1e476d0 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -40,7 +40,7 @@
 
 typedef complex double dcmplx;
 
-#define nvec (256/VLEN)
+#define nvec (128/VLEN)
 
 typedef union
   { Tv v; double s[VLEN]; } Tvu;
@@ -215,42 +215,36 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
   sxdata_v * restrict d, int * restrict l_, int nv2)
   {
   const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tbv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
   Tv prefac=vload(gen->prefac[gen->m]),
      prescale=vload(gen->fscale[gen->m]);
   Tv limscale=vload(sharp_limscale);
   int below_limit=1;
   for (int i=0; i<nv2; ++i)
     {
-    Tv cth2, sth2;
-    cth2=vsqrt((vone+d->cth[i])*vload(0.5));
-    cth2=vmax(cth2,vload(1e-15));
-    sth2=vsqrt((vone-d->cth[i])*vload(0.5));
-    sth2=vmax(sth2,vload(1e-15));
+    Tv cth2=vmax(vload(1e-15),vsqrt((vone+d->cth[i])*vload(0.5)));
+    Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5)));
     Tm mask=vlt(d->sth[i],vzero);
-    Tm cmask=vand_mask(mask,vlt(d->cth[i],vzero));
-    vmuleq_mask(cmask,cth2,vload(-1.));
-    Tm smask=vand_mask(mask,vgt(d->cth[i],vzero));
-    vmuleq_mask(smask,sth2,vload(-1.));
+    vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.));
+    vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.));
 
-    mypow(cth2,gen->cosPow,gen->powlimit,&ccp[i],&ccps[i]);
-    mypow(sth2,gen->sinPow,gen->powlimit,&ssp[i],&ssps[i]);
-    mypow(cth2,gen->sinPow,gen->powlimit,&csp[i],&csps[i]);
-    mypow(sth2,gen->cosPow,gen->powlimit,&scp[i],&scps[i]);
+    Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
+    mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
+    mypow(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
+    mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
+    mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
 
     d->l1p[i] = vzero;
     d->l1m[i] = vzero;
-    d->l2p[i] = prefac*ccp[i];
-    d->scp[i] = prescale+ccps[i];
-    d->l2m[i] = prefac*csp[i];
-    d->scm[i] = prescale+csps[i];
+    d->l2p[i] = prefac*ccp;
+    d->scp[i] = prescale+ccps;
+    d->l2m[i] = prefac*csp;
+    d->scm[i] = prescale+csps;
     Tvnormalize(&d->l2m[i],&d->scm[i],sharp_fbighalf);
     Tvnormalize(&d->l2p[i],&d->scp[i],sharp_fbighalf);
-
-    d->l2p[i] *= ssp[i];
-    d->scp[i] += ssps[i];
-    d->l2m[i] *= scp[i];
-    d->scm[i] += scps[i];
+    d->l2p[i] *= ssp;
+    d->scp[i] += ssps;
+    d->l2m[i] *= scp;
+    d->scm[i] += scps;
     if (gen->preMinus_p)
       d->l2p[i] = vneg(d->l2p[i]);
     if (gen->preMinus_m)
@@ -277,8 +271,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
       rec_step(&d->l2p[i],&d->l2m[i],&d->l1p[i],&d->l1m[i],d->cth[i],fx[l+2]);
       if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
           rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
-      below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
-                     vallTrue(vlt(d->scm[i],limscale));
+        below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
+                       vallTrue(vlt(d->scm[i],limscale));
       }
     l+=2;
     }
@@ -312,10 +306,10 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
       d->p1mi[i] += aci1*lw1 - agr2*lx2;
       Tv lx1=d->l2m[i]-d->l2p[i];
       Tv lw2=d->l1p[i]+d->l1m[i];
-      d->p2pr[i] -= agr2*lw2 - aci1*lx1;
+      d->p2pr[i] += agr2*lw2 - aci1*lx1;
       d->p2pi[i] += agi2*lw2 + acr1*lx1;
       d->p2mr[i] += acr2*lw2 + agi1*lx1;
-      d->p2mi[i] -= aci2*lw2 - agr1*lx1;
+      d->p2mi[i] += aci2*lw2 - agr1*lx1;
       d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
       d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
       }
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index e967773..9cb9dbb 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -85,15 +85,15 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
   else
     {
     gen->m=gen->mlo=gen->mhi=-1234567890;
-    ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+2);
-    for (int m=0; m<gen->lmax+2; ++m)
+    ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+3);
+    for (int m=0; m<gen->lmax+3; ++m)
       gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
-    ALLOC(gen->inv,double,gen->lmax+1);
+    ALLOC(gen->inv,double,gen->lmax+2);
     gen->inv[0]=0;
-    for (int m=1; m<gen->lmax+1; ++m) gen->inv[m]=1./m;
-    ALLOC(gen->flm1,double,2*gen->lmax+1);
-    ALLOC(gen->flm2,double,2*gen->lmax+1);
-    for (int m=0; m<2*gen->lmax+1; ++m)
+    for (int m=1; m<gen->lmax+2; ++m) gen->inv[m]=1./m;
+    ALLOC(gen->flm1,double,2*gen->lmax+3);
+    ALLOC(gen->flm2,double,2*gen->lmax+3);
+    for (int m=0; m<2*gen->lmax+3; ++m)
       {
       gen->flm1[m] = sqrt(1./(m+1.));
       gen->flm2[m] = sqrt(m/(m+1.));
@@ -176,7 +176,7 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
 
     if (!ms_similar)
       {
-      for (int l=gen->mhi; l<gen->lmax; ++l)
+      for (int l=gen->mhi; l<gen->lmax+1; ++l)
         {
         double t = gen->flm1[l+gen->m]*gen->flm1[l-gen->m]
                   *gen->flm1[l+gen->s]*gen->flm1[l-gen->s];

From ac3bf55ac501802c3aed9e22d98fca496bde23a0 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Mon, 17 Dec 2018 15:08:51 +0100
Subject: [PATCH 32/85] fixes and cleanups

---
 libsharp/sharp.c           |  2 +-
 libsharp/sharp_core.c      | 73 ++++++++++++++--------------
 libsharp/sharp_testsuite.c | 99 ++++++--------------------------------
 3 files changed, 54 insertions(+), 120 deletions(-)

diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index f312fc3..aa680df 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -588,7 +588,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
     }
   else
     memset (job->almtmp+job->nalm*job->ainfo->mval[mi], 0,
-      job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx));
+      job->nalm*(lmax+2-job->ainfo->mval[mi])*sizeof(dcmplx));
 
 #undef COPY_LOOP
   }
diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 1e476d0..b1414f1 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -40,22 +40,20 @@
 
 typedef complex double dcmplx;
 
-#define nvec (128/VLEN)
+#define nv0 (64/VLEN)
+#define nvx (128/VLEN)
 
-typedef union
-  { Tv v; double s[VLEN]; } Tvu;
-
-typedef Tv Tbv[nvec];
-typedef double Tbs[nvec*VLEN];
+typedef Tv Tbv0[nv0];
+typedef double Tbs0[nv0*VLEN];
 
 typedef struct
   {
-  Tbv sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i;
+  Tbv0 sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i;
   } s0data_v;
 
 typedef struct
   {
-  Tbs sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i;
+  Tbs0 sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i;
   } s0data_s;
 
 typedef union
@@ -64,16 +62,19 @@ typedef union
   s0data_s s;
   } s0data_u;
 
+typedef Tv Tbvx[nvx];
+typedef double Tbsx[nvx*VLEN];
+
 typedef struct
   {
-  Tbv sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
-      p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
+  Tbvx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
+       p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
   } sxdata_v;
 
 typedef struct
   {
-  Tbs sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
-      p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
+  Tbsx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
+       p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
   } sxdata_s;
 
 typedef union
@@ -146,6 +147,9 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
 static inline void getCorfac(Tv scale, Tv * restrict corfac,
   const double * restrict cf)
   {
+  typedef union
+    { Tv v; double s[VLEN]; } Tvu;
+
   Tvu sc, corf;
   sc.v=scale;
   for (int i=0; i<VLEN; ++i)
@@ -201,16 +205,6 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
   *l_=l;
   }
 
-#if 1
-static inline void rec_step (Tv * restrict rxp, Tv * restrict rxm,
-  Tv * restrict ryp, Tv * restrict rym, const Tv cth,
-  const sharp_ylmgen_dbl3 fx)
-  {
-  Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
-  *rxp = (cth-fx1)*fx0* *ryp - fx2* *rxp;
-  *rxm = (cth+fx1)*fx0* *rym - fx2* *rxm;
-  }
-
 NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
   sxdata_v * restrict d, int * restrict l_, int nv2)
   {
@@ -265,10 +259,16 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
     {
     if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
     below_limit=1;
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
+       fx12=vload(fx[l+1].f[2]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
+       fx22=vload(fx[l+2].f[2]);
     for (int i=0; i<nv2; ++i)
       {
-      rec_step(&d->l1p[i],&d->l1m[i],&d->l2p[i],&d->l2m[i],d->cth[i],fx[l+1]);
-      rec_step(&d->l2p[i],&d->l2m[i],&d->l1p[i],&d->l1m[i],d->cth[i],fx[l+2]);
+      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
+      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
+      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
       if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
           rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
         below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
@@ -368,7 +368,7 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
   int l,lmax=gen->lmax;
   int nv2 = (nth+VLEN-1)/VLEN;
   iter_to_ieee_spin(gen, d, &l, nv2);
-  job->opcnt += (l-gen->m) * 10*nth;
+  job->opcnt += (l-gen->mhi) * 10*nth;
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 28*nth;
 
@@ -406,10 +406,10 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
       d->p1mi[i] += aci1*lw1 - agr2*lx2;
       Tv lx1=d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i];
       Tv lw2=d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i];
-      d->p2pr[i] -= agr2*lw2 - aci1*lx1;
+      d->p2pr[i] += agr2*lw2 - aci1*lx1;
       d->p2pi[i] += agi2*lw2 + acr1*lx1;
       d->p2mr[i] += acr2*lw2 + agi1*lx1;
-      d->p2mi[i] -= aci2*lw2 - agr1*lx1;
+      d->p2mi[i] += aci2*lw2 - agr1*lx1;
       d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
       d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
       if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
@@ -443,7 +443,7 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
   int l,lmax=gen->lmax;
   int nv2 = (nth+VLEN-1)/VLEN;
   iter_to_ieee_spin(gen, d, &l, nv2);
-  job->opcnt += (l-gen->m) * 10*nth;
+  job->opcnt += (l-gen->mhi) * 10*nth;
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 28*nth;
 
@@ -519,7 +519,6 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
     }
   map2alm_spin_kernel(d, fx, alm, l, lmax, nv2);
   }
-#endif
 
 NOINLINE static void alm2map_kernel(s0data_v * restrict d,
   const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
@@ -678,19 +677,21 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
   const double *cth_, const double *sth_, int llim, int ulim,
   sharp_Ylmgen_C *gen, int mi, const int *mlim)
   {
-  const int nval=nvec*VLEN;
   const int m = job->ainfo->mval[mi];
   sharp_Ylmgen_prepare (gen, m);
 
   switch (job->type)
     {
-    case SHARP_ALM2MAP:
     case SHARP_ALM2MAP_DERIV1:
+      UTIL_FAIL("derivatives currently not supported");
+      break;
+    case SHARP_ALM2MAP:
       {
       if (job->spin==0)
         {
+        const int nval=nv0*VLEN;
         int ith=0;
-        int itgt[nvec*VLEN];
+        int itgt[nval];
         while (ith<ulim-llim)
           {
           s0data_u d;
@@ -736,8 +737,9 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
         }
       else
         {
+        const int nval=nvx*VLEN;
         int ith=0;
-        int itgt[nvec*VLEN];
+        int itgt[nval];
         while (ith<ulim-llim)
           {
           sxdata_u d;
@@ -808,7 +810,6 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
   const double *cth_, const double *sth_, int llim, int ulim,
   sharp_Ylmgen_C *gen, int mi, const int *mlim)
   {
-  const int nval=nvec*VLEN;
   const int m = job->ainfo->mval[mi];
   sharp_Ylmgen_prepare (gen, m);
 
@@ -818,6 +819,7 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
       {
       if (job->spin==0)
         {
+        const int nval=nv0*VLEN;
         int ith=0;
         while (ith<ulim-llim)
           {
@@ -852,6 +854,7 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
         }
       else
         {
+        const int nval=nvx*VLEN;
         int ith=0;
         while (ith<ulim-llim)
           {
@@ -919,5 +922,5 @@ int sharp_veclen(void)
 
 int sharp_max_nvec(void)
   {
-  return nvec;
+  return nv0;
   }
diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c
index 0171fe2..089324d 100644
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@@ -24,7 +24,7 @@
 
 /*  \file sharp_testsuite.c
  *
- *  Copyright (C) 2012-2013 Max-Planck-Society
+ *  Copyright (C) 2012-2018 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
@@ -375,7 +375,6 @@ static void check_sign_scale(void)
   UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.234675107554816442e+01,1e-12),
     "error");
 
-#if 1
   sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,SHARP_DP,
     NULL,NULL);
   UTIL_ASSERT(FAPPROX(map[0][0     ], 2.750897760535633285e+00,1e-12),
@@ -406,6 +405,7 @@ static void check_sign_scale(void)
   UTIL_ASSERT(FAPPROX(map[1][npix-1],-1.863257892248353897e+01,1e-12),
     "error");
 
+#if 0
   sharp_execute(SHARP_ALM2MAP_DERIV1,1,&alm[0],&map[0],tinfo,alms,
     SHARP_DP,NULL,NULL);
   UTIL_ASSERT(FAPPROX(map[0][0     ],-6.859393905369091105e-01,1e-11),
@@ -430,7 +430,7 @@ static void check_sign_scale(void)
   }
 
 static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
-  int spin, int nv, double **err_abs, double **err_rel,
+  int spin, double **err_abs, double **err_rel,
   double *t_a2m, double *t_m2a, unsigned long long *op_a2m,
   unsigned long long *op_m2a)
   {
@@ -450,20 +450,20 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
 
 #ifdef USE_MPI
   sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,
-    ainfo, SHARP_DP|SHARP_ADD|nv,t_a2m,op_a2m);
+    ainfo, SHARP_DP|SHARP_ADD,t_a2m,op_a2m);
 #else
   sharp_execute(SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,ainfo,
-    SHARP_DP|nv,t_a2m,op_a2m);
+    SHARP_DP,t_a2m,op_a2m);
 #endif
   if (t_a2m!=NULL) *t_a2m=maxTime(*t_a2m);
   if (op_a2m!=NULL) *op_a2m=totalops(*op_a2m);
   double *sqsum=get_sqsum_and_invert(alm,nalms,ncomp);
 #ifdef USE_MPI
   sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,
-    ainfo,SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
+    ainfo,SHARP_DP|SHARP_ADD,t_m2a,op_m2a);
 #else
   sharp_execute(SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,ainfo,
-    SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
+    SHARP_DP|SHARP_ADD,t_m2a,op_m2a);
 #endif
   if (t_m2a!=NULL) *t_m2a=maxTime(*t_m2a);
   if (op_m2a!=NULL) *op_m2a=totalops(*op_m2a);
@@ -475,11 +475,11 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
   }
 
 static void check_accuracy (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
-  int spin, int nv)
+  int spin)
   {
   int ncomp = (spin==0) ? 1 : 2;
   double *err_abs, *err_rel;
-  do_sht (ginfo, ainfo, spin, nv, &err_abs, &err_rel, NULL, NULL,
+  do_sht (ginfo, ainfo, spin, &err_abs, &err_rel, NULL, NULL,
     NULL, NULL);
   for (int i=0; i<ncomp; ++i)
     UTIL_ASSERT((err_rel[i]<1e-10) && (err_abs[i]<1e-10),"error");
@@ -501,16 +501,11 @@ static void sharp_acctest(void)
   sharp_alm_info *ainfo;
   int lmax=127, mmax=127, nlat=128, nlon=256;
   get_infos ("gauss", lmax, &mmax, &nlat, &nlon, &ginfo, &ainfo);
-  for (int nv=1; nv<=6; ++nv)
-    {
-    check_accuracy(ginfo,ainfo,0,nv);
-#if 0
-    check_accuracy(ginfo,ainfo,1,nv);
-    check_accuracy(ginfo,ainfo,2,nv);
-    check_accuracy(ginfo,ainfo,3,nv);
-    check_accuracy(ginfo,ainfo,30,nv);
-#endif
-    }
+  check_accuracy(ginfo,ainfo,0);
+  check_accuracy(ginfo,ainfo,1);
+  check_accuracy(ginfo,ainfo,2);
+  check_accuracy(ginfo,ainfo,3);
+  check_accuracy(ginfo,ainfo,30);
   sharp_destroy_alm_info(ainfo);
   sharp_destroy_geom_info(ginfo);
   if (mytask==0) printf("Passed.\n\n");
@@ -544,7 +539,7 @@ static void sharp_test (int argc, const char **argv)
     {
     ++nrpt;
     double ta2m2, tm2a2;
-    do_sht (ginfo, ainfo, spin, 0, &err_abs, &err_rel, &ta2m2, &tm2a2,
+    do_sht (ginfo, ainfo, spin, &err_abs, &err_rel, &ta2m2, &tm2a2,
       &op_a2m, &op_m2a);
     if (ta2m2<t_a2m) t_a2m=ta2m2;
     if (tm2a2<t_m2a) t_m2a=tm2a2;
@@ -604,68 +599,6 @@ static void sharp_test (int argc, const char **argv)
   DEALLOC(err_rel);
   }
 
-static void sharp_bench (int argc, const char **argv)
-  {
-  if (mytask==0) sharp_announce("sharp_bench");
-  UTIL_ASSERT(argc>=8,"usage: grid lmax mmax geom1 geom2 spin");
-  int lmax=atoi(argv[3]);
-  int mmax=atoi(argv[4]);
-  int gpar1=atoi(argv[5]);
-  int gpar2=atoi(argv[6]);
-  int spin=atoi(argv[7]);
-
-  if (mytask==0) printf("Testing map analysis accuracy.\n");
-  if (mytask==0) printf("spin=%d\n", spin);
-
-  sharp_geom_info *ginfo;
-  sharp_alm_info *ainfo;
-  get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo);
-
-  double ta2m_auto=1e30, tm2a_auto=1e30, ta2m_min=1e30, tm2a_min=1e30;
-  unsigned long long opa2m_min=0, opm2a_min=0;
-  int nvmin_a2m=-1, nvmin_m2a=-1;
-  for (int nv=0; nv<=6; ++nv)
-    {
-    int ntries=0;
-    double tacc=0;
-    do
-      {
-      double t_a2m, t_m2a;
-      unsigned long long op_a2m, op_m2a;
-      double *err_abs,*err_rel;
-      do_sht (ginfo, ainfo, spin, nv, &err_abs, &err_rel,
-        &t_a2m, &t_m2a, &op_a2m, &op_m2a);
-
-      DEALLOC(err_abs);
-      DEALLOC(err_rel);
-      tacc+=t_a2m+t_m2a;
-      ++ntries;
-      if (nv==0)
-        {
-        if (t_a2m<ta2m_auto) ta2m_auto=t_a2m;
-        if (t_m2a<tm2a_auto) tm2a_auto=t_m2a;
-        }
-      else
-        {
-        if (t_a2m<ta2m_min) { nvmin_a2m=nv; ta2m_min=t_a2m; opa2m_min=op_a2m; }
-        if (t_m2a<tm2a_min) { nvmin_m2a=nv; tm2a_min=t_m2a; opm2a_min=op_m2a; }
-        }
-      } while((ntries<2)||(tacc<3.));
-    }
-  if (mytask==0)
-    {
-    printf("a2m: nvmin=%d tmin=%fs speedup=%.2f%% perf=%.2fGFlops/s\n",
-      nvmin_a2m,ta2m_min,100.*(ta2m_auto-ta2m_min)/ta2m_auto,
-      1e-9*opa2m_min/ta2m_min);
-    printf("m2a: nvmin=%d tmin=%fs speedup=%.2f%% perf=%.2fGFlops/s\n",
-      nvmin_m2a,tm2a_min,100.*(tm2a_auto-tm2a_min)/tm2a_auto,
-      1e-9*opm2a_min/tm2a_min);
-    }
-
-  sharp_destroy_alm_info(ainfo);
-  sharp_destroy_geom_info(ginfo);
-  }
-
 int main(int argc, const char **argv)
   {
 #ifdef USE_MPI
@@ -682,8 +615,6 @@ int main(int argc, const char **argv)
     sharp_acctest();
   else if (strcmp(argv[1],"test")==0)
     sharp_test(argc,argv);
-  else if (strcmp(argv[1],"bench")==0)
-    sharp_bench(argc,argv);
   else
     UTIL_FAIL("unknown command");
 

From 3efcfa1ef26cf26b12812bce866e4bb5efdeef18 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Mon, 17 Dec 2018 15:25:21 +0100
Subject: [PATCH 33/85] cleanup

---
 Makefile.am                |  1 -
 libsharp/sharp.c           |  4 ---
 libsharp/sharp_announce.c  |  2 +-
 libsharp/sharp_core.c      |  2 +-
 libsharp/sharp_core.h      | 53 --------------------------------------
 libsharp/sharp_internal.h  | 10 +++++--
 libsharp/sharp_testsuite.c |  2 +-
 7 files changed, 11 insertions(+), 63 deletions(-)
 delete mode 100644 libsharp/sharp_core.h

diff --git a/Makefile.am b/Makefile.am
index a82583d..6370d95 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -16,7 +16,6 @@ src_sharp = \
   libsharp/sharp_ylmgen_c.c \
   libsharp/sharp_announce.h \
   libsharp/sharp_complex_hacks.h \
-  libsharp/sharp_core.h \
   libsharp/sharp_internal.h \
   libsharp/sharp_legendre_roots.h \
   libsharp/sharp_vecsupport.h \
diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index aa680df..bbb3872 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -35,7 +35,6 @@
 #include "sharp_ylmgen_c.h"
 #include "sharp_internal.h"
 #include "c_utils.h"
-#include "sharp_core.h"
 #include "walltime_c.h"
 #include "sharp_almhelpers.h"
 #include "sharp_geomhelpers.h"
@@ -960,9 +959,6 @@ void sharp_set_chunksize_min(int new_chunksize_min)
 void sharp_set_nchunks_max(int new_nchunks_max)
   { nchunks_max=new_nchunks_max; }
 
-int sharp_get_nv_max (void)
-{ return 6; }
-
 #ifdef USE_MPI
 #include "sharp_mpi.c"
 
diff --git a/libsharp/sharp_announce.c b/libsharp/sharp_announce.c
index 7027167..a028258 100644
--- a/libsharp/sharp_announce.c
+++ b/libsharp/sharp_announce.c
@@ -40,7 +40,7 @@
 #endif
 
 #include "sharp_announce.h"
-#include "sharp_core.h"
+#include "sharp_internal.h"
 
 static void OpenMP_status(void)
   {
diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index b1414f1..4d761fb 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -35,7 +35,7 @@
 #include "sharp_vecsupport.h"
 #include "sharp_complex_hacks.h"
 #include "sharp.h"
-#include "sharp_core.h"
+#include "sharp_internal.h"
 #include "c_utils.h"
 
 typedef complex double dcmplx;
diff --git a/libsharp/sharp_core.h b/libsharp/sharp_core.h
deleted file mode 100644
index a9e509b..0000000
--- a/libsharp/sharp_core.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_core.h
- *  Interface for the computational core
- *
- *  Copyright (C) 2012-2013 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#ifndef PLANCK_SHARP_CORE_H
-#define PLANCK_SHARP_CORE_H
-
-#include "sharp_internal.h"
-#include "sharp_ylmgen_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void inner_loop (sharp_job *job, const int *ispair,const double *cth,
-  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
-  const int *mlim);
-
-int sharp_veclen(void);
-int sharp_max_nvec(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/libsharp/sharp_internal.h b/libsharp/sharp_internal.h
index 1c7f6b0..635aeb8 100644
--- a/libsharp/sharp_internal.h
+++ b/libsharp/sharp_internal.h
@@ -38,6 +38,7 @@
 
 #include <complex.h>
 #include "sharp.h"
+#include "sharp_ylmgen_c.h"
 
 #define SHARP_MAXTRANS 100
 
@@ -59,8 +60,13 @@ typedef struct
   unsigned long long opcnt;
   } sharp_job;
 
-int sharp_get_nv_max (void);
-int sharp_nv_oracle (sharp_jobtype type, int spin);
 int sharp_get_mlim (int lmax, int spin, double sth, double cth);
 
+void inner_loop (sharp_job *job, const int *ispair,const double *cth,
+  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
+  const int *mlim);
+
+int sharp_veclen(void);
+int sharp_max_nvec(void);
+
 #endif
diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c
index 089324d..15c95b2 100644
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@@ -589,7 +589,7 @@ static void sharp_test (int argc, const char **argv)
     }
 
   if (mytask==0)
-    printf("%-12s %-10s %2d %d %2d %3d %6d %6d %6d %6d %2d %.2e %7.2f %.2e %7.2f"
+    printf("%-12s %-10s %2d %d %2d %3d %6d %6d %6d %6d %.2e %7.2f %.2e %7.2f"
            " %9.2f %6.2f %.2e %.2e\n",
       getenv("HOST"),argv[2],spin,sharp_veclen(),nomp,ntasks,lmax,mmax,gpar1,gpar2,
       t_a2m,1e-9*op_a2m/t_a2m,t_m2a,1e-9*op_m2a/t_m2a,tmem/(1<<20),

From fbb56d58c627eb213f8414f6f4448f63bebc3ee2 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Mon, 17 Dec 2018 15:59:41 +0100
Subject: [PATCH 34/85] add derivatives

---
 libsharp/sharp_core.c      | 116 +++++++++++++++++++++++++++++++++++--
 libsharp/sharp_testsuite.c |   2 -
 2 files changed, 112 insertions(+), 6 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 4d761fb..7d35325 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -670,6 +670,114 @@ NOINLINE static void calc_map2alm(sharp_job * restrict job,
   map2alm_kernel(d, rf, alm, l, lmax, nv2);
   }
 
+NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
+  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm,
+  int l, int lmax, int nv2)
+  {
+  while (l<=lmax)
+    {
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
+       fx12=vload(fx[l+1].f[2]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
+       fx22=vload(fx[l+2].f[2]);
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
+       ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
+      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      Tv lw=d->l2p[i]+d->l2m[i];
+      d->p1pr[i] += ar1*lw;
+      d->p1pi[i] += ai1*lw;
+      Tv lx=d->l2m[i]-d->l2p[i];
+      d->p2mr[i] += ai1*lx;
+      d->p2mi[i] -= ar1*lx;
+      lw=d->l1p[i]+d->l1m[i];
+      d->p2pr[i] += ar2*lw;
+      d->p2pi[i] += ai2*lw;
+      lx=d->l1m[i]-d->l1p[i];
+      d->p1mr[i] += ai2*lx;
+      d->p1mi[i] -= ar2*lx;
+      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
+      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      }
+    l+=2;
+    }
+  }
+
+NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
+  {
+  int l,lmax=gen->lmax;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee_spin(gen, d, &l, nv2);
+  job->opcnt += (l-gen->mhi) * 10*nth;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 20*nth;
+
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+    getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
+                 vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+    }
+
+  while((!full_ieee) && (l<=lmax))
+    {
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
+       fx12=vload(fx[l+1].f[2]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
+       fx22=vload(fx[l+2].f[2]);
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
+       ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    full_ieee=1;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
+      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      Tv lw=d->l2p[i]*d->cfp[i]+d->l2m[i]*d->cfm[i];
+      d->p1pr[i] += ar1*lw;
+      d->p1pi[i] += ai1*lw;
+      Tv lx=d->l2m[i]*d->cfm[i]-d->l2p[i]*d->cfp[i];
+      d->p2mr[i] += ai1*lx;
+      d->p2mi[i] -= ar1*lx;
+      lw=d->l1p[i]*d->cfp[i]+d->l1m[i]*d->cfm[i];
+      d->p2pr[i] += ar2*lw;
+      d->p2pi[i] += ai2*lw;
+      lx=d->l1m[i]*d->cfm[i]-d->l1p[i]*d->cfp[i];
+      d->p1mr[i] += ai2*lx;
+      d->p1mi[i] -= ar2*lx;
+      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
+      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
+        }
+      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+        }
+      }
+    l+=2;
+    }
+  if (l>lmax) return;
+
+  for (int i=0; i<nv2; ++i)
+    {
+    d->l1p[i] *= d->cfp[i];
+    d->l2p[i] *= d->cfp[i];
+    d->l1m[i] *= d->cfm[i];
+    d->l2m[i] *= d->cfm[i];
+    }
+  alm2map_deriv1_kernel(d, fx, alm, l, lmax, nv2);
+  }
+
 
 #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
 
@@ -682,10 +790,8 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
 
   switch (job->type)
     {
-    case SHARP_ALM2MAP_DERIV1:
-      UTIL_FAIL("derivatives currently not supported");
-      break;
     case SHARP_ALM2MAP:
+    case SHARP_ALM2MAP_DERIV1:
       {
       if (job->spin==0)
         {
@@ -772,7 +878,9 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
               d.s.p1pr[i]=d.s.p1pi[i]=d.s.p2pr[i]=d.s.p2pi[i]=0.;
               d.s.p1mr[i]=d.s.p1mi[i]=d.s.p2mr[i]=d.s.p2mi[i]=0.;
               }
-            calc_alm2map_spin (job, gen, &d.v, nth);
+            (job->type==SHARP_ALM2MAP) ?
+              calc_alm2map_spin  (job, gen, &d.v, nth) :
+              calc_alm2map_deriv1(job, gen, &d.v, nth);
             for (int i=0; i<nth; ++i)
               {
               int tgt=itgt[i];
diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c
index 15c95b2..f712ae4 100644
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@@ -405,7 +405,6 @@ static void check_sign_scale(void)
   UTIL_ASSERT(FAPPROX(map[1][npix-1],-1.863257892248353897e+01,1e-12),
     "error");
 
-#if 0
   sharp_execute(SHARP_ALM2MAP_DERIV1,1,&alm[0],&map[0],tinfo,alms,
     SHARP_DP,NULL,NULL);
   UTIL_ASSERT(FAPPROX(map[0][0     ],-6.859393905369091105e-01,1e-11),
@@ -420,7 +419,6 @@ static void check_sign_scale(void)
     "error");
   UTIL_ASSERT(FAPPROX(map[1][npix-1], 7.821618677689795049e+02,1e-12),
     "error");
-#endif
 
   DEALLOC2D(map);
   DEALLOC2D(alm);

From f95699948c38162c8aa035d8e9b18b21959947d6 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Mon, 17 Dec 2018 16:11:00 +0100
Subject: [PATCH 35/85] rearranging

---
 libsharp/sharp_core.c | 393 +++++++++++++++++++++---------------------
 1 file changed, 197 insertions(+), 196 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 7d35325..e611805 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -40,8 +40,8 @@
 
 typedef complex double dcmplx;
 
-#define nv0 (64/VLEN)
-#define nvx (128/VLEN)
+#define nv0 (128/VLEN)
+#define nvx (64/VLEN)
 
 typedef Tv Tbv0[nv0];
 typedef double Tbs0[nv0*VLEN];
@@ -205,6 +205,156 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
   *l_=l;
   }
 
+NOINLINE static void alm2map_kernel(s0data_v * restrict d,
+  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
+  int l, int lmax, int nv2)
+  {
+  while (l<=lmax)
+    {
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
+       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
+    for (int i=0; i<nv2; ++i)
+      {
+      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
+      d->p1r[i] += d->lam2[i]*ar1;
+      d->p1i[i] += d->lam2[i]*ai1;
+      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
+      d->p2r[i] += d->lam1[i]*ar2;
+      d->p2i[i] += d->lam1[i]*ai2;
+      }
+    l+=2;
+    }
+  }
+
+NOINLINE static void calc_alm2map (sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
+  {
+  int l,lmax=gen->lmax;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee(gen, d, &l, nv2);
+  job->opcnt += (l-gen->m) * 4*nth;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 8*nth;
+
+  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+    }
+
+  while((!full_ieee) && (l<=lmax))
+    {
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
+       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
+    full_ieee=1;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
+      d->p1r[i] += d->lam2[i]*d->corfac[i]*ar1;
+      d->p1i[i] += d->lam2[i]*d->corfac[i]*ai1;
+      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+        }
+      d->p2r[i] += d->lam1[i]*d->corfac[i]*ar2;
+      d->p2i[i] += d->lam1[i]*d->corfac[i]*ai2;
+      }
+    l+=2;
+    }
+  if (l>lmax) return;
+
+  for (int i=0; i<nv2; ++i)
+    {
+    d->lam1[i] *= d->corfac[i];
+    d->lam2[i] *= d->corfac[i];
+    }
+  alm2map_kernel(d, rf, alm, l, lmax, nv2);
+  }
+
+NOINLINE static void map2alm_kernel(s0data_v * restrict d,
+  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l,
+  int lmax, int nv2)
+  {
+  while (l<=lmax)
+    {
+    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
+       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
+    Tv atmp[4] = {vzero, vzero, vzero, vzero};
+    for (int i=0; i<nv2; ++i)
+      {
+      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
+      atmp[0] += d->lam2[i]*d->p1r[i];
+      atmp[1] += d->lam2[i]*d->p1i[i];
+      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
+      atmp[2] += d->lam1[i]*d->p2r[i];
+      atmp[3] += d->lam1[i]*d->p2i[i];
+      }
+    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
+    l+=2;
+    }
+  }
+
+NOINLINE static void calc_map2alm(sharp_job * restrict job,
+  const sharp_Ylmgen_C *gen, s0data_v * restrict d, int nth)
+  {
+  int lmax=gen->lmax;
+  int l=gen->m;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee(gen, d, &l, nv2);
+  job->opcnt += (l-gen->m) * 4*nth;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 8*nth;
+
+  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
+  dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+    }
+
+  while ((!full_ieee) && (l<=lmax))
+    {
+    full_ieee=1;
+    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
+       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
+    Tv atmp[4] = {vzero, vzero, vzero, vzero};
+    for (int i=0; i<nv2; ++i)
+      {
+      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
+      atmp[0] += d->lam2[i]*d->corfac[i]*d->p1r[i];
+      atmp[1] += d->lam2[i]*d->corfac[i]*d->p1i[i];
+      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+        }
+      atmp[2] += d->lam1[i]*d->corfac[i]*d->p2r[i];
+      atmp[3] += d->lam1[i]*d->corfac[i]*d->p2i[i];
+      }
+    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
+    l+=2;
+    }
+
+  for (int i=0; i<nv2; ++i)
+    {
+    d->lam1[i] *= d->corfac[i];
+    d->lam2[i] *= d->corfac[i];
+    }
+  map2alm_kernel(d, rf, alm, l, lmax, nv2);
+  }
+
 NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
   sxdata_v * restrict d, int * restrict l_, int nv2)
   {
@@ -317,51 +467,6 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
     }
   }
 
-NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
-  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm,
-  int l, int lmax, int nv2)
-  {
-  while (l<=lmax)
-    {
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
-       fx12=vload(fx[l+1].f[2]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
-       fx22=vload(fx[l+2].f[2]);
-    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
-    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
-    for (int i=0; i<nv2; ++i)
-      {
-      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
-      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
-      Tv lw = d->l2p[i] + d->l2m[i];
-      agr1 += d->p1pr[i]*lw;
-      agi1 += d->p1pi[i]*lw;
-      acr1 += d->p1mr[i]*lw;
-      aci1 += d->p1mi[i]*lw;
-      Tv lx = d->l2m[i] - d->l2p[i];
-      agr1 -= d->p2mi[i]*lx;
-      agi1 += d->p2mr[i]*lx;
-      acr1 += d->p2pi[i]*lx;
-      aci1 -= d->p2pr[i]*lx;
-      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
-      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
-      lw = d->l1p[i] + d->l1m[i];
-      agr2 += d->p2pr[i]*lw;
-      agi2 += d->p2pi[i]*lw;
-      acr2 += d->p2mr[i]*lw;
-      aci2 += d->p2mi[i]*lw;
-      lx = d->l1m[i] - d->l1p[i];
-      agr2 -= d->p1mi[i]*lx;
-      agi2 += d->p1mr[i]*lx;
-      acr2 += d->p1pi[i]*lx;
-      aci2 -= d->p1pr[i]*lx;
-      }
-    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
-    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
-    l+=2;
-    }
-  }
-
 NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
   const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
   {
@@ -437,6 +542,51 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
   alm2map_spin_kernel(d, fx, alm, l, lmax, nv2);
   }
 
+NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
+  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm,
+  int l, int lmax, int nv2)
+  {
+  while (l<=lmax)
+    {
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
+       fx12=vload(fx[l+1].f[2]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
+       fx22=vload(fx[l+2].f[2]);
+    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
+    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
+      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      Tv lw = d->l2p[i] + d->l2m[i];
+      agr1 += d->p1pr[i]*lw;
+      agi1 += d->p1pi[i]*lw;
+      acr1 += d->p1mr[i]*lw;
+      aci1 += d->p1mi[i]*lw;
+      Tv lx = d->l2m[i] - d->l2p[i];
+      agr1 -= d->p2mi[i]*lx;
+      agi1 += d->p2mr[i]*lx;
+      acr1 += d->p2pi[i]*lx;
+      aci1 -= d->p2pr[i]*lx;
+      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
+      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      lw = d->l1p[i] + d->l1m[i];
+      agr2 += d->p2pr[i]*lw;
+      agi2 += d->p2pi[i]*lw;
+      acr2 += d->p2mr[i]*lw;
+      aci2 += d->p2mi[i]*lw;
+      lx = d->l1m[i] - d->l1p[i];
+      agr2 -= d->p1mi[i]*lx;
+      agi2 += d->p1mr[i]*lx;
+      acr2 += d->p1pi[i]*lx;
+      aci2 -= d->p1pr[i]*lx;
+      }
+    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
+    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
+    l+=2;
+    }
+  }
+
 NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
   const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
   {
@@ -520,155 +670,6 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
   map2alm_spin_kernel(d, fx, alm, l, lmax, nv2);
   }
 
-NOINLINE static void alm2map_kernel(s0data_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
-  int l, int lmax, int nv2)
-  {
-  while (l<=lmax)
-    {
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
-       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
-    for (int i=0; i<nv2; ++i)
-      {
-      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
-      d->p1r[i] += d->lam2[i]*ar1;
-      d->p1i[i] += d->lam2[i]*ai1;
-      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
-      d->p2r[i] += d->lam1[i]*ar2;
-      d->p2i[i] += d->lam1[i]*ai2;
-      }
-    l+=2;
-    }
-  }
-
-NOINLINE static void map2alm_kernel(s0data_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l,
-  int lmax, int nv2)
-  {
-  while (l<=lmax)
-    {
-    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
-       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
-    Tv atmp[4] = {vzero, vzero, vzero, vzero};
-    for (int i=0; i<nv2; ++i)
-      {
-      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
-      atmp[0] += d->lam2[i]*d->p1r[i];
-      atmp[1] += d->lam2[i]*d->p1i[i];
-      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
-      atmp[2] += d->lam1[i]*d->p2r[i];
-      atmp[3] += d->lam1[i]*d->p2i[i];
-      }
-    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
-    l+=2;
-    }
-  }
-
-NOINLINE static void calc_alm2map (sharp_job * restrict job,
-  const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
-  {
-  int l,lmax=gen->lmax;
-  int nv2 = (nth+VLEN-1)/VLEN;
-  iter_to_ieee(gen, d, &l, nv2);
-  job->opcnt += (l-gen->m) * 4*nth;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 8*nth;
-
-  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee=1;
-  for (int i=0; i<nv2; ++i)
-    {
-    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
-    }
-
-  while((!full_ieee) && (l<=lmax))
-    {
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
-       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
-    full_ieee=1;
-    for (int i=0; i<nv2; ++i)
-      {
-      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
-      d->p1r[i] += d->lam2[i]*d->corfac[i]*ar1;
-      d->p1i[i] += d->lam2[i]*d->corfac[i]*ai1;
-      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
-      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
-        {
-        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-        full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
-        }
-      d->p2r[i] += d->lam1[i]*d->corfac[i]*ar2;
-      d->p2i[i] += d->lam1[i]*d->corfac[i]*ai2;
-      }
-    l+=2;
-    }
-  if (l>lmax) return;
-
-  for (int i=0; i<nv2; ++i)
-    {
-    d->lam1[i] *= d->corfac[i];
-    d->lam2[i] *= d->corfac[i];
-    }
-  alm2map_kernel(d, rf, alm, l, lmax, nv2);
-  }
-
-NOINLINE static void calc_map2alm(sharp_job * restrict job,
-  const sharp_Ylmgen_C *gen, s0data_v * restrict d, int nth)
-  {
-  int lmax=gen->lmax;
-  int l=gen->m;
-  int nv2 = (nth+VLEN-1)/VLEN;
-  iter_to_ieee(gen, d, &l, nv2);
-  job->opcnt += (l-gen->m) * 4*nth;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 8*nth;
-
-  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
-  dcmplx * restrict alm=job->almtmp;
-  int full_ieee=1;
-  for (int i=0; i<nv2; ++i)
-    {
-    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
-    }
-
-  while ((!full_ieee) && (l<=lmax))
-    {
-    full_ieee=1;
-    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
-       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
-    Tv atmp[4] = {vzero, vzero, vzero, vzero};
-    for (int i=0; i<nv2; ++i)
-      {
-      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
-      atmp[0] += d->lam2[i]*d->corfac[i]*d->p1r[i];
-      atmp[1] += d->lam2[i]*d->corfac[i]*d->p1i[i];
-      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
-      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
-        {
-        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-        full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
-        }
-      atmp[2] += d->lam1[i]*d->corfac[i]*d->p2r[i];
-      atmp[3] += d->lam1[i]*d->corfac[i]*d->p2i[i];
-      }
-    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
-    l+=2;
-    }
-
-  for (int i=0; i<nv2; ++i)
-    {
-    d->lam1[i] *= d->corfac[i];
-    d->lam2[i] *= d->corfac[i];
-    }
-  map2alm_kernel(d, rf, alm, l, lmax, nv2);
-  }
 
 NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
   const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm,

From d257e92e54718ae5dfc8b4b0824bb69ac8a31fe1 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Mon, 17 Dec 2018 16:53:28 +0100
Subject: [PATCH 36/85] compactification

---
 libsharp/sharp_core.c | 48 +++++++++++++++----------------------------
 1 file changed, 16 insertions(+), 32 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index e611805..2dcee90 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -559,27 +559,19 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
       d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
       d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
       Tv lw = d->l2p[i] + d->l2m[i];
-      agr1 += d->p1pr[i]*lw;
-      agi1 += d->p1pi[i]*lw;
-      acr1 += d->p1mr[i]*lw;
-      aci1 += d->p1mi[i]*lw;
       Tv lx = d->l2m[i] - d->l2p[i];
-      agr1 -= d->p2mi[i]*lx;
-      agi1 += d->p2mr[i]*lx;
-      acr1 += d->p2pi[i]*lx;
-      aci1 -= d->p2pr[i]*lx;
+      agr1 += d->p1pr[i]*lw - d->p2mi[i]*lx;;
+      agi1 += d->p1pi[i]*lw + d->p2mr[i]*lx;
+      acr1 += d->p1mr[i]*lw + d->p2pi[i]*lx;
+      aci1 += d->p1mi[i]*lw - d->p2pr[i]*lx;
       d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
       d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
       lw = d->l1p[i] + d->l1m[i];
-      agr2 += d->p2pr[i]*lw;
-      agi2 += d->p2pi[i]*lw;
-      acr2 += d->p2mr[i]*lw;
-      aci2 += d->p2mi[i]*lw;
       lx = d->l1m[i] - d->l1p[i];
-      agr2 -= d->p1mi[i]*lx;
-      agi2 += d->p1mr[i]*lx;
-      acr2 += d->p1pi[i]*lx;
-      aci2 -= d->p1pr[i]*lx;
+      agr2 += d->p2pr[i]*lw - d->p1mi[i]*lx;
+      agi2 += d->p2pi[i]*lw + d->p1mr[i]*lx;
+      acr2 += d->p2mr[i]*lw + d->p1pi[i]*lx;
+      aci2 += d->p2mi[i]*lw - d->p1pr[i]*lx;
       }
     vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
     vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
@@ -622,27 +614,19 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
       d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
       d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
       Tv lw = d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i];
-      agr1 += d->p1pr[i]*lw;
-      agi1 += d->p1pi[i]*lw;
-      acr1 += d->p1mr[i]*lw;
-      aci1 += d->p1mi[i]*lw;
       Tv lx = d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i];
-      agr1 -= d->p2mi[i]*lx;
-      agi1 += d->p2mr[i]*lx;
-      acr1 += d->p2pi[i]*lx;
-      aci1 -= d->p2pr[i]*lx;
+      agr1 += d->p1pr[i]*lw - d->p2mi[i]*lx;
+      agi1 += d->p1pi[i]*lw + d->p2mr[i]*lx;
+      acr1 += d->p1mr[i]*lw + d->p2pi[i]*lx;
+      aci1 += d->p1mi[i]*lw - d->p2pr[i]*lx;
       d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
       d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
       lw = d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i];
-      agr2 += d->p2pr[i]*lw;
-      agi2 += d->p2pi[i]*lw;
-      acr2 += d->p2mr[i]*lw;
-      aci2 += d->p2mi[i]*lw;
       lx = d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i];
-      agr2 -= d->p1mi[i]*lx;
-      agi2 += d->p1mr[i]*lx;
-      acr2 += d->p1pi[i]*lx;
-      aci2 -= d->p1pr[i]*lx;
+      agr2 += d->p2pr[i]*lw - d->p1mi[i]*lx;
+      agi2 += d->p2pi[i]*lw + d->p1mr[i]*lx;
+      acr2 += d->p2mr[i]*lw + d->p1pi[i]*lx;
+      aci2 += d->p2mi[i]*lw - d->p1pr[i]*lx;
       if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
         {
         getCorfac(d->scp[i], &d->cfp[i], gen->cf);

From 202507ecaa80020918a2324cf01ef5bd0eccc121 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 18 Dec 2018 13:45:25 +0100
Subject: [PATCH 37/85] first try; broken

---
 libsharp/sharp_core.c     | 54 ++++++++++++++++++++++++++++++++++++++-
 libsharp/sharp_ylmgen_c.c | 26 ++++++++++++++++---
 libsharp/sharp_ylmgen_c.h |  2 ++
 3 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 2dcee90..690bf2a 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -280,6 +280,58 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
   alm2map_kernel(d, rf, alm, l, lmax, nv2);
   }
 
+NOINLINE static void calc_alm2map_alt (sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
+  {
+  int l=gen->m, lmax=gen->lmax;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  job->opcnt += (lmax+1-l) * 6*nth;
+
+const double inv_sqrt4pi = 0.2820947917738781434740397257803862929220;
+  Tv mfac = vload(gen->mfac[gen->m]);
+  for (int i=0; i<nv2; ++i)
+    {
+    d->lam1[i]=vzero;
+    mypow(d->sth[i],l,gen->powlimit,&d->lam2[i],&d->scale[i]);
+    d->lam2[i] *= mfac;
+    Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
+    }
+
+  const dcmplx * restrict alm=job->almtmp;
+  dcmplx * restrict alm2=RALLOC(dcmplx, gen->lmax+5);
+  {
+  for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
+    {
+    dcmplx al = alm[l];
+    dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1];
+    dcmplx al2 = (l+2>gen->lmax) ? 0. : alm[l+2];
+    alm2[l  ] = gen->alpha[il]*(gen->eps[l+1]*al + gen->eps[l+2]*al2);
+    alm2[l+1] = gen->alpha[il]*al1;
+    }
+  }
+  for (int i=0; i<nv2; ++i)
+    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+
+  for (int il=0, l=gen->m; l<=lmax; ++il, l+=2)
+    {
+    Tv ar1=vload(creal(alm2[l  ])), ai1=vload(cimag(alm2[l  ]));
+    Tv ar2=vload(creal(alm2[l+1])), ai2=vload(cimag(alm2[l+1]));
+    for (int i=0; i<nv2; ++i)
+      {
+      d->p1r[i] += d->lam2[i]*d->corfac[i]*ar1;
+      d->p1i[i] += d->lam2[i]*d->corfac[i]*ai1;
+      d->p2r[i] += d->cth[i]*d->lam2[i]*d->corfac[i]*ar2;
+      d->p2i[i] += d->cth[i]*d->lam2[i]*d->corfac[i]*ai2;
+      Tv tmp = (gen->a[il]*d->cth[i]*d->cth[i] + gen->b[il])*d->lam2[i] + d->lam1[i];
+      d->lam1[i] = d->lam2[i];
+      d->lam2[i] = tmp;
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
+        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+      }
+    }
+  DEALLOC(alm2);
+  }
+
 NOINLINE static void map2alm_kernel(s0data_v * restrict d,
   const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l,
   int lmax, int nv2)
@@ -812,7 +864,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
               d.s.sth[i]=d.s.sth[nth-1];
               d.s.p1r[i]=d.s.p1i[i]=d.s.p2r[i]=d.s.p2i[i]=0.;
               }
-            calc_alm2map (job, gen, &d.v, nth);
+            calc_alm2map_alt (job, gen, &d.v, nth);
             for (int i=0; i<nth; ++i)
               {
               int tgt=itgt[i];
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index 9cb9dbb..e44d88b 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -74,13 +74,17 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
     gen->mfac[0] = inv_sqrt4pi;
     for (int m=1; m<=gen->mmax; ++m)
       gen->mfac[m] = gen->mfac[m-1]*sqrt((2*m+1.)/(2*m));
-    gen->root = RALLOC(double,2*gen->lmax+5);
-    gen->iroot = RALLOC(double,2*gen->lmax+5);
-    for (int m=0; m<2*gen->lmax+5; ++m)
+    gen->root = RALLOC(double,2*gen->lmax+6);
+    gen->iroot = RALLOC(double,2*gen->lmax+6);
+    for (int m=0; m<2*gen->lmax+6; ++m)
       {
       gen->root[m] = sqrt(m);
       gen->iroot[m] = (m==0) ? 0. : 1./gen->root[m];
       }
+gen->eps=RALLOC(double, gen->lmax+10);
+gen->alpha=RALLOC(double, gen->lmax/2+10);
+gen->a=RALLOC(double, gen->lmax/2+10);
+gen->b=RALLOC(double, gen->lmax/2+10);
     }
   else
     {
@@ -137,6 +141,10 @@ void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
     DEALLOC(gen->mfac);
     DEALLOC(gen->root);
     DEALLOC(gen->iroot);
+DEALLOC(gen->eps);
+DEALLOC(gen->alpha);
+DEALLOC(gen->a);
+DEALLOC(gen->b);
     }
   else
     {
@@ -165,6 +173,18 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
       gen->rf[l].f[0] = tmp*gen->root[2*l+1];
       gen->rf[l].f[1] = tmp*gen->root[l+m]*gen->root[l-m]*gen->iroot[2*l-1];
       }
+for (int l=m; l<gen->lmax+10; ++l)
+  gen->eps[l] = sqrt((l*l-m*m)/(4.*l*l-1));
+gen->alpha[0] = 1./gen->eps[m+1];
+gen->alpha[1] = gen->eps[m+1]/(gen->eps[m+2]*gen->eps[m+3]);
+for (int il=1, l=m+2; l<gen->lmax+5; ++il, l+=2)
+  gen->alpha[il+1]= ((il&1) ? -1 : 1)/(gen->eps[l+2]*gen->eps[l+3]*gen->alpha[il]);
+for (int il=0, l=m; l<gen->lmax+5; ++il, l+=2)
+  {
+  gen->a[il] = ((il&1) ? -1 : 1)*gen->alpha[il]*gen->alpha[il];
+  double t1 = gen->eps[l+2], t2 = gen->eps[l+1];
+  gen->b[il] = -gen->a[il]*(t1*t1+t2*t2);
+  }
     }
   else
     {
diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h
index 63b23cd..2e9e5ff 100644
--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@@ -58,6 +58,8 @@ typedef struct
   double *mfac;
   sharp_ylmgen_dbl2 *rf;
 
+double *eps, *alpha, *a, *b;
+
 /* used if s!=0 */
   int sinPow, cosPow, preMinus_p, preMinus_m;
   double *prefac;

From b80c2a55437100f8dab76b4fb68a0b4183a3d9c8 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 18 Dec 2018 14:06:14 +0100
Subject: [PATCH 38/85] seems to work

---
 libsharp/sharp_core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 690bf2a..86a40c2 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -289,6 +289,7 @@ NOINLINE static void calc_alm2map_alt (sharp_job * restrict job,
 
 const double inv_sqrt4pi = 0.2820947917738781434740397257803862929220;
   Tv mfac = vload(gen->mfac[gen->m]);
+  if (gen->m&1) mfac=-mfac;
   for (int i=0; i<nv2; ++i)
     {
     d->lam1[i]=vzero;

From 0ee3a87e2d1caebc5d956e097645c9d258396b91 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 18 Dec 2018 15:45:55 +0100
Subject: [PATCH 39/85] better

---
 libsharp/sharp_core.c | 125 +++++++++++++++++++++++++++++++++---------
 1 file changed, 99 insertions(+), 26 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 86a40c2..afeded5 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -283,54 +283,113 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
 NOINLINE static void calc_alm2map_alt (sharp_job * restrict job,
   const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
   {
-  int l=gen->m, lmax=gen->lmax;
+  int lmax=gen->lmax;
   int nv2 = (nth+VLEN-1)/VLEN;
-  job->opcnt += (lmax+1-l) * 6*nth;
-
-const double inv_sqrt4pi = 0.2820947917738781434740397257803862929220;
+  job->opcnt += (lmax+1-gen->m) * 6*nth;
+  int full_ieee;
+Tv csq[nv0];
   Tv mfac = vload(gen->mfac[gen->m]);
   if (gen->m&1) mfac=-mfac;
+  int below_limit = 1;
   for (int i=0; i<nv2; ++i)
     {
+csq[i] = d->cth[i]*d->cth[i];
     d->lam1[i]=vzero;
-    mypow(d->sth[i],l,gen->powlimit,&d->lam2[i],&d->scale[i]);
+    mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]);
     d->lam2[i] *= mfac;
     Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
+    below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale)));
+    }
+  int l=gen->m, il=0;
+  while (below_limit)
+    {
+    if (l+2>gen->lmax) {l=gen->lmax+1;return;}
+    below_limit=1;
+    for (int i=0; i<nv2; ++i)
+      {
+      Tv tmp = (gen->a[il]*csq[i] + gen->b[il])*d->lam2[i] + d->lam1[i];
+      d->lam1[i] = d->lam2[i];
+      d->lam2[i] = tmp;
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
+        below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale)));
+      }
+    l+=2; ++il;
     }
 
-  const dcmplx * restrict alm=job->almtmp;
-  dcmplx * restrict alm2=RALLOC(dcmplx, gen->lmax+5);
+   const dcmplx * restrict alm=job->almtmp;
+   full_ieee=1;
+   for (int i=0; i<nv2; ++i)
+     {
+     getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+     full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+     }
   {
-  for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
+ // int il=0, l=gen->m;
+  for (; (l<=lmax) && (!full_ieee); ++il, l+=2)
     {
-    dcmplx al = alm[l];
-    dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1];
-    dcmplx al2 = (l+2>gen->lmax) ? 0. : alm[l+2];
-    alm2[l  ] = gen->alpha[il]*(gen->eps[l+1]*al + gen->eps[l+2]*al2);
-    alm2[l+1] = gen->alpha[il]*al1;
-    }
-  }
-  for (int i=0; i<nv2; ++i)
-    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-
-  for (int il=0, l=gen->m; l<=lmax; ++il, l+=2)
-    {
-    Tv ar1=vload(creal(alm2[l  ])), ai1=vload(cimag(alm2[l  ]));
-    Tv ar2=vload(creal(alm2[l+1])), ai2=vload(cimag(alm2[l+1]));
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    full_ieee=1;
     for (int i=0; i<nv2; ++i)
       {
       d->p1r[i] += d->lam2[i]*d->corfac[i]*ar1;
       d->p1i[i] += d->lam2[i]*d->corfac[i]*ai1;
-      d->p2r[i] += d->cth[i]*d->lam2[i]*d->corfac[i]*ar2;
-      d->p2i[i] += d->cth[i]*d->lam2[i]*d->corfac[i]*ai2;
-      Tv tmp = (gen->a[il]*d->cth[i]*d->cth[i] + gen->b[il])*d->lam2[i] + d->lam1[i];
+      d->p2r[i] += d->lam2[i]*d->corfac[i]*ar2;
+      d->p2i[i] += d->lam2[i]*d->corfac[i]*ai2;
+      Tv tmp = (gen->a[il]*csq[i] + gen->b[il])*d->lam2[i] + d->lam1[i];
       d->lam1[i] = d->lam2[i];
       d->lam2[i] = tmp;
       if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
+        {
         getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+        }
+   //   full_ieee=0;
       }
     }
-  DEALLOC(alm2);
+  for (int i=0; i<nv2; ++i)
+    {
+    d->lam1[i] *= d->corfac[i];
+    d->lam2[i] *= d->corfac[i];
+    }
+  for (; l<=lmax-2; il+=2, l+=4)
+    {
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
+    Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
+    Tv a1=vload(gen->a[il]), a2=vload(gen->a[il+1]),
+       b1=vload(gen->b[il]), b2=vload(gen->b[il+1]);
+    for (int i=0; i<nv2; ++i)
+      {
+      d->p1r[i] += d->lam2[i]*ar1;
+      d->p1i[i] += d->lam2[i]*ai1;
+      d->p2r[i] += d->lam2[i]*ar2;
+      d->p2i[i] += d->lam2[i]*ai2;
+      d->lam1[i] = (a1*csq[i] + b1)*d->lam2[i] + d->lam1[i];
+      d->p1r[i] += d->lam1[i]*ar3;
+      d->p1i[i] += d->lam1[i]*ai3;
+      d->p2r[i] += d->lam1[i]*ar4;
+      d->p2i[i] += d->lam1[i]*ai4;
+      d->lam2[i] = (a2*csq[i] + b2)*d->lam1[i] + d->lam2[i];
+     }
+    }
+  for (; l<=lmax; ++il, l+=2)
+    {
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    for (int i=0; i<nv2; ++i)
+      {
+      d->p1r[i] += d->lam2[i]*ar1;
+      d->p1i[i] += d->lam2[i]*ai1;
+      d->p2r[i] += d->lam2[i]*ar2;
+      d->p2i[i] += d->lam2[i]*ai2;
+      Tv tmp = (gen->a[il]*csq[i] + gen->b[il])*d->lam2[i] + d->lam1[i];
+      d->lam1[i] = d->lam2[i];
+      d->lam2[i] = tmp;
+      }
+    }
+  }
   }
 
 NOINLINE static void map2alm_kernel(s0data_v * restrict d,
@@ -833,6 +892,17 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
       {
       if (job->spin==0)
         {
+//adjust the a_lm for the new algorithm
+dcmplx * restrict alm=job->almtmp;
+for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
+  {
+  dcmplx al = alm[l];
+  dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1];
+  dcmplx al2 = (l+2>gen->lmax) ? 0. : alm[l+2];
+  alm[l  ] = gen->alpha[il]*(gen->eps[l+1]*al + gen->eps[l+2]*al2);
+  alm[l+1] = gen->alpha[il]*al1;
+  }
+
         const int nval=nv0*VLEN;
         int ith=0;
         int itgt[nval];
@@ -868,6 +938,9 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
             calc_alm2map_alt (job, gen, &d.v, nth);
             for (int i=0; i<nth; ++i)
               {
+//adjust for new algorithm
+d.s.p2r[i]*=d.s.cth[i];
+d.s.p2i[i]*=d.s.cth[i];
               int tgt=itgt[i];
               int phas_idx = tgt*job->s_th + mi*job->s_m;
               complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I,

From b8effac3b390f57f3cccf6cb517221c344190709 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 18 Dec 2018 16:12:57 +0100
Subject: [PATCH 40/85] standardize

---
 libsharp/sharp_core.c     | 194 +++++++++++++++++++++-----------------
 libsharp/sharp_ylmgen_c.c |  10 +-
 libsharp/sharp_ylmgen_c.h |   3 +-
 3 files changed, 113 insertions(+), 94 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index afeded5..03e142f 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -48,12 +48,12 @@ typedef double Tbs0[nv0*VLEN];
 
 typedef struct
   {
-  Tbv0 sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i;
+  Tbv0 sth, corfac, scale, lam1, lam2, cth, csq, p1r, p1i, p2r, p2i;
   } s0data_v;
 
 typedef struct
   {
-  Tbs0 sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i;
+  Tbs0 sth, corfac, scale, lam1, lam2, cth, csq, p1r, p1i, p2r, p2i;
   } s0data_s;
 
 typedef union
@@ -205,6 +205,40 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
   *l_=l;
   }
 
+NOINLINE static void iter_to_ieee_alt(const sharp_Ylmgen_C * restrict gen,
+  s0data_v * restrict d, int * restrict l_, int * restrict il_, int nv2)
+  {
+  int l=gen->m, il=0;
+  Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
+  Tv limscale=vload(sharp_limscale);
+  int below_limit = 1;
+  for (int i=0; i<nv2; ++i)
+    {
+    d->lam1[i]=vzero;
+    mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]);
+    d->lam2[i] *= mfac;
+    Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
+    below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale)));
+    }
+
+  while (below_limit)
+    {
+    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
+    below_limit=1;
+    Tv a=vload(gen->ab[il].f[0]), b=vload(gen->ab[il].f[1]);
+    for (int i=0; i<nv2; ++i)
+      {
+      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
+      d->lam1[i] = d->lam2[i];
+      d->lam2[i] = tmp;
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
+        below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale)));
+      }
+    l+=2; ++il;
+    }
+  *l_=l; *il_=il;
+  }
+
 NOINLINE static void alm2map_kernel(s0data_v * restrict d,
   const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
   int l, int lmax, int nv2)
@@ -280,55 +314,76 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
   alm2map_kernel(d, rf, alm, l, lmax, nv2);
   }
 
-NOINLINE static void calc_alm2map_alt (sharp_job * restrict job,
-  const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
+NOINLINE static void alm2map_alt_kernel(s0data_v * restrict d,
+  const sharp_ylmgen_dbl2 * restrict ab, const dcmplx * restrict alm,
+  int l, int il, int lmax, int nv2)
   {
-  int lmax=gen->lmax;
-  int nv2 = (nth+VLEN-1)/VLEN;
-  job->opcnt += (lmax+1-gen->m) * 6*nth;
-  int full_ieee;
-Tv csq[nv0];
-  Tv mfac = vload(gen->mfac[gen->m]);
-  if (gen->m&1) mfac=-mfac;
-  int below_limit = 1;
-  for (int i=0; i<nv2; ++i)
-    {
-csq[i] = d->cth[i]*d->cth[i];
-    d->lam1[i]=vzero;
-    mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]);
-    d->lam2[i] *= mfac;
-    Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
-    below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale)));
-    }
-  int l=gen->m, il=0;
-  while (below_limit)
-    {
-    if (l+2>gen->lmax) {l=gen->lmax+1;return;}
-    below_limit=1;
-    for (int i=0; i<nv2; ++i)
-      {
-      Tv tmp = (gen->a[il]*csq[i] + gen->b[il])*d->lam2[i] + d->lam1[i];
-      d->lam1[i] = d->lam2[i];
-      d->lam2[i] = tmp;
-      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
-        below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale)));
-      }
-    l+=2; ++il;
-    }
-
-   const dcmplx * restrict alm=job->almtmp;
-   full_ieee=1;
-   for (int i=0; i<nv2; ++i)
-     {
-     getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-     full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
-     }
-  {
- // int il=0, l=gen->m;
-  for (; (l<=lmax) && (!full_ieee); ++il, l+=2)
+  for (; l<=lmax-2; il+=2, l+=4)
     {
     Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
     Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
+    Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
+    Tv a1=vload(ab[il  ].f[0]), b1=vload(ab[il  ].f[1]);
+    Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
+    for (int i=0; i<nv2; ++i)
+      {
+      d->p1r[i] += d->lam2[i]*ar1;
+      d->p1i[i] += d->lam2[i]*ai1;
+      d->p2r[i] += d->lam2[i]*ar2;
+      d->p2i[i] += d->lam2[i]*ai2;
+      d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
+      d->p1r[i] += d->lam1[i]*ar3;
+      d->p1i[i] += d->lam1[i]*ai3;
+      d->p2r[i] += d->lam1[i]*ar4;
+      d->p2i[i] += d->lam1[i]*ai4;
+      d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
+     }
+    }
+  for (; l<=lmax; ++il, l+=2)
+    {
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]);
+    for (int i=0; i<nv2; ++i)
+      {
+      d->p1r[i] += d->lam2[i]*ar1;
+      d->p1i[i] += d->lam2[i]*ai1;
+      d->p2r[i] += d->lam2[i]*ar2;
+      d->p2i[i] += d->lam2[i]*ai2;
+      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
+      d->lam1[i] = d->lam2[i];
+      d->lam2[i] = tmp;
+      }
+    }
+  }
+
+NOINLINE static void calc_alm2map_alt (sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
+  {
+  int l,il,lmax=gen->lmax;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  for (int i=0; i<nv2; ++i)
+    d->csq[i] = d->cth[i]*d->cth[i];
+  iter_to_ieee_alt(gen, d, &l, &il, nv2);
+  job->opcnt += il * 4*nth;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 6*nth;
+
+  const sharp_ylmgen_dbl2 * restrict ab = gen->ab;
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+    }
+
+  while((!full_ieee) && (l<=lmax))
+    {
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]);
     full_ieee=1;
     for (int i=0; i<nv2; ++i)
       {
@@ -336,7 +391,7 @@ csq[i] = d->cth[i]*d->cth[i];
       d->p1i[i] += d->lam2[i]*d->corfac[i]*ai1;
       d->p2r[i] += d->lam2[i]*d->corfac[i]*ar2;
       d->p2i[i] += d->lam2[i]*d->corfac[i]*ai2;
-      Tv tmp = (gen->a[il]*csq[i] + gen->b[il])*d->lam2[i] + d->lam1[i];
+      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
       d->lam1[i] = d->lam2[i];
       d->lam2[i] = tmp;
       if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
@@ -344,52 +399,17 @@ csq[i] = d->cth[i]*d->cth[i];
         getCorfac(d->scale[i], &d->corfac[i], gen->cf);
         full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
         }
-   //   full_ieee=0;
       }
+    l+=2; ++il;
     }
+  if (l>lmax) return;
+
   for (int i=0; i<nv2; ++i)
     {
     d->lam1[i] *= d->corfac[i];
     d->lam2[i] *= d->corfac[i];
     }
-  for (; l<=lmax-2; il+=2, l+=4)
-    {
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
-    Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
-    Tv a1=vload(gen->a[il]), a2=vload(gen->a[il+1]),
-       b1=vload(gen->b[il]), b2=vload(gen->b[il+1]);
-    for (int i=0; i<nv2; ++i)
-      {
-      d->p1r[i] += d->lam2[i]*ar1;
-      d->p1i[i] += d->lam2[i]*ai1;
-      d->p2r[i] += d->lam2[i]*ar2;
-      d->p2i[i] += d->lam2[i]*ai2;
-      d->lam1[i] = (a1*csq[i] + b1)*d->lam2[i] + d->lam1[i];
-      d->p1r[i] += d->lam1[i]*ar3;
-      d->p1i[i] += d->lam1[i]*ai3;
-      d->p2r[i] += d->lam1[i]*ar4;
-      d->p2i[i] += d->lam1[i]*ai4;
-      d->lam2[i] = (a2*csq[i] + b2)*d->lam1[i] + d->lam2[i];
-     }
-    }
-  for (; l<=lmax; ++il, l+=2)
-    {
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    for (int i=0; i<nv2; ++i)
-      {
-      d->p1r[i] += d->lam2[i]*ar1;
-      d->p1i[i] += d->lam2[i]*ai1;
-      d->p2r[i] += d->lam2[i]*ar2;
-      d->p2i[i] += d->lam2[i]*ai2;
-      Tv tmp = (gen->a[il]*csq[i] + gen->b[il])*d->lam2[i] + d->lam1[i];
-      d->lam1[i] = d->lam2[i];
-      d->lam2[i] = tmp;
-      }
-    }
-  }
+  alm2map_alt_kernel(d, ab, alm, l, il, lmax, nv2);
   }
 
 NOINLINE static void map2alm_kernel(s0data_v * restrict d,
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index e44d88b..a8e8d61 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -83,8 +83,7 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
       }
 gen->eps=RALLOC(double, gen->lmax+10);
 gen->alpha=RALLOC(double, gen->lmax/2+10);
-gen->a=RALLOC(double, gen->lmax/2+10);
-gen->b=RALLOC(double, gen->lmax/2+10);
+gen->ab=RALLOC(sharp_ylmgen_dbl2, gen->lmax/2+10);
     }
   else
     {
@@ -143,8 +142,7 @@ void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
     DEALLOC(gen->iroot);
 DEALLOC(gen->eps);
 DEALLOC(gen->alpha);
-DEALLOC(gen->a);
-DEALLOC(gen->b);
+DEALLOC(gen->ab);
     }
   else
     {
@@ -181,9 +179,9 @@ for (int il=1, l=m+2; l<gen->lmax+5; ++il, l+=2)
   gen->alpha[il+1]= ((il&1) ? -1 : 1)/(gen->eps[l+2]*gen->eps[l+3]*gen->alpha[il]);
 for (int il=0, l=m; l<gen->lmax+5; ++il, l+=2)
   {
-  gen->a[il] = ((il&1) ? -1 : 1)*gen->alpha[il]*gen->alpha[il];
+  gen->ab[il].f[0] = ((il&1) ? -1 : 1)*gen->alpha[il]*gen->alpha[il];
   double t1 = gen->eps[l+2], t2 = gen->eps[l+1];
-  gen->b[il] = -gen->a[il]*(t1*t1+t2*t2);
+  gen->ab[il].f[1] = -gen->ab[il].f[0]*(t1*t1+t2*t2);
   }
     }
   else
diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h
index 2e9e5ff..606ed9a 100644
--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@@ -58,7 +58,8 @@ typedef struct
   double *mfac;
   sharp_ylmgen_dbl2 *rf;
 
-double *eps, *alpha, *a, *b;
+double *eps, *alpha;
+sharp_ylmgen_dbl2 *ab;
 
 /* used if s!=0 */
   int sinPow, cosPow, preMinus_p, preMinus_m;

From 354d2ec286712b0b264aae1f8e1cd796a4613069 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 19 Dec 2018 10:05:49 +0100
Subject: [PATCH 41/85] more

---
 libsharp/sharp_core.c | 117 +++++++++++++++++++++++++++++++++---------
 1 file changed, 94 insertions(+), 23 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 03e142f..03083f2 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -318,28 +318,6 @@ NOINLINE static void alm2map_alt_kernel(s0data_v * restrict d,
   const sharp_ylmgen_dbl2 * restrict ab, const dcmplx * restrict alm,
   int l, int il, int lmax, int nv2)
   {
-  for (; l<=lmax-2; il+=2, l+=4)
-    {
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
-    Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
-    Tv a1=vload(ab[il  ].f[0]), b1=vload(ab[il  ].f[1]);
-    Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
-    for (int i=0; i<nv2; ++i)
-      {
-      d->p1r[i] += d->lam2[i]*ar1;
-      d->p1i[i] += d->lam2[i]*ai1;
-      d->p2r[i] += d->lam2[i]*ar2;
-      d->p2i[i] += d->lam2[i]*ai2;
-      d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
-      d->p1r[i] += d->lam1[i]*ar3;
-      d->p1i[i] += d->lam1[i]*ai3;
-      d->p2r[i] += d->lam1[i]*ar4;
-      d->p2i[i] += d->lam1[i]*ai4;
-      d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
-     }
-    }
   for (; l<=lmax; ++il, l+=2)
     {
     Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
@@ -358,6 +336,29 @@ NOINLINE static void alm2map_alt_kernel(s0data_v * restrict d,
     }
   }
 
+NOINLINE static void map2alm_alt_kernel(s0data_v * restrict d,
+  const sharp_ylmgen_dbl2 * restrict ab, dcmplx * restrict alm, int l,
+  int il, int lmax, int nv2)
+  {
+  for (; l<=lmax; ++il, l+=2)
+    {
+    Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]);
+    Tv atmp[4] = {vzero, vzero, vzero, vzero};
+    for (int i=0; i<nv2; ++i)
+      {
+      atmp[0] += d->lam2[i]*d->p1r[i];
+      atmp[1] += d->lam2[i]*d->p1i[i];
+      atmp[2] += d->lam2[i]*d->p2r[i];
+      atmp[3] += d->lam2[i]*d->p2i[i];
+      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
+      d->lam1[i] = d->lam2[i];
+      d->lam2[i] = tmp;
+      }
+    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
+    l+=2;
+    }
+  }
+
 NOINLINE static void calc_alm2map_alt (sharp_job * restrict job,
   const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
   {
@@ -487,6 +488,63 @@ NOINLINE static void calc_map2alm(sharp_job * restrict job,
   map2alm_kernel(d, rf, alm, l, lmax, nv2);
   }
 
+NOINLINE static void calc_map2alm_alt (sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
+  {
+  int l,il,lmax=gen->lmax;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  for (int i=0; i<nv2; ++i)
+    d->csq[i] = d->cth[i]*d->cth[i];
+  iter_to_ieee_alt(gen, d, &l, &il, nv2);
+  job->opcnt += il * 4*nth;
+  if (l>lmax) return;
+//  printf("beep\n");
+  job->opcnt += (lmax+1-l) * 6*nth;
+
+  const sharp_ylmgen_dbl2 * restrict ab = gen->ab;
+  dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+    }
+
+  while((!full_ieee) && (l<=lmax))
+    {
+    Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]);
+    Tv atmp[4] = {vzero, vzero, vzero, vzero};
+    full_ieee=1;
+    for (int i=0; i<nv2; ++i)
+      {
+      atmp[0] += d->lam2[i]*d->corfac[i]*d->p1r[i];
+      atmp[1] += d->lam2[i]*d->corfac[i]*d->p1i[i];
+      atmp[2] += d->lam2[i]*d->corfac[i]*d->p2r[i];
+      atmp[3] += d->lam2[i]*d->corfac[i]*d->p2i[i];
+      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
+      d->lam1[i] = d->lam2[i];
+      d->lam2[i] = tmp;
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+        }
+      }
+    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
+    l+=2; ++il;
+    full_ieee=0;
+    }
+  if (l>lmax) return;
+//  printf("boop\n");
+
+  for (int i=0; i<nv2; ++i)
+    {
+    d->lam1[i] *= d->corfac[i];
+    d->lam2[i] *= d->corfac[i];
+    }
+  map2alm_alt_kernel(d, ab, alm, l, il, lmax, nv2);
+  }
+
 NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
   sxdata_v * restrict d, int * restrict l_, int nv2)
   {
@@ -1074,6 +1132,9 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
               dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.;
               d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2);
               d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2);
+//adjust for new algorithm
+d.s.p2r[nth]*=d.s.cth[nth];
+d.s.p2i[nth]*=d.s.cth[nth];
               ++nth;
               }
             ++ith;
@@ -1087,9 +1148,19 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
               d.s.sth[i]=d.s.sth[nth-1];
               d.s.p1r[i]=d.s.p1i[i]=d.s.p2r[i]=d.s.p2i[i]=0.;
               }
-            calc_map2alm (job, gen, &d.v, nth);
+            calc_map2alm_alt (job, gen, &d.v, nth);
             }
           }
+//adjust the a_lm for the new algorithm
+dcmplx * restrict alm=job->almtmp;
+for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
+  {
+  dcmplx al = alm[l];
+  dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1];
+  dcmplx alm2 = (l<gen->m+2) ? 0. : alm[l-2];
+  alm[l  ] = gen->alpha[il]*(gen->eps[l+1]*al + gen->eps[l]*alm2);
+  alm[l+1] = gen->alpha[il]*al1;
+  }
         }
       else
         {

From 9a572e33b5a014bf96cd7d8e60b00e4235152952 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 19 Dec 2018 11:22:17 +0100
Subject: [PATCH 42/85] seems to work

---
 libsharp/sharp_core.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 03083f2..ff5dd83 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -355,7 +355,6 @@ NOINLINE static void map2alm_alt_kernel(s0data_v * restrict d,
       d->lam2[i] = tmp;
       }
     vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
-    l+=2;
     }
   }
 
@@ -498,7 +497,6 @@ NOINLINE static void calc_map2alm_alt (sharp_job * restrict job,
   iter_to_ieee_alt(gen, d, &l, &il, nv2);
   job->opcnt += il * 4*nth;
   if (l>lmax) return;
-//  printf("beep\n");
   job->opcnt += (lmax+1-l) * 6*nth;
 
   const sharp_ylmgen_dbl2 * restrict ab = gen->ab;
@@ -532,10 +530,8 @@ NOINLINE static void calc_map2alm_alt (sharp_job * restrict job,
       }
     vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
     l+=2; ++il;
-    full_ieee=0;
     }
   if (l>lmax) return;
-//  printf("boop\n");
 
   for (int i=0; i<nv2; ++i)
     {
@@ -1153,13 +1149,16 @@ d.s.p2i[nth]*=d.s.cth[nth];
           }
 //adjust the a_lm for the new algorithm
 dcmplx * restrict alm=job->almtmp;
+dcmplx alm2 = 0.;
+double alold=0;
 for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
   {
   dcmplx al = alm[l];
   dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1];
-  dcmplx alm2 = (l<gen->m+2) ? 0. : alm[l-2];
-  alm[l  ] = gen->alpha[il]*(gen->eps[l+1]*al + gen->eps[l]*alm2);
+  alm[l  ] = gen->alpha[il]*gen->eps[l+1]*al + alold*gen->eps[l]*alm2;
   alm[l+1] = gen->alpha[il]*al1;
+  alm2=al;
+  alold=gen->alpha[il];
   }
         }
       else

From ec05c52653ddb1bc2694bc39c3c6c370dd9cb10e Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 19 Dec 2018 11:37:47 +0100
Subject: [PATCH 43/85] performance tweak

---
 libsharp/sharp_core.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index ff5dd83..22eff09 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -223,18 +223,18 @@ NOINLINE static void iter_to_ieee_alt(const sharp_Ylmgen_C * restrict gen,
 
   while (below_limit)
     {
-    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
+    if (l+4>gen->lmax) {*l_=gen->lmax+1;return;}
     below_limit=1;
-    Tv a=vload(gen->ab[il].f[0]), b=vload(gen->ab[il].f[1]);
+    Tv a1=vload(gen->ab[il  ].f[0]), b1=vload(gen->ab[il  ].f[1]);
+    Tv a2=vload(gen->ab[il+1].f[0]), b2=vload(gen->ab[il+1].f[1]);
     for (int i=0; i<nv2; ++i)
       {
-      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
-      d->lam1[i] = d->lam2[i];
-      d->lam2[i] = tmp;
+      d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
+      d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
       if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
         below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale)));
       }
-    l+=2; ++il;
+    l+=4; il+=2;
     }
   *l_=l; *il_=il;
   }
@@ -340,6 +340,28 @@ NOINLINE static void map2alm_alt_kernel(s0data_v * restrict d,
   const sharp_ylmgen_dbl2 * restrict ab, dcmplx * restrict alm, int l,
   int il, int lmax, int nv2)
   {
+  for (; l<=lmax-2; il+=2, l+=4)
+    {
+    Tv a1=vload(ab[il  ].f[0]), b1=vload(ab[il  ].f[1]);
+    Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
+    Tv atmp1[4] = {vzero, vzero, vzero, vzero};
+    Tv atmp2[4] = {vzero, vzero, vzero, vzero};
+    for (int i=0; i<nv2; ++i)
+      {
+      atmp1[0] += d->lam2[i]*d->p1r[i];
+      atmp1[1] += d->lam2[i]*d->p1i[i];
+      atmp1[2] += d->lam2[i]*d->p2r[i];
+      atmp1[3] += d->lam2[i]*d->p2i[i];
+      d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
+      atmp2[0] += d->lam1[i]*d->p1r[i];
+      atmp2[1] += d->lam1[i]*d->p1i[i];
+      atmp2[2] += d->lam1[i]*d->p2r[i];
+      atmp2[3] += d->lam1[i]*d->p2i[i];
+      d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
+      }
+    vhsum_cmplx_special (atmp1[0], atmp1[1], atmp1[2], atmp1[3], &alm[l  ]);
+    vhsum_cmplx_special (atmp2[0], atmp2[1], atmp2[2], atmp2[3], &alm[l+2]);
+    }
   for (; l<=lmax; ++il, l+=2)
     {
     Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]);

From c118c4fd611fc2799697a120f4b2da48c08b1c3e Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 19 Dec 2018 11:50:52 +0100
Subject: [PATCH 44/85] cleanup

---
 libsharp/sharp_core.c | 289 ++++++++----------------------------------
 1 file changed, 51 insertions(+), 238 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 22eff09..538198b 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -48,12 +48,12 @@ typedef double Tbs0[nv0*VLEN];
 
 typedef struct
   {
-  Tbv0 sth, corfac, scale, lam1, lam2, cth, csq, p1r, p1i, p2r, p2i;
+  Tbv0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i;
   } s0data_v;
 
 typedef struct
   {
-  Tbs0 sth, corfac, scale, lam1, lam2, cth, csq, p1r, p1i, p2r, p2i;
+  Tbs0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i;
   } s0data_s;
 
 typedef union
@@ -172,40 +172,6 @@ static inline int rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, T
   }
 
 NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
-  s0data_v * restrict d, int * restrict l_, int nv2)
-  {
-  int l=gen->m;
-  Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
-  Tv limscale=vload(sharp_limscale);
-  int below_limit = 1;
-  for (int i=0; i<nv2; ++i)
-    {
-    d->lam1[i]=vzero;
-    mypow(d->sth[i],l,gen->powlimit,&d->lam2[i],&d->scale[i]);
-    d->lam2[i] *= mfac;
-    Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
-    below_limit &= vallTrue(vlt(d->scale[i],limscale));
-    }
-
-  while (below_limit)
-    {
-    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
-    below_limit=1;
-    Tv r10=vload(gen->rf[l  ].f[0]), r11=vload(gen->rf[l  ].f[1]),
-       r20=vload(gen->rf[l+1].f[0]), r21=vload(gen->rf[l+1].f[1]);
-    for (int i=0; i<nv2; ++i)
-      {
-      d->lam1[i] = r10*d->cth[i]*d->lam2[i] - r11*d->lam1[i];
-      d->lam2[i] = r20*d->cth[i]*d->lam1[i] - r21*d->lam2[i];
-      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
-        below_limit &= vallTrue(vlt(d->scale[i],limscale));
-      }
-    l+=2;
-    }
-  *l_=l;
-  }
-
-NOINLINE static void iter_to_ieee_alt(const sharp_Ylmgen_C * restrict gen,
   s0data_v * restrict d, int * restrict l_, int * restrict il_, int nv2)
   {
   int l=gen->m, il=0;
@@ -240,81 +206,6 @@ NOINLINE static void iter_to_ieee_alt(const sharp_Ylmgen_C * restrict gen,
   }
 
 NOINLINE static void alm2map_kernel(s0data_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
-  int l, int lmax, int nv2)
-  {
-  while (l<=lmax)
-    {
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
-       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
-    for (int i=0; i<nv2; ++i)
-      {
-      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
-      d->p1r[i] += d->lam2[i]*ar1;
-      d->p1i[i] += d->lam2[i]*ai1;
-      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
-      d->p2r[i] += d->lam1[i]*ar2;
-      d->p2i[i] += d->lam1[i]*ai2;
-      }
-    l+=2;
-    }
-  }
-
-NOINLINE static void calc_alm2map (sharp_job * restrict job,
-  const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
-  {
-  int l,lmax=gen->lmax;
-  int nv2 = (nth+VLEN-1)/VLEN;
-  iter_to_ieee(gen, d, &l, nv2);
-  job->opcnt += (l-gen->m) * 4*nth;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 8*nth;
-
-  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee=1;
-  for (int i=0; i<nv2; ++i)
-    {
-    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
-    }
-
-  while((!full_ieee) && (l<=lmax))
-    {
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
-       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
-    full_ieee=1;
-    for (int i=0; i<nv2; ++i)
-      {
-      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
-      d->p1r[i] += d->lam2[i]*d->corfac[i]*ar1;
-      d->p1i[i] += d->lam2[i]*d->corfac[i]*ai1;
-      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
-      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
-        {
-        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-        full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
-        }
-      d->p2r[i] += d->lam1[i]*d->corfac[i]*ar2;
-      d->p2i[i] += d->lam1[i]*d->corfac[i]*ai2;
-      }
-    l+=2;
-    }
-  if (l>lmax) return;
-
-  for (int i=0; i<nv2; ++i)
-    {
-    d->lam1[i] *= d->corfac[i];
-    d->lam2[i] *= d->corfac[i];
-    }
-  alm2map_kernel(d, rf, alm, l, lmax, nv2);
-  }
-
-NOINLINE static void alm2map_alt_kernel(s0data_v * restrict d,
   const sharp_ylmgen_dbl2 * restrict ab, const dcmplx * restrict alm,
   int l, int il, int lmax, int nv2)
   {
@@ -336,58 +227,12 @@ NOINLINE static void alm2map_alt_kernel(s0data_v * restrict d,
     }
   }
 
-NOINLINE static void map2alm_alt_kernel(s0data_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict ab, dcmplx * restrict alm, int l,
-  int il, int lmax, int nv2)
-  {
-  for (; l<=lmax-2; il+=2, l+=4)
-    {
-    Tv a1=vload(ab[il  ].f[0]), b1=vload(ab[il  ].f[1]);
-    Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
-    Tv atmp1[4] = {vzero, vzero, vzero, vzero};
-    Tv atmp2[4] = {vzero, vzero, vzero, vzero};
-    for (int i=0; i<nv2; ++i)
-      {
-      atmp1[0] += d->lam2[i]*d->p1r[i];
-      atmp1[1] += d->lam2[i]*d->p1i[i];
-      atmp1[2] += d->lam2[i]*d->p2r[i];
-      atmp1[3] += d->lam2[i]*d->p2i[i];
-      d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
-      atmp2[0] += d->lam1[i]*d->p1r[i];
-      atmp2[1] += d->lam1[i]*d->p1i[i];
-      atmp2[2] += d->lam1[i]*d->p2r[i];
-      atmp2[3] += d->lam1[i]*d->p2i[i];
-      d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
-      }
-    vhsum_cmplx_special (atmp1[0], atmp1[1], atmp1[2], atmp1[3], &alm[l  ]);
-    vhsum_cmplx_special (atmp2[0], atmp2[1], atmp2[2], atmp2[3], &alm[l+2]);
-    }
-  for (; l<=lmax; ++il, l+=2)
-    {
-    Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]);
-    Tv atmp[4] = {vzero, vzero, vzero, vzero};
-    for (int i=0; i<nv2; ++i)
-      {
-      atmp[0] += d->lam2[i]*d->p1r[i];
-      atmp[1] += d->lam2[i]*d->p1i[i];
-      atmp[2] += d->lam2[i]*d->p2r[i];
-      atmp[3] += d->lam2[i]*d->p2i[i];
-      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
-      d->lam1[i] = d->lam2[i];
-      d->lam2[i] = tmp;
-      }
-    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
-    }
-  }
-
-NOINLINE static void calc_alm2map_alt (sharp_job * restrict job,
+NOINLINE static void calc_alm2map (sharp_job * restrict job,
   const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
   {
   int l,il,lmax=gen->lmax;
   int nv2 = (nth+VLEN-1)/VLEN;
-  for (int i=0; i<nv2; ++i)
-    d->csq[i] = d->cth[i]*d->cth[i];
-  iter_to_ieee_alt(gen, d, &l, &il, nv2);
+  iter_to_ieee(gen, d, &l, &il, nv2);
   job->opcnt += il * 4*nth;
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 6*nth;
@@ -431,92 +276,59 @@ NOINLINE static void calc_alm2map_alt (sharp_job * restrict job,
     d->lam1[i] *= d->corfac[i];
     d->lam2[i] *= d->corfac[i];
     }
-  alm2map_alt_kernel(d, ab, alm, l, il, lmax, nv2);
+  alm2map_kernel(d, ab, alm, l, il, lmax, nv2);
   }
 
 NOINLINE static void map2alm_kernel(s0data_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l,
-  int lmax, int nv2)
+  const sharp_ylmgen_dbl2 * restrict ab, dcmplx * restrict alm, int l,
+  int il, int lmax, int nv2)
   {
-  while (l<=lmax)
+  for (; l<=lmax-2; il+=2, l+=4)
     {
-    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
-       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
+    Tv a1=vload(ab[il  ].f[0]), b1=vload(ab[il  ].f[1]);
+    Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
+    Tv atmp1[4] = {vzero, vzero, vzero, vzero};
+    Tv atmp2[4] = {vzero, vzero, vzero, vzero};
+    for (int i=0; i<nv2; ++i)
+      {
+      atmp1[0] += d->lam2[i]*d->p1r[i];
+      atmp1[1] += d->lam2[i]*d->p1i[i];
+      atmp1[2] += d->lam2[i]*d->p2r[i];
+      atmp1[3] += d->lam2[i]*d->p2i[i];
+      d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
+      atmp2[0] += d->lam1[i]*d->p1r[i];
+      atmp2[1] += d->lam1[i]*d->p1i[i];
+      atmp2[2] += d->lam1[i]*d->p2r[i];
+      atmp2[3] += d->lam1[i]*d->p2i[i];
+      d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
+      }
+    vhsum_cmplx_special (atmp1[0], atmp1[1], atmp1[2], atmp1[3], &alm[l  ]);
+    vhsum_cmplx_special (atmp2[0], atmp2[1], atmp2[2], atmp2[3], &alm[l+2]);
+    }
+  for (; l<=lmax; ++il, l+=2)
+    {
+    Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]);
     Tv atmp[4] = {vzero, vzero, vzero, vzero};
     for (int i=0; i<nv2; ++i)
       {
-      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
       atmp[0] += d->lam2[i]*d->p1r[i];
       atmp[1] += d->lam2[i]*d->p1i[i];
-      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
-      atmp[2] += d->lam1[i]*d->p2r[i];
-      atmp[3] += d->lam1[i]*d->p2i[i];
+      atmp[2] += d->lam2[i]*d->p2r[i];
+      atmp[3] += d->lam2[i]*d->p2i[i];
+      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
+      d->lam1[i] = d->lam2[i];
+      d->lam2[i] = tmp;
       }
     vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
-    l+=2;
     }
   }
 
-NOINLINE static void calc_map2alm(sharp_job * restrict job,
-  const sharp_Ylmgen_C *gen, s0data_v * restrict d, int nth)
-  {
-  int lmax=gen->lmax;
-  int l=gen->m;
-  int nv2 = (nth+VLEN-1)/VLEN;
-  iter_to_ieee(gen, d, &l, nv2);
-  job->opcnt += (l-gen->m) * 4*nth;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 8*nth;
-
-  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
-  dcmplx * restrict alm=job->almtmp;
-  int full_ieee=1;
-  for (int i=0; i<nv2; ++i)
-    {
-    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
-    }
-
-  while ((!full_ieee) && (l<=lmax))
-    {
-    full_ieee=1;
-    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
-       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
-    Tv atmp[4] = {vzero, vzero, vzero, vzero};
-    for (int i=0; i<nv2; ++i)
-      {
-      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
-      atmp[0] += d->lam2[i]*d->corfac[i]*d->p1r[i];
-      atmp[1] += d->lam2[i]*d->corfac[i]*d->p1i[i];
-      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
-      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
-        {
-        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-        full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
-        }
-      atmp[2] += d->lam1[i]*d->corfac[i]*d->p2r[i];
-      atmp[3] += d->lam1[i]*d->corfac[i]*d->p2i[i];
-      }
-    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
-    l+=2;
-    }
-
-  for (int i=0; i<nv2; ++i)
-    {
-    d->lam1[i] *= d->corfac[i];
-    d->lam2[i] *= d->corfac[i];
-    }
-  map2alm_kernel(d, rf, alm, l, lmax, nv2);
-  }
-
-NOINLINE static void calc_map2alm_alt (sharp_job * restrict job,
+NOINLINE static void calc_map2alm (sharp_job * restrict job,
   const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
   {
   int l,il,lmax=gen->lmax;
   int nv2 = (nth+VLEN-1)/VLEN;
-  for (int i=0; i<nv2; ++i)
-    d->csq[i] = d->cth[i]*d->cth[i];
-  iter_to_ieee_alt(gen, d, &l, &il, nv2);
+  iter_to_ieee(gen, d, &l, &il, nv2);
   job->opcnt += il * 4*nth;
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 6*nth;
@@ -560,7 +372,7 @@ NOINLINE static void calc_map2alm_alt (sharp_job * restrict job,
     d->lam1[i] *= d->corfac[i];
     d->lam2[i] *= d->corfac[i];
     }
-  map2alm_alt_kernel(d, ab, alm, l, il, lmax, nv2);
+  map2alm_kernel(d, ab, alm, l, il, lmax, nv2);
   }
 
 NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
@@ -1012,7 +824,8 @@ for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
             if (mlim[ith]>=m)
               {
               itgt[nth] = ith;
-              d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
+              d.s.csq[nth]=cth_[ith]*cth_[ith];
+              d.s.sth[nth]=sth_[ith];
               ++nth;
               }
             else
@@ -1027,17 +840,17 @@ for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
             int i2=((nth+VLEN-1)/VLEN)*VLEN;
             for (int i=nth; i<i2; ++i)
               {
-              d.s.cth[i]=d.s.cth[nth-1];
+              d.s.csq[i]=d.s.csq[nth-1];
               d.s.sth[i]=d.s.sth[nth-1];
               d.s.p1r[i]=d.s.p1i[i]=d.s.p2r[i]=d.s.p2i[i]=0.;
               }
-            calc_alm2map_alt (job, gen, &d.v, nth);
+            calc_alm2map (job, gen, &d.v, nth);
             for (int i=0; i<nth; ++i)
               {
-//adjust for new algorithm
-d.s.p2r[i]*=d.s.cth[i];
-d.s.p2i[i]*=d.s.cth[i];
               int tgt=itgt[i];
+//adjust for new algorithm
+d.s.p2r[i]*=cth_[tgt];
+d.s.p2i[i]*=cth_[tgt];
               int phas_idx = tgt*job->s_th + mi*job->s_m;
               complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I,
                              r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I;
@@ -1144,15 +957,15 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
             {
             if (mlim[ith]>=m)
               {
-              d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
+              d.s.csq[nth]=cth_[ith]*cth_[ith]; d.s.sth[nth]=sth_[ith];
               int phas_idx = ith*job->s_th + mi*job->s_m;
               dcmplx ph1=job->phase[phas_idx];
               dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.;
               d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2);
               d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2);
 //adjust for new algorithm
-d.s.p2r[nth]*=d.s.cth[nth];
-d.s.p2i[nth]*=d.s.cth[nth];
+d.s.p2r[nth]*=cth_[ith];
+d.s.p2i[nth]*=cth_[ith];
               ++nth;
               }
             ++ith;
@@ -1162,11 +975,11 @@ d.s.p2i[nth]*=d.s.cth[nth];
             int i2=((nth+VLEN-1)/VLEN)*VLEN;
             for (int i=nth; i<i2; ++i)
               {
-              d.s.cth[i]=d.s.cth[nth-1];
+              d.s.csq[i]=d.s.csq[nth-1];
               d.s.sth[i]=d.s.sth[nth-1];
               d.s.p1r[i]=d.s.p1i[i]=d.s.p2r[i]=d.s.p2i[i]=0.;
               }
-            calc_map2alm_alt (job, gen, &d.v, nth);
+            calc_map2alm (job, gen, &d.v, nth);
             }
           }
 //adjust the a_lm for the new algorithm

From b545c1cfe087b15e87915448cb80161385b706f5 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 19 Dec 2018 12:10:12 +0100
Subject: [PATCH 45/85] cleanup

---
 libsharp/sharp_core.c     | 58 +++++++++++++++++++--------------------
 libsharp/sharp_ylmgen_c.c | 44 ++++++++++++-----------------
 libsharp/sharp_ylmgen_c.h |  7 ++---
 3 files changed, 49 insertions(+), 60 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 538198b..94b426e 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -800,16 +800,16 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
       {
       if (job->spin==0)
         {
-//adjust the a_lm for the new algorithm
-dcmplx * restrict alm=job->almtmp;
-for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
-  {
-  dcmplx al = alm[l];
-  dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1];
-  dcmplx al2 = (l+2>gen->lmax) ? 0. : alm[l+2];
-  alm[l  ] = gen->alpha[il]*(gen->eps[l+1]*al + gen->eps[l+2]*al2);
-  alm[l+1] = gen->alpha[il]*al1;
-  }
+        //adjust the a_lm for the new algorithm
+        dcmplx * restrict alm=job->almtmp;
+        for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
+          {
+          dcmplx al = alm[l];
+          dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1];
+          dcmplx al2 = (l+2>gen->lmax) ? 0. : alm[l+2];
+          alm[l  ] = gen->alpha[il]*(gen->eps[l+1]*al + gen->eps[l+2]*al2);
+          alm[l+1] = gen->alpha[il]*al1;
+          }
 
         const int nval=nv0*VLEN;
         int ith=0;
@@ -848,9 +848,9 @@ for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
             for (int i=0; i<nth; ++i)
               {
               int tgt=itgt[i];
-//adjust for new algorithm
-d.s.p2r[i]*=cth_[tgt];
-d.s.p2i[i]*=cth_[tgt];
+              //adjust for new algorithm
+              d.s.p2r[i]*=cth_[tgt];
+              d.s.p2i[i]*=cth_[tgt];
               int phas_idx = tgt*job->s_th + mi*job->s_m;
               complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I,
                              r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I;
@@ -963,9 +963,9 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
               dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.;
               d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2);
               d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2);
-//adjust for new algorithm
-d.s.p2r[nth]*=cth_[ith];
-d.s.p2i[nth]*=cth_[ith];
+              //adjust for new algorithm
+              d.s.p2r[nth]*=cth_[ith];
+              d.s.p2i[nth]*=cth_[ith];
               ++nth;
               }
             ++ith;
@@ -982,19 +982,19 @@ d.s.p2i[nth]*=cth_[ith];
             calc_map2alm (job, gen, &d.v, nth);
             }
           }
-//adjust the a_lm for the new algorithm
-dcmplx * restrict alm=job->almtmp;
-dcmplx alm2 = 0.;
-double alold=0;
-for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
-  {
-  dcmplx al = alm[l];
-  dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1];
-  alm[l  ] = gen->alpha[il]*gen->eps[l+1]*al + alold*gen->eps[l]*alm2;
-  alm[l+1] = gen->alpha[il]*al1;
-  alm2=al;
-  alold=gen->alpha[il];
-  }
+        //adjust the a_lm for the new algorithm
+        dcmplx * restrict alm=job->almtmp;
+        dcmplx alm2 = 0.;
+        double alold=0;
+        for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
+          {
+          dcmplx al = alm[l];
+          dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1];
+          alm[l  ] = gen->alpha[il]*gen->eps[l+1]*al + alold*gen->eps[l]*alm2;
+          alm[l+1] = gen->alpha[il]*al1;
+          alm2=al;
+          alold=gen->alpha[il];
+          }
         }
       else
         {
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index a8e8d61..622fc30 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -69,7 +69,6 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
   gen->m = -1;
   if (spin==0)
     {
-    gen->rf = RALLOC(sharp_ylmgen_dbl2,gen->lmax+2);
     gen->mfac = RALLOC(double,gen->mmax+1);
     gen->mfac[0] = inv_sqrt4pi;
     for (int m=1; m<=gen->mmax; ++m)
@@ -81,9 +80,9 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
       gen->root[m] = sqrt(m);
       gen->iroot[m] = (m==0) ? 0. : 1./gen->root[m];
       }
-gen->eps=RALLOC(double, gen->lmax+10);
-gen->alpha=RALLOC(double, gen->lmax/2+10);
-gen->ab=RALLOC(sharp_ylmgen_dbl2, gen->lmax/2+10);
+    gen->eps=RALLOC(double, gen->lmax+10);
+    gen->alpha=RALLOC(double, gen->lmax/2+10);
+    gen->ab=RALLOC(sharp_ylmgen_dbl2, gen->lmax/2+10);
     }
   else
     {
@@ -136,13 +135,12 @@ void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
   DEALLOC(gen->powlimit);
   if (gen->s==0)
     {
-    DEALLOC(gen->rf);
     DEALLOC(gen->mfac);
     DEALLOC(gen->root);
     DEALLOC(gen->iroot);
-DEALLOC(gen->eps);
-DEALLOC(gen->alpha);
-DEALLOC(gen->ab);
+    DEALLOC(gen->eps);
+    DEALLOC(gen->alpha);
+    DEALLOC(gen->ab);
     }
   else
     {
@@ -163,26 +161,20 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
 
   if (gen->s==0)
     {
-    gen->rf[m].f[0] = gen->root[2*m+3];
-    gen->rf[m].f[1] = 0.;
-    for (int l=m+1; l<=gen->lmax+1; ++l)
+    for (int l=m; l<gen->lmax+10; ++l)
+      gen->eps[l] = gen->root[l+m]*gen->root[l-m]
+                   *gen->iroot[2*l+1]*gen->iroot[2*l-1];
+    gen->alpha[0] = 1./gen->eps[m+1];
+    gen->alpha[1] = gen->eps[m+1]/(gen->eps[m+2]*gen->eps[m+3]);
+    for (int il=1, l=m+2; l<gen->lmax+5; ++il, l+=2)
+      gen->alpha[il+1]= ((il&1) ? -1 : 1)
+                       /(gen->eps[l+2]*gen->eps[l+3]*gen->alpha[il]);
+    for (int il=0, l=m; l<gen->lmax+5; ++il, l+=2)
       {
-      double tmp=gen->root[2*l+3]*gen->iroot[l+1+m]*gen->iroot[l+1-m];
-      gen->rf[l].f[0] = tmp*gen->root[2*l+1];
-      gen->rf[l].f[1] = tmp*gen->root[l+m]*gen->root[l-m]*gen->iroot[2*l-1];
+      gen->ab[il].f[0] = ((il&1) ? -1 : 1)*gen->alpha[il]*gen->alpha[il];
+      double t1 = gen->eps[l+2], t2 = gen->eps[l+1];
+      gen->ab[il].f[1] = -gen->ab[il].f[0]*(t1*t1+t2*t2);
       }
-for (int l=m; l<gen->lmax+10; ++l)
-  gen->eps[l] = sqrt((l*l-m*m)/(4.*l*l-1));
-gen->alpha[0] = 1./gen->eps[m+1];
-gen->alpha[1] = gen->eps[m+1]/(gen->eps[m+2]*gen->eps[m+3]);
-for (int il=1, l=m+2; l<gen->lmax+5; ++il, l+=2)
-  gen->alpha[il+1]= ((il&1) ? -1 : 1)/(gen->eps[l+2]*gen->eps[l+3]*gen->alpha[il]);
-for (int il=0, l=m; l<gen->lmax+5; ++il, l+=2)
-  {
-  gen->ab[il].f[0] = ((il&1) ? -1 : 1)*gen->alpha[il]*gen->alpha[il];
-  double t1 = gen->eps[l+2], t2 = gen->eps[l+1];
-  gen->ab[il].f[1] = -gen->ab[il].f[0]*(t1*t1+t2*t2);
-  }
     }
   else
     {
diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h
index 606ed9a..487d207 100644
--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@@ -55,11 +55,8 @@ typedef struct
   int m;
 
 /* used if s==0 */
-  double *mfac;
-  sharp_ylmgen_dbl2 *rf;
-
-double *eps, *alpha;
-sharp_ylmgen_dbl2 *ab;
+  double *mfac, *eps, *alpha;
+  sharp_ylmgen_dbl2 *ab;
 
 /* used if s!=0 */
   int sinPow, cosPow, preMinus_p, preMinus_m;

From 5d560c2b2c585792067b12601981d04638e8bd1b Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 19 Dec 2018 14:12:58 +0100
Subject: [PATCH 46/85] adjust array size

---
 libsharp/sharp_ylmgen_c.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index e967773..548f061 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -74,9 +74,9 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
     gen->mfac[0] = inv_sqrt4pi;
     for (int m=1; m<=gen->mmax; ++m)
       gen->mfac[m] = gen->mfac[m-1]*sqrt((2*m+1.)/(2*m));
-    gen->root = RALLOC(double,2*gen->lmax+5);
-    gen->iroot = RALLOC(double,2*gen->lmax+5);
-    for (int m=0; m<2*gen->lmax+5; ++m)
+    gen->root = RALLOC(double,2*gen->lmax+7);
+    gen->iroot = RALLOC(double,2*gen->lmax+7);
+    for (int m=0; m<2*gen->lmax+7; ++m)
       {
       gen->root[m] = sqrt(m);
       gen->iroot[m] = (m==0) ? 0. : 1./gen->root[m];

From eaa4eaf0c0c936645b48e5ce0fc877064fd92089 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 19 Dec 2018 14:19:53 +0100
Subject: [PATCH 47/85] safety margin

---
 libsharp/sharp_ylmgen_c.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index 622fc30..f0672b0 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -73,9 +73,9 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
     gen->mfac[0] = inv_sqrt4pi;
     for (int m=1; m<=gen->mmax; ++m)
       gen->mfac[m] = gen->mfac[m-1]*sqrt((2*m+1.)/(2*m));
-    gen->root = RALLOC(double,2*gen->lmax+6);
-    gen->iroot = RALLOC(double,2*gen->lmax+6);
-    for (int m=0; m<2*gen->lmax+6; ++m)
+    gen->root = RALLOC(double,2*gen->lmax+7);
+    gen->iroot = RALLOC(double,2*gen->lmax+7);
+    for (int m=0; m<2*gen->lmax+7; ++m)
       {
       gen->root[m] = sqrt(m);
       gen->iroot[m] = (m==0) ? 0. : 1./gen->root[m];

From d295e4c60990459eed84a84955989a63ec7dab59 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 19 Dec 2018 15:07:17 +0100
Subject: [PATCH 48/85] fix

---
 libsharp/sharp_core.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 2dcee90..37f7710 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -261,10 +261,8 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
       d->p1i[i] += d->lam2[i]*d->corfac[i]*ai1;
       d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
       if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
-        {
         getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-        full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
-        }
+      full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
       d->p2r[i] += d->lam1[i]*d->corfac[i]*ar2;
       d->p2i[i] += d->lam1[i]*d->corfac[i]*ai2;
       }
@@ -336,16 +334,15 @@ NOINLINE static void calc_map2alm(sharp_job * restrict job,
       atmp[1] += d->lam2[i]*d->corfac[i]*d->p1i[i];
       d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
       if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
-        {
         getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-        full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
-        }
+      full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
       atmp[2] += d->lam1[i]*d->corfac[i]*d->p2r[i];
       atmp[3] += d->lam1[i]*d->corfac[i]*d->p2i[i];
       }
     vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
     l+=2;
     }
+  if (l>lmax) return;
 
   for (int i=0; i<nv2; ++i)
     {

From fb11092bb1bb957643222a8b6eb58cf75fe34540 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 19 Dec 2018 15:14:05 +0100
Subject: [PATCH 49/85] more fixes

---
 libsharp/sharp_core.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 37f7710..7e08b94 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -515,15 +515,11 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
       d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
       d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
       if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
-        {
         getCorfac(d->scp[i], &d->cfp[i], gen->cf);
-        full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
-        }
+      full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
       if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
-        {
         getCorfac(d->scm[i], &d->cfm[i], gen->cf);
-        full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
-        }
+      full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
       }
     l+=2;
     }
@@ -625,15 +621,11 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
       acr2 += d->p2mr[i]*lw + d->p1pi[i]*lx;
       aci2 += d->p2mi[i]*lw - d->p1pr[i]*lx;
       if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
-        {
         getCorfac(d->scp[i], &d->cfp[i], gen->cf);
-        full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
-        }
+      full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
       if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
-        {
         getCorfac(d->scm[i], &d->cfm[i], gen->cf);
-        full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
-        }
+      full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
       }
     vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
     vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);

From 3890bf174b0853072f095f7bc51d924ec8940462 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 19 Dec 2018 17:27:38 +0100
Subject: [PATCH 50/85] tweaks for clang

---
 libsharp/sharp_core.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 16979e4..00ff6b3 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -184,7 +184,7 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
     mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]);
     d->lam2[i] *= mfac;
     Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
-    below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale)));
+    below_limit &= vallTrue(vlt(d->scale[i],limscale));
     }
 
   while (below_limit)
@@ -209,6 +209,28 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
   const sharp_ylmgen_dbl2 * restrict ab, const dcmplx * restrict alm,
   int l, int il, int lmax, int nv2)
   {
+  for (; l<=lmax-2; il+=2, l+=4)
+    {
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
+    Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
+    Tv a1=vload(ab[il  ].f[0]), b1=vload(ab[il  ].f[1]);
+    Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
+    for (int i=0; i<nv2; ++i)
+      {
+      d->p1r[i] += d->lam2[i]*ar1;
+      d->p1i[i] += d->lam2[i]*ai1;
+      d->p2r[i] += d->lam2[i]*ar2;
+      d->p2i[i] += d->lam2[i]*ai2;
+      d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
+      d->p1r[i] += d->lam1[i]*ar3;
+      d->p1i[i] += d->lam1[i]*ai3;
+      d->p2r[i] += d->lam1[i]*ar4;
+      d->p2i[i] += d->lam1[i]*ai4;
+      d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
+      }
+    }
   for (; l<=lmax; ++il, l+=2)
     {
     Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));

From 04fbe296b21834fbca146116562bde654dd1be7d Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 20 Dec 2018 13:34:48 +0100
Subject: [PATCH 51/85] cleanups and fixes

---
 libsharp/sharp_core.c      | 12 ++++------
 libsharp/sharp_testsuite.c | 49 ++++++++++++++++++++++----------------
 libsharp/sharp_ylmgen_c.c  | 19 ++++++++-------
 3 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 00ff6b3..b9489a9 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -219,15 +219,11 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
     Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
     for (int i=0; i<nv2; ++i)
       {
-      d->p1r[i] += d->lam2[i]*ar1;
-      d->p1i[i] += d->lam2[i]*ai1;
-      d->p2r[i] += d->lam2[i]*ar2;
-      d->p2i[i] += d->lam2[i]*ai2;
       d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
-      d->p1r[i] += d->lam1[i]*ar3;
-      d->p1i[i] += d->lam1[i]*ai3;
-      d->p2r[i] += d->lam1[i]*ar4;
-      d->p2i[i] += d->lam1[i]*ai4;
+      d->p1r[i] += d->lam2[i]*ar1 + d->lam1[i]*ar3;
+      d->p1i[i] += d->lam2[i]*ai1 + d->lam1[i]*ai3;
+      d->p2r[i] += d->lam2[i]*ar2 + d->lam1[i]*ar4;
+      d->p2i[i] += d->lam2[i]*ai2 + d->lam1[i]*ai4;
       d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
       }
     }
diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c
index f712ae4..fdc842a 100644
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@@ -241,13 +241,14 @@ static int good_fft_size(int n)
   }
 
 static void get_infos (const char *gname, int lmax, int *mmax, int *gpar1,
-  int *gpar2, sharp_geom_info **ginfo, sharp_alm_info **ainfo)
+  int *gpar2, sharp_geom_info **ginfo, sharp_alm_info **ainfo, int verbose)
   {
   UTIL_ASSERT(lmax>=0,"lmax must not be negative");
   if (*mmax<0) *mmax=lmax;
   UTIL_ASSERT(*mmax<=lmax,"mmax larger than lmax");
 
-  if (mytask==0) printf ("lmax: %d, mmax: %d\n",lmax,*mmax);
+  verbose &= (mytask==0);
+  if (verbose) printf ("lmax: %d, mmax: %d\n",lmax,*mmax);
 
   sharp_make_triangular_alm_info(lmax,*mmax,1,ainfo);
 #ifdef USE_MPI
@@ -259,14 +260,14 @@ static void get_infos (const char *gname, int lmax, int *mmax, int *gpar1,
     if (*gpar1<1) *gpar1=lmax/2;
     if (*gpar1==0) ++(*gpar1);
     sharp_make_healpix_geom_info (*gpar1, 1, ginfo);
-    if (mytask==0) printf ("HEALPix grid, nside=%d\n",*gpar1);
+    if (verbose) printf ("HEALPix grid, nside=%d\n",*gpar1);
     }
   else if (strcmp(gname,"gauss")==0)
     {
     if (*gpar1<1) *gpar1=lmax+1;
     if (*gpar2<1) *gpar2=2*(*mmax)+1;
     sharp_make_gauss_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
-    if (mytask==0)
+    if (verbose)
       printf ("Gauss-Legendre grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
     }
   else if (strcmp(gname,"fejer1")==0)
@@ -274,21 +275,21 @@ static void get_infos (const char *gname, int lmax, int *mmax, int *gpar1,
     if (*gpar1<1) *gpar1=2*lmax+1;
     if (*gpar2<1) *gpar2=2*(*mmax)+1;
     sharp_make_fejer1_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
-    if (mytask==0) printf ("Fejer1 grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
+    if (verbose) printf ("Fejer1 grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
     }
   else if (strcmp(gname,"fejer2")==0)
     {
     if (*gpar1<1) *gpar1=2*lmax+1;
     if (*gpar2<1) *gpar2=2*(*mmax)+1;
     sharp_make_fejer2_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
-    if (mytask==0) printf ("Fejer2 grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
+    if (verbose) printf ("Fejer2 grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
     }
   else if (strcmp(gname,"cc")==0)
     {
     if (*gpar1<1) *gpar1=2*lmax+1;
     if (*gpar2<1) *gpar2=2*(*mmax)+1;
     sharp_make_cc_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
-    if (mytask==0)
+    if (verbose)
       printf("Clenshaw-Curtis grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
     }
   else if (strcmp(gname,"smallgauss")==0)
@@ -318,7 +319,7 @@ static void get_infos (const char *gname, int lmax, int *mmax, int *gpar1,
         ofs+=pring;
         }
       }
-    if (mytask==0)
+    if (verbose)
       {
       ptrdiff_t npix=get_npix(*ginfo);
       printf("Small Gauss grid, nlat=%d, npix=%ld, savings=%.2f%%\n",
@@ -485,6 +486,16 @@ static void check_accuracy (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
   DEALLOC(err_abs);
   }
 
+static void run(int lmax, int mmax, int nlat, int nlon, int spin)
+  {
+  sharp_geom_info *ginfo;
+  sharp_alm_info *ainfo;
+  get_infos ("gauss", lmax, &mmax, &nlat, &nlon, &ginfo, &ainfo, 0);
+  check_accuracy(ginfo,ainfo,spin);
+  sharp_destroy_alm_info(ainfo);
+  sharp_destroy_geom_info(ginfo);
+  }
+
 static void sharp_acctest(void)
   {
   if (mytask==0) sharp_module_startup("sharp_acctest",1,1,"",1);
@@ -495,17 +506,15 @@ static void sharp_acctest(void)
 
   if (mytask==0) printf("Testing map analysis accuracy.\n");
 
-  sharp_geom_info *ginfo;
-  sharp_alm_info *ainfo;
-  int lmax=127, mmax=127, nlat=128, nlon=256;
-  get_infos ("gauss", lmax, &mmax, &nlat, &nlon, &ginfo, &ainfo);
-  check_accuracy(ginfo,ainfo,0);
-  check_accuracy(ginfo,ainfo,1);
-  check_accuracy(ginfo,ainfo,2);
-  check_accuracy(ginfo,ainfo,3);
-  check_accuracy(ginfo,ainfo,30);
-  sharp_destroy_alm_info(ainfo);
-  sharp_destroy_geom_info(ginfo);
+  run(127, 127, 128, 256, 0);
+  run(127, 127, 128, 256, 1);
+  run(127, 127, 128, 256, 2);
+  run(127, 127, 128, 256, 3);
+  run(127, 127, 128, 256, 30);
+  run(5, 0, 6, 1, 0);
+  run(5, 0, 7, 2, 0);
+  run(8, 8, 9, 17, 0);
+  run(8, 8, 9, 17, 2);
   if (mytask==0) printf("Passed.\n\n");
   }
 
@@ -524,7 +533,7 @@ static void sharp_test (int argc, const char **argv)
 
   sharp_geom_info *ginfo;
   sharp_alm_info *ainfo;
-  get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo);
+  get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo, 1);
 
   int ncomp = (spin==0) ? 1 : 2;
   double t_a2m=1e30, t_m2a=1e30;
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index f0672b0..f9de49c 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -73,16 +73,16 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
     gen->mfac[0] = inv_sqrt4pi;
     for (int m=1; m<=gen->mmax; ++m)
       gen->mfac[m] = gen->mfac[m-1]*sqrt((2*m+1.)/(2*m));
-    gen->root = RALLOC(double,2*gen->lmax+7);
-    gen->iroot = RALLOC(double,2*gen->lmax+7);
-    for (int m=0; m<2*gen->lmax+7; ++m)
+    gen->root = RALLOC(double,2*gen->lmax+8);
+    gen->iroot = RALLOC(double,2*gen->lmax+8);
+    for (int m=0; m<2*gen->lmax+8; ++m)
       {
       gen->root[m] = sqrt(m);
       gen->iroot[m] = (m==0) ? 0. : 1./gen->root[m];
       }
-    gen->eps=RALLOC(double, gen->lmax+10);
-    gen->alpha=RALLOC(double, gen->lmax/2+10);
-    gen->ab=RALLOC(sharp_ylmgen_dbl2, gen->lmax/2+10);
+    gen->eps=RALLOC(double, gen->lmax+4);
+    gen->alpha=RALLOC(double, gen->lmax/2+2);
+    gen->ab=RALLOC(sharp_ylmgen_dbl2, gen->lmax/2+2);
     }
   else
     {
@@ -161,15 +161,16 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
 
   if (gen->s==0)
     {
-    for (int l=m; l<gen->lmax+10; ++l)
+    gen->eps[m] = 0.;
+    for (int l=m+1; l<gen->lmax+4; ++l)
       gen->eps[l] = gen->root[l+m]*gen->root[l-m]
                    *gen->iroot[2*l+1]*gen->iroot[2*l-1];
     gen->alpha[0] = 1./gen->eps[m+1];
     gen->alpha[1] = gen->eps[m+1]/(gen->eps[m+2]*gen->eps[m+3]);
-    for (int il=1, l=m+2; l<gen->lmax+5; ++il, l+=2)
+    for (int il=1, l=m+2; l<gen->lmax+1; ++il, l+=2)
       gen->alpha[il+1]= ((il&1) ? -1 : 1)
                        /(gen->eps[l+2]*gen->eps[l+3]*gen->alpha[il]);
-    for (int il=0, l=m; l<gen->lmax+5; ++il, l+=2)
+    for (int il=0, l=m; l<gen->lmax+2; ++il, l+=2)
       {
       gen->ab[il].f[0] = ((il&1) ? -1 : 1)*gen->alpha[il]*gen->alpha[il];
       double t1 = gen->eps[l+2], t2 = gen->eps[l+1];

From 63c3066c2cbe0d31f91d0a7bc8494e379ca63475 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 21 Dec 2018 11:29:43 +0100
Subject: [PATCH 52/85] AVX512

---
 libsharp/sharp_vecsupport.h |  3 ---
 libsharp/sharp_vecutil.h    | 10 +---------
 2 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index 942e290..914a899 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -112,9 +112,6 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #if (VLEN==4)
 
 #include <immintrin.h>
-#if (USE_FMA4)
-#include <x86intrin.h>
-#endif
 
 typedef __m256d Tv;
 typedef __m256d Tm;
diff --git a/libsharp/sharp_vecutil.h b/libsharp/sharp_vecutil.h
index 522cc5f..e872bb3 100644
--- a/libsharp/sharp_vecutil.h
+++ b/libsharp/sharp_vecutil.h
@@ -34,7 +34,7 @@
 
 #ifndef VLEN
 
-#if (defined (__MIC__))
+#if (defined (__MIC__) || defined(__AVX512F__))
 #define VLEN 8
 #elif (defined (__AVX__))
 #define VLEN 4
@@ -46,12 +46,4 @@
 
 #endif
 
-#ifndef USE_FMA4
-#ifdef __FMA4__
-#define USE_FMA4 1
-#else
-#define USE_FMA4 0
-#endif
-#endif
-
 #endif

From 5adb6a7e38799d5a37b744926c450bfd7d2511dc Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 21 Dec 2018 11:38:25 +0100
Subject: [PATCH 53/85] AVX512

---
 libsharp/sharp_vecsupport.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index 914a899..ab18d0a 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -153,10 +153,10 @@ typedef __mmask8 Tm;
 #define vload(a) _mm512_set1_pd(a)
 #define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)_mm512_set1_pd(-0.),(__m512i)a)
 #define vsqrt(a) _mm512_sqrt_pd(a)
-#define vlt(a,b) _mm512_cmplt_pd_mask(a,b)
-#define vgt(a,b) _mm512_cmpnle_pd_mask(a,b)
-#define vge(a,b) _mm512_cmpnlt_pd_mask(a,b)
-#define vne(a,b) _mm512_cmpneq_pd_mask(a,b)
+#define vlt(a,b) _mm512_cmp_pd_mask(a,b,CMP_LT_OQ)
+#define vgt(a,b) _mm512_cmp_pd_mask(a,b,CMP_GT_OQ)
+#define vge(a,b) _mm512_cmp_pd_mask(a,b,CMP_GE_OQ)
+#define vne(a,b) _mm512_cmp_pd_mask(a,b,CMP_NE_OQ)
 #define vand_mask(a,b) ((a)&(b))
 #define vor_mask(a,b) ((a)|(b))
 #define vmin(a,b) _mm512_min_pd(a,b)

From e4eb8f61ce064c750d67c5d7e02343cebbe41f3b Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 21 Dec 2018 11:38:56 +0100
Subject: [PATCH 54/85] AVX512

---
 libsharp/sharp_vecsupport.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index ab18d0a..6119d09 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -153,10 +153,10 @@ typedef __mmask8 Tm;
 #define vload(a) _mm512_set1_pd(a)
 #define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)_mm512_set1_pd(-0.),(__m512i)a)
 #define vsqrt(a) _mm512_sqrt_pd(a)
-#define vlt(a,b) _mm512_cmp_pd_mask(a,b,CMP_LT_OQ)
-#define vgt(a,b) _mm512_cmp_pd_mask(a,b,CMP_GT_OQ)
-#define vge(a,b) _mm512_cmp_pd_mask(a,b,CMP_GE_OQ)
-#define vne(a,b) _mm512_cmp_pd_mask(a,b,CMP_NE_OQ)
+#define vlt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_LT_OQ)
+#define vgt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GT_OQ)
+#define vge(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GE_OQ)
+#define vne(a,b) _mm512_cmp_pd_mask(a,b,_CMP_NE_OQ)
 #define vand_mask(a,b) ((a)&(b))
 #define vor_mask(a,b) ((a)|(b))
 #define vmin(a,b) _mm512_min_pd(a,b)

From 42b666dd429fc168ca6d2d7c67c1fd382aeae0cd Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 21 Dec 2018 11:39:51 +0100
Subject: [PATCH 55/85] AVX512

---
 libsharp/sharp_vecsupport.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index 6119d09..d5aa78b 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -148,6 +148,7 @@ typedef __m512d Tv;
 typedef __mmask8 Tm;
 
 #define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
+#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
 #define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
 #define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))
 #define vload(a) _mm512_set1_pd(a)

From 286732fabc2edfa4d6240a07c8ba07061a40b758 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 21 Dec 2018 11:40:15 +0100
Subject: [PATCH 56/85] AVX512

---
 libsharp/sharp_vecsupport.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index d5aa78b..43200f8 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -157,7 +157,7 @@ typedef __mmask8 Tm;
 #define vlt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_LT_OQ)
 #define vgt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GT_OQ)
 #define vge(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GE_OQ)
-#define vne(a,b) _mm512_cmp_pd_mask(a,b,_CMP_NE_OQ)
+#define vne(a,b) _mm512_cmp_pd_mask(a,b,_CMP_NEQ_OQ)
 #define vand_mask(a,b) ((a)&(b))
 #define vor_mask(a,b) ((a)|(b))
 #define vmin(a,b) _mm512_min_pd(a,b)

From 51d393ff8b67e7f271ebea913d040e515269dad4 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 21 Dec 2018 12:01:13 +0100
Subject: [PATCH 57/85] AVX512

---
 libsharp/sharp_vecutil.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libsharp/sharp_vecutil.h b/libsharp/sharp_vecutil.h
index e872bb3..29485f7 100644
--- a/libsharp/sharp_vecutil.h
+++ b/libsharp/sharp_vecutil.h
@@ -34,7 +34,7 @@
 
 #ifndef VLEN
 
-#if (defined (__MIC__) || defined(__AVX512F__))
+#if (defined(__AVX512F__))
 #define VLEN 8
 #elif (defined (__AVX__))
 #define VLEN 4

From abf3b053d7c48668542bb6e22f10e64960378127 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Sat, 22 Dec 2018 11:57:33 +0100
Subject: [PATCH 58/85] remark

---
 libsharp/sharp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index bbb3872..d4a5d20 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -108,15 +108,16 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou
       RESIZE (self->shiftarr,dcmplx,mmax+1);
       self->s_shift = mmax+1;
       self->phi0_ = phi0;
+// FIXME: improve this by using sincos2pibyn(nph) etc.
       for (int m=0; m<=mmax; ++m)
         self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
 //      double *tmp=(double *) self->shiftarr;
 //      sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
       }
-  if (!self->plan) self->plan=make_rfft_plan(nph);
+//  if (!self->plan) self->plan=make_rfft_plan(nph);
   if (nph!=(int)self->length)
     {
-    destroy_rfft_plan(self->plan);
+    if (self->plan) destroy_rfft_plan(self->plan);
     self->plan=make_rfft_plan(nph);
     self->length=nph;
     }

From c89efbec62e49d2c888a1e68cd1fb1a5540c31da Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 8 Jan 2019 15:36:59 +0100
Subject: [PATCH 59/85] first try, not working

---
 libsharp/sharp_core.c     | 48 +++++++++++++++++----------------------
 libsharp/sharp_ylmgen_c.c | 19 ++++++++++++++++
 libsharp/sharp_ylmgen_c.h |  2 ++
 3 files changed, 42 insertions(+), 27 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index b9489a9..80f6a87 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -392,7 +392,7 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
 NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
   sxdata_v * restrict d, int * restrict l_, int nv2)
   {
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
+  const sharp_ylmgen_dbl2 * restrict fxx = gen->fxx;
   Tv prefac=vload(gen->prefac[gen->m]),
      prescale=vload(gen->fscale[gen->m]);
   Tv limscale=vload(sharp_limscale);
@@ -443,16 +443,14 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
     {
     if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
     below_limit=1;
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
-       fx12=vload(fx[l+1].f[2]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
-       fx22=vload(fx[l+2].f[2]);
+    Tv fx10=vload(fxx[l+1].f[0]),fx11=vload(fxx[l+1].f[1]);
+    Tv fx20=vload(fxx[l+2].f[0]),fx21=vload(fxx[l+2].f[1]);
     for (int i=0; i<nv2; ++i)
       {
-      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
-      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
-      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
-      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
           rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
         below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
@@ -465,23 +463,21 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
   }
 
 NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
-  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm,
+  const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
   int l, int lmax, int nv2)
   {
   while (l<=lmax)
     {
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
-       fx12=vload(fx[l+1].f[2]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
-       fx22=vload(fx[l+2].f[2]);
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
     Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
        acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
     Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
        acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
     for (int i=0; i<nv2; ++i)
       {
-      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
-      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
       Tv lw1=d->l2p[i]+d->l2m[i];
       Tv lx2=d->l1m[i]-d->l1p[i];
       d->p1pr[i] += agr1*lw1 - aci2*lx2;
@@ -494,8 +490,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
       d->p2pi[i] += agi2*lw2 + acr1*lx1;
       d->p2mr[i] += acr2*lw2 + agi1*lx1;
       d->p2mi[i] += aci2*lw2 - agr1*lx1;
-      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
-      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       }
     l+=2;
     }
@@ -511,7 +507,7 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 28*nth;
 
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
+  const sharp_ylmgen_dbl2 * restrict fx = gen->fxx;
   const dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
   for (int i=0; i<nv2; ++i)
@@ -524,10 +520,8 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
 
   while((!full_ieee) && (l<=lmax))
     {
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
-       fx12=vload(fx[l+1].f[2]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
-       fx22=vload(fx[l+2].f[2]);
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
     Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
        acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
     Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
@@ -535,8 +529,8 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
     full_ieee=1;
     for (int i=0; i<nv2; ++i)
       {
-      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
-      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
       Tv lw1=d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i];
       Tv lx2=d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i];
       d->p1pr[i] += agr1*lw1 - aci2*lx2;
@@ -549,8 +543,8 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
       d->p2pi[i] += agi2*lw2 + acr1*lx1;
       d->p2mr[i] += acr2*lw2 + agi1*lx1;
       d->p2mi[i] += aci2*lw2 - agr1*lx1;
-      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
-      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
         getCorfac(d->scp[i], &d->cfp[i], gen->cf);
       full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index f9de49c..cadcde7 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -88,6 +88,8 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
     {
     gen->m=gen->mlo=gen->mhi=-1234567890;
     ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+3);
+ALLOC(gen->alpha,double,gen->lmax+3);
+ALLOC(gen->fxx,sharp_ylmgen_dbl2,gen->lmax+3);
     for (int m=0; m<gen->lmax+3; ++m)
       gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
     ALLOC(gen->inv,double,gen->lmax+2);
@@ -145,6 +147,8 @@ void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
   else
     {
     DEALLOC(gen->fx);
+DEALLOC(gen->alpha);
+DEALLOC(gen->fxx);
     DEALLOC(gen->prefac);
     DEALLOC(gen->fscale);
     DEALLOC(gen->flm1);
@@ -199,6 +203,21 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
            *gen->flm2[l+gen->s]*gen->flm2[l-gen->s];
         gen->fx[l+1].f[2]=t*l1*gen->inv[l];
         }
+// calculate alpha <=> index 3
+gen->alpha[gen->mhi]=gen->alpha[gen->mhi+1]=1.;
+for (int l=gen->mhi+2; l<gen->lmax; ++l)
+{
+  gen->alpha[l] = gen->alpha[l-2]*gen->fx[l+1].f[2];
+//  printf("%d %e %e\n", l, gen->fx[l].f[2], gen->alpha[l]);
+}
+gen->fxx[gen->mhi].f[0] = 0;
+gen->fxx[gen->mhi].f[0] = 0;
+for (int l=gen->mhi+1; l<gen->lmax+1; ++l)
+{
+  gen->fxx[l].f[0] = gen->fx[l].f[0]*gen->alpha[l-1]/gen->alpha[l];
+  gen->fxx[l].f[1] = gen->fx[l].f[1]*gen->fxx[l].f[0];
+}
+
       }
 
     gen->preMinus_p = gen->preMinus_m = 0;
diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h
index 487d207..0f8bfac 100644
--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@@ -63,6 +63,8 @@ typedef struct
   double *prefac;
   int *fscale;
   sharp_ylmgen_dbl3 *fx;
+//double *alpha;
+sharp_ylmgen_dbl2 *fxx;
 
 /* internal usage only */
 /* used if s==0 */

From 766ef6a84879240d52e10f0ffbfef162af292465 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 9 Jan 2019 10:06:29 +0100
Subject: [PATCH 60/85] seems to work

---
 libsharp/sharp_core.c     | 100 +++++++++++++++++++++-----------------
 libsharp/sharp_ylmgen_c.c |  22 ++++-----
 2 files changed, 66 insertions(+), 56 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 80f6a87..ad3be7a 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -392,7 +392,8 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
 NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
   sxdata_v * restrict d, int * restrict l_, int nv2)
   {
-  const sharp_ylmgen_dbl2 * restrict fxx = gen->fxx;
+  const sharp_ylmgen_dbl2 * restrict fx = gen->fxx;
+  const sharp_ylmgen_dbl3 * restrict fxo = gen->fx;
   Tv prefac=vload(gen->prefac[gen->m]),
      prescale=vload(gen->fscale[gen->m]);
   Tv limscale=vload(sharp_limscale);
@@ -443,8 +444,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
     {
     if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
     below_limit=1;
-    Tv fx10=vload(fxx[l+1].f[0]),fx11=vload(fxx[l+1].f[1]);
-    Tv fx20=vload(fxx[l+2].f[0]),fx21=vload(fxx[l+2].f[1]);
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
     for (int i=0; i<nv2; ++i)
       {
       d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
@@ -503,9 +504,9 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
   int l,lmax=gen->lmax;
   int nv2 = (nth+VLEN-1)/VLEN;
   iter_to_ieee_spin(gen, d, &l, nv2);
-  job->opcnt += (l-gen->mhi) * 10*nth;
+  job->opcnt += (l-gen->mhi) * 7*nth;
   if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 28*nth;
+  job->opcnt += (lmax+1-l) * 25*nth;
 
   const sharp_ylmgen_dbl2 * restrict fx = gen->fxx;
   const dcmplx * restrict alm=job->almtmp;
@@ -567,29 +568,27 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
   }
 
 NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
-  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm,
+  const sharp_ylmgen_dbl2 * restrict fx, dcmplx * restrict alm,
   int l, int lmax, int nv2)
   {
   while (l<=lmax)
     {
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
-       fx12=vload(fx[l+1].f[2]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
-       fx22=vload(fx[l+2].f[2]);
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
     Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
     Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
     for (int i=0; i<nv2; ++i)
       {
-      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
-      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
       Tv lw = d->l2p[i] + d->l2m[i];
       Tv lx = d->l2m[i] - d->l2p[i];
       agr1 += d->p1pr[i]*lw - d->p2mi[i]*lx;;
       agi1 += d->p1pi[i]*lw + d->p2mr[i]*lx;
       acr1 += d->p1mr[i]*lw + d->p2pi[i]*lx;
       aci1 += d->p1mi[i]*lw - d->p2pr[i]*lx;
-      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
-      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       lw = d->l1p[i] + d->l1m[i];
       lx = d->l1m[i] - d->l1p[i];
       agr2 += d->p2pr[i]*lw - d->p1mi[i]*lx;
@@ -609,11 +608,11 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
   int l,lmax=gen->lmax;
   int nv2 = (nth+VLEN-1)/VLEN;
   iter_to_ieee_spin(gen, d, &l, nv2);
-  job->opcnt += (l-gen->mhi) * 10*nth;
+  job->opcnt += (l-gen->mhi) * 7*nth;
   if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 28*nth;
+  job->opcnt += (lmax+1-l) * 25*nth;
 
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
+  const sharp_ylmgen_dbl2 * restrict fx = gen->fxx;
   dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
   for (int i=0; i<nv2; ++i)
@@ -626,25 +625,23 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
 
   while((!full_ieee) && (l<=lmax))
     {
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
-       fx12=vload(fx[l+1].f[2]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
-       fx22=vload(fx[l+2].f[2]);
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
     Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
     Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
     full_ieee=1;
     for (int i=0; i<nv2; ++i)
       {
-      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
-      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
       Tv lw = d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i];
       Tv lx = d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i];
       agr1 += d->p1pr[i]*lw - d->p2mi[i]*lx;
       agi1 += d->p1pi[i]*lw + d->p2mr[i]*lx;
       acr1 += d->p1mr[i]*lw + d->p2pi[i]*lx;
       aci1 += d->p1mi[i]*lw - d->p2pr[i]*lx;
-      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
-      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       lw = d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i];
       lx = d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i];
       agr2 += d->p2pr[i]*lw - d->p1mi[i]*lx;
@@ -676,21 +673,19 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
 
 
 NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
-  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm,
+  const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
   int l, int lmax, int nv2)
   {
   while (l<=lmax)
     {
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
-       fx12=vload(fx[l+1].f[2]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
-       fx22=vload(fx[l+2].f[2]);
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
     Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
        ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
     for (int i=0; i<nv2; ++i)
       {
-      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
-      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
       Tv lw=d->l2p[i]+d->l2m[i];
       d->p1pr[i] += ar1*lw;
       d->p1pi[i] += ai1*lw;
@@ -703,8 +698,8 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
       lx=d->l1m[i]-d->l1p[i];
       d->p1mr[i] += ai2*lx;
       d->p1mi[i] -= ar2*lx;
-      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
-      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       }
     l+=2;
     }
@@ -716,11 +711,11 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
   int l,lmax=gen->lmax;
   int nv2 = (nth+VLEN-1)/VLEN;
   iter_to_ieee_spin(gen, d, &l, nv2);
-  job->opcnt += (l-gen->mhi) * 10*nth;
+  job->opcnt += (l-gen->mhi) * 7*nth;
   if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 20*nth;
+  job->opcnt += (lmax+1-l) * 17*nth;
 
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
+  const sharp_ylmgen_dbl2 * restrict fx = gen->fxx;
   const dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
   for (int i=0; i<nv2; ++i)
@@ -733,17 +728,15 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
 
   while((!full_ieee) && (l<=lmax))
     {
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
-       fx12=vload(fx[l+1].f[2]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
-       fx22=vload(fx[l+2].f[2]);
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
     Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
        ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
     full_ieee=1;
     for (int i=0; i<nv2; ++i)
       {
-      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
-      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
       Tv lw=d->l2p[i]*d->cfp[i]+d->l2m[i]*d->cfm[i];
       d->p1pr[i] += ar1*lw;
       d->p1pi[i] += ai1*lw;
@@ -756,8 +749,8 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
       lx=d->l1m[i]*d->cfm[i]-d->l1p[i]*d->cfp[i];
       d->p1mr[i] += ai2*lx;
       d->p1mi[i] -= ar2*lx;
-      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
-      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
         {
         getCorfac(d->scp[i], &d->cfp[i], gen->cf);
@@ -863,6 +856,17 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
         }
       else
         {
+        //adjust the a_lm for the new algorithm
+        if (job->nalm==2)
+          for (int l=gen->mhi; l<=gen->lmax+1; ++l)
+            {
+            job->almtmp[2*l  ]*=gen->alpha[l];
+            job->almtmp[2*l+1]*=gen->alpha[l];
+            }
+        else
+          for (int l=gen->mhi; l<=gen->lmax+1; ++l)
+            job->almtmp[l]*=gen->alpha[l];
+
         const int nval=nvx*VLEN;
         int ith=0;
         int itgt[nval];
@@ -1037,6 +1041,12 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
             calc_map2alm_spin(job, gen, &d.v, nth);
             }
           }
+        //adjust the a_lm for the new algorithm
+        for (int l=gen->mhi; l<=gen->lmax; ++l)
+          {
+          job->almtmp[2*l  ]*=gen->alpha[l];
+          job->almtmp[2*l+1]*=gen->alpha[l];
+          }
         }
       break;
       }
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index cadcde7..e0dafa5 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -203,21 +203,21 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
            *gen->flm2[l+gen->s]*gen->flm2[l-gen->s];
         gen->fx[l+1].f[2]=t*l1*gen->inv[l];
         }
-// calculate alpha <=> index 3
+for (int l=0; l<gen->lmax+3; ++l)
+  {gen->alpha[l] = gen->fxx[l].f[0] = gen->fxx[l].f[1] = 0;}
 gen->alpha[gen->mhi]=gen->alpha[gen->mhi+1]=1.;
-for (int l=gen->mhi+2; l<gen->lmax; ++l)
-{
-  gen->alpha[l] = gen->alpha[l-2]*gen->fx[l+1].f[2];
-//  printf("%d %e %e\n", l, gen->fx[l].f[2], gen->alpha[l]);
-}
+for (int l=gen->mhi+2; l<gen->lmax+1; ++l)
+  gen->alpha[l] = gen->alpha[l-2]*gen->fx[l].f[2];
+gen->alpha[gen->lmax+1] = gen->alpha[gen->lmax+2] = 0;
 gen->fxx[gen->mhi].f[0] = 0;
-gen->fxx[gen->mhi].f[0] = 0;
-for (int l=gen->mhi+1; l<gen->lmax+1; ++l)
+gen->fxx[gen->mhi].f[1] = 0;
+for (int l=gen->mhi; l<gen->lmax+1; ++l)
 {
-  gen->fxx[l].f[0] = gen->fx[l].f[0]*gen->alpha[l-1]/gen->alpha[l];
-  gen->fxx[l].f[1] = gen->fx[l].f[1]*gen->fxx[l].f[0];
+  gen->fxx[l+1].f[0] = gen->fx[l+1].f[0]*gen->alpha[l]/gen->alpha[l+1];
+  gen->fxx[l+1].f[1] = gen->fx[l+1].f[1]*gen->fxx[l+1].f[0];
 }
-
+for (int l=gen->lmax+1; l<gen->lmax+3; ++l)
+  gen->fxx[l].f[0] = gen->fxx[l].f[1] = 0.;
       }
 
     gen->preMinus_p = gen->preMinus_m = 0;

From 24359cdbe75c2dc199b4249045bc29be61f08ebb Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 9 Jan 2019 11:18:15 +0100
Subject: [PATCH 61/85] cleanup

---
 libsharp/sharp_core.c     |  9 ++++-----
 libsharp/sharp_ylmgen_c.c | 39 +++++++++++++++------------------------
 libsharp/sharp_ylmgen_c.h |  8 ++++----
 3 files changed, 23 insertions(+), 33 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index ad3be7a..2b8c3af 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -392,8 +392,7 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
 NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
   sxdata_v * restrict d, int * restrict l_, int nv2)
   {
-  const sharp_ylmgen_dbl2 * restrict fx = gen->fxx;
-  const sharp_ylmgen_dbl3 * restrict fxo = gen->fx;
+  const sharp_ylmgen_dbl2 * restrict fx = gen->fx;
   Tv prefac=vload(gen->prefac[gen->m]),
      prescale=vload(gen->fscale[gen->m]);
   Tv limscale=vload(sharp_limscale);
@@ -508,7 +507,7 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 25*nth;
 
-  const sharp_ylmgen_dbl2 * restrict fx = gen->fxx;
+  const sharp_ylmgen_dbl2 * restrict fx = gen->fx;
   const dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
   for (int i=0; i<nv2; ++i)
@@ -612,7 +611,7 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 25*nth;
 
-  const sharp_ylmgen_dbl2 * restrict fx = gen->fxx;
+  const sharp_ylmgen_dbl2 * restrict fx = gen->fx;
   dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
   for (int i=0; i<nv2; ++i)
@@ -715,7 +714,7 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 17*nth;
 
-  const sharp_ylmgen_dbl2 * restrict fx = gen->fxx;
+  const sharp_ylmgen_dbl2 * restrict fx = gen->fx;
   const dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
   for (int i=0; i<nv2; ++i)
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index e0dafa5..5bea19e 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -87,11 +87,10 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
   else
     {
     gen->m=gen->mlo=gen->mhi=-1234567890;
-    ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+3);
-ALLOC(gen->alpha,double,gen->lmax+3);
-ALLOC(gen->fxx,sharp_ylmgen_dbl2,gen->lmax+3);
+    ALLOC(gen->fx,sharp_ylmgen_dbl2,gen->lmax+3);
     for (int m=0; m<gen->lmax+3; ++m)
-      gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
+      gen->fx[m].f[0]=gen->fx[m].f[1]=0.;
+    ALLOC(gen->alpha,double,gen->lmax+3);
     ALLOC(gen->inv,double,gen->lmax+2);
     gen->inv[0]=0;
     for (int m=1; m<gen->lmax+2; ++m) gen->inv[m]=1./m;
@@ -147,8 +146,7 @@ void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
   else
     {
     DEALLOC(gen->fx);
-DEALLOC(gen->alpha);
-DEALLOC(gen->fxx);
+    DEALLOC(gen->alpha);
     DEALLOC(gen->prefac);
     DEALLOC(gen->fscale);
     DEALLOC(gen->flm1);
@@ -191,33 +189,26 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
 
     if (!ms_similar)
       {
+      gen->alpha[gen->mhi] = 1.;
+      gen->fx[gen->mhi].f[0] = gen->fx[gen->mhi].f[1] = 0.;
       for (int l=gen->mhi; l<gen->lmax+1; ++l)
         {
         double t = gen->flm1[l+gen->m]*gen->flm1[l-gen->m]
                   *gen->flm1[l+gen->s]*gen->flm1[l-gen->s];
         double lt = 2*l+1;
         double l1 = l+1;
-        gen->fx[l+1].f[0]=l1*lt*t;
-        gen->fx[l+1].f[1]=gen->m*gen->s*gen->inv[l]*gen->inv[l+1];
+        double flp10=l1*lt*t;
+        double flp11=gen->m*gen->s*gen->inv[l]*gen->inv[l+1];
         t = gen->flm2[l+gen->m]*gen->flm2[l-gen->m]
            *gen->flm2[l+gen->s]*gen->flm2[l-gen->s];
-        gen->fx[l+1].f[2]=t*l1*gen->inv[l];
+        double flp12=t*l1*gen->inv[l];
+        if (l>gen->mhi)
+          gen->alpha[l+1] = gen->alpha[l-1]*flp12;
+        else
+          gen->alpha[l+1] = 1.;
+        gen->fx[l+1].f[0] = flp10*gen->alpha[l]/gen->alpha[l+1];
+        gen->fx[l+1].f[1] = flp11*gen->fx[l+1].f[0];
         }
-for (int l=0; l<gen->lmax+3; ++l)
-  {gen->alpha[l] = gen->fxx[l].f[0] = gen->fxx[l].f[1] = 0;}
-gen->alpha[gen->mhi]=gen->alpha[gen->mhi+1]=1.;
-for (int l=gen->mhi+2; l<gen->lmax+1; ++l)
-  gen->alpha[l] = gen->alpha[l-2]*gen->fx[l].f[2];
-gen->alpha[gen->lmax+1] = gen->alpha[gen->lmax+2] = 0;
-gen->fxx[gen->mhi].f[0] = 0;
-gen->fxx[gen->mhi].f[1] = 0;
-for (int l=gen->mhi; l<gen->lmax+1; ++l)
-{
-  gen->fxx[l+1].f[0] = gen->fx[l+1].f[0]*gen->alpha[l]/gen->alpha[l+1];
-  gen->fxx[l+1].f[1] = gen->fx[l+1].f[1]*gen->fxx[l+1].f[0];
-}
-for (int l=gen->lmax+1; l<gen->lmax+3; ++l)
-  gen->fxx[l].f[0] = gen->fxx[l].f[1] = 0.;
       }
 
     gen->preMinus_p = gen->preMinus_m = 0;
diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h
index 0f8bfac..5fd7f93 100644
--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@@ -54,17 +54,17 @@ typedef struct
 /* for public use; will typically change after call to Ylmgen_prepare() */
   int m;
 
+  double *alpha;
+
 /* used if s==0 */
-  double *mfac, *eps, *alpha;
+  double *mfac, *eps;
   sharp_ylmgen_dbl2 *ab;
 
 /* used if s!=0 */
   int sinPow, cosPow, preMinus_p, preMinus_m;
   double *prefac;
   int *fscale;
-  sharp_ylmgen_dbl3 *fx;
-//double *alpha;
-sharp_ylmgen_dbl2 *fxx;
+  sharp_ylmgen_dbl2 *fx;
 
 /* internal usage only */
 /* used if s==0 */

From 1f5874ecc004b25a6aa167a2f9cf2e8c277c3410 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 9 Jan 2019 15:53:01 +0100
Subject: [PATCH 62/85] streamlining

---
 libsharp/sharp_core.c       |  2 +-
 libsharp/sharp_vecsupport.h | 55 ++++++++++++++++++-------------------
 libsharp/sharp_ylmgen_c.c   |  2 +-
 libsharp/sharp_ylmgen_c.h   |  2 +-
 4 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 2b8c3af..5bf74a8 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -25,7 +25,7 @@
 /*! \file sharp_core.c
  *  Computational core
  *
- *  Copyright (C) 2012-2018 Max-Planck-Society
+ *  Copyright (C) 2012-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index 43200f8..ee09adf 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -25,7 +25,7 @@
 /*  \file sharp_vecsupport.h
  *  Convenience functions for vector arithmetics
  *
- *  Copyright (C) 2012-2016 Max-Planck-Society
+ *  Copyright (C) 2012-2019 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
@@ -42,11 +42,14 @@ typedef double Ts;
 typedef double Tv;
 typedef int Tm;
 
+#define vload(a) (a)
+#define vzero 0.
+#define vone 1.
+
 #define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
 #define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
 #define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
 #define vneg(a) (-(a))
-#define vload(a) (a)
 #define vabs(a) fabs(a)
 #define vsqrt(a) sqrt(a)
 #define vlt(a,b) ((a)<(b))
@@ -55,14 +58,10 @@ typedef int Tm;
 #define vne(a,b) ((a)!=(b))
 #define vand_mask(a,b) ((a)&&(b))
 #define vor_mask(a,b) ((a)||(b))
-
 static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
 static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
-
 #define vanyTrue(a) (a)
 #define vallTrue(a) (a)
-#define vzero 0.
-#define vone 1.
 
 #endif
 
@@ -86,15 +85,15 @@ typedef __m128d Tm;
 static inline Tv vblend__(Tv m, Tv a, Tv b)
   { return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
 #endif
-#define vzero _mm_setzero_pd()
-#define vone _mm_set1_pd(1.)
-
-#define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero))
-#define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero))
-#define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone))
-#define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
 #define vload(a) _mm_set1_pd(a)
-#define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
+#define vzero _mm_setzero_pd()
+#define vone vload(1.)
+
+#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
+#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
+#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
+#define vneg(a) _mm_xor_pd(vload(-0.),a)
+#define vabs(a) _mm_andnot_pd(vload(-0.),a)
 #define vsqrt(a) _mm_sqrt_pd(a)
 #define vlt(a,b) _mm_cmplt_pd(a,b)
 #define vgt(a,b) _mm_cmpgt_pd(a,b)
@@ -117,15 +116,15 @@ typedef __m256d Tv;
 typedef __m256d Tm;
 
 #define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
-#define vzero _mm256_setzero_pd()
-#define vone _mm256_set1_pd(1.)
-
-#define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero))
-#define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero))
-#define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
-#define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
 #define vload(a) _mm256_set1_pd(a)
-#define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
+#define vzero _mm256_setzero_pd()
+#define vone vload(1.)
+
+#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
+#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
+#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
+#define vneg(a) _mm256_xor_pd(vload(-0.),a)
+#define vabs(a) _mm256_andnot_pd(vload(-0.),a)
 #define vsqrt(a) _mm256_sqrt_pd(a)
 #define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
 #define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ)
@@ -147,12 +146,15 @@ typedef __m256d Tm;
 typedef __m512d Tv;
 typedef __mmask8 Tm;
 
+#define vload(a) _mm512_set1_pd(a)
+#define vzero _mm512_setzero_pd()
+#define vone vload(1.)
+
 #define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
 #define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
 #define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
-#define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))
-#define vload(a) _mm512_set1_pd(a)
-#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)_mm512_set1_pd(-0.),(__m512i)a)
+#define vneg(a) _mm512_mul_pd(a,vload(-1.))
+#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)vload(-0.),(__m512i)a)
 #define vsqrt(a) _mm512_sqrt_pd(a)
 #define vlt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_LT_OQ)
 #define vgt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GT_OQ)
@@ -165,9 +167,6 @@ typedef __mmask8 Tm;
 #define vanyTrue(a) (a!=0)
 #define vallTrue(a) (a==255)
 
-#define vzero _mm512_setzero_pd()
-#define vone _mm512_set1_pd(1.)
-
 #endif
 
 #endif
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index 5bea19e..e3c055b 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -25,7 +25,7 @@
 /*
  *  Helper code for efficient calculation of Y_lm(theta,phi=0)
  *
- *  Copyright (C) 2005-2016 Max-Planck-Society
+ *  Copyright (C) 2005-2019 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h
index 5fd7f93..b1d9cbc 100644
--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@@ -25,7 +25,7 @@
 /*! \file sharp_ylmgen_c.h
  *  Code for efficient calculation of Y_lm(phi=0,theta)
  *
- *  Copyright (C) 2005-2016 Max-Planck-Society
+ *  Copyright (C) 2005-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 

From 48cafe5c1cd1c643a5a2ef0aa3d38c6d932a578f Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 10 Jan 2019 10:19:25 +0100
Subject: [PATCH 63/85] performance tweaks

---
 libsharp/sharp_core.c | 56 ++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 5bf74a8..7307202 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -209,22 +209,46 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
   const sharp_ylmgen_dbl2 * restrict ab, const dcmplx * restrict alm,
   int l, int il, int lmax, int nv2)
   {
-  for (; l<=lmax-2; il+=2, l+=4)
+  if (nv2==nv0)
     {
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
-    Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
-    Tv a1=vload(ab[il  ].f[0]), b1=vload(ab[il  ].f[1]);
-    Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
-    for (int i=0; i<nv2; ++i)
+    for (; l<=lmax-2; il+=2, l+=4)
       {
-      d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
-      d->p1r[i] += d->lam2[i]*ar1 + d->lam1[i]*ar3;
-      d->p1i[i] += d->lam2[i]*ai1 + d->lam1[i]*ai3;
-      d->p2r[i] += d->lam2[i]*ar2 + d->lam1[i]*ar4;
-      d->p2i[i] += d->lam2[i]*ai2 + d->lam1[i]*ai4;
-      d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
+      Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+      Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+      Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
+      Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
+      Tv a1=vload(ab[il  ].f[0]), b1=vload(ab[il  ].f[1]);
+      Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
+      for (int i=0; i<nv0; ++i)
+        {
+        d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
+        d->p1r[i] += d->lam2[i]*ar1 + d->lam1[i]*ar3;
+        d->p1i[i] += d->lam2[i]*ai1 + d->lam1[i]*ai3;
+        d->p2r[i] += d->lam2[i]*ar2 + d->lam1[i]*ar4;
+        d->p2i[i] += d->lam2[i]*ai2 + d->lam1[i]*ai4;
+        d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
+        }
+      }
+    }
+  else
+    {
+    for (; l<=lmax-2; il+=2, l+=4)
+      {
+      Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+      Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+      Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
+      Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
+      Tv a1=vload(ab[il  ].f[0]), b1=vload(ab[il  ].f[1]);
+      Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
+      for (int i=0; i<nv2; ++i)
+        {
+        d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
+        d->p1r[i] += d->lam2[i]*ar1 + d->lam1[i]*ar3;
+        d->p1i[i] += d->lam2[i]*ai1 + d->lam1[i]*ai3;
+        d->p2r[i] += d->lam2[i]*ar2 + d->lam1[i]*ar4;
+        d->p2i[i] += d->lam2[i]*ai2 + d->lam1[i]*ai4;
+        d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
+        }
       }
     }
   for (; l<=lmax; ++il, l+=2)
@@ -486,12 +510,12 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
       d->p1mi[i] += aci1*lw1 - agr2*lx2;
       Tv lx1=d->l2m[i]-d->l2p[i];
       Tv lw2=d->l1p[i]+d->l1m[i];
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       d->p2pr[i] += agr2*lw2 - aci1*lx1;
       d->p2pi[i] += agi2*lw2 + acr1*lx1;
       d->p2mr[i] += acr2*lw2 + agi1*lx1;
       d->p2mi[i] += aci2*lw2 - agr1*lx1;
-      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
-      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       }
     l+=2;
     }

From ecd6c1b48bb29358e0ce12e74f284934c2082520 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 10 Jan 2019 10:30:07 +0100
Subject: [PATCH 64/85] cleanup

---
 Makefile.am                    |  1 -
 libsharp/sharp_complex_hacks.h | 61 ----------------------------------
 libsharp/sharp_vecsupport.h    | 15 ++++++++-
 libsharp/sharp_vecutil.h       | 49 ---------------------------
 libsharp/sharp_ylmgen_c.h      |  1 -
 5 files changed, 14 insertions(+), 113 deletions(-)
 delete mode 100644 libsharp/sharp_vecutil.h

diff --git a/Makefile.am b/Makefile.am
index 6370d95..163fcd0 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -19,7 +19,6 @@ src_sharp = \
   libsharp/sharp_internal.h \
   libsharp/sharp_legendre_roots.h \
   libsharp/sharp_vecsupport.h \
-  libsharp/sharp_vecutil.h \
   libsharp/sharp_ylmgen_c.h
 
 include_HEADERS = \
diff --git a/libsharp/sharp_complex_hacks.h b/libsharp/sharp_complex_hacks.h
index 6ec27bb..d50eabe 100644
--- a/libsharp/sharp_complex_hacks.h
+++ b/libsharp/sharp_complex_hacks.h
@@ -39,13 +39,6 @@
 
 #if (VLEN==1)
 
-static inline _Complex double vhsum_cmplx(Tv a, Tv b)
-  { return a+_Complex_I*b; }
-
-static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict c1, _Complex double * restrict c2)
-  { *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
-
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
   _Complex double * restrict cc)
   { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
@@ -54,18 +47,6 @@ static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
 
 #if (VLEN==2)
 
-static inline _Complex double vhsum_cmplx (Tv a, Tv b)
-  {
-#if defined(__SSE3__)
-  Tv tmp = _mm_hadd_pd(a,b);
-#else
-  Tv tmp = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
-           _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
-#endif
-  union {Tv v; _Complex double c; } u;
-  u.v=tmp; return u.c;
-  }
-
 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
   Tv d, _Complex double * restrict c1, _Complex double * restrict c2)
   {
@@ -101,38 +82,6 @@ static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
 
 #if (VLEN==4)
 
-static inline _Complex double vhsum_cmplx (Tv a, Tv b)
-  {
-  Tv tmp=_mm256_hadd_pd(a,b);
-  Tv tmp2=_mm256_permute2f128_pd(tmp,tmp,1);
-  tmp=_mm256_add_pd(tmp,tmp2);
-#ifdef UNSAFE_CODE
-  _Complex double ret;
-  *((__m128d *)&ret)=_mm256_extractf128_pd(tmp, 0);
-  return ret;
-#else
-  union {Tv v; _Complex double c[2]; } u;
-  u.v=tmp; return u.c[0];
-#endif
-  }
-
-static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict c1, _Complex double * restrict c2)
-  {
-  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
-  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
-     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
-  tmp1=tmp3+tmp4;
-#ifdef UNSAFE_CODE
-  *((__m128d *)c1)=_mm_add_pd(*((__m128d *)c1),_mm256_extractf128_pd(tmp1, 0));
-  *((__m128d *)c2)=_mm_add_pd(*((__m128d *)c2),_mm256_extractf128_pd(tmp1, 1));
-#else
-  union {Tv v; _Complex double c[2]; } u;
-  u.v=tmp1;
-  *c1+=u.c[0]; *c2+=u.c[1];
-#endif
-  }
-
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
   _Complex double * restrict cc)
   {
@@ -154,16 +103,6 @@ static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
 
 #if (VLEN==8)
 
-static inline _Complex double vhsum_cmplx(Tv a, Tv b)
-  { return _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); }
-
-static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict c1, _Complex double * restrict c2)
-  {
-  *c1 += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
-  *c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
-  }
-
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
   _Complex double * restrict cc)
   { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index ee09adf..b70143d 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -33,7 +33,20 @@
 #define SHARP_VECSUPPORT_H
 
 #include <math.h>
-#include "sharp_vecutil.h"
+
+#ifndef VLEN
+
+#if (defined(__AVX512F__))
+#define VLEN 8
+#elif (defined (__AVX__))
+#define VLEN 4
+#elif (defined (__SSE2__))
+#define VLEN 2
+#else
+#define VLEN 1
+#endif
+
+#endif
 
 typedef double Ts;
 
diff --git a/libsharp/sharp_vecutil.h b/libsharp/sharp_vecutil.h
deleted file mode 100644
index 29485f7..0000000
--- a/libsharp/sharp_vecutil.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- *  This file is part of libc_utils.
- *
- *  libc_utils is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libc_utils is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libc_utils; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_vecutil.h
- *  Functionality related to vector instruction support
- *
- *  Copyright (C) 2012-2018 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#ifndef SHARP_VECUTIL_H
-#define SHARP_VECUTIL_H
-
-#ifndef VLEN
-
-#if (defined(__AVX512F__))
-#define VLEN 8
-#elif (defined (__AVX__))
-#define VLEN 4
-#elif (defined (__SSE2__))
-#define VLEN 2
-#else
-#define VLEN 1
-#endif
-
-#endif
-
-#endif
diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h
index b1d9cbc..cc9260f 100644
--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@@ -42,7 +42,6 @@ static const double sharp_ftol=0x1p-60;
 static const double sharp_fbighalf=0x1p+400;
 
 typedef struct { double f[2]; } sharp_ylmgen_dbl2;
-typedef struct { double f[3]; } sharp_ylmgen_dbl3;
 
 typedef struct
   {

From c9684732b84e39c87fc21288049dc0b46bf128e6 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 10 Jan 2019 13:30:29 +0100
Subject: [PATCH 65/85] cleanup

---
 Makefile.am                    |   1 -
 libsharp/sharp.h               |   4 +-
 libsharp/sharp_almhelpers.h    |   2 +-
 libsharp/sharp_complex_hacks.h | 112 ---------------------------------
 libsharp/sharp_core.c          |   1 -
 libsharp/sharp_vecsupport.h    |  39 ++++++++++++
 6 files changed, 41 insertions(+), 118 deletions(-)
 delete mode 100644 libsharp/sharp_complex_hacks.h

diff --git a/Makefile.am b/Makefile.am
index 163fcd0..26b41ad 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -15,7 +15,6 @@ src_sharp = \
   libsharp/sharp_legendre_roots.c \
   libsharp/sharp_ylmgen_c.c \
   libsharp/sharp_announce.h \
-  libsharp/sharp_complex_hacks.h \
   libsharp/sharp_internal.h \
   libsharp/sharp_legendre_roots.h \
   libsharp/sharp_vecsupport.h \
diff --git a/libsharp/sharp.h b/libsharp/sharp.h
index 35a0cb5..ef9cafb 100644
--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@@ -25,7 +25,7 @@
 /*! \file sharp.h
  *  Portable interface for the spherical transform library.
  *
- *  Copyright (C) 2012-2018 Max-Planck-Society
+ *  Copyright (C) 2012-2019 Max-Planck-Society
  *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
@@ -259,8 +259,6 @@ int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
   const sharp_alm_info *alm_info, int flags, double *time,
   unsigned long long *opcnt);
 
-
-
 /*! \} */
 
 #ifdef __cplusplus
diff --git a/libsharp/sharp_almhelpers.h b/libsharp/sharp_almhelpers.h
index c17028a..06bee8f 100644
--- a/libsharp/sharp_almhelpers.h
+++ b/libsharp/sharp_almhelpers.h
@@ -25,7 +25,7 @@
 /*! \file sharp_almhelpers.h
  *  SHARP helper function for the creation of a_lm data structures
  *
- *  Copyright (C) 2008-2016 Max-Planck-Society
+ *  Copyright (C) 2008-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
diff --git a/libsharp/sharp_complex_hacks.h b/libsharp/sharp_complex_hacks.h
deleted file mode 100644
index d50eabe..0000000
--- a/libsharp/sharp_complex_hacks.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*  \file sharp_complex_hacks.h
- *  support for converting vector types and complex numbers
- *
- *  Copyright (C) 2012-2018 Max-Planck-Society
- *  Author: Martin Reinecke
- */
-
-#ifndef SHARP_COMPLEX_HACKS_H
-#define SHARP_COMPLEX_HACKS_H
-
-#include <math.h>
-#include "sharp_vecsupport.h"
-
-#define UNSAFE_CODE
-
-#if (VLEN==1)
-
-static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict cc)
-  { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
-
-#endif
-
-#if (VLEN==2)
-
-static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
-  Tv d, _Complex double * restrict c1, _Complex double * restrict c2)
-  {
-#ifdef UNSAFE_CODE
-#if defined(__SSE3__)
-  *((__m128d *)c1) += _mm_hadd_pd(a,b);
-  *((__m128d *)c2) += _mm_hadd_pd(c,d);
-#else
-  *((__m128d *)c1) += _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
-                      _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
-  *((__m128d *)c2) += _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
-                      _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
-#endif
-#else
-  union {Tv v; _Complex double c; } u1, u2;
-#if defined(__SSE3__)
-  u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
-#else
-  u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
-         _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
-  u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
-         _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
-#endif
-  *c1+=u1.c; *c2+=u2.c;
-#endif
-  }
-
-static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict cc)
-  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
-
-#endif
-
-#if (VLEN==4)
-
-static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict cc)
-  {
-  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
-  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
-     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
-  tmp1=tmp3+tmp4;
-#ifdef UNSAFE_CODE
-  _mm256_storeu_pd((double *)cc,
-    _mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1));
-#else
-  union {Tv v; _Complex double c[2]; } u;
-  u.v=tmp1;
-  cc[0]+=u.c[0]; cc[1]+=u.c[1];
-#endif
-  }
-
-#endif
-
-#if (VLEN==8)
-
-static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict cc)
-  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
-
-#endif
-
-#endif
diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 7307202..b619ed3 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -33,7 +33,6 @@
 #include <math.h>
 #include <string.h>
 #include "sharp_vecsupport.h"
-#include "sharp_complex_hacks.h"
 #include "sharp.h"
 #include "sharp_internal.h"
 #include "c_utils.h"
diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index b70143d..e4bfc4f 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -76,6 +76,11 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
 #define vanyTrue(a) (a)
 #define vallTrue(a) (a)
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  _Complex double * restrict cc)
+  { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
+
+
 #endif
 
 #if (VLEN==2)
@@ -119,6 +124,21 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vanyTrue(a) (_mm_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm_movemask_pd(a)==3)
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c,
+  Tv d, _Complex double * restrict cc)
+  {
+  union {Tv v; _Complex double c; } u1, u2;
+#if defined(__SSE3__)
+  u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
+#else
+  u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
+         _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
+  u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
+         _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
+#endif
+  cc[0]+=u1.c; cc[1]+=u2.c;
+  }
+
 #endif
 
 #if (VLEN==4)
@@ -150,6 +170,18 @@ typedef __m256d Tm;
 #define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm256_movemask_pd(a)==15)
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  _Complex double * restrict cc)
+  {
+  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
+  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
+     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
+  tmp1=tmp3+tmp4;
+  union {Tv v; _Complex double c[2]; } u;
+  u.v=tmp1;
+  cc[0]+=u.c[0]; cc[1]+=u.c[1];
+  }
+
 #endif
 
 #if (VLEN==8)
@@ -180,6 +212,13 @@ typedef __mmask8 Tm;
 #define vanyTrue(a) (a!=0)
 #define vallTrue(a) (a==255)
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  _Complex double * restrict cc)
+  {
+  cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
+  cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
+  }
+
 #endif
 
 #endif

From ef2907f050ec5287065c8f5362add7f6036fa578 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 10 Jan 2019 13:59:36 +0100
Subject: [PATCH 66/85] cleanup

---
 Makefile.am                     |  2 -
 libsharp/sharp.c                |  3 +-
 libsharp/sharp.h                | 17 +++---
 libsharp/sharp_announce.c       | 98 ---------------------------------
 libsharp/sharp_announce.h       | 39 -------------
 libsharp/sharp_core.c           |  4 +-
 libsharp/sharp_cxx.h            | 26 ++++-----
 libsharp/sharp_geomhelpers.h    |  2 +-
 libsharp/sharp_internal.h       |  4 +-
 libsharp/sharp_legendre_roots.h |  2 +-
 libsharp/sharp_mpi.c            |  2 +-
 libsharp/sharp_mpi.h            | 17 +++---
 libsharp/sharp_testsuite.c      | 58 ++++++++++++++++++-
 13 files changed, 91 insertions(+), 183 deletions(-)
 delete mode 100644 libsharp/sharp_announce.c
 delete mode 100644 libsharp/sharp_announce.h

diff --git a/Makefile.am b/Makefile.am
index 26b41ad..2649be1 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -9,12 +9,10 @@ src_sharp = \
   pocketfft/pocketfft.h \
   libsharp/sharp.c \
   libsharp/sharp_almhelpers.c \
-  libsharp/sharp_announce.c \
   libsharp/sharp_core.c \
   libsharp/sharp_geomhelpers.c \
   libsharp/sharp_legendre_roots.c \
   libsharp/sharp_ylmgen_c.c \
-  libsharp/sharp_announce.h \
   libsharp/sharp_internal.h \
   libsharp/sharp_legendre_roots.h \
   libsharp/sharp_vecsupport.h \
diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index d4a5d20..456b69c 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -853,7 +853,8 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
   init_output (job);
 
   int nchunks, chunksize;
-  get_chunk_info(job->ginfo->npairs,sharp_veclen()*sharp_max_nvec(),&nchunks,&chunksize);
+  get_chunk_info(job->ginfo->npairs,sharp_veclen()*sharp_max_nvec(job->spin),
+                 &nchunks,&chunksize);
 //FIXME: needs to be changed to "nm"
   alloc_phase (job,mmax+1,chunksize);
 
diff --git a/libsharp/sharp.h b/libsharp/sharp.h
index ef9cafb..0e43ba9 100644
--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@@ -29,8 +29,8 @@
  *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
-#ifndef PLANCK_SHARP_LOWLEVEL_H
-#define PLANCK_SHARP_LOWLEVEL_H
+#ifndef PLANCK_SHARP_H
+#define PLANCK_SHARP_H
 
 #include <stddef.h>
 
@@ -207,16 +207,13 @@ typedef enum { SHARP_DP              = 1<<4,
   \param type the type of SHT
   \param spin the spin of the quantities to be transformed
   \param alm contains pointers to the a_lm coefficients. If \a spin==0,
-    alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
-    etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
-    alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
+    alm[0] points to the a_lm of the SHT. If \a spin>0, alm[0] and alm[1]
+    point to the two a_lm sets of the SHT. The exact data type of \a alm
     depends on whether the SHARP_DP flag is set.
   \param map contains pointers to the maps. If \a spin==0,
-    map[0] points to the map of the first SHT, map[1] to that of the second
-    etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
-    point to the maps of the first SHT, map[2] and map[3] to those of the
-    second, etc. The exact data type of \a map depends on whether the SHARP_DP
-    flag is set.
+    map[0] points to the map of the SHT. If \a spin>0, or \a type is
+    SHARP_ALM2MAP_DERIV1, map[0] and map[1] point to the two maps of the SHT.
+    The exact data type of \a map depends on whether the SHARP_DP flag is set.
   \param geom_info A \c sharp_geom_info object compatible with the provided
     \a map arrays.
   \param alm_info A \c sharp_alm_info object compatible with the provided
diff --git a/libsharp/sharp_announce.c b/libsharp/sharp_announce.c
deleted file mode 100644
index a028258..0000000
--- a/libsharp/sharp_announce.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- *  This file is part of libc_utils.
- *
- *  libc_utils is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libc_utils is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libc_utils; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_announce.c
- *  Banner for module startup
- *
- *  Copyright (C) 2012 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#ifdef USE_MPI
-#include <mpi.h>
-#endif
-
-#include "sharp_announce.h"
-#include "sharp_internal.h"
-
-static void OpenMP_status(void)
-  {
-#ifndef _OPENMP
-  printf("OpenMP: not supported by this binary\n");
-#else
-  int threads = omp_get_max_threads();
-  if (threads>1)
-    printf("OpenMP active: max. %d threads.\n",threads);
-  else
-    printf("OpenMP active, but running with 1 thread only.\n");
-#endif
-  }
-
-static void MPI_status(void)
-  {
-#ifndef USE_MPI
-  printf("MPI: not supported by this binary\n");
-#else
-  int tasks;
-  MPI_Comm_size(MPI_COMM_WORLD,&tasks);
-  if (tasks>1)
-    printf("MPI active with %d tasks.\n",tasks);
-  else
-    printf("MPI active, but running with 1 task only.\n");
-#endif
-  }
-
-static void vecmath_status(void)
-  { printf("Supported vector length: %d\n",sharp_veclen()); }
-
-void sharp_announce (const char *name)
-  {
-  size_t m, nlen=strlen(name);
-  printf("\n+-");
-  for (m=0; m<nlen; ++m) printf("-");
-  printf("-+\n");
-  printf("| %s |\n", name);
-  printf("+-");
-  for (m=0; m<nlen; ++m) printf("-");
-  printf("-+\n\n");
-  vecmath_status();
-  OpenMP_status();
-  MPI_status();
-  printf("\n");
-  }
-
-void sharp_module_startup (const char *name, int argc, int argc_expected,
-  const char *argv_expected, int verbose)
-  {
-  if (verbose) sharp_announce (name);
-  if (argc==argc_expected) return;
-  if (verbose) fprintf(stderr, "Usage: %s %s\n", name, argv_expected);
-  exit(1);
-  }
diff --git a/libsharp/sharp_announce.h b/libsharp/sharp_announce.h
deleted file mode 100644
index e446d37..0000000
--- a/libsharp/sharp_announce.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- *  This file is part of libc_utils.
- *
- *  libc_utils is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libc_utils is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libc_utils; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_announce.h
- *  Banner for module startup
- *
- *  Copyright (C) 2012 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#ifndef SHARP_ANNOUNCE_H
-#define SHARP_ANNOUNCE_H
-
-void sharp_announce (const char *name);
-void sharp_module_startup (const char *name, int argc, int argc_expected,
-  const char *argv_expected, int verbose);
-
-#endif
diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index b619ed3..8d17a4d 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -1096,7 +1096,7 @@ int sharp_veclen(void)
   return VLEN;
   }
 
-int sharp_max_nvec(void)
+int sharp_max_nvec(int spin)
   {
-  return nv0;
+  return (spin==0) ? nv0 : nvx;
   }
diff --git a/libsharp/sharp_cxx.h b/libsharp/sharp_cxx.h
index 6d5a6e4..049d89e 100644
--- a/libsharp/sharp_cxx.h
+++ b/libsharp/sharp_cxx.h
@@ -25,7 +25,7 @@
 /*! \file sharp_cxx.h
  *  Spherical transform library
  *
- *  Copyright (C) 2012-2017 Max-Planck-Society
+ *  Copyright (C) 2012-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
@@ -120,15 +120,13 @@ template<typename T> class sharp_cxxjob: public sharp_base
       {
       void *aptr=conv(alm), *mptr=conv(map);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1,
-        flags,0,0);
+      sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, flags, 0, 0);
       }
     void alm2map (const std::complex<T> *alm, T *map, bool add) const
       {
       void *aptr=conv(alm), *mptr=conv(map);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1,
-        flags,0,0);
+      sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, flags, 0, 0);
       }
     void alm2map_spin (const T *alm1, const T *alm2,
       T *map1, T *map2, int spin, bool add) const
@@ -137,7 +135,7 @@ template<typename T> class sharp_cxxjob: public sharp_base
       aptr[0]=conv(alm1); aptr[1]=conv(alm2);
       mptr[0]=conv(map1); mptr[1]=conv(map2);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
+      sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,flags, 0, 0);
       }
     void alm2map_spin (const std::complex<T> *alm1, const std::complex<T> *alm2,
       T *map1, T *map2, int spin, bool add) const
@@ -146,14 +144,14 @@ template<typename T> class sharp_cxxjob: public sharp_base
       aptr[0]=conv(alm1); aptr[1]=conv(alm2);
       mptr[0]=conv(map1); mptr[1]=conv(map2);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
+      sharp_execute (SHARP_ALM2MAP, spin, aptr, mptr, ginfo, ainfo, flags,0,0);
       }
     void alm2map_der1 (const T *alm, T *map1, T *map2, bool add) const
       {
       void *aptr=conv(alm), *mptr[2];
       mptr[0]=conv(map1); mptr[1]=conv(map2);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0);
+      sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,flags,0,0);
       }
     void alm2map_der1 (const std::complex<T> *alm, T *map1, T *map2, bool add)
       const
@@ -161,7 +159,7 @@ template<typename T> class sharp_cxxjob: public sharp_base
       void *aptr=conv(alm), *mptr[2];
       mptr[0]=conv(map1); mptr[1]=conv(map2);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0);
+      sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,flags,0,0);
       }
     void alm2map_adjoint (const T *map, T *alm, bool add) const
       {
@@ -173,19 +171,19 @@ template<typename T> class sharp_cxxjob: public sharp_base
       {
       void *aptr=conv(alm), *mptr=conv(map);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
+      sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,flags,0,0);
       }
     void map2alm (const T *map, T *alm, bool add) const
       {
       void *aptr=conv(alm), *mptr=conv(map);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
+      sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,flags,0,0);
       }
     void map2alm (const T *map, std::complex<T> *alm, bool add) const
       {
       void *aptr=conv(alm), *mptr=conv(map);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
+      sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,flags,0,0);
       }
     void map2alm_spin (const T *map1, const T *map2, T *alm1, T *alm2,
       int spin, bool add) const
@@ -194,7 +192,7 @@ template<typename T> class sharp_cxxjob: public sharp_base
       aptr[0]=conv(alm1); aptr[1]=conv(alm2);
       mptr[0]=conv(map1); mptr[1]=conv(map2);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_MAP2ALM,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
+      sharp_execute (SHARP_MAP2ALM,spin,aptr,mptr,ginfo,ainfo,flags,0,0);
       }
     void map2alm_spin (const T *map1, const T *map2, std::complex<T> *alm1,
       std::complex<T> *alm2, int spin, bool add) const
@@ -203,7 +201,7 @@ template<typename T> class sharp_cxxjob: public sharp_base
       aptr[0]=conv(alm1); aptr[1]=conv(alm2);
       mptr[0]=conv(map1); mptr[1]=conv(map2);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_MAP2ALM,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
+      sharp_execute (SHARP_MAP2ALM,spin,aptr,mptr,ginfo,ainfo,flags,0,0);
       }
   };
 
diff --git a/libsharp/sharp_geomhelpers.h b/libsharp/sharp_geomhelpers.h
index 1c77e27..a59af9b 100644
--- a/libsharp/sharp_geomhelpers.h
+++ b/libsharp/sharp_geomhelpers.h
@@ -25,7 +25,7 @@
 /*! \file sharp_geomhelpers.h
  *  SHARP helper function for the creation of grid geometries
  *
- *  Copyright (C) 2006-2013 Max-Planck-Society
+ *  Copyright (C) 2006-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
diff --git a/libsharp/sharp_internal.h b/libsharp/sharp_internal.h
index 635aeb8..4c67cc8 100644
--- a/libsharp/sharp_internal.h
+++ b/libsharp/sharp_internal.h
@@ -25,7 +25,7 @@
 /*! \file sharp_internal.h
  *  Internally used functionality for the spherical transform library.
  *
- *  Copyright (C) 2006-2018 Max-Planck-Society
+ *  Copyright (C) 2006-2019 Max-Planck-Society
  *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
@@ -67,6 +67,6 @@ void inner_loop (sharp_job *job, const int *ispair,const double *cth,
   const int *mlim);
 
 int sharp_veclen(void);
-int sharp_max_nvec(void);
+int sharp_max_nvec(int spin);
 
 #endif
diff --git a/libsharp/sharp_legendre_roots.h b/libsharp/sharp_legendre_roots.h
index 2a056b2..19f8ec8 100644
--- a/libsharp/sharp_legendre_roots.h
+++ b/libsharp/sharp_legendre_roots.h
@@ -24,7 +24,7 @@
 
 /*! \file sharp_legendre_roots.h
  *
- *  Copyright (C) 2006-2012 Max-Planck-Society
+ *  Copyright (C) 2006-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
diff --git a/libsharp/sharp_mpi.c b/libsharp/sharp_mpi.c
index b23409a..a90ec69 100644
--- a/libsharp/sharp_mpi.c
+++ b/libsharp/sharp_mpi.c
@@ -25,7 +25,7 @@
 /*! \file sharp_mpi.c
  *  Functionality only needed for MPI-parallel transforms
  *
- *  Copyright (C) 2012-2013 Max-Planck-Society
+ *  Copyright (C) 2012-2019 Max-Planck-Society
  *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
diff --git a/libsharp/sharp_mpi.h b/libsharp/sharp_mpi.h
index 73a8aa0..5de37bf 100644
--- a/libsharp/sharp_mpi.h
+++ b/libsharp/sharp_mpi.h
@@ -25,7 +25,7 @@
 /*! \file sharp_mpi.h
  *  Interface for the spherical transform library with MPI support.
  *
- *  Copyright (C) 2011,2012 Max-Planck-Society
+ *  Copyright (C) 2011-2019 Max-Planck-Society
  *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
@@ -40,21 +40,18 @@ extern "C" {
 #endif
 
 /*! Performs an MPI parallel libsharp SHT job. The interface deliberately does
-  not use the C99 "complex" data type, in order to be callable from C.
+  not use the C99 "complex" data type, in order to be callable from C89 and C++.
   \param comm the MPI communicator to be used for this SHT
   \param type the type of SHT
   \param spin the spin of the quantities to be transformed
   \param alm contains pointers to the a_lm coefficients. If \a spin==0,
-    alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
-    etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
-    alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
+    alm[0] points to the a_lm of the SHT. If \a spin>0, alm[0] and alm[1]
+    point to the two a_lm sets of the SHT. The exact data type of \a alm
     depends on whether the SHARP_DP flag is set.
   \param map contains pointers to the maps. If \a spin==0,
-    map[0] points to the map of the first SHT, map[1] to that of the second
-    etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
-    point to the maps of the first SHT, map[2] and map[3] to those of the
-    second, etc. The exact data type of \a map depends on whether the SHARP_DP
-    flag is set.
+    map[0] points to the map of the SHT. If \a spin>0, or \a type is
+    SHARP_ALM2MAP_DERIV1, map[0] and map[1] point to the two maps of the SHT.
+    The exact data type of \a map depends on whether the SHARP_DP flag is set.
   \param geom_info A \c sharp_geom_info object compatible with the provided
     \a map arrays. The total map geometry is the union of all \a geom_info
     objects over the participating MPI tasks.
diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c
index fdc842a..1a05bdc 100644
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@@ -24,7 +24,7 @@
 
 /*  \file sharp_testsuite.c
  *
- *  Copyright (C) 2012-2018 Max-Planck-Society
+ *  Copyright (C) 2012-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
@@ -42,9 +42,63 @@
 #include "sharp_geomhelpers.h"
 #include "sharp_almhelpers.h"
 #include "c_utils.h"
-#include "sharp_announce.h"
 #include "memusage.h"
 
+static void OpenMP_status(void)
+  {
+#ifndef _OPENMP
+  printf("OpenMP: not supported by this binary\n");
+#else
+  int threads = omp_get_max_threads();
+  if (threads>1)
+    printf("OpenMP active: max. %d threads.\n",threads);
+  else
+    printf("OpenMP active, but running with 1 thread only.\n");
+#endif
+  }
+
+static void MPI_status(void)
+  {
+#ifndef USE_MPI
+  printf("MPI: not supported by this binary\n");
+#else
+  int tasks;
+  MPI_Comm_size(MPI_COMM_WORLD,&tasks);
+  if (tasks>1)
+    printf("MPI active with %d tasks.\n",tasks);
+  else
+    printf("MPI active, but running with 1 task only.\n");
+#endif
+  }
+
+static void vecmath_status(void)
+  { printf("Supported vector length: %d\n",sharp_veclen()); }
+
+static void sharp_announce (const char *name)
+  {
+  size_t m, nlen=strlen(name);
+  printf("\n+-");
+  for (m=0; m<nlen; ++m) printf("-");
+  printf("-+\n");
+  printf("| %s |\n", name);
+  printf("+-");
+  for (m=0; m<nlen; ++m) printf("-");
+  printf("-+\n\n");
+  vecmath_status();
+  OpenMP_status();
+  MPI_status();
+  printf("\n");
+  }
+
+static void sharp_module_startup (const char *name, int argc, int argc_expected,
+  const char *argv_expected, int verbose)
+  {
+  if (verbose) sharp_announce (name);
+  if (argc==argc_expected) return;
+  if (verbose) fprintf(stderr, "Usage: %s %s\n", name, argv_expected);
+  exit(1);
+  }
+
 typedef complex double dcmplx;
 
 int ntasks, mytask;

From 4d91dd88889ef25bc3d91c517cc8dca59807fda5 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 10 Jan 2019 15:33:24 +0100
Subject: [PATCH 67/85] temporary commit for future reference

---
 libsharp/sharp_core.c | 129 +++++++++++++++++++++++++++++++-----------
 1 file changed, 96 insertions(+), 33 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 8d17a4d..c428702 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -220,11 +220,15 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
       Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
       for (int i=0; i<nv0; ++i)
         {
+        d->p1r[i] += d->lam2[i]*ar1;
+        d->p1i[i] += d->lam2[i]*ai1;
+        d->p2r[i] += d->lam2[i]*ar2;
+        d->p2i[i] += d->lam2[i]*ai2;
         d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
-        d->p1r[i] += d->lam2[i]*ar1 + d->lam1[i]*ar3;
-        d->p1i[i] += d->lam2[i]*ai1 + d->lam1[i]*ai3;
-        d->p2r[i] += d->lam2[i]*ar2 + d->lam1[i]*ar4;
-        d->p2i[i] += d->lam2[i]*ai2 + d->lam1[i]*ai4;
+        d->p1r[i] += d->lam1[i]*ar3;
+        d->p1i[i] += d->lam1[i]*ai3;
+        d->p2r[i] += d->lam1[i]*ar4;
+        d->p2i[i] += d->lam1[i]*ai4;
         d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
         }
       }
@@ -241,11 +245,15 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
       Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
       for (int i=0; i<nv2; ++i)
         {
+        d->p1r[i] += d->lam2[i]*ar1;
+        d->p1i[i] += d->lam2[i]*ai1;
+        d->p2r[i] += d->lam2[i]*ar2;
+        d->p2i[i] += d->lam2[i]*ai2;
         d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
-        d->p1r[i] += d->lam2[i]*ar1 + d->lam1[i]*ar3;
-        d->p1i[i] += d->lam2[i]*ai1 + d->lam1[i]*ai3;
-        d->p2r[i] += d->lam2[i]*ar2 + d->lam1[i]*ar4;
-        d->p2i[i] += d->lam2[i]*ai2 + d->lam1[i]*ai4;
+        d->p1r[i] += d->lam1[i]*ar3;
+        d->p1i[i] += d->lam1[i]*ai3;
+        d->p2r[i] += d->lam1[i]*ar4;
+        d->p2i[i] += d->lam1[i]*ai4;
         d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
         }
       }
@@ -501,20 +509,56 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
       {
       d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
       d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
-      Tv lw1=d->l2p[i]+d->l2m[i];
-      Tv lx2=d->l1m[i]-d->l1p[i];
-      d->p1pr[i] += agr1*lw1 - aci2*lx2;
-      d->p1pi[i] += agi1*lw1 + acr2*lx2;
-      d->p1mr[i] += acr1*lw1 + agi2*lx2;
-      d->p1mi[i] += aci1*lw1 - agr2*lx2;
-      Tv lx1=d->l2m[i]-d->l2p[i];
-      Tv lw2=d->l1p[i]+d->l1m[i];
+
+// p1pr = a + b - c + d
+// p2mi = a + b + c - d
+//
+// p1pi = a - b + c + d
+// p2mr =-a + b + c + d
+//
+// p1mr = a - b + c + d
+// p2pi =-a + b + c + d
+//
+// p1mi = a + b - c + d
+// p2pr = a + b + c - d
+      d->p1pr[i] += agr1*d->l2p[i];
+      d->p1pi[i] += agi1*d->l2p[i];
+      d->p1mr[i] += acr1*d->l2p[i];
+      d->p1mi[i] += aci1*d->l2p[i];
+//      d->p2pr[i] += aci1*d->l2p[i];
+//      d->p2pi[i] -= acr1*d->l2p[i];
+//      d->p2mr[i] -= agi1*d->l2p[i];
+//      d->p2mi[i] += agr1*d->l2p[i];
+
+      d->p1pr[i] += aci2*d->l1p[i];
+      d->p1pi[i] -= acr2*d->l1p[i];
+      d->p1mr[i] -= agi2*d->l1p[i];
+      d->p1mi[i] += agr2*d->l1p[i];
+//      d->p2pr[i] += agr2*d->l1p[i];
+//      d->p2pi[i] += agi2*d->l1p[i];
+//      d->p2mr[i] += acr2*d->l1p[i];
+//      d->p2mi[i] += aci2*d->l1p[i];
+
+//      d->p1pr[i] -= aci2*d->l1m[i];
+//      d->p1pi[i] += acr2*d->l1m[i];
+//      d->p1mr[i] += agi2*d->l1m[i];
+//      d->p1mi[i] -= agr2*d->l1m[i];
+      d->p2pr[i] += agr2*d->l1m[i];
+      d->p2pi[i] += agi2*d->l1m[i];
+      d->p2mr[i] += acr2*d->l1m[i];
+      d->p2mi[i] += aci2*d->l1m[i];
+
+//      d->p1pr[i] += agr1*d->l2m[i];
+//      d->p1pi[i] += agi1*d->l2m[i];
+//      d->p1mr[i] += acr1*d->l2m[i];
+//      d->p1mi[i] += aci1*d->l2m[i];
+      d->p2pr[i] -= aci1*d->l2m[i];
+      d->p2pi[i] += acr1*d->l2m[i];
+      d->p2mr[i] += agi1*d->l2m[i];
+      d->p2mi[i] -= agr1*d->l2m[i];
+
       d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
       d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-      d->p2pr[i] += agr2*lw2 - aci1*lx1;
-      d->p2pi[i] += agi2*lw2 + acr1*lx1;
-      d->p2mr[i] += acr2*lw2 + agi1*lx1;
-      d->p2mi[i] += aci2*lw2 - agr1*lx1;
       }
     l+=2;
     }
@@ -528,7 +572,7 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
   iter_to_ieee_spin(gen, d, &l, nv2);
   job->opcnt += (l-gen->mhi) * 7*nth;
   if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 25*nth;
+  job->opcnt += (lmax+1-l) * 23*nth;
 
   const sharp_ylmgen_dbl2 * restrict fx = gen->fx;
   const dcmplx * restrict alm=job->almtmp;
@@ -554,18 +598,28 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
       {
       d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
       d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
-      Tv lw1=d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i];
-      Tv lx2=d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i];
-      d->p1pr[i] += agr1*lw1 - aci2*lx2;
-      d->p1pi[i] += agi1*lw1 + acr2*lx2;
-      d->p1mr[i] += acr1*lw1 + agi2*lx2;
-      d->p1mi[i] += aci1*lw1 - agr2*lx2;
-      Tv lx1=d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i];
-      Tv lw2=d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i];
-      d->p2pr[i] += agr2*lw2 - aci1*lx1;
-      d->p2pi[i] += agi2*lw2 + acr1*lx1;
-      d->p2mr[i] += acr2*lw2 + agi1*lx1;
-      d->p2mi[i] += aci2*lw2 - agr1*lx1;
+
+      Tv l2p=d->l2p[i]*d->cfp[i], l2m=d->l2m[i]*d->cfm[i];
+      Tv l1m=d->l1m[i]*d->cfm[i], l1p=d->l1p[i]*d->cfp[i];
+      d->p1pr[i] += agr1*l2p;
+      d->p1pi[i] += agi1*l2p;
+      d->p1mr[i] += acr1*l2p;
+      d->p1mi[i] += aci1*l2p;
+
+      d->p1pr[i] += aci2*l1p;
+      d->p1pi[i] -= acr2*l1p;
+      d->p1mr[i] -= agi2*l1p;
+      d->p1mi[i] += agr2*l1p;
+
+      d->p2pr[i] += agr2*l1m;
+      d->p2pi[i] += agi2*l1m;
+      d->p2mr[i] += acr2*l1m;
+      d->p2mi[i] += aci2*l1m;
+
+      d->p2pr[i] -= aci1*l2m;
+      d->p2pi[i] += acr1*l2m;
+      d->p2mr[i] += agi1*l2m;
+      d->p2mi[i] -= agr1*l2m;
       d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
       d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
@@ -587,6 +641,15 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
     d->l2m[i] *= d->cfm[i];
     }
   alm2map_spin_kernel(d, fx, alm, l, lmax, nv2);
+
+  for (int i=0; i<nv2; ++i)
+    {
+    Tv tmp;
+    tmp = d->p1pr[i]; d->p1pr[i] -= d->p2mi[i]; d->p2mi[i] += tmp;
+    tmp = d->p1pi[i]; d->p1pi[i] += d->p2mr[i]; d->p2mr[i] -= tmp;
+    tmp = d->p1mr[i]; d->p1mr[i] += d->p2pi[i]; d->p2pi[i] -= tmp;
+    tmp = d->p1mi[i]; d->p1mi[i] -= d->p2pr[i]; d->p2pr[i] += tmp;
+    }
   }
 
 NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,

From 721a8e3b9304299724f228ece9ad9fb0ad2caf4f Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 10 Jan 2019 15:35:23 +0100
Subject: [PATCH 68/85] temporary commit for future reference

---
 libsharp/sharp_core.c | 31 ++++++++-----------------------
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index c428702..4a6967e 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -507,8 +507,6 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
        acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
     for (int i=0; i<nv2; ++i)
       {
-      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
-      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
 
 // p1pr = a + b - c + d
 // p2mi = a + b + c - d
@@ -525,38 +523,25 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
       d->p1pi[i] += agi1*d->l2p[i];
       d->p1mr[i] += acr1*d->l2p[i];
       d->p1mi[i] += aci1*d->l2p[i];
-//      d->p2pr[i] += aci1*d->l2p[i];
-//      d->p2pi[i] -= acr1*d->l2p[i];
-//      d->p2mr[i] -= agi1*d->l2p[i];
-//      d->p2mi[i] += agr1*d->l2p[i];
+
+      d->p2pr[i] -= aci1*d->l2m[i];
+      d->p2pi[i] += acr1*d->l2m[i];
+      d->p2mr[i] += agi1*d->l2m[i];
+      d->p2mi[i] -= agr1*d->l2m[i];
+
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
 
       d->p1pr[i] += aci2*d->l1p[i];
       d->p1pi[i] -= acr2*d->l1p[i];
       d->p1mr[i] -= agi2*d->l1p[i];
       d->p1mi[i] += agr2*d->l1p[i];
-//      d->p2pr[i] += agr2*d->l1p[i];
-//      d->p2pi[i] += agi2*d->l1p[i];
-//      d->p2mr[i] += acr2*d->l1p[i];
-//      d->p2mi[i] += aci2*d->l1p[i];
 
-//      d->p1pr[i] -= aci2*d->l1m[i];
-//      d->p1pi[i] += acr2*d->l1m[i];
-//      d->p1mr[i] += agi2*d->l1m[i];
-//      d->p1mi[i] -= agr2*d->l1m[i];
       d->p2pr[i] += agr2*d->l1m[i];
       d->p2pi[i] += agi2*d->l1m[i];
       d->p2mr[i] += acr2*d->l1m[i];
       d->p2mi[i] += aci2*d->l1m[i];
 
-//      d->p1pr[i] += agr1*d->l2m[i];
-//      d->p1pi[i] += agi1*d->l2m[i];
-//      d->p1mr[i] += acr1*d->l2m[i];
-//      d->p1mi[i] += aci1*d->l2m[i];
-      d->p2pr[i] -= aci1*d->l2m[i];
-      d->p2pi[i] += acr1*d->l2m[i];
-      d->p2mr[i] += agi1*d->l2m[i];
-      d->p2mi[i] -= agr1*d->l2m[i];
-
       d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
       d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       }

From 14955e74f1010a89e817cfcf6dd26bfc2df9fac7 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 10 Jan 2019 15:35:58 +0100
Subject: [PATCH 69/85] temporary commit for future reference

---
 libsharp/sharp_core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 4a6967e..f8bc16d 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -531,6 +531,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
 
       d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
       d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
 
       d->p1pr[i] += aci2*d->l1p[i];
       d->p1pi[i] -= acr2*d->l1p[i];
@@ -542,8 +544,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
       d->p2mr[i] += acr2*d->l1m[i];
       d->p2mi[i] += aci2*d->l1m[i];
 
-      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
-      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
+//      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+//      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       }
     l+=2;
     }

From 0a0a4e2b61e960c01fa27a3082608627ca655f98 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 10 Jan 2019 16:46:52 +0100
Subject: [PATCH 70/85] crazy hacks; still broken

---
 libsharp/sharp_core.c | 157 +++++++++++++++++++++++++++++-------------
 1 file changed, 108 insertions(+), 49 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index f8bc16d..58efdc0 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -497,6 +497,7 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
   const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
   int l, int lmax, int nv2)
   {
+  int lsave = l;
   while (l<=lmax)
     {
     Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
@@ -507,45 +508,44 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
        acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
     for (int i=0; i<nv2; ++i)
       {
-
-// p1pr = a + b - c + d
-// p2mi = a + b + c - d
-//
-// p1pi = a - b + c + d
-// p2mr =-a + b + c + d
-//
-// p1mr = a - b + c + d
-// p2pi =-a + b + c + d
-//
-// p1mi = a + b - c + d
-// p2pr = a + b + c - d
       d->p1pr[i] += agr1*d->l2p[i];
       d->p1pi[i] += agi1*d->l2p[i];
       d->p1mr[i] += acr1*d->l2p[i];
       d->p1mi[i] += aci1*d->l2p[i];
 
-      d->p2pr[i] -= aci1*d->l2m[i];
-      d->p2pi[i] += acr1*d->l2m[i];
-      d->p2mr[i] += agi1*d->l2m[i];
-      d->p2mi[i] -= agr1*d->l2m[i];
-
       d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
-      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
       d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
-      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
 
       d->p1pr[i] += aci2*d->l1p[i];
       d->p1pi[i] -= acr2*d->l1p[i];
       d->p1mr[i] -= agi2*d->l1p[i];
       d->p1mi[i] += agr2*d->l1p[i];
+      }
+    l+=2;
+    }
+  l=lsave;
+  while (l<=lmax)
+    {
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
+    Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
+       acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
+    Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
+       acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
+    for (int i=0; i<nv2; ++i)
+      {
+      d->p2pr[i] -= aci1*d->l2m[i];
+      d->p2pi[i] += acr1*d->l2m[i];
+      d->p2mr[i] += agi1*d->l2m[i];
+      d->p2mi[i] -= agr1*d->l2m[i];
+
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
 
       d->p2pr[i] += agr2*d->l1m[i];
       d->p2pi[i] += agi2*d->l1m[i];
       d->p2mr[i] += acr2*d->l1m[i];
       d->p2mi[i] += aci2*d->l1m[i];
-
-//      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
-//      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       }
     l+=2;
     }
@@ -571,7 +571,7 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
     full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
                  vallTrue(vge(d->scm[i],vload(sharp_minscale)));
     }
-
+full_ieee=0;
   while((!full_ieee) && (l<=lmax))
     {
     Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
@@ -607,6 +607,7 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
       d->p2pi[i] += acr1*l2m;
       d->p2mr[i] += agi1*l2m;
       d->p2mi[i] -= agr1*l2m;
+
       d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
       d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
@@ -615,6 +616,7 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
       if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
         getCorfac(d->scm[i], &d->cfm[i], gen->cf);
       full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+      full_ieee=0;
       }
     l+=2;
     }
@@ -653,20 +655,40 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
       {
       d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
       d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
-      Tv lw = d->l2p[i] + d->l2m[i];
-      Tv lx = d->l2m[i] - d->l2p[i];
-      agr1 += d->p1pr[i]*lw - d->p2mi[i]*lx;;
-      agi1 += d->p1pi[i]*lw + d->p2mr[i]*lx;
-      acr1 += d->p1mr[i]*lw + d->p2pi[i]*lx;
-      aci1 += d->p1mi[i]*lw - d->p2pr[i]*lx;
+      agr1 += d->p1pr[i]*d->l2p[i];
+      agr1 += d->p1pr[i]*d->l2m[i];
+      agr1 -= d->p2mi[i]*d->l2m[i];
+      agr1 += d->p2mi[i]*d->l2p[i];
+      agi1 += d->p1pi[i]*d->l2p[i];
+      agi1 += d->p1pi[i]*d->l2m[i];
+      agi1 += d->p2mr[i]*d->l2m[i];
+      agi1 -= d->p2mr[i]*d->l2p[i];
+      acr1 += d->p1mr[i]*d->l2p[i];
+      acr1 += d->p1mr[i]*d->l2m[i];
+      acr1 += d->p2pi[i]*d->l2m[i];
+      acr1 -= d->p2pi[i]*d->l2p[i];
+      aci1 += d->p1mi[i]*d->l2p[i];
+      aci1 += d->p1mi[i]*d->l2m[i];
+      aci1 -= d->p2pr[i]*d->l2m[i];
+      aci1 += d->p2pr[i]*d->l2p[i];
       d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
       d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-      lw = d->l1p[i] + d->l1m[i];
-      lx = d->l1m[i] - d->l1p[i];
-      agr2 += d->p2pr[i]*lw - d->p1mi[i]*lx;
-      agi2 += d->p2pi[i]*lw + d->p1mr[i]*lx;
-      acr2 += d->p2mr[i]*lw + d->p1pi[i]*lx;
-      aci2 += d->p2mi[i]*lw - d->p1pr[i]*lx;
+      agr2 += d->p2pr[i]*d->l1p[i];
+      agr2 += d->p2pr[i]*d->l1m[i];
+      agr2 -= d->p1mi[i]*d->l1m[i];
+      agr2 += d->p1mi[i]*d->l1p[i];
+      agi2 += d->p2pi[i]*d->l1p[i];
+      agi2 += d->p2pi[i]*d->l1m[i];
+      agi2 += d->p1mr[i]*d->l1m[i];
+      agi2 -= d->p1mr[i]*d->l1p[i];
+      acr2 += d->p2mr[i]*d->l1p[i];
+      acr2 += d->p2mr[i]*d->l1m[i];
+      acr2 += d->p1pi[i]*d->l1m[i];
+      acr2 -= d->p1pi[i]*d->l1p[i];
+      aci2 += d->p2mi[i]*d->l1p[i];
+      aci2 += d->p2mi[i]*d->l1m[i];
+      aci2 -= d->p1pr[i]*d->l1m[i];
+      aci2 += d->p1pr[i]*d->l1p[i];
       }
     vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
     vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
@@ -695,6 +717,7 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
                  vallTrue(vge(d->scm[i],vload(sharp_minscale)));
     }
 
+full_ieee=0;
   while((!full_ieee) && (l<=lmax))
     {
     Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
@@ -706,26 +729,62 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
       {
       d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
       d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
-      Tv lw = d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i];
-      Tv lx = d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i];
-      agr1 += d->p1pr[i]*lw - d->p2mi[i]*lx;
-      agi1 += d->p1pi[i]*lw + d->p2mr[i]*lx;
-      acr1 += d->p1mr[i]*lw + d->p2pi[i]*lx;
-      aci1 += d->p1mi[i]*lw - d->p2pr[i]*lx;
-      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
-      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-      lw = d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i];
-      lx = d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i];
-      agr2 += d->p2pr[i]*lw - d->p1mi[i]*lx;
-      agi2 += d->p2pi[i]*lw + d->p1mr[i]*lx;
-      acr2 += d->p2mr[i]*lw + d->p1pi[i]*lx;
-      aci2 += d->p2mi[i]*lw - d->p1pr[i]*lx;
+      Tv l2p = d->l2p[i]*d->cfp[i], l2m = d->l2m[i]*d->cfm[i];
+      Tv l1p = d->l1p[i]*d->cfp[i], l1m = d->l1m[i]*d->cfm[i];
+      agr1 += d->p1pr[i]*l2p;
+      agr1 += d->p1pr[i]*l2m;
+      agr1 -= d->p2mi[i]*l2m;
+      agr1 += d->p2mi[i]*l2p;
+      agi1 += d->p1pi[i]*l2p;
+      agi1 += d->p1pi[i]*l2m;
+      agi1 += d->p2mr[i]*l2m;
+      agi1 -= d->p2mr[i]*l2p;
+      acr1 += d->p1mr[i]*l2p;
+      acr1 += d->p1mr[i]*l2m;
+      acr1 += d->p2pi[i]*l2m;
+      acr1 -= d->p2pi[i]*l2p;
+      aci1 += d->p1mi[i]*l2p;
+      aci1 += d->p1mi[i]*l2m;
+      aci1 -= d->p2pr[i]*l2m;
+      aci1 += d->p2pr[i]*l2p;
+      agr2 += d->p2pr[i]*l1p;
+      agr2 += d->p2pr[i]*l1m;
+      agr2 -= d->p1mi[i]*l1m;
+      agr2 += d->p1mi[i]*l1p;
+      agi2 += d->p2pi[i]*l1p;
+      agi2 += d->p2pi[i]*l1m;
+      agi2 += d->p1mr[i]*l1m;
+      agi2 -= d->p1mr[i]*l1p;
+      acr2 += d->p2mr[i]*l1p;
+      acr2 += d->p2mr[i]*l1m;
+      acr2 += d->p1pi[i]*l1m;
+      acr2 -= d->p1pi[i]*l1p;
+      aci2 += d->p2mi[i]*l1p;
+      aci2 += d->p2mi[i]*l1m;
+      aci2 -= d->p1pr[i]*l1m;
+      aci2 += d->p1pr[i]*l1p;
+
+//       Tv lw = d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i];
+//       Tv lx = d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i];
+//       agr1 += d->p1pr[i]*lw - d->p2mi[i]*lx;
+//       agi1 += d->p1pi[i]*lw + d->p2mr[i]*lx;
+//       acr1 += d->p1mr[i]*lw + d->p2pi[i]*lx;
+//       aci1 += d->p1mi[i]*lw - d->p2pr[i]*lx;
+       d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+       d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
+//       lw = d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i];
+//       lx = d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i];
+//       agr2 += d->p2pr[i]*lw - d->p1mi[i]*lx;
+//       agi2 += d->p2pi[i]*lw + d->p1mr[i]*lx;
+//       acr2 += d->p2mr[i]*lw + d->p1pi[i]*lx;
+//       aci2 += d->p2mi[i]*lw - d->p1pr[i]*lx;
       if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
         getCorfac(d->scp[i], &d->cfp[i], gen->cf);
       full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
       if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
         getCorfac(d->scm[i], &d->cfm[i], gen->cf);
       full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+      full_ieee=0;
       }
     vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
     vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);

From 8fd091bf88fb7e526863ff62b6e945e0a76d5c95 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 10 Jan 2019 16:47:10 +0100
Subject: [PATCH 71/85] crazy hacks; still broken

---
 libsharp/sharp_core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 58efdc0..7d6709c 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -588,6 +588,7 @@ full_ieee=0;
 
       Tv l2p=d->l2p[i]*d->cfp[i], l2m=d->l2m[i]*d->cfm[i];
       Tv l1m=d->l1m[i]*d->cfm[i], l1p=d->l1p[i]*d->cfp[i];
+
       d->p1pr[i] += agr1*l2p;
       d->p1pi[i] += agi1*l2p;
       d->p1mr[i] += acr1*l2p;

From d2cd9a08550bfa6f67d0417be556b05bb790e14f Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 10 Jan 2019 18:38:46 +0100
Subject: [PATCH 72/85] tweaks

---
 libsharp/sharp_core.c | 110 ++++++++++++++++--------------------------
 1 file changed, 42 insertions(+), 68 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 7d6709c..f0d6354 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -508,18 +508,17 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
        acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
     for (int i=0; i<nv2; ++i)
       {
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
       d->p1pr[i] += agr1*d->l2p[i];
       d->p1pi[i] += agi1*d->l2p[i];
       d->p1mr[i] += acr1*d->l2p[i];
       d->p1mi[i] += aci1*d->l2p[i];
 
-      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
-      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
-
       d->p1pr[i] += aci2*d->l1p[i];
       d->p1pi[i] -= acr2*d->l1p[i];
       d->p1mr[i] -= agi2*d->l1p[i];
       d->p1mi[i] += agr2*d->l1p[i];
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
       }
     l+=2;
     }
@@ -534,18 +533,17 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
        acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
     for (int i=0; i<nv2; ++i)
       {
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
       d->p2pr[i] -= aci1*d->l2m[i];
       d->p2pi[i] += acr1*d->l2m[i];
       d->p2mr[i] += agi1*d->l2m[i];
       d->p2mi[i] -= agr1*d->l2m[i];
 
-      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
-      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-
       d->p2pr[i] += agr2*d->l1m[i];
       d->p2pi[i] += agi2*d->l1m[i];
       d->p2mr[i] += acr2*d->l1m[i];
       d->p2mi[i] += aci2*d->l1m[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       }
     l+=2;
     }
@@ -571,7 +569,7 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
     full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
                  vallTrue(vge(d->scm[i],vload(sharp_minscale)));
     }
-full_ieee=0;
+
   while((!full_ieee) && (l<=lmax))
     {
     Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
@@ -617,11 +615,10 @@ full_ieee=0;
       if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
         getCorfac(d->scm[i], &d->cfm[i], gen->cf);
       full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
-      full_ieee=0;
       }
     l+=2;
     }
-  if (l>lmax) return;
+//  if (l>lmax) return;
 
   for (int i=0; i<nv2; ++i)
     {
@@ -646,6 +643,7 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
   const sharp_ylmgen_dbl2 * restrict fx, dcmplx * restrict alm,
   int l, int lmax, int nv2)
   {
+  int lsave=l;
   while (l<=lmax)
     {
     Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
@@ -655,41 +653,39 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
     for (int i=0; i<nv2; ++i)
       {
       d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
-      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
-      agr1 += d->p1pr[i]*d->l2p[i];
-      agr1 += d->p1pr[i]*d->l2m[i];
-      agr1 -= d->p2mi[i]*d->l2m[i];
       agr1 += d->p2mi[i]*d->l2p[i];
-      agi1 += d->p1pi[i]*d->l2p[i];
-      agi1 += d->p1pi[i]*d->l2m[i];
-      agi1 += d->p2mr[i]*d->l2m[i];
       agi1 -= d->p2mr[i]*d->l2p[i];
-      acr1 += d->p1mr[i]*d->l2p[i];
-      acr1 += d->p1mr[i]*d->l2m[i];
-      acr1 += d->p2pi[i]*d->l2m[i];
       acr1 -= d->p2pi[i]*d->l2p[i];
-      aci1 += d->p1mi[i]*d->l2p[i];
-      aci1 += d->p1mi[i]*d->l2m[i];
-      aci1 -= d->p2pr[i]*d->l2m[i];
       aci1 += d->p2pr[i]*d->l2p[i];
-      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
-      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       agr2 += d->p2pr[i]*d->l1p[i];
-      agr2 += d->p2pr[i]*d->l1m[i];
-      agr2 -= d->p1mi[i]*d->l1m[i];
-      agr2 += d->p1mi[i]*d->l1p[i];
       agi2 += d->p2pi[i]*d->l1p[i];
-      agi2 += d->p2pi[i]*d->l1m[i];
-      agi2 += d->p1mr[i]*d->l1m[i];
-      agi2 -= d->p1mr[i]*d->l1p[i];
       acr2 += d->p2mr[i]*d->l1p[i];
-      acr2 += d->p2mr[i]*d->l1m[i];
-      acr2 += d->p1pi[i]*d->l1m[i];
-      acr2 -= d->p1pi[i]*d->l1p[i];
       aci2 += d->p2mi[i]*d->l1p[i];
-      aci2 += d->p2mi[i]*d->l1m[i];
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      }
+    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
+    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
+    l+=2;
+    }
+  l=lsave;
+  while (l<=lmax)
+    {
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
+    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
+    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
+      agr1 += d->p1pr[i]*d->l2m[i];
+      agi1 += d->p1pi[i]*d->l2m[i];
+      acr1 += d->p1mr[i]*d->l2m[i];
+      aci1 += d->p1mi[i]*d->l2m[i];
+      agr2 -= d->p1mi[i]*d->l1m[i];
+      agi2 += d->p1mr[i]*d->l1m[i];
+      acr2 += d->p1pi[i]*d->l1m[i];
       aci2 -= d->p1pr[i]*d->l1m[i];
-      aci2 += d->p1pr[i]*d->l1p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       }
     vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
     vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
@@ -705,7 +701,7 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
   iter_to_ieee_spin(gen, d, &l, nv2);
   job->opcnt += (l-gen->mhi) * 7*nth;
   if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 25*nth;
+  job->opcnt += (lmax+1-l) * 23*nth;
 
   const sharp_ylmgen_dbl2 * restrict fx = gen->fx;
   dcmplx * restrict alm=job->almtmp;
@@ -717,8 +713,15 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
     full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
                  vallTrue(vge(d->scm[i],vload(sharp_minscale)));
     }
+  for (int i=0; i<nv2; ++i)
+    {
+    Tv tmp;
+    tmp = d->p1pr[i]; d->p1pr[i] -= d->p2mi[i]; d->p2mi[i] += tmp;
+    tmp = d->p1pi[i]; d->p1pi[i] += d->p2mr[i]; d->p2mr[i] -= tmp;
+    tmp = d->p1mr[i]; d->p1mr[i] += d->p2pi[i]; d->p2pi[i] -= tmp;
+    tmp = d->p1mi[i]; d->p1mi[i] -= d->p2pr[i]; d->p2pr[i] += tmp;
+    }
 
-full_ieee=0;
   while((!full_ieee) && (l<=lmax))
     {
     Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
@@ -732,60 +735,31 @@ full_ieee=0;
       d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
       Tv l2p = d->l2p[i]*d->cfp[i], l2m = d->l2m[i]*d->cfm[i];
       Tv l1p = d->l1p[i]*d->cfp[i], l1m = d->l1m[i]*d->cfm[i];
-      agr1 += d->p1pr[i]*l2p;
       agr1 += d->p1pr[i]*l2m;
-      agr1 -= d->p2mi[i]*l2m;
       agr1 += d->p2mi[i]*l2p;
-      agi1 += d->p1pi[i]*l2p;
       agi1 += d->p1pi[i]*l2m;
-      agi1 += d->p2mr[i]*l2m;
       agi1 -= d->p2mr[i]*l2p;
-      acr1 += d->p1mr[i]*l2p;
       acr1 += d->p1mr[i]*l2m;
-      acr1 += d->p2pi[i]*l2m;
       acr1 -= d->p2pi[i]*l2p;
-      aci1 += d->p1mi[i]*l2p;
       aci1 += d->p1mi[i]*l2m;
-      aci1 -= d->p2pr[i]*l2m;
       aci1 += d->p2pr[i]*l2p;
       agr2 += d->p2pr[i]*l1p;
-      agr2 += d->p2pr[i]*l1m;
       agr2 -= d->p1mi[i]*l1m;
-      agr2 += d->p1mi[i]*l1p;
       agi2 += d->p2pi[i]*l1p;
-      agi2 += d->p2pi[i]*l1m;
       agi2 += d->p1mr[i]*l1m;
-      agi2 -= d->p1mr[i]*l1p;
       acr2 += d->p2mr[i]*l1p;
-      acr2 += d->p2mr[i]*l1m;
       acr2 += d->p1pi[i]*l1m;
-      acr2 -= d->p1pi[i]*l1p;
       aci2 += d->p2mi[i]*l1p;
-      aci2 += d->p2mi[i]*l1m;
       aci2 -= d->p1pr[i]*l1m;
-      aci2 += d->p1pr[i]*l1p;
 
-//       Tv lw = d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i];
-//       Tv lx = d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i];
-//       agr1 += d->p1pr[i]*lw - d->p2mi[i]*lx;
-//       agi1 += d->p1pi[i]*lw + d->p2mr[i]*lx;
-//       acr1 += d->p1mr[i]*lw + d->p2pi[i]*lx;
-//       aci1 += d->p1mi[i]*lw - d->p2pr[i]*lx;
-       d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
-       d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-//       lw = d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i];
-//       lx = d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i];
-//       agr2 += d->p2pr[i]*lw - d->p1mi[i]*lx;
-//       agi2 += d->p2pi[i]*lw + d->p1mr[i]*lx;
-//       acr2 += d->p2mr[i]*lw + d->p1pi[i]*lx;
-//       aci2 += d->p2mi[i]*lw - d->p1pr[i]*lx;
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
       if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
         getCorfac(d->scp[i], &d->cfp[i], gen->cf);
       full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
       if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
         getCorfac(d->scm[i], &d->cfm[i], gen->cf);
       full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
-      full_ieee=0;
       }
     vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
     vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);

From b0b0875def31c20009f5b278be9232d4a693376c Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 10 Jan 2019 18:50:52 +0100
Subject: [PATCH 73/85] shortening

---
 libsharp/sharp_core.c | 50 ++++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index f0d6354..3cd2b35 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -587,25 +587,15 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
       Tv l2p=d->l2p[i]*d->cfp[i], l2m=d->l2m[i]*d->cfm[i];
       Tv l1m=d->l1m[i]*d->cfm[i], l1p=d->l1p[i]*d->cfp[i];
 
-      d->p1pr[i] += agr1*l2p;
-      d->p1pi[i] += agi1*l2p;
-      d->p1mr[i] += acr1*l2p;
-      d->p1mi[i] += aci1*l2p;
+      d->p1pr[i] += agr1*l2p + aci2*l1p;
+      d->p1pi[i] += agi1*l2p - acr2*l1p;
+      d->p1mr[i] += acr1*l2p - agi2*l1p;
+      d->p1mi[i] += aci1*l2p + agr2*l1p;
 
-      d->p1pr[i] += aci2*l1p;
-      d->p1pi[i] -= acr2*l1p;
-      d->p1mr[i] -= agi2*l1p;
-      d->p1mi[i] += agr2*l1p;
-
-      d->p2pr[i] += agr2*l1m;
-      d->p2pi[i] += agi2*l1m;
-      d->p2mr[i] += acr2*l1m;
-      d->p2mi[i] += aci2*l1m;
-
-      d->p2pr[i] -= aci1*l2m;
-      d->p2pi[i] += acr1*l2m;
-      d->p2mr[i] += agi1*l2m;
-      d->p2mi[i] -= agr1*l2m;
+      d->p2pr[i] += agr2*l1m - aci1*l2m;
+      d->p2pi[i] += agi2*l1m + acr1*l2m;
+      d->p2mr[i] += acr2*l1m + agi1*l2m;
+      d->p2mi[i] += aci2*l1m - agr1*l2m;
 
       d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
       d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
@@ -735,22 +725,14 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
       d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
       Tv l2p = d->l2p[i]*d->cfp[i], l2m = d->l2m[i]*d->cfm[i];
       Tv l1p = d->l1p[i]*d->cfp[i], l1m = d->l1m[i]*d->cfm[i];
-      agr1 += d->p1pr[i]*l2m;
-      agr1 += d->p2mi[i]*l2p;
-      agi1 += d->p1pi[i]*l2m;
-      agi1 -= d->p2mr[i]*l2p;
-      acr1 += d->p1mr[i]*l2m;
-      acr1 -= d->p2pi[i]*l2p;
-      aci1 += d->p1mi[i]*l2m;
-      aci1 += d->p2pr[i]*l2p;
-      agr2 += d->p2pr[i]*l1p;
-      agr2 -= d->p1mi[i]*l1m;
-      agi2 += d->p2pi[i]*l1p;
-      agi2 += d->p1mr[i]*l1m;
-      acr2 += d->p2mr[i]*l1p;
-      acr2 += d->p1pi[i]*l1m;
-      aci2 += d->p2mi[i]*l1p;
-      aci2 -= d->p1pr[i]*l1m;
+      agr1 += d->p1pr[i]*l2m + d->p2mi[i]*l2p;
+      agi1 += d->p1pi[i]*l2m - d->p2mr[i]*l2p;
+      acr1 += d->p1mr[i]*l2m - d->p2pi[i]*l2p;
+      aci1 += d->p1mi[i]*l2m + d->p2pr[i]*l2p;
+      agr2 += d->p2pr[i]*l1p - d->p1mi[i]*l1m;
+      agi2 += d->p2pi[i]*l1p + d->p1mr[i]*l1m;
+      acr2 += d->p2mr[i]*l1p + d->p1pi[i]*l1m;
+      aci2 += d->p2mi[i]*l1p - d->p1pr[i]*l1m;
 
       d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
       d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];

From 253b2534676e3407d89e110fa1800a60e3260eba Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 11 Jan 2019 09:27:04 +0100
Subject: [PATCH 74/85] simplify Ylm data structures

---
 libsharp/sharp_core.c     | 80 +++++++++++++++++++--------------------
 libsharp/sharp_ylmgen_c.c | 22 +++++------
 libsharp/sharp_ylmgen_c.h |  5 +--
 3 files changed, 52 insertions(+), 55 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 3cd2b35..146be76 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -190,8 +190,8 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
     {
     if (l+4>gen->lmax) {*l_=gen->lmax+1;return;}
     below_limit=1;
-    Tv a1=vload(gen->ab[il  ].f[0]), b1=vload(gen->ab[il  ].f[1]);
-    Tv a2=vload(gen->ab[il+1].f[0]), b2=vload(gen->ab[il+1].f[1]);
+    Tv a1=vload(gen->coef[il  ][0]), b1=vload(gen->coef[il  ][1]);
+    Tv a2=vload(gen->coef[il+1][0]), b2=vload(gen->coef[il+1][1]);
     for (int i=0; i<nv2; ++i)
       {
       d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
@@ -205,7 +205,7 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
   }
 
 NOINLINE static void alm2map_kernel(s0data_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict ab, const dcmplx * restrict alm,
+  const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm,
   int l, int il, int lmax, int nv2)
   {
   if (nv2==nv0)
@@ -216,8 +216,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
       Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
       Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
       Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
-      Tv a1=vload(ab[il  ].f[0]), b1=vload(ab[il  ].f[1]);
-      Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
+      Tv a1=vload(coef[il  ][0]), b1=vload(coef[il  ][1]);
+      Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]);
       for (int i=0; i<nv0; ++i)
         {
         d->p1r[i] += d->lam2[i]*ar1;
@@ -241,8 +241,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
       Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
       Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
       Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
-      Tv a1=vload(ab[il  ].f[0]), b1=vload(ab[il  ].f[1]);
-      Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
+      Tv a1=vload(coef[il  ][0]), b1=vload(coef[il  ][1]);
+      Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]);
       for (int i=0; i<nv2; ++i)
         {
         d->p1r[i] += d->lam2[i]*ar1;
@@ -262,7 +262,7 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
     {
     Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
     Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]);
+    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
     for (int i=0; i<nv2; ++i)
       {
       d->p1r[i] += d->lam2[i]*ar1;
@@ -286,7 +286,7 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 6*nth;
 
-  const sharp_ylmgen_dbl2 * restrict ab = gen->ab;
+  const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
   const dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
   for (int i=0; i<nv2; ++i)
@@ -299,7 +299,7 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
     {
     Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
     Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]);
+    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
     full_ieee=1;
     for (int i=0; i<nv2; ++i)
       {
@@ -323,17 +323,17 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
     d->lam1[i] *= d->corfac[i];
     d->lam2[i] *= d->corfac[i];
     }
-  alm2map_kernel(d, ab, alm, l, il, lmax, nv2);
+  alm2map_kernel(d, coef, alm, l, il, lmax, nv2);
   }
 
 NOINLINE static void map2alm_kernel(s0data_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict ab, dcmplx * restrict alm, int l,
+  const sharp_ylmgen_dbl2 * restrict coef, dcmplx * restrict alm, int l,
   int il, int lmax, int nv2)
   {
   for (; l<=lmax-2; il+=2, l+=4)
     {
-    Tv a1=vload(ab[il  ].f[0]), b1=vload(ab[il  ].f[1]);
-    Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
+    Tv a1=vload(coef[il  ][0]), b1=vload(coef[il  ][1]);
+    Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]);
     Tv atmp1[4] = {vzero, vzero, vzero, vzero};
     Tv atmp2[4] = {vzero, vzero, vzero, vzero};
     for (int i=0; i<nv2; ++i)
@@ -354,7 +354,7 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d,
     }
   for (; l<=lmax; ++il, l+=2)
     {
-    Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]);
+    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
     Tv atmp[4] = {vzero, vzero, vzero, vzero};
     for (int i=0; i<nv2; ++i)
       {
@@ -380,7 +380,7 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 6*nth;
 
-  const sharp_ylmgen_dbl2 * restrict ab = gen->ab;
+  const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
   dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
   for (int i=0; i<nv2; ++i)
@@ -391,7 +391,7 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
 
   while((!full_ieee) && (l<=lmax))
     {
-    Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]);
+    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
     Tv atmp[4] = {vzero, vzero, vzero, vzero};
     full_ieee=1;
     for (int i=0; i<nv2; ++i)
@@ -417,13 +417,13 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
     d->lam1[i] *= d->corfac[i];
     d->lam2[i] *= d->corfac[i];
     }
-  map2alm_kernel(d, ab, alm, l, il, lmax, nv2);
+  map2alm_kernel(d, coef, alm, l, il, lmax, nv2);
   }
 
 NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
   sxdata_v * restrict d, int * restrict l_, int nv2)
   {
-  const sharp_ylmgen_dbl2 * restrict fx = gen->fx;
+  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
   Tv prefac=vload(gen->prefac[gen->m]),
      prescale=vload(gen->fscale[gen->m]);
   Tv limscale=vload(sharp_limscale);
@@ -474,8 +474,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
     {
     if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
     below_limit=1;
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
     for (int i=0; i<nv2; ++i)
       {
       d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
@@ -500,8 +500,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
   int lsave = l;
   while (l<=lmax)
     {
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
     Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
        acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
     Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
@@ -525,8 +525,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
   l=lsave;
   while (l<=lmax)
     {
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
     Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
        acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
     Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
@@ -559,7 +559,7 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 23*nth;
 
-  const sharp_ylmgen_dbl2 * restrict fx = gen->fx;
+  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
   const dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
   for (int i=0; i<nv2; ++i)
@@ -572,8 +572,8 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
 
   while((!full_ieee) && (l<=lmax))
     {
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
     Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
        acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
     Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
@@ -636,8 +636,8 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
   int lsave=l;
   while (l<=lmax)
     {
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
     Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
     Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
     for (int i=0; i<nv2; ++i)
@@ -660,8 +660,8 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
   l=lsave;
   while (l<=lmax)
     {
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
     Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
     Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
     for (int i=0; i<nv2; ++i)
@@ -693,7 +693,7 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 23*nth;
 
-  const sharp_ylmgen_dbl2 * restrict fx = gen->fx;
+  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
   dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
   for (int i=0; i<nv2; ++i)
@@ -714,8 +714,8 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
 
   while((!full_ieee) && (l<=lmax))
     {
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
     Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
     Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
     full_ieee=1;
@@ -766,8 +766,8 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
   {
   while (l<=lmax)
     {
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
     Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
        ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
     for (int i=0; i<nv2; ++i)
@@ -803,7 +803,7 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 17*nth;
 
-  const sharp_ylmgen_dbl2 * restrict fx = gen->fx;
+  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
   const dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
   for (int i=0; i<nv2; ++i)
@@ -816,8 +816,8 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
 
   while((!full_ieee) && (l<=lmax))
     {
-    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
-    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
     Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
        ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
     full_ieee=1;
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index e3c055b..ffa3e0f 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -82,14 +82,14 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
       }
     gen->eps=RALLOC(double, gen->lmax+4);
     gen->alpha=RALLOC(double, gen->lmax/2+2);
-    gen->ab=RALLOC(sharp_ylmgen_dbl2, gen->lmax/2+2);
+    gen->coef=RALLOC(sharp_ylmgen_dbl2, gen->lmax/2+2);
     }
   else
     {
     gen->m=gen->mlo=gen->mhi=-1234567890;
-    ALLOC(gen->fx,sharp_ylmgen_dbl2,gen->lmax+3);
+    ALLOC(gen->coef,sharp_ylmgen_dbl2,gen->lmax+3);
     for (int m=0; m<gen->lmax+3; ++m)
-      gen->fx[m].f[0]=gen->fx[m].f[1]=0.;
+      gen->coef[m][0]=gen->coef[m][1]=0.;
     ALLOC(gen->alpha,double,gen->lmax+3);
     ALLOC(gen->inv,double,gen->lmax+2);
     gen->inv[0]=0;
@@ -134,19 +134,17 @@ void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
   {
   DEALLOC(gen->cf);
   DEALLOC(gen->powlimit);
+  DEALLOC(gen->alpha);
+  DEALLOC(gen->coef);
   if (gen->s==0)
     {
     DEALLOC(gen->mfac);
     DEALLOC(gen->root);
     DEALLOC(gen->iroot);
     DEALLOC(gen->eps);
-    DEALLOC(gen->alpha);
-    DEALLOC(gen->ab);
     }
   else
     {
-    DEALLOC(gen->fx);
-    DEALLOC(gen->alpha);
     DEALLOC(gen->prefac);
     DEALLOC(gen->fscale);
     DEALLOC(gen->flm1);
@@ -174,9 +172,9 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
                        /(gen->eps[l+2]*gen->eps[l+3]*gen->alpha[il]);
     for (int il=0, l=m; l<gen->lmax+2; ++il, l+=2)
       {
-      gen->ab[il].f[0] = ((il&1) ? -1 : 1)*gen->alpha[il]*gen->alpha[il];
+      gen->coef[il][0] = ((il&1) ? -1 : 1)*gen->alpha[il]*gen->alpha[il];
       double t1 = gen->eps[l+2], t2 = gen->eps[l+1];
-      gen->ab[il].f[1] = -gen->ab[il].f[0]*(t1*t1+t2*t2);
+      gen->coef[il][1] = -gen->coef[il][0]*(t1*t1+t2*t2);
       }
     }
   else
@@ -190,7 +188,7 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
     if (!ms_similar)
       {
       gen->alpha[gen->mhi] = 1.;
-      gen->fx[gen->mhi].f[0] = gen->fx[gen->mhi].f[1] = 0.;
+      gen->coef[gen->mhi][0] = gen->coef[gen->mhi][1] = 0.;
       for (int l=gen->mhi; l<gen->lmax+1; ++l)
         {
         double t = gen->flm1[l+gen->m]*gen->flm1[l-gen->m]
@@ -206,8 +204,8 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
           gen->alpha[l+1] = gen->alpha[l-1]*flp12;
         else
           gen->alpha[l+1] = 1.;
-        gen->fx[l+1].f[0] = flp10*gen->alpha[l]/gen->alpha[l+1];
-        gen->fx[l+1].f[1] = flp11*gen->fx[l+1].f[0];
+        gen->coef[l+1][0] = flp10*gen->alpha[l]/gen->alpha[l+1];
+        gen->coef[l+1][1] = flp11*gen->coef[l+1][0];
         }
       }
 
diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h
index cc9260f..b36346a 100644
--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@@ -41,7 +41,7 @@ static const double sharp_fbig=0x1p+800,sharp_fsmall=0x1p-800;
 static const double sharp_ftol=0x1p-60;
 static const double sharp_fbighalf=0x1p+400;
 
-typedef struct { double f[2]; } sharp_ylmgen_dbl2;
+typedef double sharp_ylmgen_dbl2[2];
 
 typedef struct
   {
@@ -54,16 +54,15 @@ typedef struct
   int m;
 
   double *alpha;
+  sharp_ylmgen_dbl2 *coef;
 
 /* used if s==0 */
   double *mfac, *eps;
-  sharp_ylmgen_dbl2 *ab;
 
 /* used if s!=0 */
   int sinPow, cosPow, preMinus_p, preMinus_m;
   double *prefac;
   int *fscale;
-  sharp_ylmgen_dbl2 *fx;
 
 /* internal usage only */
 /* used if s==0 */

From 5c25c300d94aebbc3b49337cf8b88c8cfdf806a7 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 11 Jan 2019 09:38:10 +0100
Subject: [PATCH 75/85] make configure less smart

---
 configure.ac | 44 --------------------------------------------
 1 file changed, 44 deletions(-)

diff --git a/configure.ac b/configure.ac
index 34626bc..1264b17 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,6 @@ AC_INIT([libsharp], [1.0.0])
 AM_INIT_AUTOMAKE([foreign subdir-objects -Wall -Werror])
 AM_MAINTAINER_MODE([enable])
 
-AC_OPENMP
 
 dnl
 dnl Needed for linking on Windows.
@@ -26,51 +25,8 @@ dnl Enable silent build rules if this version of Automake supports them
 dnl
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 
-AC_DEFUN([AX_CHECK_COMPILE_FLAG],
-[AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
-AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
-  ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
-  _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
-  AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],
-    [AS_VAR_SET(CACHEVAR,[yes])],
-    [AS_VAR_SET(CACHEVAR,[no])])
-  _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
-AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes],
-  [m4_default([$2], :)],
-  [m4_default([$3], :)])
-AS_VAR_POPDEF([CACHEVAR])dnl
-])dnl AX_CHECK_COMPILE_FLAGS
-
-dnl
-dnl Introduce --enable-native-optimizations command line argument to turn on
-dnl -march=native compiler flag, disabled by default.
-dnl
-AC_ARG_ENABLE(
-    [native-optimizations],
-    [AS_HELP_STRING([--enable-native-optimizations], [Enable non-portable optimizations for your own CPU by compiling with -march=native @<:@default=no@:>@])]
-)
 
 AC_PROG_CC_C99
-AS_IF(
-    [test "x$enable_native_optimizations" = "xyes"],
-    [AX_CHECK_COMPILE_FLAG([-march=native],[CC="$CC -march=native"])],
-    dnl
-    dnl FIXME: On GCC 4.4, we hit an internal compiler error unless either
-    dnl -march=native or -fno-tree-fre is specified.
-    dnl
-    [
-        AS_IF(
-            [test "x$GCC" = "xyes" -a "x`$CC -dumpversion | cut -d. -f1,2`" = "x4.4"],
-            [AX_CHECK_COMPILE_FLAG([-fno-tree-fre], [CFLAGS="$CFLAGS -fno-tree-fre"])]
-        )
-    ]
-)
-AX_CHECK_COMPILE_FLAG([-fno-math-errno],[CFLAGS="$CFLAGS -fno-math-errno"])
-AX_CHECK_COMPILE_FLAG([-fno-trapping-math],[CFLAGS="$CFLAGS -fno-trapping-math"])
-AX_CHECK_COMPILE_FLAG([-fno-rounding-math],[CFLAGS="$CFLAGS -fno-rounding-math"])
-AX_CHECK_COMPILE_FLAG([-fno-signaling-nans],[CFLAGS="$CFLAGS -fno-signaling-nans"])
-AX_CHECK_COMPILE_FLAG([-fcx-limited-range],[CFLAGS="$CFLAGS -fcx-limited-range"])
-CFLAGS="$CFLAGS $OPENMP_CFLAGS"
 
 # adding the lib to the files to link
 LIBS="-lm"

From 6646b05f535eba544f781de9cec6ed90825fa5a5 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 11 Jan 2019 11:18:07 +0100
Subject: [PATCH 76/85] polishing

---
 COMPILE | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 COMPILE

diff --git a/COMPILE b/COMPILE
new file mode 100644
index 0000000..8823bd3
--- /dev/null
+++ b/COMPILE
@@ -0,0 +1,25 @@
+Libsharp is configured, compiled and installed using GNU autotools.
+The most complicated step for the user is selecting the appropriate compiler
+flags (and in some cases the compiler).
+
+Here are a few (hopefully helpful) examples:
+
+GCC, OpenMP, portable executable:
+CFLAGS="-std=c99 -O3 -ffast-math -flto -fopenmp" ./configure
+
+GCC, OpenMP, specific optimization for the target CPU:
+CFLAGS="-std=c99 -O3 -march=native -ffast-math -flto -fopenmp" ./configure
+
+GCC, no OpenMP, specific optimization for the target CPU:
+CFLAGS="-std=c99 -O3 -march=native -ffast-math -flto" ./configure
+
+Clang:
+CC=clang CFLAGS="-std=c99 -O3 -march=native -ffast-math -flto -fopenmp" ./configure
+
+MPI support:
+CC=mpicc CFLAGS="-DUSE_MPI -std=c99 -O3 -march=native -ffast-math -flto" ./configure
+
+Additional GCC flags for pedantic warning and debugging:
+
+-Wall -Wextra -Wshadow -Wmissing-prototypes -Wfatal-errors -pedantic -g
+-fsanitize=address

From 976dfb61742357d68d942837f2b9e0002dcaacec Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 11 Jan 2019 11:18:18 +0100
Subject: [PATCH 77/85] polishing

---
 libsharp/libsharp.dox | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libsharp/libsharp.dox b/libsharp/libsharp.dox
index 4c441f1..06361ee 100644
--- a/libsharp/libsharp.dox
+++ b/libsharp/libsharp.dox
@@ -70,8 +70,8 @@
   libsharp supports shared-memory parallelisation via OpenMP; this feature will
   be automatically enabled if the compiler supports it.
 
-  Libsharp will also make use of SSE2 and AVX instructions when compiled for a
-  platform known to support them.
+  Libsharp will also make use of SSE2/AVX/AVX512 instructions when compiled
+  for a platform known to support them.
 
   Support for MPI-parallel transforms is also available; in this mode,
   every MPI task must provide a unique subset of the map and a_lm coefficients.

From b4298be6ab21fbad51d8332f9bb103518e03bd35 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 11 Jan 2019 14:37:27 +0100
Subject: [PATCH 78/85] compile for multiple target platforms

---
 .gitignore                    |   24 +-
 Makefile.am                   |    8 +
 libsharp/sharp_core.c         | 1254 +++------------------------------
 libsharp/sharp_core_avx.c     |   11 +
 libsharp/sharp_core_avx2.c    |   11 +
 libsharp/sharp_core_avx512f.c |   11 +
 libsharp/sharp_core_fma.c     |   11 +
 libsharp/sharp_core_fma4.c    |   11 +
 libsharp/sharp_core_inc.c     | 1175 ++++++++++++++++++++++++++++++
 libsharp/sharp_internal.h     |    1 +
 libsharp/sharp_testsuite.c    |    6 +-
 11 files changed, 1359 insertions(+), 1164 deletions(-)
 create mode 100644 libsharp/sharp_core_avx.c
 create mode 100644 libsharp/sharp_core_avx2.c
 create mode 100644 libsharp/sharp_core_avx512f.c
 create mode 100644 libsharp/sharp_core_fma.c
 create mode 100644 libsharp/sharp_core_fma4.c
 create mode 100644 libsharp/sharp_core_inc.c

diff --git a/.gitignore b/.gitignore
index 12a6531..4675273 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,14 +6,24 @@
 **~
 **.pyc
 **.pyo
+.libs
+**/.deps
+**/.dirstamp
 
 /auto
 /autom4te.cache
-/config.log
-/config.status
-/config/config.auto
+/m4
+config.log
+config.guess
+config.sub
+ltmain.sh
+compile
+missing
+/comp
 /configure
-/sharp_oracle.inc
-
-/python/libsharp/libsharp.c
-/python/libsharp/libsharp_mpi.c
+/Makefile
+/Makefile.in
+/aclocal.m4
+/ar-lib
+/depcomp
+/install-sh
diff --git a/Makefile.am b/Makefile.am
index 2649be1..a1541f8 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -10,6 +10,11 @@ src_sharp = \
   libsharp/sharp.c \
   libsharp/sharp_almhelpers.c \
   libsharp/sharp_core.c \
+  libsharp/sharp_core_avx.c \
+  libsharp/sharp_core_avx2.c \
+  libsharp/sharp_core_fma.c \
+  libsharp/sharp_core_fma4.c \
+  libsharp/sharp_core_avx512f.c \
   libsharp/sharp_geomhelpers.c \
   libsharp/sharp_legendre_roots.c \
   libsharp/sharp_ylmgen_c.c \
@@ -24,6 +29,9 @@ include_HEADERS = \
   libsharp/sharp_almhelpers.h \
   libsharp/sharp_cxx.h
 
+EXTRA_DIST = \
+  libsharp/sharp_core_inc.c
+
 libsharp_la_SOURCES = $(src_sharp)
 
 check_PROGRAMS = sharp_testsuite
diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 146be76..036a8ed 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -1,1168 +1,116 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
+#define XCONCATX(a,b) a##_##b
+#define XCONCATX2(a,b) XCONCATX(a,b)
+#define XARCH(a) XCONCATX2(a,ARCH)
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+#define ARCH default
+#include "sharp_core_inc.c"
+#undef ARCH
 
-/*! \file sharp_core.c
- *  Computational core
- *
- *  Copyright (C) 2012-2019 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#include <complex.h>
-#include <math.h>
-#include <string.h>
-#include "sharp_vecsupport.h"
-#include "sharp.h"
-#include "sharp_internal.h"
-#include "c_utils.h"
-
-typedef complex double dcmplx;
-
-#define nv0 (128/VLEN)
-#define nvx (64/VLEN)
-
-typedef Tv Tbv0[nv0];
-typedef double Tbs0[nv0*VLEN];
-
-typedef struct
-  {
-  Tbv0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i;
-  } s0data_v;
-
-typedef struct
-  {
-  Tbs0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i;
-  } s0data_s;
-
-typedef union
-  {
-  s0data_v v;
-  s0data_s s;
-  } s0data_u;
-
-typedef Tv Tbvx[nvx];
-typedef double Tbsx[nvx*VLEN];
-
-typedef struct
-  {
-  Tbvx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
-       p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
-  } sxdata_v;
-
-typedef struct
-  {
-  Tbsx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
-       p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
-  } sxdata_s;
-
-typedef union
-  {
-  sxdata_v v;
-  sxdata_s s;
-  } sxdata_u;
-
-static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale,
-  double maxval)
-  {
-  const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
-  const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
-  Tm mask = vgt(vabs(*val),vfmax);
-  while (vanyTrue(mask))
-    {
-    vmuleq_mask(mask,*val,vfsmall);
-    vaddeq_mask(mask,*scale,vone);
-    mask = vgt(vabs(*val),vfmax);
-    }
-  mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
-  while (vanyTrue(mask))
-    {
-    vmuleq_mask(mask,*val,vfbig);
-    vsubeq_mask(mask,*scale,vone);
-    mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
-    }
-  }
-
-static void mypow(Tv val, int npow, const double * restrict powlimit,
-  Tv * restrict resd, Tv * restrict ress)
-  {
-  Tv vminv=vload(powlimit[npow]);
-  Tm mask = vlt(vabs(val),vminv);
-  if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
-    {
-    Tv res=vone;
-    do
-      {
-      if (npow&1)
-        res*=val;
-      val*=val;
-      }
-    while(npow>>=1);
-    *resd=res;
-    *ress=vzero;
-    }
-  else
-    {
-    Tv scale=vzero, scaleint=vzero, res=vone;
-    Tvnormalize(&val,&scaleint,sharp_fbighalf);
-    do
-      {
-      if (npow&1)
-        {
-        res*=val;
-        scale+=scaleint;
-        Tvnormalize(&res,&scale,sharp_fbighalf);
-        }
-      val*=val;
-      scaleint+=scaleint;
-      Tvnormalize(&val,&scaleint,sharp_fbighalf);
-      }
-    while(npow>>=1);
-    *resd=res;
-    *ress=scale;
-    }
-  }
-
-static inline void getCorfac(Tv scale, Tv * restrict corfac,
-  const double * restrict cf)
-  {
-  typedef union
-    { Tv v; double s[VLEN]; } Tvu;
-
-  Tvu sc, corf;
-  sc.v=scale;
-  for (int i=0; i<VLEN; ++i)
-    corf.s[i] = (sc.s[i]<sharp_minscale) ?
-      0. : cf[(int)(sc.s[i])-sharp_minscale];
-  *corfac=corf.v;
-  }
-
-static inline int rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, Tv eps)
-  {
-  Tm mask = vgt(vabs(*v2),eps);
-  if (vanyTrue(mask))
-    {
-    vmuleq_mask(mask,*v1,vload(sharp_fsmall));
-    vmuleq_mask(mask,*v2,vload(sharp_fsmall));
-    vaddeq_mask(mask,*s,vone);
-    return 1;
-    }
-  return 0;
-  }
-
-NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
-  s0data_v * restrict d, int * restrict l_, int * restrict il_, int nv2)
-  {
-  int l=gen->m, il=0;
-  Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
-  Tv limscale=vload(sharp_limscale);
-  int below_limit = 1;
-  for (int i=0; i<nv2; ++i)
-    {
-    d->lam1[i]=vzero;
-    mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]);
-    d->lam2[i] *= mfac;
-    Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
-    below_limit &= vallTrue(vlt(d->scale[i],limscale));
-    }
-
-  while (below_limit)
-    {
-    if (l+4>gen->lmax) {*l_=gen->lmax+1;return;}
-    below_limit=1;
-    Tv a1=vload(gen->coef[il  ][0]), b1=vload(gen->coef[il  ][1]);
-    Tv a2=vload(gen->coef[il+1][0]), b2=vload(gen->coef[il+1][1]);
-    for (int i=0; i<nv2; ++i)
-      {
-      d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
-      d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
-      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
-        below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale)));
-      }
-    l+=4; il+=2;
-    }
-  *l_=l; *il_=il;
-  }
-
-NOINLINE static void alm2map_kernel(s0data_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm,
-  int l, int il, int lmax, int nv2)
-  {
-  if (nv2==nv0)
-    {
-    for (; l<=lmax-2; il+=2, l+=4)
-      {
-      Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-      Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-      Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
-      Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
-      Tv a1=vload(coef[il  ][0]), b1=vload(coef[il  ][1]);
-      Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]);
-      for (int i=0; i<nv0; ++i)
-        {
-        d->p1r[i] += d->lam2[i]*ar1;
-        d->p1i[i] += d->lam2[i]*ai1;
-        d->p2r[i] += d->lam2[i]*ar2;
-        d->p2i[i] += d->lam2[i]*ai2;
-        d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
-        d->p1r[i] += d->lam1[i]*ar3;
-        d->p1i[i] += d->lam1[i]*ai3;
-        d->p2r[i] += d->lam1[i]*ar4;
-        d->p2i[i] += d->lam1[i]*ai4;
-        d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
-        }
-      }
-    }
-  else
-    {
-    for (; l<=lmax-2; il+=2, l+=4)
-      {
-      Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-      Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-      Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
-      Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
-      Tv a1=vload(coef[il  ][0]), b1=vload(coef[il  ][1]);
-      Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]);
-      for (int i=0; i<nv2; ++i)
-        {
-        d->p1r[i] += d->lam2[i]*ar1;
-        d->p1i[i] += d->lam2[i]*ai1;
-        d->p2r[i] += d->lam2[i]*ar2;
-        d->p2i[i] += d->lam2[i]*ai2;
-        d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
-        d->p1r[i] += d->lam1[i]*ar3;
-        d->p1i[i] += d->lam1[i]*ai3;
-        d->p2r[i] += d->lam1[i]*ar4;
-        d->p2i[i] += d->lam1[i]*ai4;
-        d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
-        }
-      }
-    }
-  for (; l<=lmax; ++il, l+=2)
-    {
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
-    for (int i=0; i<nv2; ++i)
-      {
-      d->p1r[i] += d->lam2[i]*ar1;
-      d->p1i[i] += d->lam2[i]*ai1;
-      d->p2r[i] += d->lam2[i]*ar2;
-      d->p2i[i] += d->lam2[i]*ai2;
-      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
-      d->lam1[i] = d->lam2[i];
-      d->lam2[i] = tmp;
-      }
-    }
-  }
-
-NOINLINE static void calc_alm2map (sharp_job * restrict job,
-  const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
-  {
-  int l,il,lmax=gen->lmax;
-  int nv2 = (nth+VLEN-1)/VLEN;
-  iter_to_ieee(gen, d, &l, &il, nv2);
-  job->opcnt += il * 4*nth;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 6*nth;
-
-  const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee=1;
-  for (int i=0; i<nv2; ++i)
-    {
-    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
-    }
-
-  while((!full_ieee) && (l<=lmax))
-    {
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
-    full_ieee=1;
-    for (int i=0; i<nv2; ++i)
-      {
-      d->p1r[i] += d->lam2[i]*d->corfac[i]*ar1;
-      d->p1i[i] += d->lam2[i]*d->corfac[i]*ai1;
-      d->p2r[i] += d->lam2[i]*d->corfac[i]*ar2;
-      d->p2i[i] += d->lam2[i]*d->corfac[i]*ai2;
-      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
-      d->lam1[i] = d->lam2[i];
-      d->lam2[i] = tmp;
-      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
-        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-      full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
-      }
-    l+=2; ++il;
-    }
-  if (l>lmax) return;
-
-  for (int i=0; i<nv2; ++i)
-    {
-    d->lam1[i] *= d->corfac[i];
-    d->lam2[i] *= d->corfac[i];
-    }
-  alm2map_kernel(d, coef, alm, l, il, lmax, nv2);
-  }
-
-NOINLINE static void map2alm_kernel(s0data_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict coef, dcmplx * restrict alm, int l,
-  int il, int lmax, int nv2)
-  {
-  for (; l<=lmax-2; il+=2, l+=4)
-    {
-    Tv a1=vload(coef[il  ][0]), b1=vload(coef[il  ][1]);
-    Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]);
-    Tv atmp1[4] = {vzero, vzero, vzero, vzero};
-    Tv atmp2[4] = {vzero, vzero, vzero, vzero};
-    for (int i=0; i<nv2; ++i)
-      {
-      atmp1[0] += d->lam2[i]*d->p1r[i];
-      atmp1[1] += d->lam2[i]*d->p1i[i];
-      atmp1[2] += d->lam2[i]*d->p2r[i];
-      atmp1[3] += d->lam2[i]*d->p2i[i];
-      d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
-      atmp2[0] += d->lam1[i]*d->p1r[i];
-      atmp2[1] += d->lam1[i]*d->p1i[i];
-      atmp2[2] += d->lam1[i]*d->p2r[i];
-      atmp2[3] += d->lam1[i]*d->p2i[i];
-      d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
-      }
-    vhsum_cmplx_special (atmp1[0], atmp1[1], atmp1[2], atmp1[3], &alm[l  ]);
-    vhsum_cmplx_special (atmp2[0], atmp2[1], atmp2[2], atmp2[3], &alm[l+2]);
-    }
-  for (; l<=lmax; ++il, l+=2)
-    {
-    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
-    Tv atmp[4] = {vzero, vzero, vzero, vzero};
-    for (int i=0; i<nv2; ++i)
-      {
-      atmp[0] += d->lam2[i]*d->p1r[i];
-      atmp[1] += d->lam2[i]*d->p1i[i];
-      atmp[2] += d->lam2[i]*d->p2r[i];
-      atmp[3] += d->lam2[i]*d->p2i[i];
-      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
-      d->lam1[i] = d->lam2[i];
-      d->lam2[i] = tmp;
-      }
-    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
-    }
-  }
-
-NOINLINE static void calc_map2alm (sharp_job * restrict job,
-  const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
-  {
-  int l,il,lmax=gen->lmax;
-  int nv2 = (nth+VLEN-1)/VLEN;
-  iter_to_ieee(gen, d, &l, &il, nv2);
-  job->opcnt += il * 4*nth;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 6*nth;
-
-  const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
-  dcmplx * restrict alm=job->almtmp;
-  int full_ieee=1;
-  for (int i=0; i<nv2; ++i)
-    {
-    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
-    }
-
-  while((!full_ieee) && (l<=lmax))
-    {
-    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
-    Tv atmp[4] = {vzero, vzero, vzero, vzero};
-    full_ieee=1;
-    for (int i=0; i<nv2; ++i)
-      {
-      atmp[0] += d->lam2[i]*d->corfac[i]*d->p1r[i];
-      atmp[1] += d->lam2[i]*d->corfac[i]*d->p1i[i];
-      atmp[2] += d->lam2[i]*d->corfac[i]*d->p2r[i];
-      atmp[3] += d->lam2[i]*d->corfac[i]*d->p2i[i];
-      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
-      d->lam1[i] = d->lam2[i];
-      d->lam2[i] = tmp;
-      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
-        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-      full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
-      }
-    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
-    l+=2; ++il;
-    }
-  if (l>lmax) return;
-
-  for (int i=0; i<nv2; ++i)
-    {
-    d->lam1[i] *= d->corfac[i];
-    d->lam2[i] *= d->corfac[i];
-    }
-  map2alm_kernel(d, coef, alm, l, il, lmax, nv2);
-  }
-
-NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
-  sxdata_v * restrict d, int * restrict l_, int nv2)
-  {
-  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
-  Tv prefac=vload(gen->prefac[gen->m]),
-     prescale=vload(gen->fscale[gen->m]);
-  Tv limscale=vload(sharp_limscale);
-  int below_limit=1;
-  for (int i=0; i<nv2; ++i)
-    {
-    Tv cth2=vmax(vload(1e-15),vsqrt((vone+d->cth[i])*vload(0.5)));
-    Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5)));
-    Tm mask=vlt(d->sth[i],vzero);
-    vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.));
-    vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.));
-
-    Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
-    mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
-    mypow(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
-    mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
-    mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
-
-    d->l1p[i] = vzero;
-    d->l1m[i] = vzero;
-    d->l2p[i] = prefac*ccp;
-    d->scp[i] = prescale+ccps;
-    d->l2m[i] = prefac*csp;
-    d->scm[i] = prescale+csps;
-    Tvnormalize(&d->l2m[i],&d->scm[i],sharp_fbighalf);
-    Tvnormalize(&d->l2p[i],&d->scp[i],sharp_fbighalf);
-    d->l2p[i] *= ssp;
-    d->scp[i] += ssps;
-    d->l2m[i] *= scp;
-    d->scm[i] += scps;
-    if (gen->preMinus_p)
-      d->l2p[i] = vneg(d->l2p[i]);
-    if (gen->preMinus_m)
-      d->l2m[i] = vneg(d->l2m[i]);
-    if (gen->s&1)
-      d->l2p[i] = vneg(d->l2p[i]);
-
-    Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol);
-    Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol);
-
-    below_limit &= vallTrue(vlt(d->scm[i],limscale)) &&
-                   vallTrue(vlt(d->scp[i],limscale));
-    }
-
-  int l=gen->mhi;
-
-  while (below_limit)
-    {
-    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
-    below_limit=1;
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
-    for (int i=0; i<nv2; ++i)
-      {
-      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
-      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
-      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
-      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-      if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
-          rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
-        below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
-                       vallTrue(vlt(d->scm[i],limscale));
-      }
-    l+=2;
-    }
-
-  *l_=l;
-  }
-
-NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
-  int l, int lmax, int nv2)
-  {
-  int lsave = l;
-  while (l<=lmax)
-    {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
-    Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
-       acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
-    Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
-       acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
-    for (int i=0; i<nv2; ++i)
-      {
-      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
-      d->p1pr[i] += agr1*d->l2p[i];
-      d->p1pi[i] += agi1*d->l2p[i];
-      d->p1mr[i] += acr1*d->l2p[i];
-      d->p1mi[i] += aci1*d->l2p[i];
-
-      d->p1pr[i] += aci2*d->l1p[i];
-      d->p1pi[i] -= acr2*d->l1p[i];
-      d->p1mr[i] -= agi2*d->l1p[i];
-      d->p1mi[i] += agr2*d->l1p[i];
-      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
-      }
-    l+=2;
-    }
-  l=lsave;
-  while (l<=lmax)
-    {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
-    Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
-       acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
-    Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
-       acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
-    for (int i=0; i<nv2; ++i)
-      {
-      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
-      d->p2pr[i] -= aci1*d->l2m[i];
-      d->p2pi[i] += acr1*d->l2m[i];
-      d->p2mr[i] += agi1*d->l2m[i];
-      d->p2mi[i] -= agr1*d->l2m[i];
-
-      d->p2pr[i] += agr2*d->l1m[i];
-      d->p2pi[i] += agi2*d->l1m[i];
-      d->p2mr[i] += acr2*d->l1m[i];
-      d->p2mi[i] += aci2*d->l1m[i];
-      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-      }
-    l+=2;
-    }
-  }
-
-NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
-  const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
-  {
-  int l,lmax=gen->lmax;
-  int nv2 = (nth+VLEN-1)/VLEN;
-  iter_to_ieee_spin(gen, d, &l, nv2);
-  job->opcnt += (l-gen->mhi) * 7*nth;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 23*nth;
-
-  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee=1;
-  for (int i=0; i<nv2; ++i)
-    {
-    getCorfac(d->scp[i], &d->cfp[i], gen->cf);
-    getCorfac(d->scm[i], &d->cfm[i], gen->cf);
-    full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
-                 vallTrue(vge(d->scm[i],vload(sharp_minscale)));
-    }
-
-  while((!full_ieee) && (l<=lmax))
-    {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
-    Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
-       acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
-    Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
-       acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
-    full_ieee=1;
-    for (int i=0; i<nv2; ++i)
-      {
-      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
-      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
-
-      Tv l2p=d->l2p[i]*d->cfp[i], l2m=d->l2m[i]*d->cfm[i];
-      Tv l1m=d->l1m[i]*d->cfm[i], l1p=d->l1p[i]*d->cfp[i];
-
-      d->p1pr[i] += agr1*l2p + aci2*l1p;
-      d->p1pi[i] += agi1*l2p - acr2*l1p;
-      d->p1mr[i] += acr1*l2p - agi2*l1p;
-      d->p1mi[i] += aci1*l2p + agr2*l1p;
-
-      d->p2pr[i] += agr2*l1m - aci1*l2m;
-      d->p2pi[i] += agi2*l1m + acr1*l2m;
-      d->p2mr[i] += acr2*l1m + agi1*l2m;
-      d->p2mi[i] += aci2*l1m - agr1*l2m;
-
-      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
-      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
-        getCorfac(d->scp[i], &d->cfp[i], gen->cf);
-      full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
-      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
-        getCorfac(d->scm[i], &d->cfm[i], gen->cf);
-      full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
-      }
-    l+=2;
-    }
-//  if (l>lmax) return;
-
-  for (int i=0; i<nv2; ++i)
-    {
-    d->l1p[i] *= d->cfp[i];
-    d->l2p[i] *= d->cfp[i];
-    d->l1m[i] *= d->cfm[i];
-    d->l2m[i] *= d->cfm[i];
-    }
-  alm2map_spin_kernel(d, fx, alm, l, lmax, nv2);
-
-  for (int i=0; i<nv2; ++i)
-    {
-    Tv tmp;
-    tmp = d->p1pr[i]; d->p1pr[i] -= d->p2mi[i]; d->p2mi[i] += tmp;
-    tmp = d->p1pi[i]; d->p1pi[i] += d->p2mr[i]; d->p2mr[i] -= tmp;
-    tmp = d->p1mr[i]; d->p1mr[i] += d->p2pi[i]; d->p2pi[i] -= tmp;
-    tmp = d->p1mi[i]; d->p1mi[i] -= d->p2pr[i]; d->p2pr[i] += tmp;
-    }
-  }
-
-NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict fx, dcmplx * restrict alm,
-  int l, int lmax, int nv2)
-  {
-  int lsave=l;
-  while (l<=lmax)
-    {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
-    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
-    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
-    for (int i=0; i<nv2; ++i)
-      {
-      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
-      agr1 += d->p2mi[i]*d->l2p[i];
-      agi1 -= d->p2mr[i]*d->l2p[i];
-      acr1 -= d->p2pi[i]*d->l2p[i];
-      aci1 += d->p2pr[i]*d->l2p[i];
-      agr2 += d->p2pr[i]*d->l1p[i];
-      agi2 += d->p2pi[i]*d->l1p[i];
-      acr2 += d->p2mr[i]*d->l1p[i];
-      aci2 += d->p2mi[i]*d->l1p[i];
-      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
-      }
-    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
-    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
-    l+=2;
-    }
-  l=lsave;
-  while (l<=lmax)
-    {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
-    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
-    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
-    for (int i=0; i<nv2; ++i)
-      {
-      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
-      agr1 += d->p1pr[i]*d->l2m[i];
-      agi1 += d->p1pi[i]*d->l2m[i];
-      acr1 += d->p1mr[i]*d->l2m[i];
-      aci1 += d->p1mi[i]*d->l2m[i];
-      agr2 -= d->p1mi[i]*d->l1m[i];
-      agi2 += d->p1mr[i]*d->l1m[i];
-      acr2 += d->p1pi[i]*d->l1m[i];
-      aci2 -= d->p1pr[i]*d->l1m[i];
-      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-      }
-    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
-    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
-    l+=2;
-    }
-  }
-
-NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
-  const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
-  {
-  int l,lmax=gen->lmax;
-  int nv2 = (nth+VLEN-1)/VLEN;
-  iter_to_ieee_spin(gen, d, &l, nv2);
-  job->opcnt += (l-gen->mhi) * 7*nth;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 23*nth;
-
-  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
-  dcmplx * restrict alm=job->almtmp;
-  int full_ieee=1;
-  for (int i=0; i<nv2; ++i)
-    {
-    getCorfac(d->scp[i], &d->cfp[i], gen->cf);
-    getCorfac(d->scm[i], &d->cfm[i], gen->cf);
-    full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
-                 vallTrue(vge(d->scm[i],vload(sharp_minscale)));
-    }
-  for (int i=0; i<nv2; ++i)
-    {
-    Tv tmp;
-    tmp = d->p1pr[i]; d->p1pr[i] -= d->p2mi[i]; d->p2mi[i] += tmp;
-    tmp = d->p1pi[i]; d->p1pi[i] += d->p2mr[i]; d->p2mr[i] -= tmp;
-    tmp = d->p1mr[i]; d->p1mr[i] += d->p2pi[i]; d->p2pi[i] -= tmp;
-    tmp = d->p1mi[i]; d->p1mi[i] -= d->p2pr[i]; d->p2pr[i] += tmp;
-    }
-
-  while((!full_ieee) && (l<=lmax))
-    {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
-    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
-    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
-    full_ieee=1;
-    for (int i=0; i<nv2; ++i)
-      {
-      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
-      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
-      Tv l2p = d->l2p[i]*d->cfp[i], l2m = d->l2m[i]*d->cfm[i];
-      Tv l1p = d->l1p[i]*d->cfp[i], l1m = d->l1m[i]*d->cfm[i];
-      agr1 += d->p1pr[i]*l2m + d->p2mi[i]*l2p;
-      agi1 += d->p1pi[i]*l2m - d->p2mr[i]*l2p;
-      acr1 += d->p1mr[i]*l2m - d->p2pi[i]*l2p;
-      aci1 += d->p1mi[i]*l2m + d->p2pr[i]*l2p;
-      agr2 += d->p2pr[i]*l1p - d->p1mi[i]*l1m;
-      agi2 += d->p2pi[i]*l1p + d->p1mr[i]*l1m;
-      acr2 += d->p2mr[i]*l1p + d->p1pi[i]*l1m;
-      aci2 += d->p2mi[i]*l1p - d->p1pr[i]*l1m;
-
-      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
-      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
-        getCorfac(d->scp[i], &d->cfp[i], gen->cf);
-      full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
-      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
-        getCorfac(d->scm[i], &d->cfm[i], gen->cf);
-      full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
-      }
-    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
-    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
-    l+=2;
-    }
-  if (l>lmax) return;
-
-  for (int i=0; i<nv2; ++i)
-    {
-    d->l1p[i] *= d->cfp[i];
-    d->l2p[i] *= d->cfp[i];
-    d->l1m[i] *= d->cfm[i];
-    d->l2m[i] *= d->cfm[i];
-    }
-  map2alm_spin_kernel(d, fx, alm, l, lmax, nv2);
-  }
-
-
-NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
-  int l, int lmax, int nv2)
-  {
-  while (l<=lmax)
-    {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
-       ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    for (int i=0; i<nv2; ++i)
-      {
-      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
-      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
-      Tv lw=d->l2p[i]+d->l2m[i];
-      d->p1pr[i] += ar1*lw;
-      d->p1pi[i] += ai1*lw;
-      Tv lx=d->l2m[i]-d->l2p[i];
-      d->p2mr[i] += ai1*lx;
-      d->p2mi[i] -= ar1*lx;
-      lw=d->l1p[i]+d->l1m[i];
-      d->p2pr[i] += ar2*lw;
-      d->p2pi[i] += ai2*lw;
-      lx=d->l1m[i]-d->l1p[i];
-      d->p1mr[i] += ai2*lx;
-      d->p1mi[i] -= ar2*lx;
-      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
-      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-      }
-    l+=2;
-    }
-  }
-
-NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
-  const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
-  {
-  int l,lmax=gen->lmax;
-  int nv2 = (nth+VLEN-1)/VLEN;
-  iter_to_ieee_spin(gen, d, &l, nv2);
-  job->opcnt += (l-gen->mhi) * 7*nth;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * 17*nth;
-
-  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee=1;
-  for (int i=0; i<nv2; ++i)
-    {
-    getCorfac(d->scp[i], &d->cfp[i], gen->cf);
-    getCorfac(d->scm[i], &d->cfm[i], gen->cf);
-    full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
-                 vallTrue(vge(d->scm[i],vload(sharp_minscale)));
-    }
-
-  while((!full_ieee) && (l<=lmax))
-    {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
-       ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    full_ieee=1;
-    for (int i=0; i<nv2; ++i)
-      {
-      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
-      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
-      Tv lw=d->l2p[i]*d->cfp[i]+d->l2m[i]*d->cfm[i];
-      d->p1pr[i] += ar1*lw;
-      d->p1pi[i] += ai1*lw;
-      Tv lx=d->l2m[i]*d->cfm[i]-d->l2p[i]*d->cfp[i];
-      d->p2mr[i] += ai1*lx;
-      d->p2mi[i] -= ar1*lx;
-      lw=d->l1p[i]*d->cfp[i]+d->l1m[i]*d->cfm[i];
-      d->p2pr[i] += ar2*lw;
-      d->p2pi[i] += ai2*lw;
-      lx=d->l1m[i]*d->cfm[i]-d->l1p[i]*d->cfp[i];
-      d->p1mr[i] += ai2*lx;
-      d->p1mi[i] -= ar2*lx;
-      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
-      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
-        {
-        getCorfac(d->scp[i], &d->cfp[i], gen->cf);
-        full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
-        }
-      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
-        {
-        getCorfac(d->scm[i], &d->cfm[i], gen->cf);
-        full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
-        }
-      }
-    l+=2;
-    }
-  if (l>lmax) return;
-
-  for (int i=0; i<nv2; ++i)
-    {
-    d->l1p[i] *= d->cfp[i];
-    d->l2p[i] *= d->cfp[i];
-    d->l1m[i] *= d->cfm[i];
-    d->l2m[i] *= d->cfm[i];
-    }
-  alm2map_deriv1_kernel(d, fx, alm, l, lmax, nv2);
-  }
-
-
-#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
-
-NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
+typedef void (*t_inner_loop) (sharp_job *job, const int *ispair,
   const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim)
+  sharp_Ylmgen_C *gen, int mi, const int *mlim);
+typedef int (*t_veclen) (void);
+typedef int (*t_max_nvec) (int spin);
+typedef const char *(*t_architecture) (void);
+
+static t_inner_loop inner_loop_ = NULL;
+static t_veclen veclen_ = NULL;
+static t_max_nvec max_nvec_ = NULL;
+static t_architecture architecture_ = NULL;
+
+#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+
+#define DECL(arch) \
+static int XCONCATX2(have,arch)(void) \
+  { \
+  static int res=-1; \
+  if (res<0) \
+    { \
+    __builtin_cpu_init(); \
+    res = __builtin_cpu_supports(#arch); \
+    } \
+  return res; \
+  } \
+\
+void XCONCATX2(inner_loop,arch) (sharp_job *job, const int *ispair, \
+  const double *cth_, const double *sth_, int llim, int ulim, \
+  sharp_Ylmgen_C *gen, int mi, const int *mlim); \
+int XCONCATX2(sharp_veclen,arch) (void); \
+int XCONCATX2(sharp_max_nvec,arch) (int spin); \
+const char *XCONCATX2(sharp_architecture,arch) (void);
+
+#if (!defined(__AVX512F__))
+DECL(avx512f)
+#endif
+#if (!defined(__FMA4__))
+DECL(fma4)
+#endif
+#if (!defined(__FMA__))
+DECL(fma)
+#endif
+#if (!defined(__AVX2__))
+DECL(avx2)
+#endif
+#if (!defined(__AVX__))
+DECL(avx)
+#endif
+
+#endif
+
+static void assign_funcs(void)
   {
-  const int m = job->ainfo->mval[mi];
-  sharp_Ylmgen_prepare (gen, m);
-
-  switch (job->type)
-    {
-    case SHARP_ALM2MAP:
-    case SHARP_ALM2MAP_DERIV1:
-      {
-      if (job->spin==0)
-        {
-        //adjust the a_lm for the new algorithm
-        dcmplx * restrict alm=job->almtmp;
-        for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
-          {
-          dcmplx al = alm[l];
-          dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1];
-          dcmplx al2 = (l+2>gen->lmax) ? 0. : alm[l+2];
-          alm[l  ] = gen->alpha[il]*(gen->eps[l+1]*al + gen->eps[l+2]*al2);
-          alm[l+1] = gen->alpha[il]*al1;
-          }
-
-        const int nval=nv0*VLEN;
-        int ith=0;
-        int itgt[nval];
-        while (ith<ulim-llim)
-          {
-          s0data_u d;
-          VZERO(d.s.p1r); VZERO(d.s.p1i); VZERO(d.s.p2r); VZERO(d.s.p2i);
-          int nth=0;
-          while ((nth<nval)&&(ith<ulim-llim))
-            {
-            if (mlim[ith]>=m)
-              {
-              itgt[nth] = ith;
-              d.s.csq[nth]=cth_[ith]*cth_[ith];
-              d.s.sth[nth]=sth_[ith];
-              ++nth;
-              }
-            else
-              {
-              int phas_idx = ith*job->s_th + mi*job->s_m;
-              job->phase[phas_idx] = job->phase[phas_idx+1] = 0;
-              }
-            ++ith;
-            }
-          if (nth>0)
-            {
-            int i2=((nth+VLEN-1)/VLEN)*VLEN;
-            for (int i=nth; i<i2; ++i)
-              {
-              d.s.csq[i]=d.s.csq[nth-1];
-              d.s.sth[i]=d.s.sth[nth-1];
-              d.s.p1r[i]=d.s.p1i[i]=d.s.p2r[i]=d.s.p2i[i]=0.;
-              }
-            calc_alm2map (job, gen, &d.v, nth);
-            for (int i=0; i<nth; ++i)
-              {
-              int tgt=itgt[i];
-              //adjust for new algorithm
-              d.s.p2r[i]*=cth_[tgt];
-              d.s.p2i[i]*=cth_[tgt];
-              int phas_idx = tgt*job->s_th + mi*job->s_m;
-              complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I,
-                             r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I;
-              job->phase[phas_idx] = r1+r2;
-              if (ispair[tgt])
-                job->phase[phas_idx+1] = r1-r2;
-              }
-            }
-          }
-        }
-      else
-        {
-        //adjust the a_lm for the new algorithm
-        if (job->nalm==2)
-          for (int l=gen->mhi; l<=gen->lmax+1; ++l)
-            {
-            job->almtmp[2*l  ]*=gen->alpha[l];
-            job->almtmp[2*l+1]*=gen->alpha[l];
-            }
-        else
-          for (int l=gen->mhi; l<=gen->lmax+1; ++l)
-            job->almtmp[l]*=gen->alpha[l];
-
-        const int nval=nvx*VLEN;
-        int ith=0;
-        int itgt[nval];
-        while (ith<ulim-llim)
-          {
-          sxdata_u d;
-          VZERO(d.s.p1pr); VZERO(d.s.p1pi); VZERO(d.s.p2pr); VZERO(d.s.p2pi);
-          VZERO(d.s.p1mr); VZERO(d.s.p1mi); VZERO(d.s.p2mr); VZERO(d.s.p2mi);
-          int nth=0;
-          while ((nth<nval)&&(ith<ulim-llim))
-            {
-            if (mlim[ith]>=m)
-              {
-              itgt[nth] = ith;
-              d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
-              ++nth;
-              }
-            else
-              {
-              int phas_idx = ith*job->s_th + mi*job->s_m;
-              job->phase[phas_idx  ] = job->phase[phas_idx+1] = 0;
-              job->phase[phas_idx+2] = job->phase[phas_idx+3] = 0;
-              }
-            ++ith;
-            }
-          if (nth>0)
-            {
-            int i2=((nth+VLEN-1)/VLEN)*VLEN;
-            for (int i=nth; i<i2; ++i)
-              {
-              d.s.cth[i]=d.s.cth[nth-1];
-              d.s.sth[i]=d.s.sth[nth-1];
-              d.s.p1pr[i]=d.s.p1pi[i]=d.s.p2pr[i]=d.s.p2pi[i]=0.;
-              d.s.p1mr[i]=d.s.p1mi[i]=d.s.p2mr[i]=d.s.p2mi[i]=0.;
-              }
-            (job->type==SHARP_ALM2MAP) ?
-              calc_alm2map_spin  (job, gen, &d.v, nth) :
-              calc_alm2map_deriv1(job, gen, &d.v, nth);
-            for (int i=0; i<nth; ++i)
-              {
-              int tgt=itgt[i];
-              int phas_idx = tgt*job->s_th + mi*job->s_m;
-              complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I,
-                             q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I,
-                             u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I,
-                             u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I;
-              job->phase[phas_idx  ] = q1+q2;
-              job->phase[phas_idx+2] = u1+u2;
-              if (ispair[tgt])
-                {
-                dcmplx *phQ = &(job->phase[phas_idx+1]),
-                       *phU = &(job->phase[phas_idx+3]);
-                *phQ = q1-q2;
-                *phU = u1-u2;
-                if ((gen->mhi-gen->m+gen->s)&1)
-                  { *phQ=-(*phQ); *phU=-(*phU); }
-                }
-              }
-            }
-          }
-        }
-      break;
-      }
-    default:
-      {
-      UTIL_FAIL("must not happen");
-      break;
-      }
+#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+#define DECL2(arch) \
+  if (XCONCATX2(have,arch)()) \
+    { \
+    inner_loop_ = XCONCATX2(inner_loop,arch); \
+    veclen_ = XCONCATX2(sharp_veclen,arch); \
+    max_nvec_ = XCONCATX2(sharp_max_nvec,arch); \
+    architecture_ = XCONCATX2(sharp_architecture,arch); \
+    return; \
     }
+#if (!defined(__AVX512F__))
+DECL2(avx512f)
+#endif
+#if (!defined(__FMA4__))
+DECL2(fma4)
+#endif
+#if (!defined(__FMA__))
+DECL2(fma)
+#endif
+#if (!defined(__AVX2__))
+DECL2(avx2)
+#endif
+#if (!defined(__AVX__))
+DECL2(avx)
+#endif
+#endif
+  inner_loop_ = inner_loop_default;
+  veclen_ = sharp_veclen_default;
+  max_nvec_ = sharp_max_nvec_default;
+  architecture_ = sharp_architecture_default;
   }
 
-NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
-  const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim)
+
+void inner_loop (sharp_job *job, const int *ispair,const double *cth,
+  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
+  const int *mlim)
   {
-  const int m = job->ainfo->mval[mi];
-  sharp_Ylmgen_prepare (gen, m);
-
-  switch (job->type)
-    {
-    case SHARP_MAP2ALM:
-      {
-      if (job->spin==0)
-        {
-        const int nval=nv0*VLEN;
-        int ith=0;
-        while (ith<ulim-llim)
-          {
-          s0data_u d;
-          int nth=0;
-          while ((nth<nval)&&(ith<ulim-llim))
-            {
-            if (mlim[ith]>=m)
-              {
-              d.s.csq[nth]=cth_[ith]*cth_[ith]; d.s.sth[nth]=sth_[ith];
-              int phas_idx = ith*job->s_th + mi*job->s_m;
-              dcmplx ph1=job->phase[phas_idx];
-              dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.;
-              d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2);
-              d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2);
-              //adjust for new algorithm
-              d.s.p2r[nth]*=cth_[ith];
-              d.s.p2i[nth]*=cth_[ith];
-              ++nth;
-              }
-            ++ith;
-            }
-          if (nth>0)
-            {
-            int i2=((nth+VLEN-1)/VLEN)*VLEN;
-            for (int i=nth; i<i2; ++i)
-              {
-              d.s.csq[i]=d.s.csq[nth-1];
-              d.s.sth[i]=d.s.sth[nth-1];
-              d.s.p1r[i]=d.s.p1i[i]=d.s.p2r[i]=d.s.p2i[i]=0.;
-              }
-            calc_map2alm (job, gen, &d.v, nth);
-            }
-          }
-        //adjust the a_lm for the new algorithm
-        dcmplx * restrict alm=job->almtmp;
-        dcmplx alm2 = 0.;
-        double alold=0;
-        for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
-          {
-          dcmplx al = alm[l];
-          dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1];
-          alm[l  ] = gen->alpha[il]*gen->eps[l+1]*al + alold*gen->eps[l]*alm2;
-          alm[l+1] = gen->alpha[il]*al1;
-          alm2=al;
-          alold=gen->alpha[il];
-          }
-        }
-      else
-        {
-        const int nval=nvx*VLEN;
-        int ith=0;
-        while (ith<ulim-llim)
-          {
-          sxdata_u d;
-          int nth=0;
-          while ((nth<nval)&&(ith<ulim-llim))
-            {
-            if (mlim[ith]>=m)
-              {
-              d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
-              int phas_idx = ith*job->s_th + mi*job->s_m;
-              dcmplx p1Q=job->phase[phas_idx],
-                     p1U=job->phase[phas_idx+2],
-                     p2Q=ispair[ith] ? job->phase[phas_idx+1]:0.,
-                     p2U=ispair[ith] ? job->phase[phas_idx+3]:0.;
-              if ((gen->mhi-gen->m+gen->s)&1)
-                { p2Q=-p2Q; p2U=-p2U; }
-              d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q);
-              d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U);
-              d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q);
-              d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U);
-              ++nth;
-              }
-            ++ith;
-            }
-          if (nth>0)
-            {
-            int i2=((nth+VLEN-1)/VLEN)*VLEN;
-            for (int i=nth; i<i2; ++i)
-              {
-              d.s.cth[i]=d.s.cth[nth-1];
-              d.s.sth[i]=d.s.sth[nth-1];
-              d.s.p1pr[i]=d.s.p1pi[i]=d.s.p2pr[i]=d.s.p2pi[i]=0.;
-              d.s.p1mr[i]=d.s.p1mi[i]=d.s.p2mr[i]=d.s.p2mi[i]=0.;
-              }
-            calc_map2alm_spin(job, gen, &d.v, nth);
-            }
-          }
-        //adjust the a_lm for the new algorithm
-        for (int l=gen->mhi; l<=gen->lmax; ++l)
-          {
-          job->almtmp[2*l  ]*=gen->alpha[l];
-          job->almtmp[2*l+1]*=gen->alpha[l];
-          }
-        }
-      break;
-      }
-    default:
-      {
-      UTIL_FAIL("must not happen");
-      break;
-      }
-    }
+  if (!inner_loop_) assign_funcs();
+  inner_loop_(job, ispair, cth, sth, llim, ulim, gen, mi, mlim);
   }
-
-void inner_loop (sharp_job *job, const int *ispair,
-  const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim)
-  {
-  (job->type==SHARP_MAP2ALM) ?
-    inner_loop_m2a(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) :
-    inner_loop_a2m(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim);
-  }
-
-#undef VZERO
-
 int sharp_veclen(void)
   {
-  return VLEN;
+  if (!veclen_) assign_funcs();
+  return veclen_();
   }
-
 int sharp_max_nvec(int spin)
   {
-  return (spin==0) ? nv0 : nvx;
+  if (!max_nvec_) assign_funcs();
+  return max_nvec_(spin);
+  }
+const char *sharp_architecture(void)
+  {
+  if (!architecture_) assign_funcs();
+  return architecture_();
   }
diff --git a/libsharp/sharp_core_avx.c b/libsharp/sharp_core_avx.c
new file mode 100644
index 0000000..724e629
--- /dev/null
+++ b/libsharp/sharp_core_avx.c
@@ -0,0 +1,11 @@
+#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+
+#define XCONCATX(a,b) a##_##b
+#define XCONCATX2(a,b) XCONCATX(a,b)
+#define XARCH(a) XCONCATX2(a,ARCH)
+
+#define ARCH avx
+#pragma GCC target("avx")
+#include "sharp_core_inc.c"
+
+#endif
diff --git a/libsharp/sharp_core_avx2.c b/libsharp/sharp_core_avx2.c
new file mode 100644
index 0000000..a7ab0a7
--- /dev/null
+++ b/libsharp/sharp_core_avx2.c
@@ -0,0 +1,11 @@
+#if (!defined(__AVX2__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+
+#define XCONCATX(a,b) a##_##b
+#define XCONCATX2(a,b) XCONCATX(a,b)
+#define XARCH(a) XCONCATX2(a,ARCH)
+
+#define ARCH avx2
+#pragma GCC target("avx2")
+#include "sharp_core_inc.c"
+
+#endif
diff --git a/libsharp/sharp_core_avx512f.c b/libsharp/sharp_core_avx512f.c
new file mode 100644
index 0000000..7f17429
--- /dev/null
+++ b/libsharp/sharp_core_avx512f.c
@@ -0,0 +1,11 @@
+#if (!defined(__AVX512F__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+
+#define XCONCATX(a,b) a##_##b
+#define XCONCATX2(a,b) XCONCATX(a,b)
+#define XARCH(a) XCONCATX2(a,ARCH)
+
+#define ARCH avx512f
+#pragma GCC target("avx512f")
+#include "sharp_core_inc.c"
+
+#endif
diff --git a/libsharp/sharp_core_fma.c b/libsharp/sharp_core_fma.c
new file mode 100644
index 0000000..793151f
--- /dev/null
+++ b/libsharp/sharp_core_fma.c
@@ -0,0 +1,11 @@
+#if (!defined(__FMA__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+
+#define XCONCATX(a,b) a##_##b
+#define XCONCATX2(a,b) XCONCATX(a,b)
+#define XARCH(a) XCONCATX2(a,ARCH)
+
+#define ARCH fma
+#pragma GCC target("fma")
+#include "sharp_core_inc.c"
+
+#endif
diff --git a/libsharp/sharp_core_fma4.c b/libsharp/sharp_core_fma4.c
new file mode 100644
index 0000000..d71de74
--- /dev/null
+++ b/libsharp/sharp_core_fma4.c
@@ -0,0 +1,11 @@
+#if (!defined(__FMA4__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+
+#define XCONCATX(a,b) a##_##b
+#define XCONCATX2(a,b) XCONCATX(a,b)
+#define XARCH(a) XCONCATX2(a,ARCH)
+
+#define ARCH fma4
+#pragma GCC target("fma4")
+#include "sharp_core_inc.c"
+
+#endif
diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c
new file mode 100644
index 0000000..ddd08df
--- /dev/null
+++ b/libsharp/sharp_core_inc.c
@@ -0,0 +1,1175 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_core.c
+ *  Computational core
+ *
+ *  Copyright (C) 2012-2019 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#include <complex.h>
+#include <math.h>
+#include <string.h>
+#include "sharp_vecsupport.h"
+#include "sharp.h"
+#include "sharp_internal.h"
+#include "c_utils.h"
+
+typedef complex double dcmplx;
+
+#define nv0 (128/VLEN)
+#define nvx (64/VLEN)
+
+typedef Tv Tbv0[nv0];
+typedef double Tbs0[nv0*VLEN];
+
+typedef struct
+  {
+  Tbv0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i;
+  } s0data_v;
+
+typedef struct
+  {
+  Tbs0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i;
+  } s0data_s;
+
+typedef union
+  {
+  s0data_v v;
+  s0data_s s;
+  } s0data_u;
+
+typedef Tv Tbvx[nvx];
+typedef double Tbsx[nvx*VLEN];
+
+typedef struct
+  {
+  Tbvx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
+       p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
+  } sxdata_v;
+
+typedef struct
+  {
+  Tbsx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
+       p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
+  } sxdata_s;
+
+typedef union
+  {
+  sxdata_v v;
+  sxdata_s s;
+  } sxdata_u;
+
+static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale,
+  double maxval)
+  {
+  const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
+  const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
+  Tm mask = vgt(vabs(*val),vfmax);
+  while (vanyTrue(mask))
+    {
+    vmuleq_mask(mask,*val,vfsmall);
+    vaddeq_mask(mask,*scale,vone);
+    mask = vgt(vabs(*val),vfmax);
+    }
+  mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
+  while (vanyTrue(mask))
+    {
+    vmuleq_mask(mask,*val,vfbig);
+    vsubeq_mask(mask,*scale,vone);
+    mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
+    }
+  }
+
+static void mypow(Tv val, int npow, const double * restrict powlimit,
+  Tv * restrict resd, Tv * restrict ress)
+  {
+  Tv vminv=vload(powlimit[npow]);
+  Tm mask = vlt(vabs(val),vminv);
+  if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
+    {
+    Tv res=vone;
+    do
+      {
+      if (npow&1)
+        res*=val;
+      val*=val;
+      }
+    while(npow>>=1);
+    *resd=res;
+    *ress=vzero;
+    }
+  else
+    {
+    Tv scale=vzero, scaleint=vzero, res=vone;
+    Tvnormalize(&val,&scaleint,sharp_fbighalf);
+    do
+      {
+      if (npow&1)
+        {
+        res*=val;
+        scale+=scaleint;
+        Tvnormalize(&res,&scale,sharp_fbighalf);
+        }
+      val*=val;
+      scaleint+=scaleint;
+      Tvnormalize(&val,&scaleint,sharp_fbighalf);
+      }
+    while(npow>>=1);
+    *resd=res;
+    *ress=scale;
+    }
+  }
+
+static inline void getCorfac(Tv scale, Tv * restrict corfac,
+  const double * restrict cf)
+  {
+  typedef union
+    { Tv v; double s[VLEN]; } Tvu;
+
+  Tvu sc, corf;
+  sc.v=scale;
+  for (int i=0; i<VLEN; ++i)
+    corf.s[i] = (sc.s[i]<sharp_minscale) ?
+      0. : cf[(int)(sc.s[i])-sharp_minscale];
+  *corfac=corf.v;
+  }
+
+static inline int rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, Tv eps)
+  {
+  Tm mask = vgt(vabs(*v2),eps);
+  if (vanyTrue(mask))
+    {
+    vmuleq_mask(mask,*v1,vload(sharp_fsmall));
+    vmuleq_mask(mask,*v2,vload(sharp_fsmall));
+    vaddeq_mask(mask,*s,vone);
+    return 1;
+    }
+  return 0;
+  }
+
+NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
+  s0data_v * restrict d, int * restrict l_, int * restrict il_, int nv2)
+  {
+  int l=gen->m, il=0;
+  Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
+  Tv limscale=vload(sharp_limscale);
+  int below_limit = 1;
+  for (int i=0; i<nv2; ++i)
+    {
+    d->lam1[i]=vzero;
+    mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]);
+    d->lam2[i] *= mfac;
+    Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
+    below_limit &= vallTrue(vlt(d->scale[i],limscale));
+    }
+
+  while (below_limit)
+    {
+    if (l+4>gen->lmax) {*l_=gen->lmax+1;return;}
+    below_limit=1;
+    Tv a1=vload(gen->coef[il  ][0]), b1=vload(gen->coef[il  ][1]);
+    Tv a2=vload(gen->coef[il+1][0]), b2=vload(gen->coef[il+1][1]);
+    for (int i=0; i<nv2; ++i)
+      {
+      d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
+      d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
+        below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale)));
+      }
+    l+=4; il+=2;
+    }
+  *l_=l; *il_=il;
+  }
+
+NOINLINE static void alm2map_kernel(s0data_v * restrict d,
+  const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm,
+  int l, int il, int lmax, int nv2)
+  {
+  if (nv2==nv0)
+    {
+    for (; l<=lmax-2; il+=2, l+=4)
+      {
+      Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+      Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+      Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
+      Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
+      Tv a1=vload(coef[il  ][0]), b1=vload(coef[il  ][1]);
+      Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]);
+      for (int i=0; i<nv0; ++i)
+        {
+        d->p1r[i] += d->lam2[i]*ar1;
+        d->p1i[i] += d->lam2[i]*ai1;
+        d->p2r[i] += d->lam2[i]*ar2;
+        d->p2i[i] += d->lam2[i]*ai2;
+        d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
+        d->p1r[i] += d->lam1[i]*ar3;
+        d->p1i[i] += d->lam1[i]*ai3;
+        d->p2r[i] += d->lam1[i]*ar4;
+        d->p2i[i] += d->lam1[i]*ai4;
+        d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
+        }
+      }
+    }
+  else
+    {
+    for (; l<=lmax-2; il+=2, l+=4)
+      {
+      Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+      Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+      Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
+      Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
+      Tv a1=vload(coef[il  ][0]), b1=vload(coef[il  ][1]);
+      Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]);
+      for (int i=0; i<nv2; ++i)
+        {
+        d->p1r[i] += d->lam2[i]*ar1;
+        d->p1i[i] += d->lam2[i]*ai1;
+        d->p2r[i] += d->lam2[i]*ar2;
+        d->p2i[i] += d->lam2[i]*ai2;
+        d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
+        d->p1r[i] += d->lam1[i]*ar3;
+        d->p1i[i] += d->lam1[i]*ai3;
+        d->p2r[i] += d->lam1[i]*ar4;
+        d->p2i[i] += d->lam1[i]*ai4;
+        d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
+        }
+      }
+    }
+  for (; l<=lmax; ++il, l+=2)
+    {
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
+    for (int i=0; i<nv2; ++i)
+      {
+      d->p1r[i] += d->lam2[i]*ar1;
+      d->p1i[i] += d->lam2[i]*ai1;
+      d->p2r[i] += d->lam2[i]*ar2;
+      d->p2i[i] += d->lam2[i]*ai2;
+      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
+      d->lam1[i] = d->lam2[i];
+      d->lam2[i] = tmp;
+      }
+    }
+  }
+
+NOINLINE static void calc_alm2map (sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
+  {
+  int l,il,lmax=gen->lmax;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee(gen, d, &l, &il, nv2);
+  job->opcnt += il * 4*nth;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 6*nth;
+
+  const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+    }
+
+  while((!full_ieee) && (l<=lmax))
+    {
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
+    full_ieee=1;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->p1r[i] += d->lam2[i]*d->corfac[i]*ar1;
+      d->p1i[i] += d->lam2[i]*d->corfac[i]*ai1;
+      d->p2r[i] += d->lam2[i]*d->corfac[i]*ar2;
+      d->p2i[i] += d->lam2[i]*d->corfac[i]*ai2;
+      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
+      d->lam1[i] = d->lam2[i];
+      d->lam2[i] = tmp;
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
+        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+      full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+      }
+    l+=2; ++il;
+    }
+  if (l>lmax) return;
+
+  for (int i=0; i<nv2; ++i)
+    {
+    d->lam1[i] *= d->corfac[i];
+    d->lam2[i] *= d->corfac[i];
+    }
+  alm2map_kernel(d, coef, alm, l, il, lmax, nv2);
+  }
+
+NOINLINE static void map2alm_kernel(s0data_v * restrict d,
+  const sharp_ylmgen_dbl2 * restrict coef, dcmplx * restrict alm, int l,
+  int il, int lmax, int nv2)
+  {
+  for (; l<=lmax-2; il+=2, l+=4)
+    {
+    Tv a1=vload(coef[il  ][0]), b1=vload(coef[il  ][1]);
+    Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]);
+    Tv atmp1[4] = {vzero, vzero, vzero, vzero};
+    Tv atmp2[4] = {vzero, vzero, vzero, vzero};
+    for (int i=0; i<nv2; ++i)
+      {
+      atmp1[0] += d->lam2[i]*d->p1r[i];
+      atmp1[1] += d->lam2[i]*d->p1i[i];
+      atmp1[2] += d->lam2[i]*d->p2r[i];
+      atmp1[3] += d->lam2[i]*d->p2i[i];
+      d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
+      atmp2[0] += d->lam1[i]*d->p1r[i];
+      atmp2[1] += d->lam1[i]*d->p1i[i];
+      atmp2[2] += d->lam1[i]*d->p2r[i];
+      atmp2[3] += d->lam1[i]*d->p2i[i];
+      d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
+      }
+    vhsum_cmplx_special (atmp1[0], atmp1[1], atmp1[2], atmp1[3], &alm[l  ]);
+    vhsum_cmplx_special (atmp2[0], atmp2[1], atmp2[2], atmp2[3], &alm[l+2]);
+    }
+  for (; l<=lmax; ++il, l+=2)
+    {
+    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
+    Tv atmp[4] = {vzero, vzero, vzero, vzero};
+    for (int i=0; i<nv2; ++i)
+      {
+      atmp[0] += d->lam2[i]*d->p1r[i];
+      atmp[1] += d->lam2[i]*d->p1i[i];
+      atmp[2] += d->lam2[i]*d->p2r[i];
+      atmp[3] += d->lam2[i]*d->p2i[i];
+      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
+      d->lam1[i] = d->lam2[i];
+      d->lam2[i] = tmp;
+      }
+    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
+    }
+  }
+
+NOINLINE static void calc_map2alm (sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
+  {
+  int l,il,lmax=gen->lmax;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee(gen, d, &l, &il, nv2);
+  job->opcnt += il * 4*nth;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 6*nth;
+
+  const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
+  dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+    }
+
+  while((!full_ieee) && (l<=lmax))
+    {
+    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
+    Tv atmp[4] = {vzero, vzero, vzero, vzero};
+    full_ieee=1;
+    for (int i=0; i<nv2; ++i)
+      {
+      atmp[0] += d->lam2[i]*d->corfac[i]*d->p1r[i];
+      atmp[1] += d->lam2[i]*d->corfac[i]*d->p1i[i];
+      atmp[2] += d->lam2[i]*d->corfac[i]*d->p2r[i];
+      atmp[3] += d->lam2[i]*d->corfac[i]*d->p2i[i];
+      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
+      d->lam1[i] = d->lam2[i];
+      d->lam2[i] = tmp;
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
+        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+      full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+      }
+    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
+    l+=2; ++il;
+    }
+  if (l>lmax) return;
+
+  for (int i=0; i<nv2; ++i)
+    {
+    d->lam1[i] *= d->corfac[i];
+    d->lam2[i] *= d->corfac[i];
+    }
+  map2alm_kernel(d, coef, alm, l, il, lmax, nv2);
+  }
+
+NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
+  sxdata_v * restrict d, int * restrict l_, int nv2)
+  {
+  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
+  Tv prefac=vload(gen->prefac[gen->m]),
+     prescale=vload(gen->fscale[gen->m]);
+  Tv limscale=vload(sharp_limscale);
+  int below_limit=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    Tv cth2=vmax(vload(1e-15),vsqrt((vone+d->cth[i])*vload(0.5)));
+    Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5)));
+    Tm mask=vlt(d->sth[i],vzero);
+    vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.));
+    vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.));
+
+    Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
+    mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
+    mypow(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
+    mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
+    mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
+
+    d->l1p[i] = vzero;
+    d->l1m[i] = vzero;
+    d->l2p[i] = prefac*ccp;
+    d->scp[i] = prescale+ccps;
+    d->l2m[i] = prefac*csp;
+    d->scm[i] = prescale+csps;
+    Tvnormalize(&d->l2m[i],&d->scm[i],sharp_fbighalf);
+    Tvnormalize(&d->l2p[i],&d->scp[i],sharp_fbighalf);
+    d->l2p[i] *= ssp;
+    d->scp[i] += ssps;
+    d->l2m[i] *= scp;
+    d->scm[i] += scps;
+    if (gen->preMinus_p)
+      d->l2p[i] = vneg(d->l2p[i]);
+    if (gen->preMinus_m)
+      d->l2m[i] = vneg(d->l2m[i]);
+    if (gen->s&1)
+      d->l2p[i] = vneg(d->l2p[i]);
+
+    Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol);
+    Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol);
+
+    below_limit &= vallTrue(vlt(d->scm[i],limscale)) &&
+                   vallTrue(vlt(d->scp[i],limscale));
+    }
+
+  int l=gen->mhi;
+
+  while (below_limit)
+    {
+    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
+    below_limit=1;
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
+      if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
+          rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
+        below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
+                       vallTrue(vlt(d->scm[i],limscale));
+      }
+    l+=2;
+    }
+
+  *l_=l;
+  }
+
+NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
+  const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
+  int l, int lmax, int nv2)
+  {
+  int lsave = l;
+  while (l<=lmax)
+    {
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
+       acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
+    Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
+       acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
+      d->p1pr[i] += agr1*d->l2p[i];
+      d->p1pi[i] += agi1*d->l2p[i];
+      d->p1mr[i] += acr1*d->l2p[i];
+      d->p1mi[i] += aci1*d->l2p[i];
+
+      d->p1pr[i] += aci2*d->l1p[i];
+      d->p1pi[i] -= acr2*d->l1p[i];
+      d->p1mr[i] -= agi2*d->l1p[i];
+      d->p1mi[i] += agr2*d->l1p[i];
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      }
+    l+=2;
+    }
+  l=lsave;
+  while (l<=lmax)
+    {
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
+       acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
+    Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
+       acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
+      d->p2pr[i] -= aci1*d->l2m[i];
+      d->p2pi[i] += acr1*d->l2m[i];
+      d->p2mr[i] += agi1*d->l2m[i];
+      d->p2mi[i] -= agr1*d->l2m[i];
+
+      d->p2pr[i] += agr2*d->l1m[i];
+      d->p2pi[i] += agi2*d->l1m[i];
+      d->p2mr[i] += acr2*d->l1m[i];
+      d->p2mi[i] += aci2*d->l1m[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
+      }
+    l+=2;
+    }
+  }
+
+NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
+  {
+  int l,lmax=gen->lmax;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee_spin(gen, d, &l, nv2);
+  job->opcnt += (l-gen->mhi) * 7*nth;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 23*nth;
+
+  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+    getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
+                 vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+    }
+
+  while((!full_ieee) && (l<=lmax))
+    {
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
+       acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
+    Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
+       acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
+    full_ieee=1;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
+
+      Tv l2p=d->l2p[i]*d->cfp[i], l2m=d->l2m[i]*d->cfm[i];
+      Tv l1m=d->l1m[i]*d->cfm[i], l1p=d->l1p[i]*d->cfp[i];
+
+      d->p1pr[i] += agr1*l2p + aci2*l1p;
+      d->p1pi[i] += agi1*l2p - acr2*l1p;
+      d->p1mr[i] += acr1*l2p - agi2*l1p;
+      d->p1mi[i] += aci1*l2p + agr2*l1p;
+
+      d->p2pr[i] += agr2*l1m - aci1*l2m;
+      d->p2pi[i] += agi2*l1m + acr1*l2m;
+      d->p2mr[i] += acr2*l1m + agi1*l2m;
+      d->p2mi[i] += aci2*l1m - agr1*l2m;
+
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
+      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
+        getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+      full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
+      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
+        getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+      full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+      }
+    l+=2;
+    }
+//  if (l>lmax) return;
+
+  for (int i=0; i<nv2; ++i)
+    {
+    d->l1p[i] *= d->cfp[i];
+    d->l2p[i] *= d->cfp[i];
+    d->l1m[i] *= d->cfm[i];
+    d->l2m[i] *= d->cfm[i];
+    }
+  alm2map_spin_kernel(d, fx, alm, l, lmax, nv2);
+
+  for (int i=0; i<nv2; ++i)
+    {
+    Tv tmp;
+    tmp = d->p1pr[i]; d->p1pr[i] -= d->p2mi[i]; d->p2mi[i] += tmp;
+    tmp = d->p1pi[i]; d->p1pi[i] += d->p2mr[i]; d->p2mr[i] -= tmp;
+    tmp = d->p1mr[i]; d->p1mr[i] += d->p2pi[i]; d->p2pi[i] -= tmp;
+    tmp = d->p1mi[i]; d->p1mi[i] -= d->p2pr[i]; d->p2pr[i] += tmp;
+    }
+  }
+
+NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
+  const sharp_ylmgen_dbl2 * restrict fx, dcmplx * restrict alm,
+  int l, int lmax, int nv2)
+  {
+  int lsave=l;
+  while (l<=lmax)
+    {
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
+    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
+      agr1 += d->p2mi[i]*d->l2p[i];
+      agi1 -= d->p2mr[i]*d->l2p[i];
+      acr1 -= d->p2pi[i]*d->l2p[i];
+      aci1 += d->p2pr[i]*d->l2p[i];
+      agr2 += d->p2pr[i]*d->l1p[i];
+      agi2 += d->p2pi[i]*d->l1p[i];
+      acr2 += d->p2mr[i]*d->l1p[i];
+      aci2 += d->p2mi[i]*d->l1p[i];
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      }
+    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
+    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
+    l+=2;
+    }
+  l=lsave;
+  while (l<=lmax)
+    {
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
+    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
+      agr1 += d->p1pr[i]*d->l2m[i];
+      agi1 += d->p1pi[i]*d->l2m[i];
+      acr1 += d->p1mr[i]*d->l2m[i];
+      aci1 += d->p1mi[i]*d->l2m[i];
+      agr2 -= d->p1mi[i]*d->l1m[i];
+      agi2 += d->p1mr[i]*d->l1m[i];
+      acr2 += d->p1pi[i]*d->l1m[i];
+      aci2 -= d->p1pr[i]*d->l1m[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
+      }
+    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
+    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
+    l+=2;
+    }
+  }
+
+NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
+  {
+  int l,lmax=gen->lmax;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee_spin(gen, d, &l, nv2);
+  job->opcnt += (l-gen->mhi) * 7*nth;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 23*nth;
+
+  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
+  dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+    getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
+                 vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+    }
+  for (int i=0; i<nv2; ++i)
+    {
+    Tv tmp;
+    tmp = d->p1pr[i]; d->p1pr[i] -= d->p2mi[i]; d->p2mi[i] += tmp;
+    tmp = d->p1pi[i]; d->p1pi[i] += d->p2mr[i]; d->p2mr[i] -= tmp;
+    tmp = d->p1mr[i]; d->p1mr[i] += d->p2pi[i]; d->p2pi[i] -= tmp;
+    tmp = d->p1mi[i]; d->p1mi[i] -= d->p2pr[i]; d->p2pr[i] += tmp;
+    }
+
+  while((!full_ieee) && (l<=lmax))
+    {
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
+    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
+    full_ieee=1;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
+      Tv l2p = d->l2p[i]*d->cfp[i], l2m = d->l2m[i]*d->cfm[i];
+      Tv l1p = d->l1p[i]*d->cfp[i], l1m = d->l1m[i]*d->cfm[i];
+      agr1 += d->p1pr[i]*l2m + d->p2mi[i]*l2p;
+      agi1 += d->p1pi[i]*l2m - d->p2mr[i]*l2p;
+      acr1 += d->p1mr[i]*l2m - d->p2pi[i]*l2p;
+      aci1 += d->p1mi[i]*l2m + d->p2pr[i]*l2p;
+      agr2 += d->p2pr[i]*l1p - d->p1mi[i]*l1m;
+      agi2 += d->p2pi[i]*l1p + d->p1mr[i]*l1m;
+      acr2 += d->p2mr[i]*l1p + d->p1pi[i]*l1m;
+      aci2 += d->p2mi[i]*l1p - d->p1pr[i]*l1m;
+
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
+      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
+        getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+      full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
+      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
+        getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+      full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+      }
+    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
+    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
+    l+=2;
+    }
+  if (l>lmax) return;
+
+  for (int i=0; i<nv2; ++i)
+    {
+    d->l1p[i] *= d->cfp[i];
+    d->l2p[i] *= d->cfp[i];
+    d->l1m[i] *= d->cfm[i];
+    d->l2m[i] *= d->cfm[i];
+    }
+  map2alm_spin_kernel(d, fx, alm, l, lmax, nv2);
+  }
+
+
+NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
+  const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
+  int l, int lmax, int nv2)
+  {
+  while (l<=lmax)
+    {
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
+       ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
+      Tv lw=d->l2p[i]+d->l2m[i];
+      d->p1pr[i] += ar1*lw;
+      d->p1pi[i] += ai1*lw;
+      Tv lx=d->l2m[i]-d->l2p[i];
+      d->p2mr[i] += ai1*lx;
+      d->p2mi[i] -= ar1*lx;
+      lw=d->l1p[i]+d->l1m[i];
+      d->p2pr[i] += ar2*lw;
+      d->p2pi[i] += ai2*lw;
+      lx=d->l1m[i]-d->l1p[i];
+      d->p1mr[i] += ai2*lx;
+      d->p1mi[i] -= ar2*lx;
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
+      }
+    l+=2;
+    }
+  }
+
+NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
+  {
+  int l,lmax=gen->lmax;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee_spin(gen, d, &l, nv2);
+  job->opcnt += (l-gen->mhi) * 7*nth;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 17*nth;
+
+  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+    getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
+                 vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+    }
+
+  while((!full_ieee) && (l<=lmax))
+    {
+    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
+    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
+       ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    full_ieee=1;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
+      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
+      Tv lw=d->l2p[i]*d->cfp[i]+d->l2m[i]*d->cfm[i];
+      d->p1pr[i] += ar1*lw;
+      d->p1pi[i] += ai1*lw;
+      Tv lx=d->l2m[i]*d->cfm[i]-d->l2p[i]*d->cfp[i];
+      d->p2mr[i] += ai1*lx;
+      d->p2mi[i] -= ar1*lx;
+      lw=d->l1p[i]*d->cfp[i]+d->l1m[i]*d->cfm[i];
+      d->p2pr[i] += ar2*lw;
+      d->p2pi[i] += ai2*lw;
+      lx=d->l1m[i]*d->cfm[i]-d->l1p[i]*d->cfp[i];
+      d->p1mr[i] += ai2*lx;
+      d->p1mi[i] -= ar2*lx;
+      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
+      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
+      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
+        }
+      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+        }
+      }
+    l+=2;
+    }
+  if (l>lmax) return;
+
+  for (int i=0; i<nv2; ++i)
+    {
+    d->l1p[i] *= d->cfp[i];
+    d->l2p[i] *= d->cfp[i];
+    d->l1m[i] *= d->cfm[i];
+    d->l2m[i] *= d->cfm[i];
+    }
+  alm2map_deriv1_kernel(d, fx, alm, l, lmax, nv2);
+  }
+
+
+#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
+
+NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *mlim)
+  {
+  const int m = job->ainfo->mval[mi];
+  sharp_Ylmgen_prepare (gen, m);
+
+  switch (job->type)
+    {
+    case SHARP_ALM2MAP:
+    case SHARP_ALM2MAP_DERIV1:
+      {
+      if (job->spin==0)
+        {
+        //adjust the a_lm for the new algorithm
+        dcmplx * restrict alm=job->almtmp;
+        for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
+          {
+          dcmplx al = alm[l];
+          dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1];
+          dcmplx al2 = (l+2>gen->lmax) ? 0. : alm[l+2];
+          alm[l  ] = gen->alpha[il]*(gen->eps[l+1]*al + gen->eps[l+2]*al2);
+          alm[l+1] = gen->alpha[il]*al1;
+          }
+
+        const int nval=nv0*VLEN;
+        int ith=0;
+        int itgt[nval];
+        while (ith<ulim-llim)
+          {
+          s0data_u d;
+          VZERO(d.s.p1r); VZERO(d.s.p1i); VZERO(d.s.p2r); VZERO(d.s.p2i);
+          int nth=0;
+          while ((nth<nval)&&(ith<ulim-llim))
+            {
+            if (mlim[ith]>=m)
+              {
+              itgt[nth] = ith;
+              d.s.csq[nth]=cth_[ith]*cth_[ith];
+              d.s.sth[nth]=sth_[ith];
+              ++nth;
+              }
+            else
+              {
+              int phas_idx = ith*job->s_th + mi*job->s_m;
+              job->phase[phas_idx] = job->phase[phas_idx+1] = 0;
+              }
+            ++ith;
+            }
+          if (nth>0)
+            {
+            int i2=((nth+VLEN-1)/VLEN)*VLEN;
+            for (int i=nth; i<i2; ++i)
+              {
+              d.s.csq[i]=d.s.csq[nth-1];
+              d.s.sth[i]=d.s.sth[nth-1];
+              d.s.p1r[i]=d.s.p1i[i]=d.s.p2r[i]=d.s.p2i[i]=0.;
+              }
+            calc_alm2map (job, gen, &d.v, nth);
+            for (int i=0; i<nth; ++i)
+              {
+              int tgt=itgt[i];
+              //adjust for new algorithm
+              d.s.p2r[i]*=cth_[tgt];
+              d.s.p2i[i]*=cth_[tgt];
+              int phas_idx = tgt*job->s_th + mi*job->s_m;
+              complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I,
+                             r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I;
+              job->phase[phas_idx] = r1+r2;
+              if (ispair[tgt])
+                job->phase[phas_idx+1] = r1-r2;
+              }
+            }
+          }
+        }
+      else
+        {
+        //adjust the a_lm for the new algorithm
+        if (job->nalm==2)
+          for (int l=gen->mhi; l<=gen->lmax+1; ++l)
+            {
+            job->almtmp[2*l  ]*=gen->alpha[l];
+            job->almtmp[2*l+1]*=gen->alpha[l];
+            }
+        else
+          for (int l=gen->mhi; l<=gen->lmax+1; ++l)
+            job->almtmp[l]*=gen->alpha[l];
+
+        const int nval=nvx*VLEN;
+        int ith=0;
+        int itgt[nval];
+        while (ith<ulim-llim)
+          {
+          sxdata_u d;
+          VZERO(d.s.p1pr); VZERO(d.s.p1pi); VZERO(d.s.p2pr); VZERO(d.s.p2pi);
+          VZERO(d.s.p1mr); VZERO(d.s.p1mi); VZERO(d.s.p2mr); VZERO(d.s.p2mi);
+          int nth=0;
+          while ((nth<nval)&&(ith<ulim-llim))
+            {
+            if (mlim[ith]>=m)
+              {
+              itgt[nth] = ith;
+              d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
+              ++nth;
+              }
+            else
+              {
+              int phas_idx = ith*job->s_th + mi*job->s_m;
+              job->phase[phas_idx  ] = job->phase[phas_idx+1] = 0;
+              job->phase[phas_idx+2] = job->phase[phas_idx+3] = 0;
+              }
+            ++ith;
+            }
+          if (nth>0)
+            {
+            int i2=((nth+VLEN-1)/VLEN)*VLEN;
+            for (int i=nth; i<i2; ++i)
+              {
+              d.s.cth[i]=d.s.cth[nth-1];
+              d.s.sth[i]=d.s.sth[nth-1];
+              d.s.p1pr[i]=d.s.p1pi[i]=d.s.p2pr[i]=d.s.p2pi[i]=0.;
+              d.s.p1mr[i]=d.s.p1mi[i]=d.s.p2mr[i]=d.s.p2mi[i]=0.;
+              }
+            (job->type==SHARP_ALM2MAP) ?
+              calc_alm2map_spin  (job, gen, &d.v, nth) :
+              calc_alm2map_deriv1(job, gen, &d.v, nth);
+            for (int i=0; i<nth; ++i)
+              {
+              int tgt=itgt[i];
+              int phas_idx = tgt*job->s_th + mi*job->s_m;
+              complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I,
+                             q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I,
+                             u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I,
+                             u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I;
+              job->phase[phas_idx  ] = q1+q2;
+              job->phase[phas_idx+2] = u1+u2;
+              if (ispair[tgt])
+                {
+                dcmplx *phQ = &(job->phase[phas_idx+1]),
+                       *phU = &(job->phase[phas_idx+3]);
+                *phQ = q1-q2;
+                *phU = u1-u2;
+                if ((gen->mhi-gen->m+gen->s)&1)
+                  { *phQ=-(*phQ); *phU=-(*phU); }
+                }
+              }
+            }
+          }
+        }
+      break;
+      }
+    default:
+      {
+      UTIL_FAIL("must not happen");
+      break;
+      }
+    }
+  }
+
+NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *mlim)
+  {
+  const int m = job->ainfo->mval[mi];
+  sharp_Ylmgen_prepare (gen, m);
+
+  switch (job->type)
+    {
+    case SHARP_MAP2ALM:
+      {
+      if (job->spin==0)
+        {
+        const int nval=nv0*VLEN;
+        int ith=0;
+        while (ith<ulim-llim)
+          {
+          s0data_u d;
+          int nth=0;
+          while ((nth<nval)&&(ith<ulim-llim))
+            {
+            if (mlim[ith]>=m)
+              {
+              d.s.csq[nth]=cth_[ith]*cth_[ith]; d.s.sth[nth]=sth_[ith];
+              int phas_idx = ith*job->s_th + mi*job->s_m;
+              dcmplx ph1=job->phase[phas_idx];
+              dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.;
+              d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2);
+              d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2);
+              //adjust for new algorithm
+              d.s.p2r[nth]*=cth_[ith];
+              d.s.p2i[nth]*=cth_[ith];
+              ++nth;
+              }
+            ++ith;
+            }
+          if (nth>0)
+            {
+            int i2=((nth+VLEN-1)/VLEN)*VLEN;
+            for (int i=nth; i<i2; ++i)
+              {
+              d.s.csq[i]=d.s.csq[nth-1];
+              d.s.sth[i]=d.s.sth[nth-1];
+              d.s.p1r[i]=d.s.p1i[i]=d.s.p2r[i]=d.s.p2i[i]=0.;
+              }
+            calc_map2alm (job, gen, &d.v, nth);
+            }
+          }
+        //adjust the a_lm for the new algorithm
+        dcmplx * restrict alm=job->almtmp;
+        dcmplx alm2 = 0.;
+        double alold=0;
+        for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
+          {
+          dcmplx al = alm[l];
+          dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1];
+          alm[l  ] = gen->alpha[il]*gen->eps[l+1]*al + alold*gen->eps[l]*alm2;
+          alm[l+1] = gen->alpha[il]*al1;
+          alm2=al;
+          alold=gen->alpha[il];
+          }
+        }
+      else
+        {
+        const int nval=nvx*VLEN;
+        int ith=0;
+        while (ith<ulim-llim)
+          {
+          sxdata_u d;
+          int nth=0;
+          while ((nth<nval)&&(ith<ulim-llim))
+            {
+            if (mlim[ith]>=m)
+              {
+              d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
+              int phas_idx = ith*job->s_th + mi*job->s_m;
+              dcmplx p1Q=job->phase[phas_idx],
+                     p1U=job->phase[phas_idx+2],
+                     p2Q=ispair[ith] ? job->phase[phas_idx+1]:0.,
+                     p2U=ispair[ith] ? job->phase[phas_idx+3]:0.;
+              if ((gen->mhi-gen->m+gen->s)&1)
+                { p2Q=-p2Q; p2U=-p2U; }
+              d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q);
+              d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U);
+              d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q);
+              d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U);
+              ++nth;
+              }
+            ++ith;
+            }
+          if (nth>0)
+            {
+            int i2=((nth+VLEN-1)/VLEN)*VLEN;
+            for (int i=nth; i<i2; ++i)
+              {
+              d.s.cth[i]=d.s.cth[nth-1];
+              d.s.sth[i]=d.s.sth[nth-1];
+              d.s.p1pr[i]=d.s.p1pi[i]=d.s.p2pr[i]=d.s.p2pi[i]=0.;
+              d.s.p1mr[i]=d.s.p1mi[i]=d.s.p2mr[i]=d.s.p2mi[i]=0.;
+              }
+            calc_map2alm_spin(job, gen, &d.v, nth);
+            }
+          }
+        //adjust the a_lm for the new algorithm
+        for (int l=gen->mhi; l<=gen->lmax; ++l)
+          {
+          job->almtmp[2*l  ]*=gen->alpha[l];
+          job->almtmp[2*l+1]*=gen->alpha[l];
+          }
+        }
+      break;
+      }
+    default:
+      {
+      UTIL_FAIL("must not happen");
+      break;
+      }
+    }
+  }
+
+void XARCH(inner_loop) (sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *mlim)
+  {
+  (job->type==SHARP_MAP2ALM) ?
+    inner_loop_m2a(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) :
+    inner_loop_a2m(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim);
+  }
+
+#undef VZERO
+
+int XARCH(sharp_veclen)(void)
+  {
+  return VLEN;
+  }
+
+int XARCH(sharp_max_nvec)(int spin)
+  {
+  return (spin==0) ? nv0 : nvx;
+  }
+
+#define xstr(a) str(a)
+#define str(a) #a
+const char *XARCH(sharp_architecture)(void)
+  {
+  return xstr(ARCH);
+  }
diff --git a/libsharp/sharp_internal.h b/libsharp/sharp_internal.h
index 4c67cc8..a6b3120 100644
--- a/libsharp/sharp_internal.h
+++ b/libsharp/sharp_internal.h
@@ -68,5 +68,6 @@ void inner_loop (sharp_job *job, const int *ispair,const double *cth,
 
 int sharp_veclen(void);
 int sharp_max_nvec(int spin);
+const char *sharp_architecture(void);
 
 #endif
diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c
index 1a05bdc..f22a91e 100644
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@@ -71,9 +71,6 @@ static void MPI_status(void)
 #endif
   }
 
-static void vecmath_status(void)
-  { printf("Supported vector length: %d\n",sharp_veclen()); }
-
 static void sharp_announce (const char *name)
   {
   size_t m, nlen=strlen(name);
@@ -84,7 +81,8 @@ static void sharp_announce (const char *name)
   printf("+-");
   for (m=0; m<nlen; ++m) printf("-");
   printf("-+\n\n");
-  vecmath_status();
+  printf("Detected hardware architecture: %s\n", sharp_architecture());
+  printf("Supported vector length: %d\n", sharp_veclen());
   OpenMP_status();
   MPI_status();
   printf("\n");

From 18c04674f25684f639df395905ad87fccf662f61 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 11 Jan 2019 16:26:00 +0100
Subject: [PATCH 79/85] fix sharp_cxx.h

---
 libsharp/sharp_cxx.h | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/libsharp/sharp_cxx.h b/libsharp/sharp_cxx.h
index 049d89e..b2415e0 100644
--- a/libsharp/sharp_cxx.h
+++ b/libsharp/sharp_cxx.h
@@ -165,7 +165,7 @@ template<typename T> class sharp_cxxjob: public sharp_base
       {
       void *aptr=conv(alm), *mptr=conv(map);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
+      sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,flags,0,0);
       }
     void alm2map_adjoint (const T *map, std::complex<T> *alm, bool add) const
       {
@@ -173,6 +173,21 @@ template<typename T> class sharp_cxxjob: public sharp_base
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
       sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,flags,0,0);
       }
+    void alm2map_spin_adjoint (const T *map1, const T *map2, T *alm1, T *alm2,
+      int spin, bool add) const
+      {
+      void *aptr[2], *mptr[2];
+      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
+      mptr[0]=conv(map1); mptr[1]=conv(map2);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_Yt,spin,aptr,mptr,ginfo,ainfo,flags,0,0);
+      }
+    void alm2map_spin_adjoint (const T *map1, const T *map2,
+      std::complex<T> *alm1, std::complex<T> *alm2, int spin, bool add) const
+      {
+      alm2map_spin_adjoint (map1, map2, reinterpret_cast<T *>(alm1),
+        reinterpret_cast<T *>(alm2), spin, add);
+      }
     void map2alm (const T *map, T *alm, bool add) const
       {
       void *aptr=conv(alm), *mptr=conv(map);

From 7b4353a5e7bd89b3b6a9dc4d9ce2c86a67387638 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 11 Jan 2019 17:25:21 +0100
Subject: [PATCH 80/85] tweaks

---
 .gitignore                | 8 +++++++-
 libsharp/sharp_core_inc.c | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4675273..782109a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,15 +9,21 @@
 .libs
 **/.deps
 **/.dirstamp
-
+libsharp-uninstalled.pc
+libsharp-uninstalled.sh
+libsharp.pc
+libsharp.pc.in
+perf.data*
 /auto
 /autom4te.cache
 /m4
 config.log
 config.guess
+config.status
 config.sub
 ltmain.sh
 compile
+libtool
 missing
 /comp
 /configure
diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c
index ddd08df..f4bd041 100644
--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
@@ -22,7 +22,7 @@
  *  (DLR).
  */
 
-/*! \file sharp_core.c
+/*! \file sharp_core_inc.c
  *  Computational core
  *
  *  Copyright (C) 2012-2019 Max-Planck-Society

From b5eda7fc0a29abcf670db2b7968ca03b430888bb Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 11 Jan 2019 19:41:23 +0100
Subject: [PATCH 81/85] fixes for extra paranoia

---
 libsharp/sharp_core_inc.c | 66 +++++++++++++++++++++------------------
 libsharp/sharp_ylmgen_c.c | 12 +++----
 libsharp/sharp_ylmgen_c.h |  2 +-
 3 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c
index f4bd041..d229a49 100644
--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
@@ -190,8 +190,8 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
     {
     if (l+4>gen->lmax) {*l_=gen->lmax+1;return;}
     below_limit=1;
-    Tv a1=vload(gen->coef[il  ][0]), b1=vload(gen->coef[il  ][1]);
-    Tv a2=vload(gen->coef[il+1][0]), b2=vload(gen->coef[il+1][1]);
+    Tv a1=vload(gen->coef[il  ].a), b1=vload(gen->coef[il  ].b);
+    Tv a2=vload(gen->coef[il+1].a), b2=vload(gen->coef[il+1].b);
     for (int i=0; i<nv2; ++i)
       {
       d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
@@ -216,8 +216,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
       Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
       Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
       Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
-      Tv a1=vload(coef[il  ][0]), b1=vload(coef[il  ][1]);
-      Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]);
+      Tv a1=vload(coef[il  ].a), b1=vload(coef[il  ].b);
+      Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
       for (int i=0; i<nv0; ++i)
         {
         d->p1r[i] += d->lam2[i]*ar1;
@@ -241,8 +241,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
       Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
       Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
       Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
-      Tv a1=vload(coef[il  ][0]), b1=vload(coef[il  ][1]);
-      Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]);
+      Tv a1=vload(coef[il  ].a), b1=vload(coef[il  ].b);
+      Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
       for (int i=0; i<nv2; ++i)
         {
         d->p1r[i] += d->lam2[i]*ar1;
@@ -262,7 +262,7 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
     {
     Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
     Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
+    Tv a=vload(coef[il].a), b=vload(coef[il].b);
     for (int i=0; i<nv2; ++i)
       {
       d->p1r[i] += d->lam2[i]*ar1;
@@ -299,7 +299,7 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
     {
     Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
     Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
+    Tv a=vload(coef[il].a), b=vload(coef[il].b);
     full_ieee=1;
     for (int i=0; i<nv2; ++i)
       {
@@ -332,8 +332,8 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d,
   {
   for (; l<=lmax-2; il+=2, l+=4)
     {
-    Tv a1=vload(coef[il  ][0]), b1=vload(coef[il  ][1]);
-    Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]);
+    Tv a1=vload(coef[il  ].a), b1=vload(coef[il  ].b);
+    Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
     Tv atmp1[4] = {vzero, vzero, vzero, vzero};
     Tv atmp2[4] = {vzero, vzero, vzero, vzero};
     for (int i=0; i<nv2; ++i)
@@ -354,7 +354,7 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d,
     }
   for (; l<=lmax; ++il, l+=2)
     {
-    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
+    Tv a=vload(coef[il].a), b=vload(coef[il].b);
     Tv atmp[4] = {vzero, vzero, vzero, vzero};
     for (int i=0; i<nv2; ++i)
       {
@@ -391,7 +391,7 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
 
   while((!full_ieee) && (l<=lmax))
     {
-    Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
+    Tv a=vload(coef[il].a), b=vload(coef[il].b);
     Tv atmp[4] = {vzero, vzero, vzero, vzero};
     full_ieee=1;
     for (int i=0; i<nv2; ++i)
@@ -474,8 +474,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
     {
     if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
     below_limit=1;
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
+    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
     for (int i=0; i<nv2; ++i)
       {
       d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
@@ -500,8 +500,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
   int lsave = l;
   while (l<=lmax)
     {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
+    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
     Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
        acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
     Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
@@ -525,8 +525,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
   l=lsave;
   while (l<=lmax)
     {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
+    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
     Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
        acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
     Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
@@ -572,8 +572,8 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
 
   while((!full_ieee) && (l<=lmax))
     {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
+    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
     Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
        acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
     Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
@@ -636,8 +636,8 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
   int lsave=l;
   while (l<=lmax)
     {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
+    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
     Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
     Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
     for (int i=0; i<nv2; ++i)
@@ -660,8 +660,8 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
   l=lsave;
   while (l<=lmax)
     {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
+    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
     Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
     Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
     for (int i=0; i<nv2; ++i)
@@ -714,8 +714,8 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
 
   while((!full_ieee) && (l<=lmax))
     {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
+    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
     Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
     Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
     full_ieee=1;
@@ -766,8 +766,8 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
   {
   while (l<=lmax)
     {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
+    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
     Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
        ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
     for (int i=0; i<nv2; ++i)
@@ -816,8 +816,8 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
 
   while((!full_ieee) && (l<=lmax))
     {
-    Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
-    Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
+    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
+    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
     Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
        ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
     full_ieee=1;
@@ -1146,6 +1146,9 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
     }
   }
 
+void XARCH(inner_loop) (sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *mlim);
 void XARCH(inner_loop) (sharp_job *job, const int *ispair,
   const double *cth_, const double *sth_, int llim, int ulim,
   sharp_Ylmgen_C *gen, int mi, const int *mlim)
@@ -1157,11 +1160,13 @@ void XARCH(inner_loop) (sharp_job *job, const int *ispair,
 
 #undef VZERO
 
+int XARCH(sharp_veclen)(void);
 int XARCH(sharp_veclen)(void)
   {
   return VLEN;
   }
 
+int XARCH(sharp_max_nvec)(int spin);
 int XARCH(sharp_max_nvec)(int spin)
   {
   return (spin==0) ? nv0 : nvx;
@@ -1169,6 +1174,7 @@ int XARCH(sharp_max_nvec)(int spin)
 
 #define xstr(a) str(a)
 #define str(a) #a
+const char *XARCH(sharp_architecture)(void);
 const char *XARCH(sharp_architecture)(void)
   {
   return xstr(ARCH);
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index ffa3e0f..f408eea 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -89,7 +89,7 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
     gen->m=gen->mlo=gen->mhi=-1234567890;
     ALLOC(gen->coef,sharp_ylmgen_dbl2,gen->lmax+3);
     for (int m=0; m<gen->lmax+3; ++m)
-      gen->coef[m][0]=gen->coef[m][1]=0.;
+      gen->coef[m].a=gen->coef[m].b=0.;
     ALLOC(gen->alpha,double,gen->lmax+3);
     ALLOC(gen->inv,double,gen->lmax+2);
     gen->inv[0]=0;
@@ -172,9 +172,9 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
                        /(gen->eps[l+2]*gen->eps[l+3]*gen->alpha[il]);
     for (int il=0, l=m; l<gen->lmax+2; ++il, l+=2)
       {
-      gen->coef[il][0] = ((il&1) ? -1 : 1)*gen->alpha[il]*gen->alpha[il];
+      gen->coef[il].a = ((il&1) ? -1 : 1)*gen->alpha[il]*gen->alpha[il];
       double t1 = gen->eps[l+2], t2 = gen->eps[l+1];
-      gen->coef[il][1] = -gen->coef[il][0]*(t1*t1+t2*t2);
+      gen->coef[il].b = -gen->coef[il].a*(t1*t1+t2*t2);
       }
     }
   else
@@ -188,7 +188,7 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
     if (!ms_similar)
       {
       gen->alpha[gen->mhi] = 1.;
-      gen->coef[gen->mhi][0] = gen->coef[gen->mhi][1] = 0.;
+      gen->coef[gen->mhi].a = gen->coef[gen->mhi].b = 0.;
       for (int l=gen->mhi; l<gen->lmax+1; ++l)
         {
         double t = gen->flm1[l+gen->m]*gen->flm1[l-gen->m]
@@ -204,8 +204,8 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
           gen->alpha[l+1] = gen->alpha[l-1]*flp12;
         else
           gen->alpha[l+1] = 1.;
-        gen->coef[l+1][0] = flp10*gen->alpha[l]/gen->alpha[l+1];
-        gen->coef[l+1][1] = flp11*gen->coef[l+1][0];
+        gen->coef[l+1].a = flp10*gen->alpha[l]/gen->alpha[l+1];
+        gen->coef[l+1].b = flp11*gen->coef[l+1].a;
         }
       }
 
diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h
index b36346a..130d797 100644
--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@@ -41,7 +41,7 @@ static const double sharp_fbig=0x1p+800,sharp_fsmall=0x1p-800;
 static const double sharp_ftol=0x1p-60;
 static const double sharp_fbighalf=0x1p+400;
 
-typedef double sharp_ylmgen_dbl2[2];
+typedef struct { double a, b; } sharp_ylmgen_dbl2;
 
 typedef struct
   {

From 3a9705e3cc0f2b246b357c185e3b2892a07867ac Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Sat, 12 Jan 2019 12:31:26 +0100
Subject: [PATCH 82/85] better documentation

---
 COMPILE | 86 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 73 insertions(+), 13 deletions(-)

diff --git a/COMPILE b/COMPILE
index 8823bd3..8a5f3cd 100644
--- a/COMPILE
+++ b/COMPILE
@@ -1,23 +1,83 @@
 Libsharp is configured, compiled and installed using GNU autotools.
-The most complicated step for the user is selecting the appropriate compiler
-flags (and in some cases the compiler).
 
-Here are a few (hopefully helpful) examples:
+If you have cloned the libsharp repository, you have to run
+"autoreconf -i" before starting the configuration, which requires several
+GNU developer tools to be available on your system.
 
-GCC, OpenMP, portable executable:
-CFLAGS="-std=c99 -O3 -ffast-math -flto -fopenmp" ./configure
+When using a release tarball, configuration is done via
 
-GCC, OpenMP, specific optimization for the target CPU:
-CFLAGS="-std=c99 -O3 -march=native -ffast-math -flto -fopenmp" ./configure
+[CC=...] [CFLAGS=...] ./configure
 
-GCC, no OpenMP, specific optimization for the target CPU:
-CFLAGS="-std=c99 -O3 -march=native -ffast-math -flto" ./configure
+The following sections briefly describe possible choices for compilers and
+flags.
 
-Clang:
-CC=clang CFLAGS="-std=c99 -O3 -march=native -ffast-math -flto -fopenmp" ./configure
 
-MPI support:
-CC=mpicc CFLAGS="-DUSE_MPI -std=c99 -O3 -march=native -ffast-math -flto" ./configure
+Fast math
+---------
+
+Specifying "-ffast-math" is important for all compilers, since it allows the
+compiler to fuse multiplications and additions into FMA instructions, which is
+forbidden by the C99 standard. Since FMAs are a central aspect of the algorithm,
+they are needed for optimum performance.
+
+If you are calling libsharp from other code which requires strict adherence
+to the C99 standard, you should still be able to compile libsharp with
+"-ffast-math" without any problems.
+
+
+Runtime CPU selection with gcc
+------------------------------
+
+When using a recent gcc (6.0 and newer) on an x86_64 platform, the build
+machinery will compile the time-critical functions for several different
+architectures (SSE2, AVX, AVX2, FMA3, FMA4, AVX512F), and the appropriate
+implementation will be selected at runtime.
+This only happens if you do _not_ explicitly specify a target architecture via
+the compiler flags. I.e., please do _not_ specify "-march=native" or
+"-mtarget=avx" or similar if you want a portable binary that will run
+efficiently on different x86_64 CPUs.
+If you are compiling libsharp for a particular target CPU only, or if you are
+using a different compiler, however, "-march-native" should be used. The
+resulting binary will most likely not run on other computers, though.
+
+
+OpenMP
+------
+
+OpenMP should be switched on for maximum performance, and at runtime
+OMP_NUM_THREADS should be set to the number of hardware threads (not physical
+cores) of the system.
+(Usually this is  already the default setting when OMP_NUM_THREADS is not
+specified.)
+
+
+MPI
+---
+
+MPI support is enabled by using the MPI compiler (typically "mpicc") _and_
+adding the flag "-DUSE_MPI".
+When using MPI and OpenMP simultaneously, the product of MPI tasks per node
+and OMP_NUM_THREADS should be equal to the number of hardware threads available
+on the node. One MPI task per node should result in the best performance.
+
+
+Example configure invocations
+=============================
+
+GCC, OpenMP, portable binary:
+CFLAGS="-std=c99 -O3 -ffast-math -fopenmp" ./configure
+
+GCC, no OpenMP, portable binary:
+CFLAGS="-std=c99 -O3 -ffast-math" ./configure
+
+Clang, OpenMP, nonportable binary:
+CC=clang CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure
+
+Intel C compiler, OpenMP, nonportable binary:
+CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure
+
+MPI support, nonportable binary:
+CC=mpicc CFLAGS="-DUSE_MPI -std=c99 -O3 -march=native -ffast-math" ./configure
 
 Additional GCC flags for pedantic warning and debugging:
 

From 78a3580901c134024b7f1042d21ea94bd4ae65fa Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Sat, 12 Jan 2019 12:56:27 +0100
Subject: [PATCH 83/85] add test

---
 Makefile.am | 5 +++--
 runtest.sh  | 4 ++++
 2 files changed, 7 insertions(+), 2 deletions(-)
 create mode 100755 runtest.sh

diff --git a/Makefile.am b/Makefile.am
index a1541f8..bcf53ff 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -30,7 +30,8 @@ include_HEADERS = \
   libsharp/sharp_cxx.h
 
 EXTRA_DIST = \
-  libsharp/sharp_core_inc.c
+  libsharp/sharp_core_inc.c \
+  runtest.sh
 
 libsharp_la_SOURCES = $(src_sharp)
 
@@ -38,7 +39,7 @@ check_PROGRAMS = sharp_testsuite
 sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h c_utils/walltime_c.c c_utils/walltime_c.h
 sharp_testsuite_LDADD = libsharp.la
 
-#TESTS = ffttest
+TESTS = runtest.sh
 
 AM_CFLAGS = -I$(top_srcdir)/c_utils -I$(top_srcdir)/libsharp @AM_CFLAGS@
 
diff --git a/runtest.sh b/runtest.sh
new file mode 100755
index 0000000..be291ec
--- /dev/null
+++ b/runtest.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+./sharp_testsuite acctest
+

From da16218781e41993a93e98a88adf5ffb51de3489 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 15 Jan 2019 11:07:34 +0100
Subject: [PATCH 84/85] add README.md

---
 README.md | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 README.md

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7620953
--- /dev/null
+++ b/README.md
@@ -0,0 +1,35 @@
+# Libsharp
+
+Library for efficient spherical harmonic transforms at arbitrary spins,
+supporting CPU vectorization, OpenMP and MPI.
+
+## Paper
+
+https://arxiv.org/abs/1303.4945
+
+## News
+
+### January 2019
+
+This update features significant speedups thanks to important algorithmic
+discoveries by Keiichi Ishioka
+(https://www.jstage.jst.go.jp/article/jmsj/96/2/96_2018-019/_article and
+personal communication).
+
+These improvements reduce the fraction of CPU time spent on evaluating the
+recurrences for Y_lm coefficients, which means that computing multiple
+simultaneous SHTs no longer have a big performance advantage compared to SHTs
+done one after the other.
+As a consequence, libsharp support for simultaneous SHTs was dropped, making
+its interface much simpler.
+
+With the proper compilers and flags (see the file COMPILE for details) libsharp
+is now built with support for SSE2, AVX, AVX2, FMA3, FMA4 and AVX512f and the
+appropriate implementation is selected dynamically at runtime. This should
+provide a very significant performance boost for everyone using pre-compiled
+portable binaries.
+
+### Compilation
+
+The library uses the standard `autotools` mechanism for configuration,
+compilation and installation. See the file `COMPILE` for configuration hints.

From 7440aab6ecf697aa43b9acb5c0114fe87db9bc8a Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 15 Jan 2019 11:12:47 +0100
Subject: [PATCH 85/85] cosmetics

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7620953..c993dd1 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ appropriate implementation is selected dynamically at runtime. This should
 provide a very significant performance boost for everyone using pre-compiled
 portable binaries.
 
-### Compilation
+## Compilation
 
 The library uses the standard `autotools` mechanism for configuration,
 compilation and installation. See the file `COMPILE` for configuration hints.