From db08a08f778c3864e78a5cf17dd8785c3bf3f773 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 14 Feb 2019 17:28:28 +0100
Subject: [PATCH 01/17] revamp include mechanism; update copyright; fix for OSX

---
 Makefile.am                     |  4 ++--
 c_utils/c_utils.c               | 10 +++-------
 c_utils/c_utils.h               |  8 ++------
 c_utils/memusage.c              | 10 +++-------
 c_utils/memusage.h              |  8 ++------
 c_utils/walltime_c.c            | 10 +++-------
 c_utils/walltime_c.h            |  8 ++------
 configure.ac                    |  6 ------
 libsharp/sharp.c                | 20 ++++++++------------
 libsharp/sharp.h                |  6 +-----
 libsharp/sharp_almhelpers.c     | 12 ++++--------
 libsharp/sharp_almhelpers.h     |  8 ++------
 libsharp/sharp_core.c           | 33 ++++++++++++++++++++++++++++++++-
 libsharp/sharp_core_inc.c       | 19 ++++++++++---------
 libsharp/sharp_cxx.h            | 15 ++++-----------
 libsharp/sharp_geomhelpers.c    | 14 +++++---------
 libsharp/sharp_geomhelpers.h    |  8 ++------
 libsharp/sharp_internal.h       | 10 +++-------
 libsharp/sharp_legendre_roots.c |  4 ++--
 libsharp/sharp_legendre_roots.h |  6 +-----
 libsharp/sharp_mpi.c            |  8 ++------
 libsharp/sharp_mpi.h            |  8 ++------
 libsharp/sharp_testsuite.c      | 20 ++++++++------------
 libsharp/sharp_vecsupport.h     |  6 +-----
 libsharp/sharp_ylmgen_c.c       | 10 +++-------
 libsharp/sharp_ylmgen_c.h       |  6 +-----
 pocketfft/LICENSE.md            | 25 +++++++++++++++++++++++++
 pocketfft/pocketfft.c           |  2 +-
 pocketfft/pocketfft.h           |  2 +-
 29 files changed, 135 insertions(+), 171 deletions(-)
 create mode 100644 pocketfft/LICENSE.md

diff --git a/Makefile.am b/Makefile.am
index 0dfc6e1..9345ea5 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -38,7 +38,7 @@ libsharp_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
 
 endif
 
-include_HEADERS = \
+nobase_include_HEADERS = \
   libsharp/sharp.h \
   libsharp/sharp_geomhelpers.h \
   libsharp/sharp_almhelpers.h \
@@ -53,7 +53,7 @@ sharp_testsuite_LDADD = libsharp.la
 
 TESTS = runtest.sh
 
-AM_CFLAGS = -I$(top_srcdir)/c_utils -I$(top_srcdir)/libsharp @AM_CFLAGS@
+AM_CFLAGS = @AM_CFLAGS@
 
 pkgconfigdir = $(libdir)/pkgconfig
 nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc
diff --git a/c_utils/c_utils.c b/c_utils/c_utils.c
index 9344a6d..cf4f409 100644
--- a/c_utils/c_utils.c
+++ b/c_utils/c_utils.c
@@ -16,21 +16,17 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*
  *  Convenience functions
  *
- *  Copyright (C) 2008-2017 Max-Planck-Society
+ *  Copyright (C) 2008-2019 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
 #include <stdio.h>
-#include "c_utils.h"
+#include "c_utils/c_utils.h"
 
 void util_fail_ (const char *file, int line, const char *func, const char *msg)
   {
diff --git a/c_utils/c_utils.h b/c_utils/c_utils.h
index 01c64ad..5aed9a1 100644
--- a/c_utils/c_utils.h
+++ b/c_utils/c_utils.h
@@ -16,16 +16,12 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file c_utils.h
  *  Convenience functions
  *
- *  Copyright (C) 2008-2017 Max-Planck-Society
+ *  Copyright (C) 2008-2019 Max-Planck-Society
  *  \author Martin Reinecke
  *  \note This file should only be included from .c files, NOT from .h files.
  */
diff --git a/c_utils/memusage.c b/c_utils/memusage.c
index a1c25c0..fd0c331 100644
--- a/c_utils/memusage.c
+++ b/c_utils/memusage.c
@@ -16,22 +16,18 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*
  *  Functionality for measuring memory consumption
  *
- *  Copyright (C) 2012 Max-Planck-Society
+ *  Copyright (C) 2012-2019 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
 #include <stdio.h>
 #include <string.h>
-#include "memusage.h"
+#include "c_utils/memusage.h"
 
 double residentSetSize(void)
   {
diff --git a/c_utils/memusage.h b/c_utils/memusage.h
index fa0ac43..73c55c3 100644
--- a/c_utils/memusage.h
+++ b/c_utils/memusage.h
@@ -16,16 +16,12 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file memusage.h
  *  Functionality for measuring memory consumption
  *
- *  Copyright (C) 2012 Max-Planck-Society
+ *  Copyright (C) 2012-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
diff --git a/c_utils/walltime_c.c b/c_utils/walltime_c.c
index 8f4ac0c..f6b8a08 100644
--- a/c_utils/walltime_c.c
+++ b/c_utils/walltime_c.c
@@ -16,16 +16,12 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*
  *  Functionality for reading wall clock time
  *
- *  Copyright (C) 2010-2016 Max-Planck-Society
+ *  Copyright (C) 2010-2019 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
@@ -40,7 +36,7 @@
 #include <stdlib.h>
 #endif
 
-#include "walltime_c.h"
+#include "c_utils/walltime_c.h"
 
 double wallTime(void)
   {
diff --git a/c_utils/walltime_c.h b/c_utils/walltime_c.h
index c291f17..d74dc6c 100644
--- a/c_utils/walltime_c.h
+++ b/c_utils/walltime_c.h
@@ -16,16 +16,12 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file walltime_c.h
  *  Functionality for reading wall clock time
  *
- *  Copyright (C) 2010 Max-Planck-Society
+ *  Copyright (C) 2010-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
diff --git a/configure.ac b/configure.ac
index a619d3f..7f6b5d8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -14,12 +14,6 @@ m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
 LT_INIT
 AC_CONFIG_MACRO_DIR([m4])
 
-dnl
-dnl By default, install the headers into a subdirectory of
-dnl ${prefix}/include to avoid possible header filename collisions.
-dnl
-includedir="${includedir}/${PACKAGE_NAME}"
-
 dnl
 dnl Enable silent build rules if this version of Automake supports them
 dnl
diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index 456b69c..59e4638 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -16,28 +16,24 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file sharp.c
  *  Spherical transform library
  *
- *  Copyright (C) 2006-2016 Max-Planck-Society
+ *  Copyright (C) 2006-2019 Max-Planck-Society
  *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
 #include <math.h>
 #include <string.h>
 #include "pocketfft/pocketfft.h"
-#include "sharp_ylmgen_c.h"
-#include "sharp_internal.h"
-#include "c_utils.h"
-#include "walltime_c.h"
-#include "sharp_almhelpers.h"
-#include "sharp_geomhelpers.h"
+#include "libsharp/sharp_ylmgen_c.h"
+#include "libsharp/sharp_internal.h"
+#include "c_utils/c_utils.h"
+#include "c_utils/walltime_c.h"
+#include "libsharp/sharp_almhelpers.h"
+#include "libsharp/sharp_geomhelpers.h"
 
 typedef complex double dcmplx;
 typedef complex float  fcmplx;
diff --git a/libsharp/sharp.h b/libsharp/sharp.h
index 0e43ba9..1171372 100644
--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@@ -16,11 +16,7 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file sharp.h
  *  Portable interface for the spherical transform library.
diff --git a/libsharp/sharp_almhelpers.c b/libsharp/sharp_almhelpers.c
index 12ce600..c744b47 100644
--- a/libsharp/sharp_almhelpers.c
+++ b/libsharp/sharp_almhelpers.c
@@ -16,21 +16,17 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file sharp_almhelpers.c
  *  Spherical transform library
  *
- *  Copyright (C) 2008-2016 Max-Planck-Society
+ *  Copyright (C) 2008-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
-#include "sharp_almhelpers.h"
-#include "c_utils.h"
+#include "libsharp/sharp_almhelpers.h"
+#include "c_utils/c_utils.h"
 
 void sharp_make_triangular_alm_info (int lmax, int mmax, int stride,
   sharp_alm_info **alm_info)
diff --git a/libsharp/sharp_almhelpers.h b/libsharp/sharp_almhelpers.h
index 06bee8f..1d40d01 100644
--- a/libsharp/sharp_almhelpers.h
+++ b/libsharp/sharp_almhelpers.h
@@ -16,11 +16,7 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file sharp_almhelpers.h
  *  SHARP helper function for the creation of a_lm data structures
@@ -32,7 +28,7 @@
 #ifndef PLANCK_SHARP_ALMHELPERS_H
 #define PLANCK_SHARP_ALMHELPERS_H
 
-#include "sharp.h"
+#include "libsharp/sharp.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index f54a058..2c78f2b 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -1,6 +1,33 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
+
+/*! \file sharp_core.c
+ *  Spherical transform library
+ *
+ *  Copyright (C) 2019 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
 #define ARCH default
 #define GENERIC_ARCH
-#include "sharp_core_inc.c"
+#include "libsharp/sharp_core_inc.c"
 #undef GENERIC_ARCH
 #undef ARCH
 
@@ -42,7 +69,9 @@ int XCONCATX2(sharp_veclen,arch) (void); \
 int XCONCATX2(sharp_max_nvec,arch) (int spin); \
 const char *XCONCATX2(sharp_architecture,arch) (void);
 
+#if (!defined(__APPLE__))
 DECL(avx512f)
+#endif
 DECL(fma4)
 DECL(fma)
 DECL(avx2)
@@ -62,7 +91,9 @@ static void assign_funcs(void)
     architecture_ = XCONCATX2(sharp_architecture,arch); \
     return; \
     }
+#if (!defined(__APPLE__))
 DECL2(avx512f)
+#endif
 DECL2(fma4)
 DECL2(fma)
 DECL2(avx2)
diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c
index b875877..331f8f7 100644
--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
@@ -16,11 +16,7 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file sharp_core_inc.c
  *  Computational core
@@ -29,6 +25,9 @@
  *  \author Martin Reinecke
  */
 
+// FIXME: special ugly workaround for problems on OSX
+#if (!defined(__APPLE__)) || (!defined(__AVX512F__))
+
 #if (defined(MULTIARCH) || defined(GENERIC_ARCH))
 
 #define XCONCATX(a,b) a##_##b
@@ -38,10 +37,10 @@
 #include <complex.h>
 #include <math.h>
 #include <string.h>
-#include "sharp_vecsupport.h"
-#include "sharp.h"
-#include "sharp_internal.h"
-#include "c_utils.h"
+#include "libsharp/sharp_vecsupport.h"
+#include "libsharp/sharp.h"
+#include "libsharp/sharp_internal.h"
+#include "c_utils/c_utils.h"
 
 typedef complex double dcmplx;
 
@@ -1205,3 +1204,5 @@ const char *XARCH(sharp_architecture)(void)
   }
 
 #endif
+
+#endif
diff --git a/libsharp/sharp_cxx.h b/libsharp/sharp_cxx.h
index b2415e0..e37d491 100644
--- a/libsharp/sharp_cxx.h
+++ b/libsharp/sharp_cxx.h
@@ -16,11 +16,7 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file sharp_cxx.h
  *  Spherical transform library
@@ -33,9 +29,9 @@
 #define PLANCK_SHARP_CXX_H
 
 #include <complex>
-#include "sharp.h"
-#include "sharp_geomhelpers.h"
-#include "sharp_almhelpers.h"
+#include "libsharp/sharp.h"
+#include "libsharp/sharp_geomhelpers.h"
+#include "libsharp/sharp_almhelpers.h"
 
 class sharp_base
   {
@@ -89,9 +85,6 @@ class sharp_base
       if (ainfo) sharp_destroy_alm_info(ainfo);
       sharp_make_triangular_alm_info (lmax, mmax, 1, &ainfo);
       }
-
-    const sharp_geom_info* get_geom_info() const { return ginfo; }
-    const sharp_alm_info* get_alm_info() const { return ainfo; }
   };
 
 template<typename T> struct cxxjobhelper__ {};
diff --git a/libsharp/sharp_geomhelpers.c b/libsharp/sharp_geomhelpers.c
index 0f6af39..8a7030b 100644
--- a/libsharp/sharp_geomhelpers.c
+++ b/libsharp/sharp_geomhelpers.c
@@ -16,23 +16,19 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file sharp_geomhelpers.c
  *  Spherical transform library
  *
- *  Copyright (C) 2006-2018 Max-Planck-Society
+ *  Copyright (C) 2006-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
 #include <math.h>
-#include "sharp_geomhelpers.h"
-#include "sharp_legendre_roots.h"
-#include "c_utils.h"
+#include "libsharp/sharp_geomhelpers.h"
+#include "libsharp/sharp_legendre_roots.h"
+#include "c_utils/c_utils.h"
 #include "pocketfft/pocketfft.h"
 
 void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
diff --git a/libsharp/sharp_geomhelpers.h b/libsharp/sharp_geomhelpers.h
index a59af9b..27b1246 100644
--- a/libsharp/sharp_geomhelpers.h
+++ b/libsharp/sharp_geomhelpers.h
@@ -16,11 +16,7 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file sharp_geomhelpers.h
  *  SHARP helper function for the creation of grid geometries
@@ -32,7 +28,7 @@
 #ifndef PLANCK_SHARP_GEOMHELPERS_H
 #define PLANCK_SHARP_GEOMHELPERS_H
 
-#include "sharp.h"
+#include "libsharp/sharp.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/libsharp/sharp_internal.h b/libsharp/sharp_internal.h
index 979ae58..78ff666 100644
--- a/libsharp/sharp_internal.h
+++ b/libsharp/sharp_internal.h
@@ -16,11 +16,7 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file sharp_internal.h
  *  Internally used functionality for the spherical transform library.
@@ -37,8 +33,8 @@
 #endif
 
 #include <complex.h>
-#include "sharp.h"
-#include "sharp_ylmgen_c.h"
+#include "libsharp/sharp.h"
+#include "libsharp/sharp_ylmgen_c.h"
 
 typedef struct
   {
diff --git a/libsharp/sharp_legendre_roots.c b/libsharp/sharp_legendre_roots.c
index 44d8507..ec81ecb 100644
--- a/libsharp/sharp_legendre_roots.c
+++ b/libsharp/sharp_legendre_roots.c
@@ -7,8 +7,8 @@
     - tweaked Newton iteration to obtain higher accuracy */
 
 #include <math.h>
-#include "sharp_legendre_roots.h"
-#include "c_utils.h"
+#include "libsharp/sharp_legendre_roots.h"
+#include "c_utils/c_utils.h"
 
 static inline double one_minus_x2 (double x)
   { return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; }
diff --git a/libsharp/sharp_legendre_roots.h b/libsharp/sharp_legendre_roots.h
index 19f8ec8..4220fce 100644
--- a/libsharp/sharp_legendre_roots.h
+++ b/libsharp/sharp_legendre_roots.h
@@ -16,11 +16,7 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file sharp_legendre_roots.h
  *
diff --git a/libsharp/sharp_mpi.c b/libsharp/sharp_mpi.c
index a90ec69..ed3063a 100644
--- a/libsharp/sharp_mpi.c
+++ b/libsharp/sharp_mpi.c
@@ -16,11 +16,7 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file sharp_mpi.c
  *  Functionality only needed for MPI-parallel transforms
@@ -31,7 +27,7 @@
 
 #ifdef USE_MPI
 
-#include "sharp_mpi.h"
+#include "libsharp/sharp_mpi.h"
 
 typedef struct
   {
diff --git a/libsharp/sharp_mpi.h b/libsharp/sharp_mpi.h
index 5de37bf..bb26ff0 100644
--- a/libsharp/sharp_mpi.h
+++ b/libsharp/sharp_mpi.h
@@ -16,11 +16,7 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file sharp_mpi.h
  *  Interface for the spherical transform library with MPI support.
@@ -33,7 +29,7 @@
 #define PLANCK_SHARP_MPI_H
 
 #include <mpi.h>
-#include "sharp.h"
+#include "libsharp/sharp.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c
index f22a91e..4392a61 100644
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@@ -16,11 +16,7 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*  \file sharp_testsuite.c
  *
@@ -32,17 +28,17 @@
 #include <string.h>
 #ifdef USE_MPI
 #include "mpi.h"
-#include "sharp_mpi.h"
+#include "libsharp/sharp_mpi.h"
 #endif
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include "sharp.h"
-#include "sharp_internal.h"
-#include "sharp_geomhelpers.h"
-#include "sharp_almhelpers.h"
-#include "c_utils.h"
-#include "memusage.h"
+#include "libsharp/sharp.h"
+#include "libsharp/sharp_internal.h"
+#include "libsharp/sharp_geomhelpers.h"
+#include "libsharp/sharp_almhelpers.h"
+#include "c_utils/c_utils.h"
+#include "c_utils/memusage.h"
 
 static void OpenMP_status(void)
   {
diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index e4bfc4f..7894ef6 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -16,11 +16,7 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*  \file sharp_vecsupport.h
  *  Convenience functions for vector arithmetics
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index f408eea..35f3397 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -16,11 +16,7 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*
  *  Helper code for efficient calculation of Y_lm(theta,phi=0)
@@ -31,8 +27,8 @@
 
 #include <math.h>
 #include <stdlib.h>
-#include "sharp_ylmgen_c.h"
-#include "c_utils.h"
+#include "libsharp/sharp_ylmgen_c.h"
+#include "c_utils/c_utils.h"
 
 static inline void normalize (double *val, int *scale, double xfmax)
   {
diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h
index 130d797..1fa1cfa 100644
--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@@ -16,11 +16,7 @@
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*! \file sharp_ylmgen_c.h
  *  Code for efficient calculation of Y_lm(phi=0,theta)
diff --git a/pocketfft/LICENSE.md b/pocketfft/LICENSE.md
new file mode 100644
index 0000000..c6d4a9f
--- /dev/null
+++ b/pocketfft/LICENSE.md
@@ -0,0 +1,25 @@
+Copyright (C) 2010-2019 Max-Planck-Society
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+* Neither the name of the Astropy Team nor the names of its contributors may be
+  used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pocketfft/pocketfft.c b/pocketfft/pocketfft.c
index de1af3e..f7771c9 100644
--- a/pocketfft/pocketfft.c
+++ b/pocketfft/pocketfft.c
@@ -6,7 +6,7 @@
 /*
  *  Main implementation file.
  *
- *  Copyright (C) 2004-2018 Max-Planck-Society
+ *  Copyright (C) 2004-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
diff --git a/pocketfft/pocketfft.h b/pocketfft/pocketfft.h
index 9eb3985..af86eba 100644
--- a/pocketfft/pocketfft.h
+++ b/pocketfft/pocketfft.h
@@ -6,7 +6,7 @@
 /*! \file pocketfft.h
  *  Public interface of the pocketfft library
  *
- *  Copyright (C) 2008-2018 Max-Planck-Society
+ *  Copyright (C) 2008-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 

From 86bb6fa93a5c4af17d4c4aadfe01bd68b3d1aa33 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 15 Feb 2019 16:31:05 +0100
Subject: [PATCH 02/17] make walltimer_c part of the library

---
 Makefile.am | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Makefile.am b/Makefile.am
index 9345ea5..7900c0d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -5,6 +5,8 @@ lib_LTLIBRARIES = libsharp.la
 libsharp_la_SOURCES = \
   c_utils/c_utils.c \
   c_utils/c_utils.h \
+  c_utils/walltime_c.c \
+  c_utils/walltime_c.h \
   pocketfft/pocketfft.c \
   pocketfft/pocketfft.h \
   libsharp/sharp.c \
@@ -48,7 +50,7 @@ EXTRA_DIST = \
   runtest.sh
 
 check_PROGRAMS = sharp_testsuite
-sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h c_utils/walltime_c.c c_utils/walltime_c.h
+sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h
 sharp_testsuite_LDADD = libsharp.la
 
 TESTS = runtest.sh

From 04aa8245d04ccd1a1418edfb9680d47bb4e42f45 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 20 Feb 2019 10:06:06 +0100
Subject: [PATCH 03/17] cosmetics and small tweaks to pkg-config

---
 c_utils/memusage.c | 2 +-
 configure.ac       | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/c_utils/memusage.c b/c_utils/memusage.c
index fd0c331..7fe5fa3 100644
--- a/c_utils/memusage.c
+++ b/c_utils/memusage.c
@@ -53,7 +53,7 @@ double VmHWM(void)
     if (!strncmp(word, "VmHWM:", 6))
       {
       if (fscanf(f,"%lf%2s",&res,word)<0)
-	{ fclose(f); return -1.0; }
+        { fclose(f); return -1.0; }
       if (strncmp(word, "kB", 2))
         { fclose(f); return -1.0; }
       res *=1024;
diff --git a/configure.ac b/configure.ac
index 7f6b5d8..3871ffc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -30,6 +30,8 @@ AC_PROG_LIBTOOL
 tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'`
 AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
 
+PACKAGE_LIBS="-lsharp"
+
 dnl
 dnl Create pkgconfig .pc file.
 dnl

From 5a010d39704cb49ad789cc7b39ae845d29ff6521 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 22 Feb 2019 14:53:04 +0100
Subject: [PATCH 04/17] remove SHARP_NO_OPENMP flag

---
 libsharp/sharp.c     | 6 +++---
 libsharp/sharp.h     | 1 -
 libsharp/sharp_mpi.c | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index 59e4638..f811ab4 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -761,7 +761,7 @@ NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
     }
   else
     {
-#pragma omp parallel if ((job->flags&SHARP_NO_OPENMP)==0)
+#pragma omp parallel
 {
     ringhelper helper;
     ringhelper_init(&helper);
@@ -806,7 +806,7 @@ NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
     }
   else
     {
-#pragma omp parallel if ((job->flags&SHARP_NO_OPENMP)==0)
+#pragma omp parallel
 {
     ringhelper helper;
     ringhelper_init(&helper);
@@ -872,7 +872,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
 /* map->phase where necessary */
     map2phase (job, mmax, llim, ulim);
 
-#pragma omp parallel if ((job->flags&SHARP_NO_OPENMP)==0)
+#pragma omp parallel
 {
     sharp_job ljob = *job;
     ljob.opcnt=0;
diff --git a/libsharp/sharp.h b/libsharp/sharp.h
index 1171372..657e369 100644
--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@@ -195,7 +195,6 @@ typedef enum { SHARP_DP              = 1<<4,
                SHARP_NO_FFT          = 1<<7,
 
                SHARP_USE_WEIGHTS     = 1<<20,    /* internal use only */
-               SHARP_NO_OPENMP       = 1<<21,    /* internal use only */
              } sharp_jobflags;
 
 /*! Performs a libsharp SHT job. The interface deliberately does not use
diff --git a/libsharp/sharp_mpi.c b/libsharp/sharp_mpi.c
index ed3063a..02c76c3 100644
--- a/libsharp/sharp_mpi.c
+++ b/libsharp/sharp_mpi.c
@@ -266,7 +266,7 @@ static void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm)
 
     map2alm_comm (job, &minfo);
 
-#pragma omp parallel if ((job->flags&SHARP_NO_OPENMP)==0)
+#pragma omp parallel
 {
     sharp_job ljob = *job;
     sharp_Ylmgen_C generator;

From 540e7e44f8dfc61d64debf5ae42aebb011444b5d Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 27 Feb 2019 10:44:38 +0100
Subject: [PATCH 05/17] doc improvements and a pragma, which probably does
 nothing

---
 COMPILE                   | 6 +++---
 libsharp/sharp_core_inc.c | 5 +++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/COMPILE b/COMPILE
index 5b1c5b2..b63ae7f 100644
--- a/COMPILE
+++ b/COMPILE
@@ -15,7 +15,7 @@ flags.
 Fast math
 ---------
 
-Specifying "-ffast-math" is important for all compilers, since it allows the
+Specifying "-ffast-math" or "-ffp-contract=fast" is important for all compilers, since it allows the
 compiler to fuse multiplications and additions into FMA instructions, which is
 forbidden by the C99 standard. Since FMAs are a central aspect of the algorithm,
 they are needed for optimum performance.
@@ -25,8 +25,8 @@ to the C99 standard, you should still be able to compile libsharp with
 "-ffast-math" without any problems.
 
 
-Runtime CPU selection with gcc
-------------------------------
+Runtime CPU selection with gcc and clang
+----------------------------------------
 
 When using a recent gcc (6.0 and newer) or a recent clang (successfully tested
 with versions 6 and 7) on an x86_64 platform, the build machinery can compile
diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c
index 331f8f7..682558c 100644
--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
@@ -42,6 +42,11 @@
 #include "libsharp/sharp_internal.h"
 #include "c_utils/c_utils.h"
 
+// In the following, we explicitly allow the compiler to contract floating
+// point operations, like multiply-and-add.
+// Unfortunately, most compilers don't act on this pragma yet.
+#pragma STDC FP_CONTRACT ON
+
 typedef complex double dcmplx;
 
 #define nv0 (128/VLEN)

From bf43082182d7c20bf41f9a4165932ddf48aff003 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 27 Feb 2019 11:12:44 +0100
Subject: [PATCH 06/17] separate out pocketfft

---
 Makefile.am                  |    6 +-
 configure.ac                 |    7 +-
 libsharp/sharp.c             |   14 +-
 libsharp/sharp_geomhelpers.c |   18 +-
 pocketfft/LICENSE.md         |   25 -
 pocketfft/pocketfft.c        | 2190 ----------------------------------
 pocketfft/pocketfft.h        |   34 -
 7 files changed, 22 insertions(+), 2272 deletions(-)
 delete mode 100644 pocketfft/LICENSE.md
 delete mode 100644 pocketfft/pocketfft.c
 delete mode 100644 pocketfft/pocketfft.h

diff --git a/Makefile.am b/Makefile.am
index 7900c0d..f946f3a 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -7,8 +7,6 @@ libsharp_la_SOURCES = \
   c_utils/c_utils.h \
   c_utils/walltime_c.c \
   c_utils/walltime_c.h \
-  pocketfft/pocketfft.c \
-  pocketfft/pocketfft.h \
   libsharp/sharp.c \
   libsharp/sharp_almhelpers.c \
   libsharp/sharp_core.c \
@@ -51,11 +49,11 @@ EXTRA_DIST = \
 
 check_PROGRAMS = sharp_testsuite
 sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h
-sharp_testsuite_LDADD = libsharp.la
+sharp_testsuite_LDADD = libsharp.la @POCKETFFT_LIBS@
 
 TESTS = runtest.sh
 
-AM_CFLAGS = @AM_CFLAGS@
+AM_CFLAGS = @AM_CFLAGS@ @POCKETFFT_CFLAGS@
 
 pkgconfigdir = $(libdir)/pkgconfig
 nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc
diff --git a/configure.ac b/configure.ac
index 3871ffc..95cb187 100644
--- a/configure.ac
+++ b/configure.ac
@@ -22,11 +22,10 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 
 AC_PROG_CC_C99
 
-# adding the lib to the files to link
-LIBS="-lm"
-
 AC_PROG_LIBTOOL
 
+PKG_CHECK_MODULES([POCKETFFT], [pocketfft], , [POCKETFFT_LIBS="-lpocketfft -lm"])
+
 tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'`
 AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
 
@@ -39,6 +38,8 @@ AX_CREATE_PKGCONFIG_INFO(,,,,[])
 AC_SUBST([LIBS])
 AC_SUBST([AM_CFLAGS])
 AC_SUBST([AM_LDFLAGS])
+AC_SUBST([POCKETFFT_CFLAGS])
+AC_SUBST([POCKETFFT_LIBS])
 
 AC_CONFIG_FILES([Makefile])
 AC_OUTPUT
diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index f811ab4..58d2565 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -77,7 +77,7 @@ typedef struct
   double phi0_;
   dcmplx *shiftarr;
   int s_shift;
-  rfft_plan plan;
+  pocketfft_plan_r plan;
   int length;
   int norot;
   } ringhelper;
@@ -90,7 +90,7 @@ static void ringhelper_init (ringhelper *self)
 
 static void ringhelper_destroy (ringhelper *self)
   {
-  if (self->plan) destroy_rfft_plan(self->plan);
+  if (self->plan) pocketfft_delete_plan_r(self->plan);
   DEALLOC(self->shiftarr);
   ringhelper_init(self);
   }
@@ -110,11 +110,11 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou
 //      double *tmp=(double *) self->shiftarr;
 //      sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
       }
-//  if (!self->plan) self->plan=make_rfft_plan(nph);
+//  if (!self->plan) self->plan=pocketfft_make_plan_r(nph);
   if (nph!=(int)self->length)
     {
-    if (self->plan) destroy_rfft_plan(self->plan);
-    self->plan=make_rfft_plan(nph);
+    if (self->plan) pocketfft_delete_plan_r(self->plan);
+    self->plan=pocketfft_make_plan_r(nph);
     self->length=nph;
     }
   }
@@ -331,7 +331,7 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
       }
     }
   data[1]=data[0];
-  rfft_backward (self->plan, &(data[1]), 1.);
+  pocketfft_backward_r (self->plan, &(data[1]), 1.);
   }
 
 NOINLINE static void ringhelper_ring2phase (ringhelper *self,
@@ -350,7 +350,7 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
   if (flags&SHARP_REAL_HARMONICS)
     wgt *= sqrt_two;
 
-  rfft_forward (self->plan, &(data[1]), 1.);
+  pocketfft_forward_r (self->plan, &(data[1]), 1.);
   data[0]=data[1];
   data[1]=data[nph+1]=0.;
 
diff --git a/libsharp/sharp_geomhelpers.c b/libsharp/sharp_geomhelpers.c
index 8a7030b..f8f5152 100644
--- a/libsharp/sharp_geomhelpers.c
+++ b/libsharp/sharp_geomhelpers.c
@@ -155,9 +155,9 @@ void sharp_make_fejer1_geom_info (int nrings, int ppring, double phi0,
     weight[2*k  ]=2./(1.-4.*k*k)*sin((k*pi)/nrings);
     }
   if ((nrings&1)==0) weight[nrings-1]=0.;
-  rfft_plan plan = make_rfft_plan(nrings);
-  rfft_backward(plan,weight,1.);
-  destroy_rfft_plan(plan);
+  pocketfft_plan_r plan = pocketfft_make_plan_r(nrings);
+  pocketfft_backward_r(plan,weight,1.);
+  pocketfft_delete_plan_r(plan);
 
   for (int m=0; m<(nrings+1)/2; ++m)
     {
@@ -202,9 +202,9 @@ void sharp_make_cc_geom_info (int nrings, int ppring, double phi0,
   for (int k=1; k<=(n/2-1); ++k)
     weight[2*k-1]=2./(1.-4.*k*k) + dw;
   weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1);
-  rfft_plan plan = make_rfft_plan(n);
-  rfft_backward(plan,weight,1.);
-  destroy_rfft_plan(plan);
+  pocketfft_plan_r plan = pocketfft_make_plan_r(n);
+  pocketfft_backward_r(plan,weight,1.);
+  pocketfft_delete_plan_r(plan);
   weight[n]=weight[0];
 
   for (int m=0; m<(nrings+1)/2; ++m)
@@ -250,9 +250,9 @@ void sharp_make_fejer2_geom_info (int nrings, int ppring, double phi0,
   for (int k=1; k<=(n/2-1); ++k)
     weight[2*k-1]=2./(1.-4.*k*k);
   weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.;
-  rfft_plan plan = make_rfft_plan(n);
-  rfft_backward(plan,weight,1.);
-  destroy_rfft_plan(plan);
+  pocketfft_plan_r plan = pocketfft_make_plan_r(n);
+  pocketfft_backward_r(plan,weight,1.);
+  pocketfft_delete_plan_r(plan);
   for (int m=0; m<nrings; ++m)
     weight[m]=weight[m+1];
 
diff --git a/pocketfft/LICENSE.md b/pocketfft/LICENSE.md
deleted file mode 100644
index c6d4a9f..0000000
--- a/pocketfft/LICENSE.md
+++ /dev/null
@@ -1,25 +0,0 @@
-Copyright (C) 2010-2019 Max-Planck-Society
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above copyright notice, this
-  list of conditions and the following disclaimer in the documentation and/or
-  other materials provided with the distribution.
-* Neither the name of the Astropy Team nor the names of its contributors may be
-  used to endorse or promote products derived from this software without
-  specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pocketfft/pocketfft.c b/pocketfft/pocketfft.c
deleted file mode 100644
index f7771c9..0000000
--- a/pocketfft/pocketfft.c
+++ /dev/null
@@ -1,2190 +0,0 @@
-/*
- * This file is part of pocketfft.
- * Licensed under a 3-clause BSD style license - see LICENSE.md
- */
-
-/*
- *  Main implementation file.
- *
- *  Copyright (C) 2004-2019 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#include <math.h>
-#include <string.h>
-
-#include "pocketfft.h"
-
-#define RALLOC(type,num) \
-  ((type *)malloc((num)*sizeof(type)))
-#define DEALLOC(ptr) \
-  do { free(ptr); (ptr)=NULL; } while(0)
-
-#define SWAP(a,b,type) \
-  do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0)
-
-#ifdef __GNUC__
-#define NOINLINE __attribute__((noinline))
-#define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result))
-#else
-#define NOINLINE
-#define WARN_UNUSED_RESULT
-#endif
-
-// adapted from https://stackoverflow.com/questions/42792939/
-// CAUTION: this function only works for arguments in the range [-0.25; 0.25]!
-static void my_sincosm1pi (double a, double *restrict res)
-  {
-  double s = a * a;
-  /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
-  double r =     -1.0369917389758117e-4;
-  r = fma (r, s,  1.9294935641298806e-3);
-  r = fma (r, s, -2.5806887942825395e-2);
-  r = fma (r, s,  2.3533063028328211e-1);
-  r = fma (r, s, -1.3352627688538006e+0);
-  r = fma (r, s,  4.0587121264167623e+0);
-  r = fma (r, s, -4.9348022005446790e+0);
-  double c = r*s;
-  /* Approximate sin(pi*x) for x in [-0.25,0.25] */
-  r =             4.6151442520157035e-4;
-  r = fma (r, s, -7.3700183130883555e-3);
-  r = fma (r, s,  8.2145868949323936e-2);
-  r = fma (r, s, -5.9926452893214921e-1);
-  r = fma (r, s,  2.5501640398732688e+0);
-  r = fma (r, s, -5.1677127800499516e+0);
-  s = s * a;
-  r = r * s;
-  s = fma (a, 3.1415926535897931e+0, r);
-  res[0] = c;
-  res[1] = s;
-  }
-
-NOINLINE static void calc_first_octant(size_t den, double * restrict res)
-  {
-  size_t n = (den+4)>>3;
-  if (n==0) return;
-  res[0]=1.; res[1]=0.;
-  if (n==1) return;
-  size_t l1=(size_t)sqrt(n);
-  for (size_t i=1; i<l1; ++i)
-    my_sincosm1pi((2.*i)/den,&res[2*i]);
-  size_t start=l1;
-  while(start<n)
-    {
-    double cs[2];
-    my_sincosm1pi((2.*start)/den,cs);
-    res[2*start] = cs[0]+1.;
-    res[2*start+1] = cs[1];
-    size_t end = l1;
-    if (start+end>n) end = n-start;
-    for (size_t i=1; i<end; ++i)
-      {
-      double csx[2]={res[2*i], res[2*i+1]};
-      res[2*(start+i)] = ((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1.;
-      res[2*(start+i)+1] = (cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1];
-      }
-    start += l1;
-    }
-  for (size_t i=1; i<l1; ++i)
-    res[2*i] += 1.;
-  }
-
-NOINLINE static void calc_first_quadrant(size_t n, double * restrict res)
-  {
-  double * restrict p = res+n;
-  calc_first_octant(n<<1, p);
-  size_t ndone=(n+2)>>2;
-  size_t i=0, idx1=0, idx2=2*ndone-2;
-  for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
-    {
-    res[idx1]   = p[2*i];
-    res[idx1+1] = p[2*i+1];
-    res[idx2]   = p[2*i+3];
-    res[idx2+1] = p[2*i+2];
-    }
-  if (i!=ndone)
-    {
-    res[idx1  ] = p[2*i];
-    res[idx1+1] = p[2*i+1];
-    }
-  }
-
-NOINLINE static void calc_first_half(size_t n, double * restrict res)
-  {
-  int ndone=(n+1)>>1;
-  double * p = res+n-1;
-  calc_first_octant(n<<2, p);
-  int i4=0, in=n, i=0;
-  for (; i4<=in-i4; ++i, i4+=4) // octant 0
-    {
-    res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1];
-    }
-  for (; i4-in <= 0; ++i, i4+=4) // octant 1
-    {
-    int xm = in-i4;
-    res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm];
-    }
-  for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
-    {
-    int xm = i4-in;
-    res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm];
-    }
-  for (; i<ndone; ++i, i4+=4) // octant 3
-    {
-    int xm = 2*in-i4;
-    res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1];
-    }
-  }
-
-NOINLINE static void fill_first_quadrant(size_t n, double * restrict res)
-  {
-  const double hsqt2 = 0.707106781186547524400844362104849;
-  size_t quart = n>>2;
-  if ((n&7)==0)
-    res[quart] = res[quart+1] = hsqt2;
-  for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
-    {
-    res[j  ] = res[i+1];
-    res[j+1] = res[i  ];
-    }
-  }
-
-NOINLINE static void fill_first_half(size_t n, double * restrict res)
-  {
-  size_t half = n>>1;
-  if ((n&3)==0)
-    for (size_t i=0; i<half; i+=2)
-      {
-      res[i+half]   = -res[i+1];
-      res[i+half+1] =  res[i  ];
-      }
-  else
-    for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
-      {
-      res[j  ] = -res[i  ];
-      res[j+1] =  res[i+1];
-      }
-  }
-
-NOINLINE static void fill_second_half(size_t n, double * restrict res)
-  {
-  if ((n&1)==0)
-    for (size_t i=0; i<n; ++i)
-      res[i+n] = -res[i];
-  else
-    for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
-      {
-      res[j  ] =  res[i  ];
-      res[j+1] = -res[i+1];
-      }
-  }
-
-NOINLINE static void sincos_2pibyn_half(size_t n, double * restrict res)
-  {
-  if ((n&3)==0)
-    {
-    calc_first_octant(n, res);
-    fill_first_quadrant(n, res);
-    fill_first_half(n, res);
-    }
-  else if ((n&1)==0)
-    {
-    calc_first_quadrant(n, res);
-    fill_first_half(n, res);
-    }
-  else
-    calc_first_half(n, res);
-  }
-
-NOINLINE static void sincos_2pibyn(size_t n, double * restrict res)
-  {
-  sincos_2pibyn_half(n, res);
-  fill_second_half(n, res);
-  }
-
-NOINLINE static size_t largest_prime_factor (size_t n)
-  {
-  size_t res=1;
-  size_t tmp;
-  while (((tmp=(n>>1))<<1)==n)
-    { res=2; n=tmp; }
-
-  size_t limit=(size_t)sqrt(n+0.01);
-  for (size_t x=3; x<=limit; x+=2)
-  while (((tmp=(n/x))*x)==n)
-    {
-    res=x;
-    n=tmp;
-    limit=(size_t)sqrt(n+0.01);
-    }
-  if (n>1) res=n;
-
-  return res;
-  }
-
-NOINLINE static double cost_guess (size_t n)
-  {
-  const double lfp=1.1; // penalty for non-hardcoded larger factors
-  size_t ni=n;
-  double result=0.;
-  size_t tmp;
-  while (((tmp=(n>>1))<<1)==n)
-    { result+=2; n=tmp; }
-
-  size_t limit=(size_t)sqrt(n+0.01);
-  for (size_t x=3; x<=limit; x+=2)
-  while ((tmp=(n/x))*x==n)
-    {
-    result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
-    n=tmp;
-    limit=(size_t)sqrt(n+0.01);
-    }
-  if (n>1) result+=(n<=5) ? n : lfp*n;
-
-  return result*ni;
-  }
-
-/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
-NOINLINE static size_t good_size(size_t n)
-  {
-  if (n<=6) return n;
-
-  size_t bestfac=2*n;
-  for (size_t f2=1; f2<bestfac; f2*=2)
-    for (size_t f23=f2; f23<bestfac; f23*=3)
-      for (size_t f235=f23; f235<bestfac; f235*=5)
-        for (size_t f2357=f235; f2357<bestfac; f2357*=7)
-          for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
-            if (f235711>=n) bestfac=f235711;
-  return bestfac;
-  }
-
-typedef struct cmplx {
-  double r,i;
-} cmplx;
-
-#define NFCT 25
-typedef struct cfftp_fctdata
-  {
-  size_t fct;
-  cmplx *tw, *tws;
-  } cfftp_fctdata;
-
-typedef struct cfftp_plan_i
-  {
-  size_t length, nfct;
-  cmplx *mem;
-  cfftp_fctdata fct[NFCT];
-  } cfftp_plan_i;
-typedef struct cfftp_plan_i * cfftp_plan;
-
-#define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
-#define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; }
-#define SCALEC(a,b) { a.r*=b; a.i*=b; }
-#define ROT90(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
-#define ROTM90(a) { double tmp_=-a.r; a.r=a.i; a.i=tmp_; }
-#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
-#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
-#define WA(x,i) wa[(i)-1+(x)*(ido-1)]
-/* a = b*c */
-#define A_EQ_B_MUL_C(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
-/* a = conj(b)*c*/
-#define A_EQ_CB_MUL_C(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
-
-#define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; }
-/* a = b*c */
-#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-sign*b.i*c.i; a.i=b.r*c.i+sign*b.i*c.r; }
-/* a *= b */
-#define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; }
-
-NOINLINE static void pass2b (size_t ido, size_t l1, const cmplx * restrict cc,
-  cmplx * restrict ch, const cmplx * restrict wa)
-  {
-  const size_t cdim=2;
-
-  if (ido==1)
-    for (size_t k=0; k<l1; ++k)
-      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
-  else
-    for (size_t k=0; k<l1; ++k)
-      {
-      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
-      for (size_t i=1; i<ido; ++i)
-        {
-        cmplx t;
-        PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
-        A_EQ_B_MUL_C (CH(i,k,1),WA(0,i),t)
-        }
-      }
-  }
-
-NOINLINE static void pass2f (size_t ido, size_t l1, const cmplx * restrict cc,
-  cmplx * restrict ch, const cmplx * restrict wa)
-  {
-  const size_t cdim=2;
-
-  if (ido==1)
-    for (size_t k=0; k<l1; ++k)
-      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
-  else
-    for (size_t k=0; k<l1; ++k)
-      {
-      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
-      for (size_t i=1; i<ido; ++i)
-        {
-        cmplx t;
-        PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
-        A_EQ_CB_MUL_C (CH(i,k,1),WA(0,i),t)
-        }
-      }
-  }
-
-#define PREP3(idx) \
-        cmplx t0 = CC(idx,0,k), t1, t2; \
-        PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)) \
-        CH(idx,k,0).r=t0.r+t1.r; \
-        CH(idx,k,0).i=t0.i+t1.i;
-#define PARTSTEP3a(u1,u2,twr,twi) \
-        { \
-        cmplx ca,cb; \
-        ca.r=t0.r+twr*t1.r; \
-        ca.i=t0.i+twr*t1.i; \
-        cb.i=twi*t2.r; \
-        cb.r=-(twi*t2.i); \
-        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
-        }
-
-#define PARTSTEP3b(u1,u2,twr,twi) \
-        { \
-        cmplx ca,cb,da,db; \
-        ca.r=t0.r+twr*t1.r; \
-        ca.i=t0.i+twr*t1.i; \
-        cb.i=twi*t2.r; \
-        cb.r=-(twi*t2.i); \
-        PMC(da,db,ca,cb) \
-        A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
-        A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
-        }
-NOINLINE static void pass3b (size_t ido, size_t l1, const cmplx * restrict cc,
-  cmplx * restrict ch, const cmplx * restrict wa)
-  {
-  const size_t cdim=3;
-  const double tw1r=-0.5, tw1i= 0.86602540378443864676;
-
-  if (ido==1)
-    for (size_t k=0; k<l1; ++k)
-      {
-      PREP3(0)
-      PARTSTEP3a(1,2,tw1r,tw1i)
-      }
-  else
-    for (size_t k=0; k<l1; ++k)
-      {
-      {
-      PREP3(0)
-      PARTSTEP3a(1,2,tw1r,tw1i)
-      }
-      for (size_t i=1; i<ido; ++i)
-        {
-        PREP3(i)
-        PARTSTEP3b(1,2,tw1r,tw1i)
-        }
-      }
-  }
-#define PARTSTEP3f(u1,u2,twr,twi) \
-        { \
-        cmplx ca,cb,da,db; \
-        ca.r=t0.r+twr*t1.r; \
-        ca.i=t0.i+twr*t1.i; \
-        cb.i=twi*t2.r; \
-        cb.r=-(twi*t2.i); \
-        PMC(da,db,ca,cb) \
-        A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
-        A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
-        }
-NOINLINE static void pass3f (size_t ido, size_t l1, const cmplx * restrict cc,
-  cmplx * restrict ch, const cmplx * restrict wa)
-  {
-  const size_t cdim=3;
-  const double tw1r=-0.5, tw1i= -0.86602540378443864676;
-
-  if (ido==1)
-    for (size_t k=0; k<l1; ++k)
-      {
-      PREP3(0)
-      PARTSTEP3a(1,2,tw1r,tw1i)
-      }
-  else
-    for (size_t k=0; k<l1; ++k)
-      {
-      {
-      PREP3(0)
-      PARTSTEP3a(1,2,tw1r,tw1i)
-      }
-      for (size_t i=1; i<ido; ++i)
-        {
-        PREP3(i)
-        PARTSTEP3f(1,2,tw1r,tw1i)
-        }
-      }
-  }
-
-NOINLINE static void pass4b (size_t ido, size_t l1, const cmplx * restrict cc,
-  cmplx * restrict ch, const cmplx * restrict wa)
-  {
-  const size_t cdim=4;
-
-  if (ido==1)
-    for (size_t k=0; k<l1; ++k)
-      {
-      cmplx t1, t2, t3, t4;
-      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
-      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
-      ROT90(t4)
-      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
-      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
-      }
-  else
-    for (size_t k=0; k<l1; ++k)
-      {
-      {
-      cmplx t1, t2, t3, t4;
-      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
-      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
-      ROT90(t4)
-      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
-      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
-      }
-      for (size_t i=1; i<ido; ++i)
-        {
-        cmplx c2, c3, c4, t1, t2, t3, t4;
-        cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
-        PMC(t2,t1,cc0,cc2)
-        PMC(t3,t4,cc1,cc3)
-        ROT90(t4)
-        cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
-        PMC(CH(i,k,0),c3,t2,t3)
-        PMC(c2,c4,t1,t4)
-        A_EQ_B_MUL_C (CH(i,k,1),wa0,c2)
-        A_EQ_B_MUL_C (CH(i,k,2),wa1,c3)
-        A_EQ_B_MUL_C (CH(i,k,3),wa2,c4)
-        }
-      }
-  }
-NOINLINE static void pass4f (size_t ido, size_t l1, const cmplx * restrict cc,
-  cmplx * restrict ch, const cmplx * restrict wa)
-  {
-  const size_t cdim=4;
-
-  if (ido==1)
-    for (size_t k=0; k<l1; ++k)
-      {
-      cmplx t1, t2, t3, t4;
-      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
-      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
-      ROTM90(t4)
-      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
-      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
-      }
-  else
-    for (size_t k=0; k<l1; ++k)
-      {
-      {
-      cmplx t1, t2, t3, t4;
-      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
-      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
-      ROTM90(t4)
-      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
-      PMC (CH(0,k,1),CH(0,k,3),t1,t4)
-      }
-      for (size_t i=1; i<ido; ++i)
-        {
-        cmplx c2, c3, c4, t1, t2, t3, t4;
-        cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
-        PMC(t2,t1,cc0,cc2)
-        PMC(t3,t4,cc1,cc3)
-        ROTM90(t4)
-        cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
-        PMC(CH(i,k,0),c3,t2,t3)
-        PMC(c2,c4,t1,t4)
-        A_EQ_CB_MUL_C (CH(i,k,1),wa0,c2)
-        A_EQ_CB_MUL_C (CH(i,k,2),wa1,c3)
-        A_EQ_CB_MUL_C (CH(i,k,3),wa2,c4)
-        }
-      }
-  }
-
-#define PREP5(idx) \
-        cmplx t0 = CC(idx,0,k), t1, t2, t3, t4; \
-        PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)) \
-        PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)) \
-        CH(idx,k,0).r=t0.r+t1.r+t2.r; \
-        CH(idx,k,0).i=t0.i+t1.i+t2.i;
-
-#define PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
-        { \
-        cmplx ca,cb; \
-        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
-        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
-        cb.i=twai*t4.r twbi*t3.r; \
-        cb.r=-(twai*t4.i twbi*t3.i); \
-        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
-        }
-
-#define PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
-        { \
-        cmplx ca,cb,da,db; \
-        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
-        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
-        cb.i=twai*t4.r twbi*t3.r; \
-        cb.r=-(twai*t4.i twbi*t3.i); \
-        PMC(da,db,ca,cb) \
-        A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
-        A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
-        }
-NOINLINE static void pass5b (size_t ido, size_t l1, const cmplx * restrict cc,
-  cmplx * restrict ch, const cmplx * restrict wa)
-  {
-  const size_t cdim=5;
-  const double tw1r= 0.3090169943749474241,
-               tw1i= 0.95105651629515357212,
-               tw2r= -0.8090169943749474241,
-               tw2i= 0.58778525229247312917;
-
-  if (ido==1)
-    for (size_t k=0; k<l1; ++k)
-      {
-      PREP5(0)
-      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
-      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
-      }
-  else
-    for (size_t k=0; k<l1; ++k)
-      {
-      {
-      PREP5(0)
-      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
-      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
-      }
-      for (size_t i=1; i<ido; ++i)
-        {
-        PREP5(i)
-        PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
-        PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
-        }
-      }
-  }
-#define PARTSTEP5f(u1,u2,twar,twbr,twai,twbi) \
-        { \
-        cmplx ca,cb,da,db; \
-        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
-        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
-        cb.i=twai*t4.r twbi*t3.r; \
-        cb.r=-(twai*t4.i twbi*t3.i); \
-        PMC(da,db,ca,cb) \
-        A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
-        A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
-        }
-NOINLINE static void pass5f (size_t ido, size_t l1, const cmplx * restrict cc,
-  cmplx * restrict ch, const cmplx * restrict wa)
-  {
-  const size_t cdim=5;
-  const double tw1r= 0.3090169943749474241,
-               tw1i= -0.95105651629515357212,
-               tw2r= -0.8090169943749474241,
-               tw2i= -0.58778525229247312917;
-
-  if (ido==1)
-    for (size_t k=0; k<l1; ++k)
-      {
-      PREP5(0)
-      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
-      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
-      }
-  else
-    for (size_t k=0; k<l1; ++k)
-      {
-      {
-      PREP5(0)
-      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
-      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
-      }
-      for (size_t i=1; i<ido; ++i)
-        {
-        PREP5(i)
-        PARTSTEP5f(1,4,tw1r,tw2r,+tw1i,+tw2i)
-        PARTSTEP5f(2,3,tw2r,tw1r,+tw2i,-tw1i)
-        }
-      }
-  }
-
-#define PREP7(idx) \
-        cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
-        PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)) \
-        PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)) \
-        PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)) \
-        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
-        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
-
-#define PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
-        { \
-        cmplx ca,cb; \
-        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
-        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
-        cb.i=y1*t7.r y2*t6.r y3*t5.r; \
-        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
-        PMC(out1,out2,ca,cb) \
-        }
-#define PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
-        PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
-#define PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
-        { \
-        cmplx da,db; \
-        PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
-        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
-        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
-        }
-
-NOINLINE static void pass7(size_t ido, size_t l1, const cmplx * restrict cc,
-  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
-  {
-  const size_t cdim=7;
-  const double tw1r= 0.623489801858733530525,
-               tw1i= sign * 0.7818314824680298087084,
-               tw2r= -0.222520933956314404289,
-               tw2i= sign * 0.9749279121818236070181,
-               tw3r= -0.9009688679024191262361,
-               tw3i= sign * 0.4338837391175581204758;
-
-  if (ido==1)
-    for (size_t k=0; k<l1; ++k)
-      {
-      PREP7(0)
-      PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
-      PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
-      PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
-      }
-  else
-    for (size_t k=0; k<l1; ++k)
-      {
-      {
-      PREP7(0)
-      PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
-      PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
-      PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
-      }
-      for (size_t i=1; i<ido; ++i)
-        {
-        PREP7(i)
-        PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
-        PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
-        PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
-        }
-      }
-  }
-
-#define PREP11(idx) \
-        cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
-        PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)) \
-        PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)) \
-        PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)) \
-        PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)) \
-        PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)) \
-        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
-        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
-
-#define PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
-        { \
-        cmplx ca,cb; \
-        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r+x4*t5.r+x5*t6.r; \
-        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i+x5*t6.i; \
-        cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
-        cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
-        PMC(out1,out2,ca,cb) \
-        }
-#define PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
-        PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
-#define PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
-        { \
-        cmplx da,db; \
-        PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
-        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
-        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
-        }
-
-NOINLINE static void pass11 (size_t ido, size_t l1, const cmplx * restrict cc,
-  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
-  {
-  const size_t cdim=11;
-  const double tw1r =        0.8412535328311811688618,
-               tw1i = sign * 0.5406408174555975821076,
-               tw2r =        0.4154150130018864255293,
-               tw2i = sign * 0.9096319953545183714117,
-               tw3r =       -0.1423148382732851404438,
-               tw3i = sign * 0.9898214418809327323761,
-               tw4r =       -0.6548607339452850640569,
-               tw4i = sign * 0.755749574354258283774,
-               tw5r =       -0.9594929736144973898904,
-               tw5i = sign * 0.2817325568414296977114;
-
-  if (ido==1)
-    for (size_t k=0; k<l1; ++k)
-      {
-      PREP11(0)
-      PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
-      PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
-      PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
-      PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
-      PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
-      }
-  else
-    for (size_t k=0; k<l1; ++k)
-      {
-      {
-      PREP11(0)
-      PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
-      PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
-      PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
-      PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
-      PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
-      }
-      for (size_t i=1; i<ido; ++i)
-        {
-        PREP11(i)
-        PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
-        PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
-        PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
-        PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
-        PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
-        }
-      }
-  }
-
-#define CX(a,b,c) cc[(a)+ido*((b)+l1*(c))]
-#define CX2(a,b) cc[(a)+idl1*(b)]
-#define CH2(a,b) ch[(a)+idl1*(b)]
-
-NOINLINE static int passg (size_t ido, size_t ip, size_t l1,
-  cmplx * restrict cc, cmplx * restrict ch, const cmplx * restrict wa,
-  const cmplx * restrict csarr, const int sign)
-  {
-  const size_t cdim=ip;
-  size_t ipph = (ip+1)/2;
-  size_t idl1 = ido*l1;
-
-  cmplx * restrict wal=RALLOC(cmplx,ip);
-  if (!wal) return -1;
-  wal[0]=(cmplx){1.,0.};
-  for (size_t i=1; i<ip; ++i)
-    wal[i]=(cmplx){csarr[i].r,sign*csarr[i].i};
-
-  for (size_t k=0; k<l1; ++k)
-    for (size_t i=0; i<ido; ++i)
-      CH(i,k,0) = CC(i,0,k);
-  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
-    for (size_t k=0; k<l1; ++k)
-      for (size_t i=0; i<ido; ++i)
-        PMC(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k))
-  for (size_t k=0; k<l1; ++k)
-    for (size_t i=0; i<ido; ++i)
-      {
-      cmplx tmp = CH(i,k,0);
-      for (size_t j=1; j<ipph; ++j)
-        ADDC(tmp,tmp,CH(i,k,j))
-      CX(i,k,0) = tmp;
-      }
-  for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
-    {
-    // j=0
-    for (size_t ik=0; ik<idl1; ++ik)
-      {
-      CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
-      CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
-      CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
-      CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
-      }
-
-    size_t iwal=2*l;
-    size_t j=3, jc=ip-3;
-    for (; j<ipph-1; j+=2, jc-=2)
-      {
-      iwal+=l; if (iwal>ip) iwal-=ip;
-      cmplx xwal=wal[iwal];
-      iwal+=l; if (iwal>ip) iwal-=ip;
-      cmplx xwal2=wal[iwal];
-      for (size_t ik=0; ik<idl1; ++ik)
-        {
-        CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
-        CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
-        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
-        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
-        }
-      }
-    for (; j<ipph; ++j, --jc)
-      {
-      iwal+=l; if (iwal>ip) iwal-=ip;
-      cmplx xwal=wal[iwal];
-      for (size_t ik=0; ik<idl1; ++ik)
-        {
-        CX2(ik,l).r += CH2(ik,j).r*xwal.r;
-        CX2(ik,l).i += CH2(ik,j).i*xwal.r;
-        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
-        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
-        }
-      }
-    }
-  DEALLOC(wal);
-
-  // shuffling and twiddling
-  if (ido==1)
-    for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
-      for (size_t ik=0; ik<idl1; ++ik)
-        {
-        cmplx t1=CX2(ik,j), t2=CX2(ik,jc);
-        PMC(CX2(ik,j),CX2(ik,jc),t1,t2)
-        }
-  else
-    {
-    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
-      for (size_t k=0; k<l1; ++k)
-        {
-        cmplx t1=CX(0,k,j), t2=CX(0,k,jc);
-        PMC(CX(0,k,j),CX(0,k,jc),t1,t2)
-        for (size_t i=1; i<ido; ++i)
-          {
-          cmplx x1, x2;
-          PMC(x1,x2,CX(i,k,j),CX(i,k,jc))
-          size_t idij=(j-1)*(ido-1)+i-1;
-          MULPMSIGNC (CX(i,k,j),wa[idij],x1)
-          idij=(jc-1)*(ido-1)+i-1;
-          MULPMSIGNC (CX(i,k,jc),wa[idij],x2)
-          }
-        }
-    }
-  return 0;
-  }
-
-#undef CH2
-#undef CX2
-#undef CX
-
-NOINLINE WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct,
-  const int sign)
-  {
-  if (plan->length==1) return 0;
-  size_t len=plan->length;
-  size_t l1=1, nf=plan->nfct;
-  cmplx *ch = RALLOC(cmplx, len);
-  if (!ch) return -1;
-  cmplx *p1=c, *p2=ch;
-
-  for(size_t k1=0; k1<nf; k1++)
-    {
-    size_t ip=plan->fct[k1].fct;
-    size_t l2=ip*l1;
-    size_t ido = len/l2;
-    if     (ip==4)
-      sign>0 ? pass4b (ido, l1, p1, p2, plan->fct[k1].tw)
-             : pass4f (ido, l1, p1, p2, plan->fct[k1].tw);
-    else if(ip==2)
-      sign>0 ? pass2b (ido, l1, p1, p2, plan->fct[k1].tw)
-             : pass2f (ido, l1, p1, p2, plan->fct[k1].tw);
-    else if(ip==3)
-      sign>0 ? pass3b (ido, l1, p1, p2, plan->fct[k1].tw)
-             : pass3f (ido, l1, p1, p2, plan->fct[k1].tw);
-    else if(ip==5)
-      sign>0 ? pass5b (ido, l1, p1, p2, plan->fct[k1].tw)
-             : pass5f (ido, l1, p1, p2, plan->fct[k1].tw);
-    else if(ip==7)  pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
-    else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign);
-    else
-      {
-      if (passg(ido, ip, l1, p1, p2, plan->fct[k1].tw, plan->fct[k1].tws, sign))
-        { DEALLOC(ch); return -1; }
-      SWAP(p1,p2,cmplx *);
-      }
-    SWAP(p1,p2,cmplx *);
-    l1=l2;
-    }
-  if (p1!=c)
-    {
-    if (fct!=1.)
-      for (size_t i=0; i<len; ++i)
-        {
-        c[i].r = ch[i].r*fct;
-        c[i].i = ch[i].i*fct;
-        }
-    else
-      memcpy (c,p1,len*sizeof(cmplx));
-    }
-  else
-    if (fct!=1.)
-      for (size_t i=0; i<len; ++i)
-        {
-        c[i].r *= fct;
-        c[i].i *= fct;
-        }
-  DEALLOC(ch);
-  return 0;
-  }
-
-#undef PMSIGNC
-#undef A_EQ_B_MUL_C
-#undef A_EQ_CB_MUL_C
-#undef MULPMSIGNC
-#undef MULPMSIGNCEQ
-
-#undef WA
-#undef CC
-#undef CH
-#undef ROT90
-#undef SCALEC
-#undef ADDC
-#undef PMC
-
-NOINLINE WARN_UNUSED_RESULT
-static int cfftp_forward(cfftp_plan plan, double c[], double fct)
-  { return pass_all(plan,(cmplx *)c, fct, -1); }
-
-NOINLINE WARN_UNUSED_RESULT
-static int cfftp_backward(cfftp_plan plan, double c[], double fct)
-  { return pass_all(plan,(cmplx *)c, fct, 1); }
-
-NOINLINE WARN_UNUSED_RESULT
-static int cfftp_factorize (cfftp_plan plan)
-  {
-  size_t length=plan->length;
-  size_t nfct=0;
-  while ((length%4)==0)
-    { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
-  if ((length%2)==0)
-    {
-    length>>=1;
-    // factor 2 should be at the front of the factor list
-    if (nfct>=NFCT) return -1;
-    plan->fct[nfct++].fct=2;
-    SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
-    }
-  size_t maxl=(size_t)(sqrt((double)length))+1;
-  for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
-    if ((length%divisor)==0)
-      {
-      while ((length%divisor)==0)
-        {
-        if (nfct>=NFCT) return -1;
-        plan->fct[nfct++].fct=divisor;
-        length/=divisor;
-        }
-      maxl=(size_t)(sqrt((double)length))+1;
-      }
-  if (length>1) plan->fct[nfct++].fct=length;
-  plan->nfct=nfct;
-  return 0;
-  }
-
-NOINLINE static size_t cfftp_twsize (cfftp_plan plan)
-  {
-  size_t twsize=0, l1=1;
-  for (size_t k=0; k<plan->nfct; ++k)
-    {
-    size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
-    twsize+=(ip-1)*(ido-1);
-    if (ip>11)
-      twsize+=ip;
-    l1*=ip;
-    }
-  return twsize;
-  }
-
-NOINLINE WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan)
-  {
-  size_t length=plan->length;
-  double *twid = RALLOC(double, 2*length);
-  if (!twid) return -1;
-  sincos_2pibyn(length, twid);
-  size_t l1=1;
-  size_t memofs=0;
-  for (size_t k=0; k<plan->nfct; ++k)
-    {
-    size_t ip=plan->fct[k].fct, ido= length/(l1*ip);
-    plan->fct[k].tw=plan->mem+memofs;
-    memofs+=(ip-1)*(ido-1);
-    for (size_t j=1; j<ip; ++j)
-      for (size_t i=1; i<ido; ++i)
-        {
-        plan->fct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i];
-        plan->fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1];
-        }
-    if (ip>11)
-      {
-      plan->fct[k].tws=plan->mem+memofs;
-      memofs+=ip;
-      for (size_t j=0; j<ip; ++j)
-        {
-        plan->fct[k].tws[j].r = twid[2*j*l1*ido];
-        plan->fct[k].tws[j].i = twid[2*j*l1*ido+1];
-        }
-      }
-    l1*=ip;
-    }
-  DEALLOC(twid);
-  return 0;
-  }
-
-static cfftp_plan make_cfftp_plan (size_t length)
-  {
-  if (length==0) return NULL;
-  cfftp_plan plan = RALLOC(cfftp_plan_i,1);
-  if (!plan) return NULL;
-  plan->length=length;
-  plan->nfct=0;
-  for (size_t i=0; i<NFCT; ++i)
-    plan->fct[i]=(cfftp_fctdata){0,0,0};
-  plan->mem=0;
-  if (length==1) return plan;
-  if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
-  size_t tws=cfftp_twsize(plan);
-  plan->mem=RALLOC(cmplx,tws);
-  if (!plan->mem) { DEALLOC(plan); return NULL; }
-  if (cfftp_comp_twiddle(plan)!=0)
-    { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
-  return plan;
-  }
-
-static void destroy_cfftp_plan (cfftp_plan plan)
-  {
-  DEALLOC(plan->mem);
-  DEALLOC(plan);
-  }
-
-typedef struct rfftp_fctdata
-  {
-  size_t fct;
-  double *tw, *tws;
-  } rfftp_fctdata;
-
-typedef struct rfftp_plan_i
-  {
-  size_t length, nfct;
-  double *mem;
-  rfftp_fctdata fct[NFCT];
-  } rfftp_plan_i;
-typedef struct rfftp_plan_i * rfftp_plan;
-
-#define WA(x,i) wa[(i)+(x)*(ido-1)]
-#define PM(a,b,c,d) { a=c+d; b=c-d; }
-/* (a+ib) = conj(c+id) * (e+if) */
-#define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; }
-
-#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
-#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
-
-NOINLINE static void radf2 (size_t ido, size_t l1, const double * restrict cc,
-  double * restrict ch, const double * restrict wa)
-  {
-  const size_t cdim=2;
-
-  for (size_t k=0; k<l1; k++)
-    PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1))
-  if ((ido&1)==0)
-    for (size_t k=0; k<l1; k++)
-      {
-      CH(    0,1,k) = -CC(ido-1,k,1);
-      CH(ido-1,0,k) =  CC(ido-1,k,0);
-      }
-  if (ido<=2) return;
-  for (size_t k=0; k<l1; k++)
-    for (size_t i=2; i<ido; i+=2)
-      {
-      size_t ic=ido-i;
-      double tr2, ti2;
-      MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
-      PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2)
-      PM (CH(i  ,0,k),CH(ic  ,1,k),ti2,CC(i  ,k,0))
-      }
-  }
-
-NOINLINE static void radf3(size_t ido, size_t l1, const double * restrict cc,
-  double * restrict ch, const double * restrict wa)
-  {
-  const size_t cdim=3;
-  static const double taur=-0.5, taui=0.86602540378443864676;
-
-  for (size_t k=0; k<l1; k++)
-    {
-    double cr2=CC(0,k,1)+CC(0,k,2);
-    CH(0,0,k) = CC(0,k,0)+cr2;
-    CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
-    CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
-    }
-  if (ido==1) return;
-  for (size_t k=0; k<l1; k++)
-    for (size_t i=2; i<ido; i+=2)
-      {
-      size_t ic=ido-i;
-      double di2, di3, dr2, dr3;
-      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)) // d2=conj(WA0)*CC1
-      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)) // d3=conj(WA1)*CC2
-      double cr2=dr2+dr3; // c add
-      double ci2=di2+di3;
-      CH(i-1,0,k) = CC(i-1,k,0)+cr2; // c add
-      CH(i  ,0,k) = CC(i  ,k,0)+ci2;
-      double tr2 = CC(i-1,k,0)+taur*cr2; // c add
-      double ti2 = CC(i  ,k,0)+taur*ci2;
-      double tr3 = taui*(di2-di3);  // t3 = taui*i*(d3-d2)?
-      double ti3 = taui*(dr3-dr2);
-      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3) // PM(i) = t2+t3
-      PM(CH(i  ,2,k),CH(ic  ,1,k),ti3,ti2) // PM(ic) = conj(t2-t3)
-      }
-  }
-
-NOINLINE static void radf4(size_t ido, size_t l1, const double * restrict cc,
-  double * restrict ch, const double * restrict wa)
-  {
-  const size_t cdim=4;
-  static const double hsqt2=0.70710678118654752440;
-
-  for (size_t k=0; k<l1; k++)
-    {
-    double tr1,tr2;
-    PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1))
-    PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2))
-    PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1)
-    }
-  if ((ido&1)==0)
-    for (size_t k=0; k<l1; k++)
-      {
-      double ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
-      double tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
-      PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1)
-      PM (CH(    0,3,k),CH(    0,1,k),ti1,CC(ido-1,k,2))
-      }
-  if (ido<=2) return;
-  for (size_t k=0; k<l1; k++)
-    for (size_t i=2; i<ido; i+=2)
-      {
-      size_t ic=ido-i;
-      double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-      MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
-      MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
-      MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
-      PM(tr1,tr4,cr4,cr2)
-      PM(ti1,ti4,ci2,ci4)
-      PM(tr2,tr3,CC(i-1,k,0),cr3)
-      PM(ti2,ti3,CC(i  ,k,0),ci3)
-      PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1)
-      PM(CH(i  ,0,k),CH(ic  ,3,k),ti1,ti2)
-      PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4)
-      PM(CH(i  ,2,k),CH(ic  ,1,k),tr4,ti3)
-      }
-  }
-
-NOINLINE static void radf5(size_t ido, size_t l1, const double * restrict cc,
-  double * restrict ch, const double * restrict wa)
-  {
-  const size_t cdim=5;
-  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
-                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
-
-  for (size_t k=0; k<l1; k++)
-    {
-    double cr2, cr3, ci4, ci5;
-    PM (cr2,ci5,CC(0,k,4),CC(0,k,1))
-    PM (cr3,ci4,CC(0,k,3),CC(0,k,2))
-    CH(0,0,k)=CC(0,k,0)+cr2+cr3;
-    CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
-    CH(0,2,k)=ti11*ci5+ti12*ci4;
-    CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
-    CH(0,4,k)=ti12*ci5-ti11*ci4;
-    }
-  if (ido==1) return;
-  for (size_t k=0; k<l1;++k)
-    for (size_t i=2; i<ido; i+=2)
-      {
-      double ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3,
-         dr4, dr5, cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
-      size_t ic=ido-i;
-      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
-      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
-      MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
-      MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4))
-      PM(cr2,ci5,dr5,dr2)
-      PM(ci2,cr5,di2,di5)
-      PM(cr3,ci4,dr4,dr3)
-      PM(ci3,cr4,di3,di4)
-      CH(i-1,0,k)=CC(i-1,k,0)+cr2+cr3;
-      CH(i  ,0,k)=CC(i  ,k,0)+ci2+ci3;
-      tr2=CC(i-1,k,0)+tr11*cr2+tr12*cr3;
-      ti2=CC(i  ,k,0)+tr11*ci2+tr12*ci3;
-      tr3=CC(i-1,k,0)+tr12*cr2+tr11*cr3;
-      ti3=CC(i  ,k,0)+tr12*ci2+tr11*ci3;
-      MULPM(tr5,tr4,cr5,cr4,ti11,ti12)
-      MULPM(ti5,ti4,ci5,ci4,ti11,ti12)
-      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5)
-      PM(CH(i  ,2,k),CH(ic  ,1,k),ti5,ti2)
-      PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4)
-      PM(CH(i  ,4,k),CH(ic  ,3,k),ti4,ti3)
-      }
-  }
-
-#undef CC
-#undef CH
-#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
-#define C2(a,b) cc[(a)+idl1*(b)]
-#define CH2(a,b) ch[(a)+idl1*(b)]
-#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
-#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
-NOINLINE static void radfg(size_t ido, size_t ip, size_t l1,
-  double * restrict cc, double * restrict ch, const double * restrict wa,
-  const double * restrict csarr)
-  {
-  const size_t cdim=ip;
-  size_t ipph=(ip+1)/2;
-  size_t idl1 = ido*l1;
-
-  if (ido>1)
-    {
-    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)              // 114
-      {
-      size_t is=(j-1)*(ido-1),
-             is2=(jc-1)*(ido-1);
-      for (size_t k=0; k<l1; ++k)                            // 113
-        {
-        size_t idij=is;
-        size_t idij2=is2;
-        for (size_t i=1; i<=ido-2; i+=2)                      // 112
-          {
-          double t1=C1(i,k,j ), t2=C1(i+1,k,j ),
-                 t3=C1(i,k,jc), t4=C1(i+1,k,jc);
-          double x1=wa[idij]*t1 + wa[idij+1]*t2,
-                 x2=wa[idij]*t2 - wa[idij+1]*t1,
-                 x3=wa[idij2]*t3 + wa[idij2+1]*t4,
-                 x4=wa[idij2]*t4 - wa[idij2+1]*t3;
-          C1(i  ,k,j ) = x1+x3;
-          C1(i  ,k,jc) = x2-x4;
-          C1(i+1,k,j ) = x2+x4;
-          C1(i+1,k,jc) = x3-x1;
-          idij+=2;
-          idij2+=2;
-          }
-        }
-      }
-    }
-
-  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 123
-    for (size_t k=0; k<l1; ++k)                              // 122
-      {
-      double t1=C1(0,k,j), t2=C1(0,k,jc);
-      C1(0,k,j ) = t1+t2;
-      C1(0,k,jc) = t2-t1;
-      }
-
-//everything in C
-//memset(ch,0,ip*l1*ido*sizeof(double));
-
-  for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)                 // 127
-    {
-    for (size_t ik=0; ik<idl1; ++ik)                         // 124
-      {
-      CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
-      CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
-      }
-    size_t iang = 2*l;
-    size_t j=3, jc=ip-3;
-    for (; j<ipph-3; j+=4,jc-=4)              // 126
-      {
-      iang+=l; if (iang>=ip) iang-=ip;
-      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
-      iang+=l; if (iang>=ip) iang-=ip;
-      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
-      iang+=l; if (iang>=ip) iang-=ip;
-      double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
-      iang+=l; if (iang>=ip) iang-=ip;
-      double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
-      for (size_t ik=0; ik<idl1; ++ik)                       // 125
-        {
-        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
-                     +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
-        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
-                     +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
-        }
-      }
-    for (; j<ipph-1; j+=2,jc-=2)              // 126
-      {
-      iang+=l; if (iang>=ip) iang-=ip;
-      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
-      iang+=l; if (iang>=ip) iang-=ip;
-      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
-      for (size_t ik=0; ik<idl1; ++ik)                       // 125
-        {
-        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
-        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
-        }
-      }
-    for (; j<ipph; ++j,--jc)              // 126
-      {
-      iang+=l; if (iang>=ip) iang-=ip;
-      double ar=csarr[2*iang], ai=csarr[2*iang+1];
-      for (size_t ik=0; ik<idl1; ++ik)                       // 125
-        {
-        CH2(ik,l ) += ar*C2(ik,j );
-        CH2(ik,lc) += ai*C2(ik,jc);
-        }
-      }
-    }
-  for (size_t ik=0; ik<idl1; ++ik)                         // 101
-    CH2(ik,0) = C2(ik,0);
-  for (size_t j=1; j<ipph; ++j)                              // 129
-    for (size_t ik=0; ik<idl1; ++ik)                         // 128
-      CH2(ik,0) += C2(ik,j);
-
-// everything in CH at this point!
-//memset(cc,0,ip*l1*ido*sizeof(double));
-
-  for (size_t k=0; k<l1; ++k)                                // 131
-    for (size_t i=0; i<ido; ++i)                             // 130
-      CC(i,0,k) = CH(i,k,0);
-
-  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 137
-    {
-    size_t j2=2*j-1;
-    for (size_t k=0; k<l1; ++k)                              // 136
-      {
-      CC(ido-1,j2,k) = CH(0,k,j);
-      CC(0,j2+1,k) = CH(0,k,jc);
-      }
-    }
-
-  if (ido==1) return;
-
-  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 140
-    {
-    size_t j2=2*j-1;
-    for(size_t k=0; k<l1; ++k)                               // 139
-      for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 138
-        {
-        CC(i   ,j2+1,k) = CH(i  ,k,j )+CH(i  ,k,jc);
-        CC(ic  ,j2  ,k) = CH(i  ,k,j )-CH(i  ,k,jc);
-        CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
-        CC(ic+1,j2  ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
-        }
-    }
-  }
-#undef C1
-#undef C2
-#undef CH2
-
-#undef CH
-#undef CC
-#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
-#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
-
-NOINLINE static void radb2(size_t ido, size_t l1, const double * restrict cc,
-  double * restrict ch, const double * restrict wa)
-  {
-  const size_t cdim=2;
-
-  for (size_t k=0; k<l1; k++)
-    PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k))
-  if ((ido&1)==0)
-    for (size_t k=0; k<l1; k++)
-      {
-      CH(ido-1,k,0) = 2.*CC(ido-1,0,k);
-      CH(ido-1,k,1) =-2.*CC(0    ,1,k);
-      }
-  if (ido<=2) return;
-  for (size_t k=0; k<l1;++k)
-    for (size_t i=2; i<ido; i+=2)
-      {
-      size_t ic=ido-i;
-      double ti2, tr2;
-      PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k))
-      PM (ti2,CH(i  ,k,0),CC(i  ,0,k),CC(ic  ,1,k))
-      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2)
-      }
-  }
-
-NOINLINE static void radb3(size_t ido, size_t l1, const double * restrict cc,
-  double * restrict ch, const double * restrict wa)
-  {
-  const size_t cdim=3;
-  static const double taur=-0.5, taui=0.86602540378443864676;
-
-  for (size_t k=0; k<l1; k++)
-    {
-    double tr2=2.*CC(ido-1,1,k);
-    double cr2=CC(0,0,k)+taur*tr2;
-    CH(0,k,0)=CC(0,0,k)+tr2;
-    double ci3=2.*taui*CC(0,2,k);
-    PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
-    }
-  if (ido==1) return;
-  for (size_t k=0; k<l1; k++)
-    for (size_t i=2; i<ido; i+=2)
-      {
-      size_t ic=ido-i;
-      double tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
-      double ti2=CC(i  ,2,k)-CC(ic  ,1,k);
-      double cr2=CC(i-1,0,k)+taur*tr2;     // c2=CC +taur*t2
-      double ci2=CC(i  ,0,k)+taur*ti2;
-      CH(i-1,k,0)=CC(i-1,0,k)+tr2;         // CH=CC+t2
-      CH(i  ,k,0)=CC(i  ,0,k)+ti2;
-      double cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
-      double ci3=taui*(CC(i  ,2,k)+CC(ic  ,1,k));
-      double di2, di3, dr2, dr3;
-      PM(dr3,dr2,cr2,ci3) // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
-      PM(di2,di3,ci2,cr3) // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
-      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2) // ch = WA*d2
-      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
-      }
-  }
-
-NOINLINE static void radb4(size_t ido, size_t l1, const double * restrict cc,
-  double * restrict ch, const double * restrict wa)
-  {
-  const size_t cdim=4;
-  static const double sqrt2=1.41421356237309504880;
-
-  for (size_t k=0; k<l1; k++)
-    {
-    double tr1, tr2;
-    PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k))
-    double tr3=2.*CC(ido-1,1,k);
-    double tr4=2.*CC(0,2,k);
-    PM (CH(0,k,0),CH(0,k,2),tr2,tr3)
-    PM (CH(0,k,3),CH(0,k,1),tr1,tr4)
-    }
-  if ((ido&1)==0)
-    for (size_t k=0; k<l1; k++)
-      {
-      double tr1,tr2,ti1,ti2;
-      PM (ti1,ti2,CC(0    ,3,k),CC(0    ,1,k))
-      PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k))
-      CH(ido-1,k,0)=tr2+tr2;
-      CH(ido-1,k,1)=sqrt2*(tr1-ti1);
-      CH(ido-1,k,2)=ti2+ti2;
-      CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
-      }
-  if (ido<=2) return;
-  for (size_t k=0; k<l1;++k)
-    for (size_t i=2; i<ido; i+=2)
-      {
-      double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-      size_t ic=ido-i;
-      PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k))
-      PM (ti1,ti2,CC(i  ,0,k),CC(ic  ,3,k))
-      PM (tr4,ti3,CC(i  ,2,k),CC(ic  ,1,k))
-      PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k))
-      PM (CH(i-1,k,0),cr3,tr2,tr3)
-      PM (CH(i  ,k,0),ci3,ti2,ti3)
-      PM (cr4,cr2,tr1,tr4)
-      PM (ci2,ci4,ti1,ti4)
-      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2)
-      MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3)
-      MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4)
-      }
-  }
-
-NOINLINE static void radb5(size_t ido, size_t l1, const double * restrict cc,
-  double * restrict ch, const double * restrict wa)
-  {
-  const size_t cdim=5;
-  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
-                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
-
-  for (size_t k=0; k<l1; k++)
-    {
-    double ti5=CC(0,2,k)+CC(0,2,k);
-    double ti4=CC(0,4,k)+CC(0,4,k);
-    double tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
-    double tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
-    CH(0,k,0)=CC(0,0,k)+tr2+tr3;
-    double cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
-    double cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
-    double ci4, ci5;
-    MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
-    PM(CH(0,k,4),CH(0,k,1),cr2,ci5)
-    PM(CH(0,k,3),CH(0,k,2),cr3,ci4)
-    }
-  if (ido==1) return;
-  for (size_t k=0; k<l1;++k)
-    for (size_t i=2; i<ido; i+=2)
-      {
-      size_t ic=ido-i;
-      double tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
-      PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k))
-      PM(ti5,ti2,CC(i  ,2,k),CC(ic  ,1,k))
-      PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k))
-      PM(ti4,ti3,CC(i  ,4,k),CC(ic  ,3,k))
-      CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
-      CH(i  ,k,0)=CC(i  ,0,k)+ti2+ti3;
-      double cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
-      double ci2=CC(i  ,0,k)+tr11*ti2+tr12*ti3;
-      double cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
-      double ci3=CC(i  ,0,k)+tr12*ti2+tr11*ti3;
-      double ci4, ci5, cr5, cr4;
-      MULPM(cr5,cr4,tr5,tr4,ti11,ti12)
-      MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
-      double dr2, dr3, dr4, dr5, di2, di3, di4, di5;
-      PM(dr4,dr3,cr3,ci4)
-      PM(di3,di4,ci3,cr4)
-      PM(dr5,dr2,cr2,ci5)
-      PM(di2,di5,ci2,cr5)
-      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
-      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
-      MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4)
-      MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5)
-      }
-  }
-
-#undef CC
-#undef CH
-#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
-#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
-#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
-#define C2(a,b) cc[(a)+idl1*(b)]
-#define CH2(a,b) ch[(a)+idl1*(b)]
-
-NOINLINE static void radbg(size_t ido, size_t ip, size_t l1,
-  double * restrict cc, double * restrict ch, const double * restrict wa,
-  const double * restrict csarr)
-  {
-  const size_t cdim=ip;
-  size_t ipph=(ip+1)/ 2;
-  size_t idl1 = ido*l1;
-
-  for (size_t k=0; k<l1; ++k)        // 102
-    for (size_t i=0; i<ido; ++i)     // 101
-      CH(i,k,0) = CC(i,0,k);
-  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)   // 108
-    {
-    size_t j2=2*j-1;
-    for (size_t k=0; k<l1; ++k)
-      {
-      CH(0,k,j ) = 2*CC(ido-1,j2,k);
-      CH(0,k,jc) = 2*CC(0,j2+1,k);
-      }
-    }
-
-  if (ido!=1)
-    {
-    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 111
-      {
-      size_t j2=2*j-1;
-      for (size_t k=0; k<l1; ++k)
-        for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 109
-          {
-          CH(i  ,k,j ) = CC(i  ,j2+1,k)+CC(ic  ,j2,k);
-          CH(i  ,k,jc) = CC(i  ,j2+1,k)-CC(ic  ,j2,k);
-          CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
-          CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
-          }
-      }
-    }
-  for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
-    {
-    for (size_t ik=0; ik<idl1; ++ik)
-      {
-      C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
-      C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
-      }
-    size_t iang=2*l;
-    size_t j=3,jc=ip-3;
-    for(; j<ipph-3; j+=4,jc-=4)
-      {
-      iang+=l; if(iang>ip) iang-=ip;
-      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
-      iang+=l; if(iang>ip) iang-=ip;
-      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
-      iang+=l; if(iang>ip) iang-=ip;
-      double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
-      iang+=l; if(iang>ip) iang-=ip;
-      double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
-      for (size_t ik=0; ik<idl1; ++ik)
-        {
-        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
-                    +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
-        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
-                    +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
-        }
-      }
-    for(; j<ipph-1; j+=2,jc-=2)
-      {
-      iang+=l; if(iang>ip) iang-=ip;
-      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
-      iang+=l; if(iang>ip) iang-=ip;
-      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
-      for (size_t ik=0; ik<idl1; ++ik)
-        {
-        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
-        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
-        }
-      }
-    for(; j<ipph; ++j,--jc)
-      {
-      iang+=l; if(iang>ip) iang-=ip;
-      double war=csarr[2*iang], wai=csarr[2*iang+1];
-      for (size_t ik=0; ik<idl1; ++ik)
-        {
-        C2(ik,l ) += war*CH2(ik,j );
-        C2(ik,lc) += wai*CH2(ik,jc);
-        }
-      }
-    }
-  for (size_t j=1; j<ipph; ++j)
-    for (size_t ik=0; ik<idl1; ++ik)
-      CH2(ik,0) += CH2(ik,j);
-  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 124
-    for (size_t k=0; k<l1; ++k)
-      {
-      CH(0,k,j ) = C1(0,k,j)-C1(0,k,jc);
-      CH(0,k,jc) = C1(0,k,j)+C1(0,k,jc);
-      }
-
-  if (ido==1) return;
-
-  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)  // 127
-    for (size_t k=0; k<l1; ++k)
-      for (size_t i=1; i<=ido-2; i+=2)
-        {
-        CH(i  ,k,j ) = C1(i  ,k,j)-C1(i+1,k,jc);
-        CH(i  ,k,jc) = C1(i  ,k,j)+C1(i+1,k,jc);
-        CH(i+1,k,j ) = C1(i+1,k,j)+C1(i  ,k,jc);
-        CH(i+1,k,jc) = C1(i+1,k,j)-C1(i  ,k,jc);
-        }
-
-// All in CH
-
-  for (size_t j=1; j<ip; ++j)
-    {
-    size_t is = (j-1)*(ido-1);
-    for (size_t k=0; k<l1; ++k)
-      {
-      size_t idij = is;
-      for (size_t i=1; i<=ido-2; i+=2)
-        {
-        double t1=CH(i,k,j), t2=CH(i+1,k,j);
-        CH(i  ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
-        CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
-        idij+=2;
-        }
-      }
-    }
-  }
-#undef C1
-#undef C2
-#undef CH2
-
-#undef CC
-#undef CH
-#undef PM
-#undef MULPM
-#undef WA
-
-static void copy_and_norm(double *c, double *p1, size_t n, double fct)
-  {
-  if (p1!=c)
-    {
-    if (fct!=1.)
-      for (size_t i=0; i<n; ++i)
-        c[i] = fct*p1[i];
-    else
-      memcpy (c,p1,n*sizeof(double));
-    }
-  else
-    if (fct!=1.)
-      for (size_t i=0; i<n; ++i)
-        c[i] *= fct;
-  }
-
-WARN_UNUSED_RESULT
-static int rfftp_forward(rfftp_plan plan, double c[], double fct)
-  {
-  if (plan->length==1) return 0;
-  size_t n=plan->length;
-  size_t l1=n, nf=plan->nfct;
-  double *ch = RALLOC(double, n);
-  if (!ch) return -1;
-  double *p1=c, *p2=ch;
-
-  for(size_t k1=0; k1<nf;++k1)
-    {
-    size_t k=nf-k1-1;
-    size_t ip=plan->fct[k].fct;
-    size_t ido=n / l1;
-    l1 /= ip;
-    if(ip==4)
-      radf4(ido, l1, p1, p2, plan->fct[k].tw);
-    else if(ip==2)
-      radf2(ido, l1, p1, p2, plan->fct[k].tw);
-    else if(ip==3)
-      radf3(ido, l1, p1, p2, plan->fct[k].tw);
-    else if(ip==5)
-      radf5(ido, l1, p1, p2, plan->fct[k].tw);
-    else
-      {
-      radfg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
-      SWAP (p1,p2,double *);
-      }
-    SWAP (p1,p2,double *);
-    }
-  copy_and_norm(c,p1,n,fct);
-  DEALLOC(ch);
-  return 0;
-  }
-
-WARN_UNUSED_RESULT
-static int rfftp_backward(rfftp_plan plan, double c[], double fct)
-  {
-  if (plan->length==1) return 0;
-  size_t n=plan->length;
-  size_t l1=1, nf=plan->nfct;
-  double *ch = RALLOC(double, n);
-  if (!ch) return -1;
-  double *p1=c, *p2=ch;
-
-  for(size_t k=0; k<nf; k++)
-    {
-    size_t ip = plan->fct[k].fct,
-           ido= n/(ip*l1);
-    if(ip==4)
-      radb4(ido, l1, p1, p2, plan->fct[k].tw);
-    else if(ip==2)
-      radb2(ido, l1, p1, p2, plan->fct[k].tw);
-    else if(ip==3)
-      radb3(ido, l1, p1, p2, plan->fct[k].tw);
-    else if(ip==5)
-      radb5(ido, l1, p1, p2, plan->fct[k].tw);
-    else
-      radbg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
-    SWAP (p1,p2,double *);
-    l1*=ip;
-    }
-  copy_and_norm(c,p1,n,fct);
-  DEALLOC(ch);
-  return 0;
-  }
-
-WARN_UNUSED_RESULT
-static int rfftp_factorize (rfftp_plan plan)
-  {
-  size_t length=plan->length;
-  size_t nfct=0;
-  while ((length%4)==0)
-    { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
-  if ((length%2)==0)
-    {
-    length>>=1;
-    // factor 2 should be at the front of the factor list
-    if (nfct>=NFCT) return -1;
-    plan->fct[nfct++].fct=2;
-    SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
-    }
-  size_t maxl=(size_t)(sqrt((double)length))+1;
-  for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
-    if ((length%divisor)==0)
-      {
-      while ((length%divisor)==0)
-        {
-        if (nfct>=NFCT) return -1;
-        plan->fct[nfct++].fct=divisor;
-        length/=divisor;
-        }
-      maxl=(size_t)(sqrt((double)length))+1;
-      }
-  if (length>1) plan->fct[nfct++].fct=length;
-  plan->nfct=nfct;
-  return 0;
-  }
-
-static size_t rfftp_twsize(rfftp_plan plan)
-  {
-  size_t twsize=0, l1=1;
-  for (size_t k=0; k<plan->nfct; ++k)
-    {
-    size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
-    twsize+=(ip-1)*(ido-1);
-    if (ip>5) twsize+=2*ip;
-    l1*=ip;
-    }
-  return twsize;
-  return 0;
-  }
-
-WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan)
-  {
-  size_t length=plan->length;
-  double *twid = RALLOC(double, 2*length);
-  if (!twid) return -1;
-  sincos_2pibyn_half(length, twid);
-  size_t l1=1;
-  double *ptr=plan->mem;
-  for (size_t k=0; k<plan->nfct; ++k)
-    {
-    size_t ip=plan->fct[k].fct, ido=length/(l1*ip);
-    if (k<plan->nfct-1) // last factor doesn't need twiddles
-      {
-      plan->fct[k].tw=ptr; ptr+=(ip-1)*(ido-1);
-      for (size_t j=1; j<ip; ++j)
-        for (size_t i=1; i<=(ido-1)/2; ++i)
-          {
-          plan->fct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
-          plan->fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
-          }
-      }
-    if (ip>5) // special factors required by *g functions
-      {
-      plan->fct[k].tws=ptr; ptr+=2*ip;
-      plan->fct[k].tws[0] = 1.;
-      plan->fct[k].tws[1] = 0.;
-      for (size_t i=1; i<=(ip>>1); ++i)
-        {
-        plan->fct[k].tws[2*i  ] = twid[2*i*(length/ip)];
-        plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
-        plan->fct[k].tws[2*(ip-i)  ] = twid[2*i*(length/ip)];
-        plan->fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1];
-        }
-      }
-    l1*=ip;
-    }
-  DEALLOC(twid);
-  return 0;
-  }
-
-NOINLINE static rfftp_plan make_rfftp_plan (size_t length)
-  {
-  if (length==0) return NULL;
-  rfftp_plan plan = RALLOC(rfftp_plan_i,1);
-  if (!plan) return NULL;
-  plan->length=length;
-  plan->nfct=0;
-  plan->mem=NULL;
-  for (size_t i=0; i<NFCT; ++i)
-    plan->fct[i]=(rfftp_fctdata){0,0,0};
-  if (length==1) return plan;
-  if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
-  size_t tws=rfftp_twsize(plan);
-  plan->mem=RALLOC(double,tws);
-  if (!plan->mem) { DEALLOC(plan); return NULL; }
-  if (rfftp_comp_twiddle(plan)!=0)
-    { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
-  return plan;
-  }
-
-NOINLINE static void destroy_rfftp_plan (rfftp_plan plan)
-  {
-  DEALLOC(plan->mem);
-  DEALLOC(plan);
-  }
-
-typedef struct fftblue_plan_i
-  {
-  size_t n, n2;
-  cfftp_plan plan;
-  double *mem;
-  double *bk, *bkf;
-  } fftblue_plan_i;
-typedef struct fftblue_plan_i * fftblue_plan;
-
-NOINLINE static fftblue_plan make_fftblue_plan (size_t length)
-  {
-  fftblue_plan plan = RALLOC(fftblue_plan_i,1);
-  if (!plan) return NULL;
-  plan->n = length;
-  plan->n2 = good_size(plan->n*2-1);
-  plan->mem = RALLOC(double, 2*plan->n+2*plan->n2);
-  if (!plan->mem) { DEALLOC(plan); return NULL; }
-  plan->bk  = plan->mem;
-  plan->bkf = plan->bk+2*plan->n;
-
-/* initialize b_k */
-  double *tmp = RALLOC(double,4*plan->n);
-  if (!tmp) { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
-  sincos_2pibyn(2*plan->n,tmp);
-  plan->bk[0] = 1;
-  plan->bk[1] = 0;
-
-  size_t coeff=0;
-  for (size_t m=1; m<plan->n; ++m)
-    {
-    coeff+=2*m-1;
-    if (coeff>=2*plan->n) coeff-=2*plan->n;
-    plan->bk[2*m  ] = tmp[2*coeff  ];
-    plan->bk[2*m+1] = tmp[2*coeff+1];
-    }
-
-  /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
-  double xn2 = 1./plan->n2;
-  plan->bkf[0] = plan->bk[0]*xn2;
-  plan->bkf[1] = plan->bk[1]*xn2;
-  for (size_t m=2; m<2*plan->n; m+=2)
-    {
-    plan->bkf[m]   = plan->bkf[2*plan->n2-m]   = plan->bk[m]   *xn2;
-    plan->bkf[m+1] = plan->bkf[2*plan->n2-m+1] = plan->bk[m+1] *xn2;
-    }
-  for (size_t m=2*plan->n;m<=(2*plan->n2-2*plan->n+1);++m)
-    plan->bkf[m]=0.;
-  plan->plan=make_cfftp_plan(plan->n2);
-  if (!plan->plan)
-    { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
-  if (cfftp_forward(plan->plan,plan->bkf,1.)!=0)
-    { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
-  DEALLOC(tmp);
-
-  return plan;
-  }
-
-NOINLINE static void destroy_fftblue_plan (fftblue_plan plan)
-  {
-  DEALLOC(plan->mem);
-  destroy_cfftp_plan(plan->plan);
-  DEALLOC(plan);
-  }
-
-NOINLINE WARN_UNUSED_RESULT
-static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct)
-  {
-  size_t n=plan->n;
-  size_t n2=plan->n2;
-  double *bk  = plan->bk;
-  double *bkf = plan->bkf;
-  double *akf = RALLOC(double, 2*n2);
-  if (!akf) return -1;
-
-/* initialize a_k and FFT it */
-  if (isign>0)
-    for (size_t m=0; m<2*n; m+=2)
-      {
-      akf[m]   = c[m]*bk[m]   - c[m+1]*bk[m+1];
-      akf[m+1] = c[m]*bk[m+1] + c[m+1]*bk[m];
-      }
-  else
-    for (size_t m=0; m<2*n; m+=2)
-      {
-      akf[m]   = c[m]*bk[m]   + c[m+1]*bk[m+1];
-      akf[m+1] =-c[m]*bk[m+1] + c[m+1]*bk[m];
-      }
-  for (size_t m=2*n; m<2*n2; ++m)
-    akf[m]=0;
-
-  if (cfftp_forward (plan->plan,akf,fct)!=0)
-    { DEALLOC(akf); return -1; }
-
-/* do the convolution */
-  if (isign>0)
-    for (size_t m=0; m<2*n2; m+=2)
-      {
-      double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
-      akf[m  ]  =  akf[m]*bkf[m]   + akf[m+1]*bkf[m+1];
-      akf[m+1]  = im;
-      }
-  else
-    for (size_t m=0; m<2*n2; m+=2)
-      {
-      double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
-      akf[m  ]  = akf[m]*bkf[m]   - akf[m+1]*bkf[m+1];
-      akf[m+1]  = im;
-      }
-
-/* inverse FFT */
-  if (cfftp_backward (plan->plan,akf,1.)!=0)
-    { DEALLOC(akf); return -1; }
-
-/* multiply by b_k */
-  if (isign>0)
-    for (size_t m=0; m<2*n; m+=2)
-      {
-      c[m]   = bk[m]  *akf[m] - bk[m+1]*akf[m+1];
-      c[m+1] = bk[m+1]*akf[m] + bk[m]  *akf[m+1];
-      }
-  else
-    for (size_t m=0; m<2*n; m+=2)
-      {
-      c[m]   = bk[m]  *akf[m] + bk[m+1]*akf[m+1];
-      c[m+1] =-bk[m+1]*akf[m] + bk[m]  *akf[m+1];
-      }
-  DEALLOC(akf);
-  return 0;
-  }
-
-WARN_UNUSED_RESULT
-static int cfftblue_backward(fftblue_plan plan, double c[], double fct)
-  { return fftblue_fft(plan,c,1,fct); }
-
-WARN_UNUSED_RESULT
-static int cfftblue_forward(fftblue_plan plan, double c[], double fct)
-  { return fftblue_fft(plan,c,-1,fct); }
-
-WARN_UNUSED_RESULT
-static int rfftblue_backward(fftblue_plan plan, double c[], double fct)
-  {
-  size_t n=plan->n;
-  double *tmp = RALLOC(double,2*n);
-  if (!tmp) return -1;
-  tmp[0]=c[0];
-  tmp[1]=0.;
-  memcpy (tmp+2,c+1, (n-1)*sizeof(double));
-  if ((n&1)==0) tmp[n+1]=0.;
-  for (size_t m=2; m<n; m+=2)
-    {
-    tmp[2*n-m]=tmp[m];
-    tmp[2*n-m+1]=-tmp[m+1];
-    }
-  if (fftblue_fft(plan,tmp,1,fct)!=0)
-    { DEALLOC(tmp); return -1; }
-  for (size_t m=0; m<n; ++m)
-    c[m] = tmp[2*m];
-  DEALLOC(tmp);
-  return 0;
-  }
-
-WARN_UNUSED_RESULT
-static int rfftblue_forward(fftblue_plan plan, double c[], double fct)
-  {
-  size_t n=plan->n;
-  double *tmp = RALLOC(double,2*n);
-  if (!tmp) return -1;
-  for (size_t m=0; m<n; ++m)
-    {
-    tmp[2*m] = c[m];
-    tmp[2*m+1] = 0.;
-    }
-  if (fftblue_fft(plan,tmp,-1,fct)!=0)
-    { DEALLOC(tmp); return -1; }
-  c[0] = tmp[0];
-  memcpy (c+1, tmp+2, (n-1)*sizeof(double));
-  DEALLOC(tmp);
-  return 0;
-  }
-
-typedef struct cfft_plan_i
-  {
-  cfftp_plan packplan;
-  fftblue_plan blueplan;
-  } cfft_plan_i;
-
-cfft_plan make_cfft_plan (size_t length)
-  {
-  if (length==0) return NULL;
-  cfft_plan plan = RALLOC(cfft_plan_i,1);
-  if (!plan) return NULL;
-  plan->blueplan=0;
-  plan->packplan=0;
-  if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
-    {
-    plan->packplan=make_cfftp_plan(length);
-    if (!plan->packplan) { DEALLOC(plan); return NULL; }
-    return plan;
-    }
-  double comp1 = cost_guess(length);
-  double comp2 = 2*cost_guess(good_size(2*length-1));
-  comp2*=1.5; /* fudge factor that appears to give good overall performance */
-  if (comp2<comp1) // use Bluestein
-    {
-    plan->blueplan=make_fftblue_plan(length);
-    if (!plan->blueplan) { DEALLOC(plan); return NULL; }
-    }
-  else
-    {
-    plan->packplan=make_cfftp_plan(length);
-    if (!plan->packplan) { DEALLOC(plan); return NULL; }
-    }
-  return plan;
-  }
-
-void destroy_cfft_plan (cfft_plan plan)
-  {
-  if (plan->blueplan)
-    destroy_fftblue_plan(plan->blueplan);
-  if (plan->packplan)
-    destroy_cfftp_plan(plan->packplan);
-  DEALLOC(plan);
-  }
-
-WARN_UNUSED_RESULT int cfft_backward(cfft_plan plan, double c[], double fct)
-  {
-  if (plan->packplan)
-    return cfftp_backward(plan->packplan,c,fct);
-  // if (plan->blueplan)
-  return cfftblue_backward(plan->blueplan,c,fct);
-  }
-
-WARN_UNUSED_RESULT int cfft_forward(cfft_plan plan, double c[], double fct)
-  {
-  if (plan->packplan)
-    return cfftp_forward(plan->packplan,c,fct);
-  // if (plan->blueplan)
-  return cfftblue_forward(plan->blueplan,c,fct);
-  }
-
-typedef struct rfft_plan_i
-  {
-  rfftp_plan packplan;
-  fftblue_plan blueplan;
-  } rfft_plan_i;
-
-rfft_plan make_rfft_plan (size_t length)
-  {
-  if (length==0) return NULL;
-  rfft_plan plan = RALLOC(rfft_plan_i,1);
-  if (!plan) return NULL;
-  plan->blueplan=0;
-  plan->packplan=0;
-  if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
-    {
-    plan->packplan=make_rfftp_plan(length);
-    if (!plan->packplan) { DEALLOC(plan); return NULL; }
-    return plan;
-    }
-  double comp1 = 0.5*cost_guess(length);
-  double comp2 = 2*cost_guess(good_size(2*length-1));
-  comp2*=1.5; /* fudge factor that appears to give good overall performance */
-  if (comp2<comp1) // use Bluestein
-    {
-    plan->blueplan=make_fftblue_plan(length);
-    if (!plan->blueplan) { DEALLOC(plan); return NULL; }
-    }
-  else
-    {
-    plan->packplan=make_rfftp_plan(length);
-    if (!plan->packplan) { DEALLOC(plan); return NULL; }
-    }
-  return plan;
-  }
-
-void destroy_rfft_plan (rfft_plan plan)
-  {
-  if (plan->blueplan)
-    destroy_fftblue_plan(plan->blueplan);
-  if (plan->packplan)
-    destroy_rfftp_plan(plan->packplan);
-  DEALLOC(plan);
-  }
-
-size_t rfft_length(rfft_plan plan)
-  {
-  if (plan->packplan) return plan->packplan->length;
-  return plan->blueplan->n;
-  }
-
-size_t cfft_length(cfft_plan plan)
-  {
-  if (plan->packplan) return plan->packplan->length;
-  return plan->blueplan->n;
-  }
-
-WARN_UNUSED_RESULT int rfft_backward(rfft_plan plan, double c[], double fct)
-  {
-  if (plan->packplan)
-    return rfftp_backward(plan->packplan,c,fct);
-  else // if (plan->blueplan)
-    return rfftblue_backward(plan->blueplan,c,fct);
-  }
-
-WARN_UNUSED_RESULT int rfft_forward(rfft_plan plan, double c[], double fct)
-  {
-  if (plan->packplan)
-    return rfftp_forward(plan->packplan,c,fct);
-  else // if (plan->blueplan)
-    return rfftblue_forward(plan->blueplan,c,fct);
-  }
diff --git a/pocketfft/pocketfft.h b/pocketfft/pocketfft.h
deleted file mode 100644
index af86eba..0000000
--- a/pocketfft/pocketfft.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * This file is part of pocketfft.
- * Licensed under a 3-clause BSD style license - see LICENSE.md
- */
-
-/*! \file pocketfft.h
- *  Public interface of the pocketfft library
- *
- *  Copyright (C) 2008-2019 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#ifndef POCKETFFT_H
-#define POCKETFFT_H
-
-#include <stdlib.h>
-
-struct cfft_plan_i;
-typedef struct cfft_plan_i * cfft_plan;
-cfft_plan make_cfft_plan (size_t length);
-void destroy_cfft_plan (cfft_plan plan);
-int cfft_backward(cfft_plan plan, double c[], double fct);
-int cfft_forward(cfft_plan plan, double c[], double fct);
-size_t cfft_length(cfft_plan plan);
-
-struct rfft_plan_i;
-typedef struct rfft_plan_i * rfft_plan;
-rfft_plan make_rfft_plan (size_t length);
-void destroy_rfft_plan (rfft_plan plan);
-int rfft_backward(rfft_plan plan, double c[], double fct);
-int rfft_forward(rfft_plan plan, double c[], double fct);
-size_t rfft_length(rfft_plan plan);
-
-#endif

From ef4a9512e205f23989f0dcf4150f22f57dee824e Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 27 Feb 2019 11:17:39 +0100
Subject: [PATCH 07/17] fix

---
 configure.ac | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/configure.ac b/configure.ac
index 95cb187..5763c4d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -29,6 +29,8 @@ PKG_CHECK_MODULES([POCKETFFT], [pocketfft], , [POCKETFFT_LIBS="-lpocketfft -lm"]
 tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'`
 AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
 
+
+LIBS=$POCKETFFT_LIBS
 PACKAGE_LIBS="-lsharp"
 
 dnl

From e4b490b34fae892e0cac3295f0bda5d7169a8560 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 28 Feb 2019 10:29:56 +0100
Subject: [PATCH 08/17] reorganization

---
 Makefile.am                                 |  8 +--
 c_utils/walltime_c.c                        | 63 ---------------------
 c_utils/walltime_c.h                        | 50 ----------------
 libsharp/sharp.c                            |  7 +--
 libsharp/sharp.h                            |  8 ++-
 libsharp/sharp_almhelpers.c                 |  2 +-
 libsharp/sharp_almhelpers.h                 |  4 +-
 libsharp/sharp_core.c                       | 15 +++--
 libsharp/sharp_core_inc.c                   |  6 +-
 libsharp/sharp_cxx.h                        |  4 +-
 libsharp/sharp_geomhelpers.c                |  2 +-
 libsharp/sharp_geomhelpers.h                |  4 +-
 libsharp/sharp_internal.h                   |  8 +--
 libsharp/sharp_legendre_roots.c             |  2 +-
 libsharp/sharp_mpi.c                        |  4 +-
 libsharp/sharp_mpi.h                        |  4 +-
 c_utils/c_utils.c => libsharp/sharp_utils.c | 61 +++++++++++++++-----
 c_utils/c_utils.h => libsharp/sharp_utils.h | 60 +++++++-------------
 libsharp/sharp_ylmgen_c.c                   |  6 +-
 {c_utils => test}/memusage.c                |  2 +-
 {c_utils => test}/memusage.h                |  4 +-
 {libsharp => test}/sharp_testsuite.c        |  6 +-
 22 files changed, 120 insertions(+), 210 deletions(-)
 delete mode 100644 c_utils/walltime_c.c
 delete mode 100644 c_utils/walltime_c.h
 rename c_utils/c_utils.c => libsharp/sharp_utils.c (58%)
 rename c_utils/c_utils.h => libsharp/sharp_utils.h (68%)
 rename {c_utils => test}/memusage.c (98%)
 rename {c_utils => test}/memusage.h (96%)
 rename {libsharp => test}/sharp_testsuite.c (99%)

diff --git a/Makefile.am b/Makefile.am
index f946f3a..44f5b35 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -3,10 +3,8 @@ ACLOCAL_AMFLAGS = -I m4
 lib_LTLIBRARIES = libsharp.la
 
 libsharp_la_SOURCES = \
-  c_utils/c_utils.c \
-  c_utils/c_utils.h \
-  c_utils/walltime_c.c \
-  c_utils/walltime_c.h \
+  libsharp/sharp_utils.c \
+  libsharp/sharp_utils.h \
   libsharp/sharp.c \
   libsharp/sharp_almhelpers.c \
   libsharp/sharp_core.c \
@@ -48,7 +46,7 @@ EXTRA_DIST = \
   runtest.sh
 
 check_PROGRAMS = sharp_testsuite
-sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h
+sharp_testsuite_SOURCES = test/sharp_testsuite.c test/memusage.c test/memusage.h
 sharp_testsuite_LDADD = libsharp.la @POCKETFFT_LIBS@
 
 TESTS = runtest.sh
diff --git a/c_utils/walltime_c.c b/c_utils/walltime_c.c
deleted file mode 100644
index f6b8a08..0000000
--- a/c_utils/walltime_c.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- *  This file is part of libc_utils.
- *
- *  libc_utils is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libc_utils is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libc_utils; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/* libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik */
-
-/*
- *  Functionality for reading wall clock time
- *
- *  Copyright (C) 2010-2019 Max-Planck-Society
- *  Author: Martin Reinecke
- */
-
-#if defined (_OPENMP)
-#include <omp.h>
-#elif defined (USE_MPI)
-#include "mpi.h"
-#elif defined (_WIN32)
-#include <Windows.h>
-#else
-#include <sys/time.h>
-#include <stdlib.h>
-#endif
-
-#include "c_utils/walltime_c.h"
-
-double wallTime(void)
-  {
-#if defined (_OPENMP)
-  return omp_get_wtime();
-#elif defined (USE_MPI)
-  return MPI_Wtime();
-#elif defined (_WIN32)
-  static double inv_freq = -1.;
-  if (inv_freq<0)
-    {
-    LARGE_INTEGER freq;
-    QueryPerformanceFrequency(&freq);
-    inv_freq = 1. / double(freq.QuadPart);
-    }
-  LARGE_INTEGER count;
-  QueryPerformanceCounter(&count);
-  return count.QuadPart*inv_freq;
-#else
-  struct timeval t;
-  gettimeofday(&t, NULL);
-  return t.tv_sec + 1e-6*t.tv_usec;
-#endif
-  }
diff --git a/c_utils/walltime_c.h b/c_utils/walltime_c.h
deleted file mode 100644
index d74dc6c..0000000
--- a/c_utils/walltime_c.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- *  This file is part of libc_utils.
- *
- *  libc_utils is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libc_utils is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libc_utils; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/* libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik */
-
-/*! \file walltime_c.h
- *  Functionality for reading wall clock time
- *
- *  Copyright (C) 2010-2019 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#ifndef PLANCK_WALLTIME_C_H
-#define PLANCK_WALLTIME_C_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*! Returns an approximation of the current wall time (in seconds).
-    The first available of the following timers will be used:
-    <ul>
-    <li> \a omp_get_wtime(), if OpenMP is available
-    <li> \a MPI_Wtime(), if MPI is available
-    <li> \a gettimeofday() otherwise
-    </ul>
-    \note Only useful for measuring time differences.
-    \note This function has an execution time between 10 and 100 nanoseconds. */
-double wallTime(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index 58d2565..a4345f2 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -30,8 +30,7 @@
 #include "pocketfft/pocketfft.h"
 #include "libsharp/sharp_ylmgen_c.h"
 #include "libsharp/sharp_internal.h"
-#include "c_utils/c_utils.h"
-#include "c_utils/walltime_c.h"
+#include "libsharp/sharp_utils.h"
 #include "libsharp/sharp_almhelpers.h"
 #include "libsharp/sharp_geomhelpers.h"
 
@@ -836,7 +835,7 @@ NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
 
 NOINLINE static void sharp_execute_job (sharp_job *job)
   {
-  double timer=wallTime();
+  double timer=sharp_wallTime();
   job->opcnt=0;
   int lmax = job->ainfo->lmax,
       mmax=sharp_get_mmax(job->ainfo->mval, job->ainfo->nm);
@@ -910,7 +909,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
 
   DEALLOC(job->norm_l);
   dealloc_phase (job);
-  job->time=wallTime()-timer;
+  job->time=sharp_wallTime()-timer;
   }
 
 static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
diff --git a/libsharp/sharp.h b/libsharp/sharp.h
index 657e369..1fe1566 100644
--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@@ -25,8 +25,8 @@
  *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
-#ifndef PLANCK_SHARP_H
-#define PLANCK_SHARP_H
+#ifndef SHARP_SHARP_H
+#define SHARP_SHARP_H
 
 #include <stddef.h>
 
@@ -253,6 +253,10 @@ int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
 
 /*! \} */
 
+int sharp_get_mlim (int lmax, int spin, double sth, double cth);
+int sharp_veclen(void);
+const char *sharp_architecture(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/libsharp/sharp_almhelpers.c b/libsharp/sharp_almhelpers.c
index c744b47..59b564c 100644
--- a/libsharp/sharp_almhelpers.c
+++ b/libsharp/sharp_almhelpers.c
@@ -26,7 +26,7 @@
  */
 
 #include "libsharp/sharp_almhelpers.h"
-#include "c_utils/c_utils.h"
+#include "libsharp/sharp_utils.h"
 
 void sharp_make_triangular_alm_info (int lmax, int mmax, int stride,
   sharp_alm_info **alm_info)
diff --git a/libsharp/sharp_almhelpers.h b/libsharp/sharp_almhelpers.h
index 1d40d01..3dc85f5 100644
--- a/libsharp/sharp_almhelpers.h
+++ b/libsharp/sharp_almhelpers.h
@@ -25,8 +25,8 @@
  *  \author Martin Reinecke
  */
 
-#ifndef PLANCK_SHARP_ALMHELPERS_H
-#define PLANCK_SHARP_ALMHELPERS_H
+#ifndef SHARP_ALMHELPERS_H
+#define SHARP_ALMHELPERS_H
 
 #include "libsharp/sharp.h"
 
diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 2c78f2b..1dec17a 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -105,6 +105,7 @@ DECL2(avx)
   architecture_ = sharp_architecture_default;
   }
 
+#pragma GCC visibility push(hidden)
 
 void inner_loop (sharp_job *job, const int *ispair,const double *cth,
   const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
@@ -113,16 +114,20 @@ void inner_loop (sharp_job *job, const int *ispair,const double *cth,
   if (!inner_loop_) assign_funcs();
   inner_loop_(job, ispair, cth, sth, llim, ulim, gen, mi, mlim);
   }
-int sharp_veclen(void)
-  {
-  if (!veclen_) assign_funcs();
-  return veclen_();
-  }
+
 int sharp_max_nvec(int spin)
   {
   if (!max_nvec_) assign_funcs();
   return max_nvec_(spin);
   }
+
+#pragma GCC visibility pop
+
+int sharp_veclen(void)
+  {
+  if (!veclen_) assign_funcs();
+  return veclen_();
+  }
 const char *sharp_architecture(void)
   {
   if (!architecture_) assign_funcs();
diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c
index 682558c..612a778 100644
--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
@@ -40,7 +40,9 @@
 #include "libsharp/sharp_vecsupport.h"
 #include "libsharp/sharp.h"
 #include "libsharp/sharp_internal.h"
-#include "c_utils/c_utils.h"
+#include "libsharp/sharp_utils.h"
+
+#pragma GCC visibility push(hidden)
 
 // In the following, we explicitly allow the compiler to contract floating
 // point operations, like multiply-and-add.
@@ -1208,6 +1210,8 @@ const char *XARCH(sharp_architecture)(void)
   return xstr(ARCH);
   }
 
+#pragma GCC visibility pop
+
 #endif
 
 #endif
diff --git a/libsharp/sharp_cxx.h b/libsharp/sharp_cxx.h
index e37d491..60995ed 100644
--- a/libsharp/sharp_cxx.h
+++ b/libsharp/sharp_cxx.h
@@ -25,8 +25,8 @@
  *  \author Martin Reinecke
  */
 
-#ifndef PLANCK_SHARP_CXX_H
-#define PLANCK_SHARP_CXX_H
+#ifndef SHARP_CXX_H
+#define SHARP_CXX_H
 
 #include <complex>
 #include "libsharp/sharp.h"
diff --git a/libsharp/sharp_geomhelpers.c b/libsharp/sharp_geomhelpers.c
index f8f5152..cd7c6fa 100644
--- a/libsharp/sharp_geomhelpers.c
+++ b/libsharp/sharp_geomhelpers.c
@@ -28,7 +28,7 @@
 #include <math.h>
 #include "libsharp/sharp_geomhelpers.h"
 #include "libsharp/sharp_legendre_roots.h"
-#include "c_utils/c_utils.h"
+#include "libsharp/sharp_utils.h"
 #include "pocketfft/pocketfft.h"
 
 void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
diff --git a/libsharp/sharp_geomhelpers.h b/libsharp/sharp_geomhelpers.h
index 27b1246..d0a4f33 100644
--- a/libsharp/sharp_geomhelpers.h
+++ b/libsharp/sharp_geomhelpers.h
@@ -25,8 +25,8 @@
  *  \author Martin Reinecke
  */
 
-#ifndef PLANCK_SHARP_GEOMHELPERS_H
-#define PLANCK_SHARP_GEOMHELPERS_H
+#ifndef SHARP_GEOMHELPERS_H
+#define SHARP_GEOMHELPERS_H
 
 #include "libsharp/sharp.h"
 
diff --git a/libsharp/sharp_internal.h b/libsharp/sharp_internal.h
index 78ff666..89c4429 100644
--- a/libsharp/sharp_internal.h
+++ b/libsharp/sharp_internal.h
@@ -25,8 +25,8 @@
  *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
-#ifndef PLANCK_SHARP_INTERNAL_H
-#define PLANCK_SHARP_INTERNAL_H
+#ifndef SHARP_INTERNAL_H
+#define SHARP_INTERNAL_H
 
 #ifdef __cplusplus
 #error This header file cannot be included from C++, only from C
@@ -54,14 +54,10 @@ typedef struct
   unsigned long long opcnt;
   } sharp_job;
 
-int sharp_get_mlim (int lmax, int spin, double sth, double cth);
-
 void inner_loop (sharp_job *job, const int *ispair,const double *cth,
   const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
   const int *mlim);
 
-int sharp_veclen(void);
 int sharp_max_nvec(int spin);
-const char *sharp_architecture(void);
 
 #endif
diff --git a/libsharp/sharp_legendre_roots.c b/libsharp/sharp_legendre_roots.c
index ec81ecb..648cfca 100644
--- a/libsharp/sharp_legendre_roots.c
+++ b/libsharp/sharp_legendre_roots.c
@@ -8,7 +8,7 @@
 
 #include <math.h>
 #include "libsharp/sharp_legendre_roots.h"
-#include "c_utils/c_utils.h"
+#include "libsharp/sharp_utils.h"
 
 static inline double one_minus_x2 (double x)
   { return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; }
diff --git a/libsharp/sharp_mpi.c b/libsharp/sharp_mpi.c
index 02c76c3..2be016a 100644
--- a/libsharp/sharp_mpi.c
+++ b/libsharp/sharp_mpi.c
@@ -211,7 +211,7 @@ static void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm)
     { sharp_execute_job (job); return; }
 
   MPI_Barrier(comm);
-  double timer=wallTime();
+  double timer=sharp_wallTime();
   job->opcnt=0;
   sharp_mpi_info minfo;
   sharp_make_mpi_info(comm, job, &minfo);
@@ -306,7 +306,7 @@ static void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm)
     dealloc_phase (job);
     }
   sharp_destroy_mpi_info(&minfo);
-  job->time=wallTime()-timer;
+  job->time=sharp_wallTime()-timer;
   }
 
 void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
diff --git a/libsharp/sharp_mpi.h b/libsharp/sharp_mpi.h
index bb26ff0..0195032 100644
--- a/libsharp/sharp_mpi.h
+++ b/libsharp/sharp_mpi.h
@@ -25,8 +25,8 @@
  *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
-#ifndef PLANCK_SHARP_MPI_H
-#define PLANCK_SHARP_MPI_H
+#ifndef SHARP_MPI_H
+#define SHARP_MPI_H
 
 #include <mpi.h>
 #include "libsharp/sharp.h"
diff --git a/c_utils/c_utils.c b/libsharp/sharp_utils.c
similarity index 58%
rename from c_utils/c_utils.c
rename to libsharp/sharp_utils.c
index cf4f409..c73edf8 100644
--- a/c_utils/c_utils.c
+++ b/libsharp/sharp_utils.c
@@ -1,22 +1,22 @@
 /*
- *  This file is part of libc_utils.
+ *  This file is part of libsharp.
  *
- *  libc_utils is free software; you can redistribute it and/or modify
+ *  libsharp is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 2 of the License, or
  *  (at your option) any later version.
  *
- *  libc_utils is distributed in the hope that it will be useful,
+ *  libsharp is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with libc_utils; if not, write to the Free Software
+ *  along with libsharp; if not, write to the Free Software
  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/* libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik */
+/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */
 
 /*
  *  Convenience functions
@@ -26,17 +26,13 @@
  */
 
 #include <stdio.h>
-#include "c_utils/c_utils.h"
+#include "libsharp/sharp_utils.h"
 
-void util_fail_ (const char *file, int line, const char *func, const char *msg)
+void sharp_fail_ (const char *file, int line, const char *func, const char *msg)
   {
   fprintf(stderr,"%s, %i (%s):\n%s\n",file,line,func,msg);
   exit(1);
   }
-void util_warn_ (const char *file, int line, const char *func, const char *msg)
-  {
-  fprintf(stderr,"%s, %i (%s):\n%s\n",file,line,func,msg);
-  }
 
 /* This function tries to avoid allocations with a total size close to a high
    power of two (called the "critical stride" here), by adding a few more bytes
@@ -53,7 +49,7 @@ static size_t manipsize(size_t sz)
 
 #ifdef __SSE__
 #include <xmmintrin.h>
-void *util_malloc_ (size_t sz)
+void *sharp_malloc_ (size_t sz)
   {
   void *res;
   if (sz==0) return NULL;
@@ -61,10 +57,10 @@ void *util_malloc_ (size_t sz)
   UTIL_ASSERT(res,"_mm_malloc() failed");
   return res;
   }
-void util_free_ (void *ptr)
+void sharp_free_ (void *ptr)
   { if ((ptr)!=NULL) _mm_free(ptr); }
 #else
-void *util_malloc_ (size_t sz)
+void *sharp_malloc_ (size_t sz)
   {
   void *res;
   if (sz==0) return NULL;
@@ -72,6 +68,41 @@ void *util_malloc_ (size_t sz)
   UTIL_ASSERT(res,"malloc() failed");
   return res;
   }
-void util_free_ (void *ptr)
+void sharp_free_ (void *ptr)
   { if ((ptr)!=NULL) free(ptr); }
 #endif
+
+#if defined (_OPENMP)
+#include <omp.h>
+#elif defined (USE_MPI)
+#include "mpi.h"
+#elif defined (_WIN32)
+#include <Windows.h>
+#else
+#include <sys/time.h>
+#include <stdlib.h>
+#endif
+
+double sharp_wallTime(void)
+  {
+#if defined (_OPENMP)
+  return omp_get_wtime();
+#elif defined (USE_MPI)
+  return MPI_Wtime();
+#elif defined (_WIN32)
+  static double inv_freq = -1.;
+  if (inv_freq<0)
+    {
+    LARGE_INTEGER freq;
+    QueryPerformanceFrequency(&freq);
+    inv_freq = 1. / double(freq.QuadPart);
+    }
+  LARGE_INTEGER count;
+  QueryPerformanceCounter(&count);
+  return count.QuadPart*inv_freq;
+#else
+  struct timeval t;
+  gettimeofday(&t, NULL);
+  return t.tv_sec + 1e-6*t.tv_usec;
+#endif
+  }
diff --git a/c_utils/c_utils.h b/libsharp/sharp_utils.h
similarity index 68%
rename from c_utils/c_utils.h
rename to libsharp/sharp_utils.h
index 5aed9a1..d2a1c74 100644
--- a/c_utils/c_utils.h
+++ b/libsharp/sharp_utils.h
@@ -26,8 +26,8 @@
  *  \note This file should only be included from .c files, NOT from .h files.
  */
 
-#ifndef PLANCK_C_UTILS_H
-#define PLANCK_C_UTILS_H
+#ifndef SHARP_UTILS_H
+#define SHARP_UTILS_H
 
 #include <math.h>
 #include <stdlib.h>
@@ -37,10 +37,9 @@
 extern "C" {
 #endif
 
-void util_fail_ (const char *file, int line, const char *func, const char *msg);
-void util_warn_ (const char *file, int line, const char *func, const char *msg);
-void *util_malloc_ (size_t sz);
-void util_free_ (void *ptr);
+void sharp_fail_ (const char *file, int line, const char *func, const char *msg);
+void *sharp_malloc_ (size_t sz);
+void sharp_free_ (void *ptr);
 
 #if defined (__GNUC__)
 #define UTIL_FUNC_NAME__ __func__
@@ -53,43 +52,32 @@ void util_free_ (void *ptr);
     source file name and line number of the call, as well as \a msg;
     then exit the program with an error status. */
 #define UTIL_ASSERT(cond,msg) \
-  if(!(cond)) util_fail_(__FILE__,__LINE__,UTIL_FUNC_NAME__,msg)
+  if(!(cond)) sharp_fail_(__FILE__,__LINE__,UTIL_FUNC_NAME__,msg)
 /*! \def UTIL_WARN(cond,msg)
     If \a cond is false, print an warning containing function name,
     source file name and line number of the call, as well as \a msg. */
-#define UTIL_WARN(cond,msg) \
-  if(!(cond)) util_warn_(__FILE__,__LINE__,UTIL_FUNC_NAME__,msg)
-/*! \def UTIL_FAIL(msg)
-    Print an error message containing function name,
-    source file name and line number of the call, as well as \a msg;
-    then exit the program with an error status. */
 #define UTIL_FAIL(msg) \
-  util_fail_(__FILE__,__LINE__,UTIL_FUNC_NAME__,msg)
+  sharp_fail_(__FILE__,__LINE__,UTIL_FUNC_NAME__,msg)
 
 /*! \def ALLOC(ptr,type,num)
     Allocate space for \a num objects of type \a type. Make sure that the
     allocation succeeded, else stop the program with an error. Return the
     resulting pointer in \a ptr. */
 #define ALLOC(ptr,type,num) \
-  do { (ptr)=(type *)util_malloc_((num)*sizeof(type)); } while (0)
+  do { (ptr)=(type *)sharp_malloc_((num)*sizeof(type)); } while (0)
 /*! \def RALLOC(type,num)
     Allocate space for \a num objects of type \a type. Make sure that the
     allocation succeeded, else stop the program with an error. Cast the
     resulting pointer to \a (type*). */
 #define RALLOC(type,num) \
-  ((type *)util_malloc_((num)*sizeof(type)))
+  ((type *)sharp_malloc_((num)*sizeof(type)))
 /*! \def DEALLOC(ptr)
     Deallocate \a ptr. It must have been allocated using \a ALLOC or
     \a RALLOC. */
 #define DEALLOC(ptr) \
-  do { util_free_(ptr); (ptr)=NULL; } while(0)
+  do { sharp_free_(ptr); (ptr)=NULL; } while(0)
 #define RESIZE(ptr,type,num) \
-  do { util_free_(ptr); ALLOC(ptr,type,num); } while(0)
-#define GROW(ptr,type,sz_old,sz_new) \
-  do { \
-    if ((sz_new)>(sz_old)) \
-      { RESIZE(ptr,type,2*(sz_new));sz_old=2*(sz_new); } \
-  } while(0)
+  do { sharp_free_(ptr); ALLOC(ptr,type,num); } while(0)
 /*! \def SET_ARRAY(ptr,i1,i2,val)
     Set the entries \a ptr[i1] ... \a ptr[i2-1] to \a val. */
 #define SET_ARRAY(ptr,i1,i2,val) \
@@ -97,14 +85,6 @@ void util_free_ (void *ptr);
     ptrdiff_t cnt_; \
     for (cnt_=(i1);cnt_<(i2);++cnt_) (ptr)[cnt_]=(val); \
     } while(0)
-/*! \def COPY_ARRAY(src,dest,i1,i2)
-    Copy the entries \a src[i1] ... \a src[i2-1] to
-    \a dest[i1] ... \a dest[i2-1]. */
-#define COPY_ARRAY(src,dest,i1,i2) \
-  do { \
-    ptrdiff_t cnt_; \
-    for (cnt_=(i1);cnt_<(i2);++cnt_) (dest)[cnt_]=(src)[cnt_]; \
-    } while(0)
 
 #define ALLOC2D(ptr,type,num1,num2) \
   do { \
@@ -119,8 +99,6 @@ void util_free_ (void *ptr);
 
 #define FAPPROX(a,b,eps) \
   (fabs((a)-(b))<((eps)*fabs(b)))
-#define ABSAPPROX(a,b,eps) \
-  (fabs((a)-(b))<(eps))
 #define IMAX(a,b) \
   (((a)>(b)) ? (a) : (b))
 #define IMIN(a,b) \
@@ -129,12 +107,16 @@ void util_free_ (void *ptr);
 #define SWAP(a,b,type) \
   do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0)
 
-#define CHECK_STACK_ALIGN(align) \
-  do { \
-    double foo; \
-    UTIL_WARN((((size_t)(&foo))&(align-1))==0, \
-      "WARNING: stack not sufficiently aligned!"); \
-    } while(0)
+/*! Returns an approximation of the current wall time (in seconds).
+    The first available of the following timers will be used:
+    <ul>
+    <li> \a omp_get_wtime(), if OpenMP is available
+    <li> \a MPI_Wtime(), if MPI is available
+    <li> \a gettimeofday() otherwise
+    </ul>
+    \note Only useful for measuring time differences.
+    \note This function has an execution time between 10 and 100 nanoseconds. */
+double sharp_wallTime(void);
 
 #ifdef __cplusplus
 }
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index 35f3397..791908c 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -28,7 +28,9 @@
 #include <math.h>
 #include <stdlib.h>
 #include "libsharp/sharp_ylmgen_c.h"
-#include "c_utils/c_utils.h"
+#include "libsharp/sharp_utils.h"
+
+#pragma GCC visibility push(hidden)
 
 static inline void normalize (double *val, int *scale, double xfmax)
   {
@@ -252,3 +254,5 @@ double *sharp_Ylmgen_get_d1norm (int lmax)
     res[l] = (l<1) ? 0. : 0.5*sqrt(l*(l+1.)*(2*l+1.)/(4*pi));
   return res;
   }
+
+#pragma GCC visibility pop
diff --git a/c_utils/memusage.c b/test/memusage.c
similarity index 98%
rename from c_utils/memusage.c
rename to test/memusage.c
index 7fe5fa3..3db8e13 100644
--- a/c_utils/memusage.c
+++ b/test/memusage.c
@@ -27,7 +27,7 @@
 
 #include <stdio.h>
 #include <string.h>
-#include "c_utils/memusage.h"
+#include "test/memusage.h"
 
 double residentSetSize(void)
   {
diff --git a/c_utils/memusage.h b/test/memusage.h
similarity index 96%
rename from c_utils/memusage.h
rename to test/memusage.h
index 73c55c3..023821a 100644
--- a/c_utils/memusage.h
+++ b/test/memusage.h
@@ -25,8 +25,8 @@
  *  \author Martin Reinecke
  */
 
-#ifndef PLANCK_MEMUSAGE_H
-#define PLANCK_MEMUSAGE_H
+#ifndef SHARP_MEMUSAGE_H
+#define SHARP_MEMUSAGE_H
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/libsharp/sharp_testsuite.c b/test/sharp_testsuite.c
similarity index 99%
rename from libsharp/sharp_testsuite.c
rename to test/sharp_testsuite.c
index 4392a61..c9986be 100644
--- a/libsharp/sharp_testsuite.c
+++ b/test/sharp_testsuite.c
@@ -26,6 +26,7 @@
 
 #include <stdio.h>
 #include <string.h>
+#include <complex.h>
 #ifdef USE_MPI
 #include "mpi.h"
 #include "libsharp/sharp_mpi.h"
@@ -34,11 +35,10 @@
 #include <omp.h>
 #endif
 #include "libsharp/sharp.h"
-#include "libsharp/sharp_internal.h"
 #include "libsharp/sharp_geomhelpers.h"
 #include "libsharp/sharp_almhelpers.h"
-#include "c_utils/c_utils.h"
-#include "c_utils/memusage.h"
+#include "libsharp/sharp_utils.h"
+#include "test/memusage.h"
 
 static void OpenMP_status(void)
   {

From 0944aadfdec1dcbe1e69336ae579c06a0ea25242 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 1 Mar 2019 10:17:16 +0100
Subject: [PATCH 09/17] more cleanups

---
 Makefile.am  | 4 ++--
 configure.ac | 9 ++++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index 44f5b35..94ded11 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -16,6 +16,8 @@ libsharp_la_SOURCES = \
   libsharp/sharp_vecsupport.h \
   libsharp/sharp_ylmgen_c.h
 
+AM_CFLAGS = @AM_CFLAGS@
+
 if HAVE_MULTIARCH
 
 libavx_la_SOURCES = libsharp/sharp_core_inc.c
@@ -51,8 +53,6 @@ sharp_testsuite_LDADD = libsharp.la @POCKETFFT_LIBS@
 
 TESTS = runtest.sh
 
-AM_CFLAGS = @AM_CFLAGS@ @POCKETFFT_CFLAGS@
-
 pkgconfigdir = $(libdir)/pkgconfig
 nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc
 
diff --git a/configure.ac b/configure.ac
index 5763c4d..f636f83 100644
--- a/configure.ac
+++ b/configure.ac
@@ -21,7 +21,7 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 
 
 AC_PROG_CC_C99
-
+AC_OPENMP
 AC_PROG_LIBTOOL
 
 PKG_CHECK_MODULES([POCKETFFT], [pocketfft], , [POCKETFFT_LIBS="-lpocketfft -lm"])
@@ -29,19 +29,18 @@ PKG_CHECK_MODULES([POCKETFFT], [pocketfft], , [POCKETFFT_LIBS="-lpocketfft -lm"]
 tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'`
 AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
 
+AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS $POCKETFFT_CFLAGS"
 
 LIBS=$POCKETFFT_LIBS
 PACKAGE_LIBS="-lsharp"
+PACKAGE_CFLAGS="$PACKAGE_CFLAGS $OPENMP_CFLAGS"
+PACKAGE_LDFLAGS="$PACKAGE_LDFLAGS $OPENMP_CFLAGS"
 
 dnl
 dnl Create pkgconfig .pc file.
 dnl
 AX_CREATE_PKGCONFIG_INFO(,,,,[])
-AC_SUBST([LIBS])
 AC_SUBST([AM_CFLAGS])
-AC_SUBST([AM_LDFLAGS])
-AC_SUBST([POCKETFFT_CFLAGS])
-AC_SUBST([POCKETFFT_LIBS])
 
 AC_CONFIG_FILES([Makefile])
 AC_OUTPUT

From 9435d29596b00e0b085170c5fbf8706008b60c00 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Mon, 4 Mar 2019 09:54:38 +0100
Subject: [PATCH 10/17] add library versioning

---
 Makefile.am | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Makefile.am b/Makefile.am
index 94ded11..4eb08c6 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -16,6 +16,14 @@ libsharp_la_SOURCES = \
   libsharp/sharp_vecsupport.h \
   libsharp/sharp_ylmgen_c.h
 
+# format is "current:revision:age"
+# any change: increase revision
+# any interface change: increase current, revision=0
+# any backward-compatible change: increase age
+# any backward-incompatible change: age=0
+# ==> age <= current
+libsharp_la_LDFLAGS = -version-info 0:0:0
+
 AM_CFLAGS = @AM_CFLAGS@
 
 if HAVE_MULTIARCH

From 8c854b705e191343b4096407140846f36a70499b Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 7 May 2019 13:56:08 +0200
Subject: [PATCH 11/17] prepare simple Fortran interface

---
 Makefile.am            |    4 +-
 configure.ac           |    5 +-
 fortran/sharp.f90      |  242 +++++
 fortran/test_sharp.f90 |   56 +
 pocketfft/pocketfft.c  | 2194 ++++++++++++++++++++++++++++++++++++++++
 pocketfft/pocketfft.h  |   50 +
 6 files changed, 2546 insertions(+), 5 deletions(-)
 create mode 100644 fortran/sharp.f90
 create mode 100644 fortran/test_sharp.f90
 create mode 100644 pocketfft/pocketfft.c
 create mode 100644 pocketfft/pocketfft.h

diff --git a/Makefile.am b/Makefile.am
index 4eb08c6..b172233 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -3,6 +3,8 @@ ACLOCAL_AMFLAGS = -I m4
 lib_LTLIBRARIES = libsharp.la
 
 libsharp_la_SOURCES = \
+  pocketfft/pocketfft.c \
+  pocketfft/pocketfft.h \
   libsharp/sharp_utils.c \
   libsharp/sharp_utils.h \
   libsharp/sharp.c \
@@ -57,7 +59,7 @@ EXTRA_DIST = \
 
 check_PROGRAMS = sharp_testsuite
 sharp_testsuite_SOURCES = test/sharp_testsuite.c test/memusage.c test/memusage.h
-sharp_testsuite_LDADD = libsharp.la @POCKETFFT_LIBS@
+sharp_testsuite_LDADD = libsharp.la -lm
 
 TESTS = runtest.sh
 
diff --git a/configure.ac b/configure.ac
index f636f83..b020181 100644
--- a/configure.ac
+++ b/configure.ac
@@ -24,14 +24,11 @@ AC_PROG_CC_C99
 AC_OPENMP
 AC_PROG_LIBTOOL
 
-PKG_CHECK_MODULES([POCKETFFT], [pocketfft], , [POCKETFFT_LIBS="-lpocketfft -lm"])
-
 tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'`
 AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
 
-AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS $POCKETFFT_CFLAGS"
+AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS"
 
-LIBS=$POCKETFFT_LIBS
 PACKAGE_LIBS="-lsharp"
 PACKAGE_CFLAGS="$PACKAGE_CFLAGS $OPENMP_CFLAGS"
 PACKAGE_LDFLAGS="$PACKAGE_LDFLAGS $OPENMP_CFLAGS"
diff --git a/fortran/sharp.f90 b/fortran/sharp.f90
new file mode 100644
index 0000000..d6b5eee
--- /dev/null
+++ b/fortran/sharp.f90
@@ -0,0 +1,242 @@
+module sharp
+  use iso_c_binding
+  implicit none
+  ! alm_info flags
+  integer, parameter :: SHARP_PACKED = 1
+
+  ! sharp job types
+  enum, bind(c)
+      enumerator :: SHARP_YtW = 0
+      enumerator :: SHARP_Y = 1
+      enumerator :: SHARP_Yt = 2
+      enumerator :: SHARP_WY = 3
+      enumerator :: SHARP_ALM2MAP_DERIV1 = 4
+   end enum
+
+  ! sharp job flags
+  integer, parameter :: SHARP_DP             = ISHFT(1, 4)
+  integer, parameter :: SHARP_ADD            = ISHFT(1, 5)
+  integer, parameter :: SHARP_REAL_HARMONICS = ISHFT(1, 6)
+  integer, parameter :: SHARP_NO_FFT         = ISHFT(1, 7)
+
+  type sharp_geom_info
+     type(c_ptr) :: handle
+     integer(c_intptr_t) :: n_local
+  end type sharp_geom_info
+
+  type sharp_alm_info
+     type(c_ptr) :: handle
+     integer(c_intptr_t) :: n_local
+  end type sharp_alm_info
+
+  interface
+
+     ! alm_info
+     subroutine sharp_make_general_alm_info( &
+         lmax, nm, stride, mval, mvstart, flags, alm_info) bind(c)
+       use iso_c_binding
+       integer(c_int), value, intent(in)    :: lmax, nm, stride, flags
+       integer(c_int), intent(in)           :: mval(nm)
+       integer(c_intptr_t), intent(in)     :: mvstart(nm)
+       type(c_ptr), intent(out)             :: alm_info
+     end subroutine sharp_make_general_alm_info
+
+     subroutine c_sharp_make_mmajor_real_packed_alm_info( &
+         lmax, stride, nm, ms, alm_info) bind(c, name='sharp_make_mmajor_real_packed_alm_info')
+       use iso_c_binding
+       integer(c_int), value, intent(in)    :: lmax, nm, stride
+       integer(c_int), intent(in), optional :: ms(nm)
+       type(c_ptr), intent(out)             :: alm_info
+     end subroutine c_sharp_make_mmajor_real_packed_alm_info
+
+     function c_sharp_alm_count(alm_info) bind(c, name='sharp_alm_count')
+       use iso_c_binding
+       integer(c_intptr_t)           :: c_sharp_alm_count
+       type(c_ptr), value, intent(in) :: alm_info
+     end function c_sharp_alm_count
+
+     subroutine c_sharp_destroy_alm_info(alm_info) bind(c, name='sharp_destroy_alm_info')
+       use iso_c_binding
+       type(c_ptr), value                   :: alm_info
+     end subroutine c_sharp_destroy_alm_info
+
+     ! geom_info
+     subroutine sharp_make_subset_healpix_geom_info ( &
+          nside, stride, nrings, rings, weight, geom_info) bind(c)
+       use iso_c_binding
+       integer(c_int), value, intent(in)    :: nside, stride, nrings
+       integer(c_int), intent(in), optional :: rings(nrings)
+       real(c_double), intent(in), optional :: weight(2 * nside)
+       type(c_ptr), intent(out)             :: geom_info
+     end subroutine sharp_make_subset_healpix_geom_info
+
+     subroutine c_sharp_destroy_geom_info(geom_info) bind(c, name='sharp_destroy_geom_info')
+       use iso_c_binding
+       type(c_ptr), value                   :: geom_info
+     end subroutine c_sharp_destroy_geom_info
+
+     function c_sharp_map_size(info) bind(c, name='sharp_map_size')
+       use iso_c_binding
+       integer(c_intptr_t) :: c_sharp_map_size
+       type(c_ptr), value   :: info
+     end function c_sharp_map_size
+
+
+     ! execute
+     subroutine c_sharp_execute(type, spin, alm, map, geom_info, alm_info, &
+                                flags, time, opcnt) bind(c, name='sharp_execute')
+       use iso_c_binding
+       integer(c_int), value                        :: type, spin, flags
+       type(c_ptr), value                           :: alm_info, geom_info
+       real(c_double), intent(out), optional        :: time
+       integer(c_long_long), intent(out), optional  :: opcnt
+       type(c_ptr), intent(in)                      :: alm(*), map(*)
+     end subroutine c_sharp_execute
+
+     subroutine c_sharp_execute_mpi(comm, type, spin, alm, map, geom_info, alm_info, &
+                                    flags, time, opcnt) bind(c, name='sharp_execute_mpi_fortran')
+       use iso_c_binding
+       integer(c_int), value                        :: comm, type, spin, flags
+       type(c_ptr), value                           :: alm_info, geom_info
+       real(c_double), intent(out), optional        :: time
+       integer(c_long_long), intent(out), optional  :: opcnt
+       type(c_ptr), intent(in)                      :: alm(*), map(*)
+     end subroutine c_sharp_execute_mpi
+  end interface
+
+  interface sharp_execute
+     module procedure sharp_execute_d
+  end interface
+
+contains
+  ! alm info
+
+  ! if ms is not passed, we default to using m=0..lmax.
+  subroutine sharp_make_mmajor_real_packed_alm_info(lmax, ms, alm_info)
+    use iso_c_binding
+    integer(c_int), value, intent(in)    :: lmax
+    integer(c_int), intent(in), optional :: ms(:)
+    type(sharp_alm_info), intent(out)    :: alm_info
+    !--
+    integer(c_int), allocatable          :: ms_copy(:)
+    integer(c_int)                       :: nm
+
+    if (present(ms)) then
+       nm = size(ms)
+       allocate(ms_copy(nm))
+       ms_copy = ms
+       call c_sharp_make_mmajor_real_packed_alm_info(lmax, 1, nm, ms_copy, alm_info=alm_info%handle)
+       deallocate(ms_copy)
+    else
+       call c_sharp_make_mmajor_real_packed_alm_info(lmax, 1, lmax + 1, alm_info=alm_info%handle)
+    end if
+    alm_info%n_local = c_sharp_alm_count(alm_info%handle)
+  end subroutine sharp_make_mmajor_real_packed_alm_info
+
+  subroutine sharp_destroy_alm_info(alm_info)
+    use iso_c_binding
+    type(sharp_alm_info), intent(inout) :: alm_info
+    call c_sharp_destroy_alm_info(alm_info%handle)
+    alm_info%handle = c_null_ptr
+  end subroutine sharp_destroy_alm_info
+
+
+  ! geom info
+  subroutine sharp_make_healpix_geom_info(nside, rings, weight, geom_info)
+    integer(c_int), value                :: nside
+    integer(c_int), optional             :: rings(:)
+    real(c_double), intent(in), optional :: weight(2 * nside)
+    type(sharp_geom_info), intent(out)   :: geom_info
+    !--
+    integer(c_int) :: nrings
+    integer(c_int), allocatable :: rings_copy(:)
+
+    if (present(rings)) then
+       nrings = size(rings)
+       allocate(rings_copy(nrings))
+       rings_copy = rings
+       call sharp_make_subset_healpix_geom_info(nside, 1, nrings, rings_copy, &
+                                                weight, geom_info%handle)
+       deallocate(rings_copy)
+    else
+       call sharp_make_subset_healpix_geom_info(nside, 1, nrings=4 * nside - 1, &
+                                                weight=weight, geom_info=geom_info%handle)
+    end if
+    geom_info%n_local = c_sharp_map_size(geom_info%handle)
+  end subroutine sharp_make_healpix_geom_info
+
+  subroutine sharp_destroy_geom_info(geom_info)
+    use iso_c_binding
+    type(sharp_geom_info), intent(inout) :: geom_info
+    call c_sharp_destroy_geom_info(geom_info%handle)
+    geom_info%handle = c_null_ptr
+  end subroutine sharp_destroy_geom_info
+
+
+  ! Currently the only mode supported is stacked (not interleaved) maps.
+  !
+  ! Note that passing the exact dimension of alm/map is necessary, it
+  ! prevents the caller from doing too crazy slicing prior to pass array
+  ! in...
+  !
+  ! Usage:
+  !
+  ! The alm array must have shape exactly alm(alm_info%n_local, nmaps)
+  ! The maps array must have shape exactly map(map_info%n_local, nmaps).
+  subroutine sharp_execute_d(type, spin, nmaps, alm, alm_info, map, geom_info, &
+                             add, time, opcnt, comm)
+    use iso_c_binding
+    use mpi
+    implicit none
+    integer(c_int), value                        :: type, spin, nmaps
+    integer(c_int), optional                     :: comm
+    logical, value, optional                     :: add  ! should add instead of replace out
+
+    type(sharp_alm_info)                         :: alm_info
+    type(sharp_geom_info)                        :: geom_info
+    real(c_double), intent(out), optional        :: time
+    integer(c_long_long), intent(out), optional  :: opcnt
+    real(c_double), target, intent(inout)        :: alm(0:alm_info%n_local - 1, 1:nmaps)
+    real(c_double), target, intent(inout)        :: map(0:geom_info%n_local - 1, 1:nmaps)
+    !--
+    integer(c_int)         :: mod_flags, ntrans, k
+    type(c_ptr), target    :: alm_ptr(nmaps)
+    type(c_ptr), target    :: map_ptr(nmaps)
+
+    mod_flags = SHARP_DP
+    if (present(add) .and. add) then
+       mod_flags = or(mod_flags, SHARP_ADD)
+    end if
+
+    if (spin == 0) then
+       ntrans = nmaps
+    else
+       ntrans = nmaps / 2
+    end if
+    if (ntrans/=1) print *, "ERROR: ntrans /= 1"
+
+    ! Set up pointer table to access maps
+    alm_ptr(:) = c_null_ptr
+    map_ptr(:) = c_null_ptr
+    do k = 1, nmaps
+       if (alm_info%n_local > 0) alm_ptr(k) = c_loc(alm(0, k))
+       if (geom_info%n_local > 0) map_ptr(k) = c_loc(map(0, k))
+    end do
+
+    if (present(comm)) then
+      call c_sharp_execute_mpi(comm, type, spin, alm_ptr, map_ptr, &
+          geom_info=geom_info%handle, &
+          alm_info=alm_info%handle, &
+          flags=mod_flags, &
+          time=time, &
+          opcnt=opcnt)
+    else
+      call c_sharp_execute(type, spin, alm_ptr, map_ptr, &
+          geom_info=geom_info%handle, &
+          alm_info=alm_info%handle, &
+          flags=mod_flags, &
+          time=time, &
+          opcnt=opcnt)
+   end if
+  end subroutine sharp_execute_d
+end module
diff --git a/fortran/test_sharp.f90 b/fortran/test_sharp.f90
new file mode 100644
index 0000000..78e6c16
--- /dev/null
+++ b/fortran/test_sharp.f90
@@ -0,0 +1,56 @@
+program test_sharp
+  use mpi
+  use sharp
+  use iso_c_binding, only : c_ptr, c_double
+  implicit none
+
+  integer, parameter :: lmax = 2, nside = 2
+  type(sharp_alm_info) :: alm_info
+  type(sharp_geom_info) :: geom_info
+
+  real(c_double), dimension(0:(lmax + 1)**2 - 1, 1:1) :: alm
+  real(c_double), dimension(0:12*nside**2 - 1, 1:1) :: map
+
+  integer(c_int), dimension(1:lmax + 1) :: ms
+  integer(c_int), dimension(1:4 * nside - 1) :: rings
+  integer(c_int) :: nm, m, nrings, iring
+  integer :: nodecount, rank, ierr
+
+  call MPI_Init(ierr)
+  call MPI_Comm_size(MPI_COMM_WORLD, nodecount, ierr)
+  call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
+
+  nm = 0
+  do m = rank, lmax, nodecount
+     nm = nm + 1
+     ms(nm) = m
+  end do
+
+  nrings = 0
+  do iring = rank + 1, 4 * nside - 1, nodecount
+     nrings = nrings + 1
+     rings(nrings) = iring
+  end do
+
+  alm = 0
+  map = 0
+  if (rank == 0) then
+    alm(0, 1) = 1
+  end if
+
+  print *, ms(1:nm)
+  call sharp_make_mmajor_real_packed_alm_info(lmax, ms=ms(1:nm), alm_info=alm_info)
+  print *, 'alm_info%n_local', alm_info%n_local
+  call sharp_make_healpix_geom_info(nside, rings=rings(1:nrings), geom_info=geom_info)
+  print *, 'geom_info%n_local', geom_info%n_local
+  print *, 'execute'
+  call sharp_execute(SHARP_Y, 0, 1, alm, alm_info, map, geom_info, comm=MPI_COMM_WORLD)
+
+  print *, alm
+  print *, map
+
+  call sharp_destroy_alm_info(alm_info)
+  call sharp_destroy_geom_info(geom_info)
+  print *, 'DONE'
+  call MPI_Finalize(ierr)
+end program test_sharp
diff --git a/pocketfft/pocketfft.c b/pocketfft/pocketfft.c
new file mode 100644
index 0000000..56c9dc5
--- /dev/null
+++ b/pocketfft/pocketfft.c
@@ -0,0 +1,2194 @@
+/*
+ * This file is part of pocketfft.
+ * Licensed under a 3-clause BSD style license - see LICENSE.md
+ */
+
+/*
+ *  Main implementation file.
+ *
+ *  Copyright (C) 2004-2019 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "pocketfft/pocketfft.h"
+
+#define RALLOC(type,num) \
+  ((type *)malloc((num)*sizeof(type)))
+#define DEALLOC(ptr) \
+  do { free(ptr); (ptr)=NULL; } while(0)
+
+#define SWAP(a,b,type) \
+  do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0)
+
+#ifdef __GNUC__
+#define NOINLINE __attribute__((noinline))
+#define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result))
+#else
+#define NOINLINE
+#define WARN_UNUSED_RESULT
+#endif
+
+// adapted from https://stackoverflow.com/questions/42792939/
+// CAUTION: this function only works for arguments in the range [-0.25; 0.25]!
+static void my_sincosm1pi (double a, double *restrict res)
+  {
+  double s = a * a;
+  /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
+  double r =     -1.0369917389758117e-4;
+  r = fma (r, s,  1.9294935641298806e-3);
+  r = fma (r, s, -2.5806887942825395e-2);
+  r = fma (r, s,  2.3533063028328211e-1);
+  r = fma (r, s, -1.3352627688538006e+0);
+  r = fma (r, s,  4.0587121264167623e+0);
+  r = fma (r, s, -4.9348022005446790e+0);
+  double c = r*s;
+  /* Approximate sin(pi*x) for x in [-0.25,0.25] */
+  r =             4.6151442520157035e-4;
+  r = fma (r, s, -7.3700183130883555e-3);
+  r = fma (r, s,  8.2145868949323936e-2);
+  r = fma (r, s, -5.9926452893214921e-1);
+  r = fma (r, s,  2.5501640398732688e+0);
+  r = fma (r, s, -5.1677127800499516e+0);
+  s = s * a;
+  r = r * s;
+  s = fma (a, 3.1415926535897931e+0, r);
+  res[0] = c;
+  res[1] = s;
+  }
+
+NOINLINE static void calc_first_octant(size_t den, double * restrict res)
+  {
+  size_t n = (den+4)>>3;
+  if (n==0) return;
+  res[0]=1.; res[1]=0.;
+  if (n==1) return;
+  size_t l1=(size_t)sqrt(n);
+  for (size_t i=1; i<l1; ++i)
+    my_sincosm1pi((2.*i)/den,&res[2*i]);
+  size_t start=l1;
+  while(start<n)
+    {
+    double cs[2];
+    my_sincosm1pi((2.*start)/den,cs);
+    res[2*start] = cs[0]+1.;
+    res[2*start+1] = cs[1];
+    size_t end = l1;
+    if (start+end>n) end = n-start;
+    for (size_t i=1; i<end; ++i)
+      {
+      double csx[2]={res[2*i], res[2*i+1]};
+      res[2*(start+i)] = ((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1.;
+      res[2*(start+i)+1] = (cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1];
+      }
+    start += l1;
+    }
+  for (size_t i=1; i<l1; ++i)
+    res[2*i] += 1.;
+  }
+
+NOINLINE static void calc_first_quadrant(size_t n, double * restrict res)
+  {
+  double * restrict p = res+n;
+  calc_first_octant(n<<1, p);
+  size_t ndone=(n+2)>>2;
+  size_t i=0, idx1=0, idx2=2*ndone-2;
+  for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
+    {
+    res[idx1]   = p[2*i];
+    res[idx1+1] = p[2*i+1];
+    res[idx2]   = p[2*i+3];
+    res[idx2+1] = p[2*i+2];
+    }
+  if (i!=ndone)
+    {
+    res[idx1  ] = p[2*i];
+    res[idx1+1] = p[2*i+1];
+    }
+  }
+
+NOINLINE static void calc_first_half(size_t n, double * restrict res)
+  {
+  int ndone=(n+1)>>1;
+  double * p = res+n-1;
+  calc_first_octant(n<<2, p);
+  int i4=0, in=n, i=0;
+  for (; i4<=in-i4; ++i, i4+=4) // octant 0
+    {
+    res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1];
+    }
+  for (; i4-in <= 0; ++i, i4+=4) // octant 1
+    {
+    int xm = in-i4;
+    res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm];
+    }
+  for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
+    {
+    int xm = i4-in;
+    res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm];
+    }
+  for (; i<ndone; ++i, i4+=4) // octant 3
+    {
+    int xm = 2*in-i4;
+    res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1];
+    }
+  }
+
+NOINLINE static void fill_first_quadrant(size_t n, double * restrict res)
+  {
+  const double hsqt2 = 0.707106781186547524400844362104849;
+  size_t quart = n>>2;
+  if ((n&7)==0)
+    res[quart] = res[quart+1] = hsqt2;
+  for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
+    {
+    res[j  ] = res[i+1];
+    res[j+1] = res[i  ];
+    }
+  }
+
+NOINLINE static void fill_first_half(size_t n, double * restrict res)
+  {
+  size_t half = n>>1;
+  if ((n&3)==0)
+    for (size_t i=0; i<half; i+=2)
+      {
+      res[i+half]   = -res[i+1];
+      res[i+half+1] =  res[i  ];
+      }
+  else
+    for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
+      {
+      res[j  ] = -res[i  ];
+      res[j+1] =  res[i+1];
+      }
+  }
+
+NOINLINE static void fill_second_half(size_t n, double * restrict res)
+  {
+  if ((n&1)==0)
+    for (size_t i=0; i<n; ++i)
+      res[i+n] = -res[i];
+  else
+    for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
+      {
+      res[j  ] =  res[i  ];
+      res[j+1] = -res[i+1];
+      }
+  }
+
+NOINLINE static void sincos_2pibyn_half(size_t n, double * restrict res)
+  {
+  if ((n&3)==0)
+    {
+    calc_first_octant(n, res);
+    fill_first_quadrant(n, res);
+    fill_first_half(n, res);
+    }
+  else if ((n&1)==0)
+    {
+    calc_first_quadrant(n, res);
+    fill_first_half(n, res);
+    }
+  else
+    calc_first_half(n, res);
+  }
+
+NOINLINE static void sincos_2pibyn(size_t n, double * restrict res)
+  {
+  sincos_2pibyn_half(n, res);
+  fill_second_half(n, res);
+  }
+
+NOINLINE static size_t largest_prime_factor (size_t n)
+  {
+  size_t res=1;
+  size_t tmp;
+  while (((tmp=(n>>1))<<1)==n)
+    { res=2; n=tmp; }
+
+  size_t limit=(size_t)sqrt(n+0.01);
+  for (size_t x=3; x<=limit; x+=2)
+  while (((tmp=(n/x))*x)==n)
+    {
+    res=x;
+    n=tmp;
+    limit=(size_t)sqrt(n+0.01);
+    }
+  if (n>1) res=n;
+
+  return res;
+  }
+
+NOINLINE static double cost_guess (size_t n)
+  {
+  const double lfp=1.1; // penalty for non-hardcoded larger factors
+  size_t ni=n;
+  double result=0.;
+  size_t tmp;
+  while (((tmp=(n>>1))<<1)==n)
+    { result+=2; n=tmp; }
+
+  size_t limit=(size_t)sqrt(n+0.01);
+  for (size_t x=3; x<=limit; x+=2)
+  while ((tmp=(n/x))*x==n)
+    {
+    result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
+    n=tmp;
+    limit=(size_t)sqrt(n+0.01);
+    }
+  if (n>1) result+=(n<=5) ? n : lfp*n;
+
+  return result*ni;
+  }
+
+/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
+NOINLINE static size_t good_size(size_t n)
+  {
+  if (n<=6) return n;
+
+  size_t bestfac=2*n;
+  for (size_t f2=1; f2<bestfac; f2*=2)
+    for (size_t f23=f2; f23<bestfac; f23*=3)
+      for (size_t f235=f23; f235<bestfac; f235*=5)
+        for (size_t f2357=f235; f2357<bestfac; f2357*=7)
+          for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
+            if (f235711>=n) bestfac=f235711;
+  return bestfac;
+  }
+
+typedef struct cmplx {
+  double r,i;
+} cmplx;
+
+#define NFCT 25
+typedef struct cfftp_fctdata
+  {
+  size_t fct;
+  cmplx *tw, *tws;
+  } cfftp_fctdata;
+
+typedef struct cfftp_plan_i
+  {
+  size_t length, nfct;
+  cmplx *mem;
+  cfftp_fctdata fct[NFCT];
+  } cfftp_plan_i;
+typedef struct cfftp_plan_i * cfftp_plan;
+
+#define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
+#define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; }
+#define SCALEC(a,b) { a.r*=b; a.i*=b; }
+#define ROT90(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
+#define ROTM90(a) { double tmp_=-a.r; a.r=a.i; a.i=tmp_; }
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+#define WA(x,i) wa[(i)-1+(x)*(ido-1)]
+/* a = b*c */
+#define A_EQ_B_MUL_C(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
+/* a = conj(b)*c*/
+#define A_EQ_CB_MUL_C(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
+
+#define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; }
+/* a = b*c */
+#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-sign*b.i*c.i; a.i=b.r*c.i+sign*b.i*c.r; }
+/* a *= b */
+#define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; }
+
+NOINLINE static void pass2b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=2;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx t;
+        PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
+        A_EQ_B_MUL_C (CH(i,k,1),WA(0,i),t)
+        }
+      }
+  }
+
+NOINLINE static void pass2f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=2;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx t;
+        PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
+        A_EQ_CB_MUL_C (CH(i,k,1),WA(0,i),t)
+        }
+      }
+  }
+
+#define PREP3(idx) \
+        cmplx t0 = CC(idx,0,k), t1, t2; \
+        PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)) \
+        CH(idx,k,0).r=t0.r+t1.r; \
+        CH(idx,k,0).i=t0.i+t1.i;
+#define PARTSTEP3a(u1,u2,twr,twi) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t0.r+twr*t1.r; \
+        ca.i=t0.i+twr*t1.i; \
+        cb.i=twi*t2.r; \
+        cb.r=-(twi*t2.i); \
+        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
+        }
+
+#define PARTSTEP3b(u1,u2,twr,twi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twr*t1.r; \
+        ca.i=t0.i+twr*t1.i; \
+        cb.i=twi*t2.r; \
+        cb.r=-(twi*t2.i); \
+        PMC(da,db,ca,cb) \
+        A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+NOINLINE static void pass3b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=3;
+  const double tw1r=-0.5, tw1i= 0.86602540378443864676;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP3(i)
+        PARTSTEP3b(1,2,tw1r,tw1i)
+        }
+      }
+  }
+#define PARTSTEP3f(u1,u2,twr,twi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twr*t1.r; \
+        ca.i=t0.i+twr*t1.i; \
+        cb.i=twi*t2.r; \
+        cb.r=-(twi*t2.i); \
+        PMC(da,db,ca,cb) \
+        A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+NOINLINE static void pass3f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=3;
+  const double tw1r=-0.5, tw1i= -0.86602540378443864676;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP3(i)
+        PARTSTEP3f(1,2,tw1r,tw1i)
+        }
+      }
+  }
+
+NOINLINE static void pass4b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=4;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      ROT90(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      ROT90(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx c2, c3, c4, t1, t2, t3, t4;
+        cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
+        PMC(t2,t1,cc0,cc2)
+        PMC(t3,t4,cc1,cc3)
+        ROT90(t4)
+        cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
+        PMC(CH(i,k,0),c3,t2,t3)
+        PMC(c2,c4,t1,t4)
+        A_EQ_B_MUL_C (CH(i,k,1),wa0,c2)
+        A_EQ_B_MUL_C (CH(i,k,2),wa1,c3)
+        A_EQ_B_MUL_C (CH(i,k,3),wa2,c4)
+        }
+      }
+  }
+NOINLINE static void pass4f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=4;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      ROTM90(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      ROTM90(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMC (CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx c2, c3, c4, t1, t2, t3, t4;
+        cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
+        PMC(t2,t1,cc0,cc2)
+        PMC(t3,t4,cc1,cc3)
+        ROTM90(t4)
+        cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
+        PMC(CH(i,k,0),c3,t2,t3)
+        PMC(c2,c4,t1,t4)
+        A_EQ_CB_MUL_C (CH(i,k,1),wa0,c2)
+        A_EQ_CB_MUL_C (CH(i,k,2),wa1,c3)
+        A_EQ_CB_MUL_C (CH(i,k,3),wa2,c4)
+        }
+      }
+  }
+
+#define PREP5(idx) \
+        cmplx t0 = CC(idx,0,k), t1, t2, t3, t4; \
+        PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)) \
+        PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)) \
+        CH(idx,k,0).r=t0.r+t1.r+t2.r; \
+        CH(idx,k,0).i=t0.i+t1.i+t2.i;
+
+#define PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
+        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
+        cb.i=twai*t4.r twbi*t3.r; \
+        cb.r=-(twai*t4.i twbi*t3.i); \
+        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
+        }
+
+#define PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
+        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
+        cb.i=twai*t4.r twbi*t3.r; \
+        cb.r=-(twai*t4.i twbi*t3.i); \
+        PMC(da,db,ca,cb) \
+        A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+NOINLINE static void pass5b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=5;
+  const double tw1r= 0.3090169943749474241,
+               tw1i= 0.95105651629515357212,
+               tw2r= -0.8090169943749474241,
+               tw2i= 0.58778525229247312917;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP5(i)
+        PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
+        PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
+        }
+      }
+  }
+#define PARTSTEP5f(u1,u2,twar,twbr,twai,twbi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
+        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
+        cb.i=twai*t4.r twbi*t3.r; \
+        cb.r=-(twai*t4.i twbi*t3.i); \
+        PMC(da,db,ca,cb) \
+        A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+NOINLINE static void pass5f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=5;
+  const double tw1r= 0.3090169943749474241,
+               tw1i= -0.95105651629515357212,
+               tw2r= -0.8090169943749474241,
+               tw2i= -0.58778525229247312917;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP5(i)
+        PARTSTEP5f(1,4,tw1r,tw2r,+tw1i,+tw2i)
+        PARTSTEP5f(2,3,tw2r,tw1r,+tw2i,-tw1i)
+        }
+      }
+  }
+
+#define PREP7(idx) \
+        cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
+        PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)) \
+        PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)) \
+        PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)) \
+        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
+        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
+
+#define PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
+        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
+        cb.i=y1*t7.r y2*t6.r y3*t5.r; \
+        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
+        PMC(out1,out2,ca,cb) \
+        }
+#define PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
+        PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
+#define PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
+        { \
+        cmplx da,db; \
+        PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
+        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
+        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+
+NOINLINE static void pass7(size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+  {
+  const size_t cdim=7;
+  const double tw1r= 0.623489801858733530525,
+               tw1i= sign * 0.7818314824680298087084,
+               tw2r= -0.222520933956314404289,
+               tw2i= sign * 0.9749279121818236070181,
+               tw3r= -0.9009688679024191262361,
+               tw3i= sign * 0.4338837391175581204758;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP7(0)
+      PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
+      PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
+      PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP7(0)
+      PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
+      PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
+      PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP7(i)
+        PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
+        PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
+        PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
+        }
+      }
+  }
+
+#define PREP11(idx) \
+        cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
+        PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)) \
+        PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)) \
+        PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)) \
+        PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)) \
+        PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)) \
+        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
+        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
+
+#define PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r+x4*t5.r+x5*t6.r; \
+        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i+x5*t6.i; \
+        cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
+        cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
+        PMC(out1,out2,ca,cb) \
+        }
+#define PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
+        PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
+#define PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
+        { \
+        cmplx da,db; \
+        PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
+        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
+        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+
+NOINLINE static void pass11 (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+  {
+  const size_t cdim=11;
+  const double tw1r =        0.8412535328311811688618,
+               tw1i = sign * 0.5406408174555975821076,
+               tw2r =        0.4154150130018864255293,
+               tw2i = sign * 0.9096319953545183714117,
+               tw3r =       -0.1423148382732851404438,
+               tw3i = sign * 0.9898214418809327323761,
+               tw4r =       -0.6548607339452850640569,
+               tw4i = sign * 0.755749574354258283774,
+               tw5r =       -0.9594929736144973898904,
+               tw5i = sign * 0.2817325568414296977114;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP11(0)
+      PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
+      PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
+      PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
+      PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
+      PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP11(0)
+      PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
+      PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
+      PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
+      PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
+      PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP11(i)
+        PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
+        PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
+        PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
+        PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
+        PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
+        }
+      }
+  }
+
+#define CX(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define CX2(a,b) cc[(a)+idl1*(b)]
+#define CH2(a,b) ch[(a)+idl1*(b)]
+
+NOINLINE static int passg (size_t ido, size_t ip, size_t l1,
+  cmplx * restrict cc, cmplx * restrict ch, const cmplx * restrict wa,
+  const cmplx * restrict csarr, const int sign)
+  {
+  const size_t cdim=ip;
+  size_t ipph = (ip+1)/2;
+  size_t idl1 = ido*l1;
+
+  cmplx * restrict wal=RALLOC(cmplx,ip);
+  if (!wal) return -1;
+  wal[0]=(cmplx){1.,0.};
+  for (size_t i=1; i<ip; ++i)
+    wal[i]=(cmplx){csarr[i].r,sign*csarr[i].i};
+
+  for (size_t k=0; k<l1; ++k)
+    for (size_t i=0; i<ido; ++i)
+      CH(i,k,0) = CC(i,0,k);
+  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
+    for (size_t k=0; k<l1; ++k)
+      for (size_t i=0; i<ido; ++i)
+        PMC(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k))
+  for (size_t k=0; k<l1; ++k)
+    for (size_t i=0; i<ido; ++i)
+      {
+      cmplx tmp = CH(i,k,0);
+      for (size_t j=1; j<ipph; ++j)
+        ADDC(tmp,tmp,CH(i,k,j))
+      CX(i,k,0) = tmp;
+      }
+  for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
+    {
+    // j=0
+    for (size_t ik=0; ik<idl1; ++ik)
+      {
+      CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
+      CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
+      CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
+      CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
+      }
+
+    size_t iwal=2*l;
+    size_t j=3, jc=ip-3;
+    for (; j<ipph-1; j+=2, jc-=2)
+      {
+      iwal+=l; if (iwal>ip) iwal-=ip;
+      cmplx xwal=wal[iwal];
+      iwal+=l; if (iwal>ip) iwal-=ip;
+      cmplx xwal2=wal[iwal];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
+        CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
+        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
+        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
+        }
+      }
+    for (; j<ipph; ++j, --jc)
+      {
+      iwal+=l; if (iwal>ip) iwal-=ip;
+      cmplx xwal=wal[iwal];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        CX2(ik,l).r += CH2(ik,j).r*xwal.r;
+        CX2(ik,l).i += CH2(ik,j).i*xwal.r;
+        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
+        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
+        }
+      }
+    }
+  DEALLOC(wal);
+
+  // shuffling and twiddling
+  if (ido==1)
+    for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        cmplx t1=CX2(ik,j), t2=CX2(ik,jc);
+        PMC(CX2(ik,j),CX2(ik,jc),t1,t2)
+        }
+  else
+    {
+    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
+      for (size_t k=0; k<l1; ++k)
+        {
+        cmplx t1=CX(0,k,j), t2=CX(0,k,jc);
+        PMC(CX(0,k,j),CX(0,k,jc),t1,t2)
+        for (size_t i=1; i<ido; ++i)
+          {
+          cmplx x1, x2;
+          PMC(x1,x2,CX(i,k,j),CX(i,k,jc))
+          size_t idij=(j-1)*(ido-1)+i-1;
+          MULPMSIGNC (CX(i,k,j),wa[idij],x1)
+          idij=(jc-1)*(ido-1)+i-1;
+          MULPMSIGNC (CX(i,k,jc),wa[idij],x2)
+          }
+        }
+    }
+  return 0;
+  }
+
+#undef CH2
+#undef CX2
+#undef CX
+
+NOINLINE WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct,
+  const int sign)
+  {
+  if (plan->length==1) return 0;
+  size_t len=plan->length;
+  size_t l1=1, nf=plan->nfct;
+  cmplx *ch = RALLOC(cmplx, len);
+  if (!ch) return -1;
+  cmplx *p1=c, *p2=ch;
+
+  for(size_t k1=0; k1<nf; k1++)
+    {
+    size_t ip=plan->fct[k1].fct;
+    size_t l2=ip*l1;
+    size_t ido = len/l2;
+    if     (ip==4)
+      sign>0 ? pass4b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass4f (ido, l1, p1, p2, plan->fct[k1].tw);
+    else if(ip==2)
+      sign>0 ? pass2b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass2f (ido, l1, p1, p2, plan->fct[k1].tw);
+    else if(ip==3)
+      sign>0 ? pass3b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass3f (ido, l1, p1, p2, plan->fct[k1].tw);
+    else if(ip==5)
+      sign>0 ? pass5b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass5f (ido, l1, p1, p2, plan->fct[k1].tw);
+    else if(ip==7)  pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
+    else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign);
+    else
+      {
+      if (passg(ido, ip, l1, p1, p2, plan->fct[k1].tw, plan->fct[k1].tws, sign))
+        { DEALLOC(ch); return -1; }
+      SWAP(p1,p2,cmplx *);
+      }
+    SWAP(p1,p2,cmplx *);
+    l1=l2;
+    }
+  if (p1!=c)
+    {
+    if (fct!=1.)
+      for (size_t i=0; i<len; ++i)
+        {
+        c[i].r = ch[i].r*fct;
+        c[i].i = ch[i].i*fct;
+        }
+    else
+      memcpy (c,p1,len*sizeof(cmplx));
+    }
+  else
+    if (fct!=1.)
+      for (size_t i=0; i<len; ++i)
+        {
+        c[i].r *= fct;
+        c[i].i *= fct;
+        }
+  DEALLOC(ch);
+  return 0;
+  }
+
+#undef PMSIGNC
+#undef A_EQ_B_MUL_C
+#undef A_EQ_CB_MUL_C
+#undef MULPMSIGNC
+#undef MULPMSIGNCEQ
+
+#undef WA
+#undef CC
+#undef CH
+#undef ROT90
+#undef SCALEC
+#undef ADDC
+#undef PMC
+
+NOINLINE WARN_UNUSED_RESULT
+static int cfftp_forward(cfftp_plan plan, double c[], double fct)
+  { return pass_all(plan,(cmplx *)c, fct, -1); }
+
+NOINLINE WARN_UNUSED_RESULT
+static int cfftp_backward(cfftp_plan plan, double c[], double fct)
+  { return pass_all(plan,(cmplx *)c, fct, 1); }
+
+NOINLINE WARN_UNUSED_RESULT
+static int cfftp_factorize (cfftp_plan plan)
+  {
+  size_t length=plan->length;
+  size_t nfct=0;
+  while ((length%4)==0)
+    { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
+  if ((length%2)==0)
+    {
+    length>>=1;
+    // factor 2 should be at the front of the factor list
+    if (nfct>=NFCT) return -1;
+    plan->fct[nfct++].fct=2;
+    SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
+    }
+  size_t maxl=(size_t)(sqrt((double)length))+1;
+  for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
+    if ((length%divisor)==0)
+      {
+      while ((length%divisor)==0)
+        {
+        if (nfct>=NFCT) return -1;
+        plan->fct[nfct++].fct=divisor;
+        length/=divisor;
+        }
+      maxl=(size_t)(sqrt((double)length))+1;
+      }
+  if (length>1) plan->fct[nfct++].fct=length;
+  plan->nfct=nfct;
+  return 0;
+  }
+
+NOINLINE static size_t cfftp_twsize (cfftp_plan plan)
+  {
+  size_t twsize=0, l1=1;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
+    twsize+=(ip-1)*(ido-1);
+    if (ip>11)
+      twsize+=ip;
+    l1*=ip;
+    }
+  return twsize;
+  }
+
+NOINLINE WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan)
+  {
+  size_t length=plan->length;
+  double *twid = RALLOC(double, 2*length);
+  if (!twid) return -1;
+  sincos_2pibyn(length, twid);
+  size_t l1=1;
+  size_t memofs=0;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido= length/(l1*ip);
+    plan->fct[k].tw=plan->mem+memofs;
+    memofs+=(ip-1)*(ido-1);
+    for (size_t j=1; j<ip; ++j)
+      for (size_t i=1; i<ido; ++i)
+        {
+        plan->fct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i];
+        plan->fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1];
+        }
+    if (ip>11)
+      {
+      plan->fct[k].tws=plan->mem+memofs;
+      memofs+=ip;
+      for (size_t j=0; j<ip; ++j)
+        {
+        plan->fct[k].tws[j].r = twid[2*j*l1*ido];
+        plan->fct[k].tws[j].i = twid[2*j*l1*ido+1];
+        }
+      }
+    l1*=ip;
+    }
+  DEALLOC(twid);
+  return 0;
+  }
+
+static cfftp_plan make_cfftp_plan (size_t length)
+  {
+  if (length==0) return NULL;
+  cfftp_plan plan = RALLOC(cfftp_plan_i,1);
+  if (!plan) return NULL;
+  plan->length=length;
+  plan->nfct=0;
+  for (size_t i=0; i<NFCT; ++i)
+    plan->fct[i]=(cfftp_fctdata){0,0,0};
+  plan->mem=0;
+  if (length==1) return plan;
+  if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
+  size_t tws=cfftp_twsize(plan);
+  plan->mem=RALLOC(cmplx,tws);
+  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  if (cfftp_comp_twiddle(plan)!=0)
+    { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  return plan;
+  }
+
+static void destroy_cfftp_plan (cfftp_plan plan)
+  {
+  DEALLOC(plan->mem);
+  DEALLOC(plan);
+  }
+
+typedef struct rfftp_fctdata
+  {
+  size_t fct;
+  double *tw, *tws;
+  } rfftp_fctdata;
+
+typedef struct rfftp_plan_i
+  {
+  size_t length, nfct;
+  double *mem;
+  rfftp_fctdata fct[NFCT];
+  } rfftp_plan_i;
+typedef struct rfftp_plan_i * rfftp_plan;
+
+#define WA(x,i) wa[(i)+(x)*(ido-1)]
+#define PM(a,b,c,d) { a=c+d; b=c-d; }
+/* (a+ib) = conj(c+id) * (e+if) */
+#define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; }
+
+#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
+
+NOINLINE static void radf2 (size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=2;
+
+  for (size_t k=0; k<l1; k++)
+    PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1))
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      CH(    0,1,k) = -CC(ido-1,k,1);
+      CH(ido-1,0,k) =  CC(ido-1,k,0);
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double tr2, ti2;
+      MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2)
+      PM (CH(i  ,0,k),CH(ic  ,1,k),ti2,CC(i  ,k,0))
+      }
+  }
+
+NOINLINE static void radf3(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=3;
+  static const double taur=-0.5, taui=0.86602540378443864676;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double cr2=CC(0,k,1)+CC(0,k,2);
+    CH(0,0,k) = CC(0,k,0)+cr2;
+    CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
+    CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double di2, di3, dr2, dr3;
+      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)) // d2=conj(WA0)*CC1
+      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)) // d3=conj(WA1)*CC2
+      double cr2=dr2+dr3; // c add
+      double ci2=di2+di3;
+      CH(i-1,0,k) = CC(i-1,k,0)+cr2; // c add
+      CH(i  ,0,k) = CC(i  ,k,0)+ci2;
+      double tr2 = CC(i-1,k,0)+taur*cr2; // c add
+      double ti2 = CC(i  ,k,0)+taur*ci2;
+      double tr3 = taui*(di2-di3);  // t3 = taui*i*(d3-d2)?
+      double ti3 = taui*(dr3-dr2);
+      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3) // PM(i) = t2+t3
+      PM(CH(i  ,2,k),CH(ic  ,1,k),ti3,ti2) // PM(ic) = conj(t2-t3)
+      }
+  }
+
+NOINLINE static void radf4(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=4;
+  static const double hsqt2=0.70710678118654752440;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double tr1,tr2;
+    PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1))
+    PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2))
+    PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1)
+    }
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      double ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
+      double tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
+      PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1)
+      PM (CH(    0,3,k),CH(    0,1,k),ti1,CC(ido-1,k,2))
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+      MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
+      MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
+      PM(tr1,tr4,cr4,cr2)
+      PM(ti1,ti4,ci2,ci4)
+      PM(tr2,tr3,CC(i-1,k,0),cr3)
+      PM(ti2,ti3,CC(i  ,k,0),ci3)
+      PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1)
+      PM(CH(i  ,0,k),CH(ic  ,3,k),ti1,ti2)
+      PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4)
+      PM(CH(i  ,2,k),CH(ic  ,1,k),tr4,ti3)
+      }
+  }
+
+NOINLINE static void radf5(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=5;
+  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
+                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double cr2, cr3, ci4, ci5;
+    PM (cr2,ci5,CC(0,k,4),CC(0,k,1))
+    PM (cr3,ci4,CC(0,k,3),CC(0,k,2))
+    CH(0,0,k)=CC(0,k,0)+cr2+cr3;
+    CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
+    CH(0,2,k)=ti11*ci5+ti12*ci4;
+    CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
+    CH(0,4,k)=ti12*ci5-ti11*ci4;
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      double ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3,
+         dr4, dr5, cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
+      size_t ic=ido-i;
+      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
+      MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
+      MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4))
+      PM(cr2,ci5,dr5,dr2)
+      PM(ci2,cr5,di2,di5)
+      PM(cr3,ci4,dr4,dr3)
+      PM(ci3,cr4,di3,di4)
+      CH(i-1,0,k)=CC(i-1,k,0)+cr2+cr3;
+      CH(i  ,0,k)=CC(i  ,k,0)+ci2+ci3;
+      tr2=CC(i-1,k,0)+tr11*cr2+tr12*cr3;
+      ti2=CC(i  ,k,0)+tr11*ci2+tr12*ci3;
+      tr3=CC(i-1,k,0)+tr12*cr2+tr11*cr3;
+      ti3=CC(i  ,k,0)+tr12*ci2+tr11*ci3;
+      MULPM(tr5,tr4,cr5,cr4,ti11,ti12)
+      MULPM(ti5,ti4,ci5,ci4,ti11,ti12)
+      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5)
+      PM(CH(i  ,2,k),CH(ic  ,1,k),ti5,ti2)
+      PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4)
+      PM(CH(i  ,4,k),CH(ic  ,3,k),ti4,ti3)
+      }
+  }
+
+#undef CC
+#undef CH
+#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define C2(a,b) cc[(a)+idl1*(b)]
+#define CH2(a,b) ch[(a)+idl1*(b)]
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+NOINLINE static void radfg(size_t ido, size_t ip, size_t l1,
+  double * restrict cc, double * restrict ch, const double * restrict wa,
+  const double * restrict csarr)
+  {
+  const size_t cdim=ip;
+  size_t ipph=(ip+1)/2;
+  size_t idl1 = ido*l1;
+
+  if (ido>1)
+    {
+    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)              // 114
+      {
+      size_t is=(j-1)*(ido-1),
+             is2=(jc-1)*(ido-1);
+      for (size_t k=0; k<l1; ++k)                            // 113
+        {
+        size_t idij=is;
+        size_t idij2=is2;
+        for (size_t i=1; i<=ido-2; i+=2)                      // 112
+          {
+          double t1=C1(i,k,j ), t2=C1(i+1,k,j ),
+                 t3=C1(i,k,jc), t4=C1(i+1,k,jc);
+          double x1=wa[idij]*t1 + wa[idij+1]*t2,
+                 x2=wa[idij]*t2 - wa[idij+1]*t1,
+                 x3=wa[idij2]*t3 + wa[idij2+1]*t4,
+                 x4=wa[idij2]*t4 - wa[idij2+1]*t3;
+          C1(i  ,k,j ) = x1+x3;
+          C1(i  ,k,jc) = x2-x4;
+          C1(i+1,k,j ) = x2+x4;
+          C1(i+1,k,jc) = x3-x1;
+          idij+=2;
+          idij2+=2;
+          }
+        }
+      }
+    }
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 123
+    for (size_t k=0; k<l1; ++k)                              // 122
+      {
+      double t1=C1(0,k,j), t2=C1(0,k,jc);
+      C1(0,k,j ) = t1+t2;
+      C1(0,k,jc) = t2-t1;
+      }
+
+//everything in C
+//memset(ch,0,ip*l1*ido*sizeof(double));
+
+  for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)                 // 127
+    {
+    for (size_t ik=0; ik<idl1; ++ik)                         // 124
+      {
+      CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
+      CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
+      }
+    size_t iang = 2*l;
+    size_t j=3, jc=ip-3;
+    for (; j<ipph-3; j+=4,jc-=4)              // 126
+      {
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)                       // 125
+        {
+        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
+                     +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
+        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
+                     +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
+        }
+      }
+    for (; j<ipph-1; j+=2,jc-=2)              // 126
+      {
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)                       // 125
+        {
+        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
+        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
+        }
+      }
+    for (; j<ipph; ++j,--jc)              // 126
+      {
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar=csarr[2*iang], ai=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)                       // 125
+        {
+        CH2(ik,l ) += ar*C2(ik,j );
+        CH2(ik,lc) += ai*C2(ik,jc);
+        }
+      }
+    }
+  for (size_t ik=0; ik<idl1; ++ik)                         // 101
+    CH2(ik,0) = C2(ik,0);
+  for (size_t j=1; j<ipph; ++j)                              // 129
+    for (size_t ik=0; ik<idl1; ++ik)                         // 128
+      CH2(ik,0) += C2(ik,j);
+
+// everything in CH at this point!
+//memset(cc,0,ip*l1*ido*sizeof(double));
+
+  for (size_t k=0; k<l1; ++k)                                // 131
+    for (size_t i=0; i<ido; ++i)                             // 130
+      CC(i,0,k) = CH(i,k,0);
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 137
+    {
+    size_t j2=2*j-1;
+    for (size_t k=0; k<l1; ++k)                              // 136
+      {
+      CC(ido-1,j2,k) = CH(0,k,j);
+      CC(0,j2+1,k) = CH(0,k,jc);
+      }
+    }
+
+  if (ido==1) return;
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 140
+    {
+    size_t j2=2*j-1;
+    for(size_t k=0; k<l1; ++k)                               // 139
+      for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 138
+        {
+        CC(i   ,j2+1,k) = CH(i  ,k,j )+CH(i  ,k,jc);
+        CC(ic  ,j2  ,k) = CH(i  ,k,j )-CH(i  ,k,jc);
+        CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
+        CC(ic+1,j2  ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
+        }
+    }
+  }
+#undef C1
+#undef C2
+#undef CH2
+
+#undef CH
+#undef CC
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+
+NOINLINE static void radb2(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=2;
+
+  for (size_t k=0; k<l1; k++)
+    PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k))
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      CH(ido-1,k,0) = 2.*CC(ido-1,0,k);
+      CH(ido-1,k,1) =-2.*CC(0    ,1,k);
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double ti2, tr2;
+      PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k))
+      PM (ti2,CH(i  ,k,0),CC(i  ,0,k),CC(ic  ,1,k))
+      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2)
+      }
+  }
+
+NOINLINE static void radb3(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=3;
+  static const double taur=-0.5, taui=0.86602540378443864676;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double tr2=2.*CC(ido-1,1,k);
+    double cr2=CC(0,0,k)+taur*tr2;
+    CH(0,k,0)=CC(0,0,k)+tr2;
+    double ci3=2.*taui*CC(0,2,k);
+    PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
+      double ti2=CC(i  ,2,k)-CC(ic  ,1,k);
+      double cr2=CC(i-1,0,k)+taur*tr2;     // c2=CC +taur*t2
+      double ci2=CC(i  ,0,k)+taur*ti2;
+      CH(i-1,k,0)=CC(i-1,0,k)+tr2;         // CH=CC+t2
+      CH(i  ,k,0)=CC(i  ,0,k)+ti2;
+      double cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
+      double ci3=taui*(CC(i  ,2,k)+CC(ic  ,1,k));
+      double di2, di3, dr2, dr3;
+      PM(dr3,dr2,cr2,ci3) // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
+      PM(di2,di3,ci2,cr3) // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
+      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2) // ch = WA*d2
+      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
+      }
+  }
+
+NOINLINE static void radb4(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=4;
+  static const double sqrt2=1.41421356237309504880;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double tr1, tr2;
+    PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k))
+    double tr3=2.*CC(ido-1,1,k);
+    double tr4=2.*CC(0,2,k);
+    PM (CH(0,k,0),CH(0,k,2),tr2,tr3)
+    PM (CH(0,k,3),CH(0,k,1),tr1,tr4)
+    }
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      double tr1,tr2,ti1,ti2;
+      PM (ti1,ti2,CC(0    ,3,k),CC(0    ,1,k))
+      PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k))
+      CH(ido-1,k,0)=tr2+tr2;
+      CH(ido-1,k,1)=sqrt2*(tr1-ti1);
+      CH(ido-1,k,2)=ti2+ti2;
+      CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+      size_t ic=ido-i;
+      PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k))
+      PM (ti1,ti2,CC(i  ,0,k),CC(ic  ,3,k))
+      PM (tr4,ti3,CC(i  ,2,k),CC(ic  ,1,k))
+      PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k))
+      PM (CH(i-1,k,0),cr3,tr2,tr3)
+      PM (CH(i  ,k,0),ci3,ti2,ti3)
+      PM (cr4,cr2,tr1,tr4)
+      PM (ci2,ci4,ti1,ti4)
+      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2)
+      MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3)
+      MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4)
+      }
+  }
+
+NOINLINE static void radb5(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=5;
+  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
+                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double ti5=CC(0,2,k)+CC(0,2,k);
+    double ti4=CC(0,4,k)+CC(0,4,k);
+    double tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
+    double tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
+    CH(0,k,0)=CC(0,0,k)+tr2+tr3;
+    double cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
+    double cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
+    double ci4, ci5;
+    MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
+    PM(CH(0,k,4),CH(0,k,1),cr2,ci5)
+    PM(CH(0,k,3),CH(0,k,2),cr3,ci4)
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
+      PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k))
+      PM(ti5,ti2,CC(i  ,2,k),CC(ic  ,1,k))
+      PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k))
+      PM(ti4,ti3,CC(i  ,4,k),CC(ic  ,3,k))
+      CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
+      CH(i  ,k,0)=CC(i  ,0,k)+ti2+ti3;
+      double cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
+      double ci2=CC(i  ,0,k)+tr11*ti2+tr12*ti3;
+      double cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
+      double ci3=CC(i  ,0,k)+tr12*ti2+tr11*ti3;
+      double ci4, ci5, cr5, cr4;
+      MULPM(cr5,cr4,tr5,tr4,ti11,ti12)
+      MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
+      double dr2, dr3, dr4, dr5, di2, di3, di4, di5;
+      PM(dr4,dr3,cr3,ci4)
+      PM(di3,di4,ci3,cr4)
+      PM(dr5,dr2,cr2,ci5)
+      PM(di2,di5,ci2,cr5)
+      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
+      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
+      MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4)
+      MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5)
+      }
+  }
+
+#undef CC
+#undef CH
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define C2(a,b) cc[(a)+idl1*(b)]
+#define CH2(a,b) ch[(a)+idl1*(b)]
+
+NOINLINE static void radbg(size_t ido, size_t ip, size_t l1,
+  double * restrict cc, double * restrict ch, const double * restrict wa,
+  const double * restrict csarr)
+  {
+  const size_t cdim=ip;
+  size_t ipph=(ip+1)/ 2;
+  size_t idl1 = ido*l1;
+
+  for (size_t k=0; k<l1; ++k)        // 102
+    for (size_t i=0; i<ido; ++i)     // 101
+      CH(i,k,0) = CC(i,0,k);
+  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)   // 108
+    {
+    size_t j2=2*j-1;
+    for (size_t k=0; k<l1; ++k)
+      {
+      CH(0,k,j ) = 2*CC(ido-1,j2,k);
+      CH(0,k,jc) = 2*CC(0,j2+1,k);
+      }
+    }
+
+  if (ido!=1)
+    {
+    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 111
+      {
+      size_t j2=2*j-1;
+      for (size_t k=0; k<l1; ++k)
+        for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 109
+          {
+          CH(i  ,k,j ) = CC(i  ,j2+1,k)+CC(ic  ,j2,k);
+          CH(i  ,k,jc) = CC(i  ,j2+1,k)-CC(ic  ,j2,k);
+          CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
+          CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
+          }
+      }
+    }
+  for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
+    {
+    for (size_t ik=0; ik<idl1; ++ik)
+      {
+      C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
+      C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
+      }
+    size_t iang=2*l;
+    size_t j=3,jc=ip-3;
+    for(; j<ipph-3; j+=4,jc-=4)
+      {
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
+                    +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
+        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
+                    +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
+        }
+      }
+    for(; j<ipph-1; j+=2,jc-=2)
+      {
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
+        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
+        }
+      }
+    for(; j<ipph; ++j,--jc)
+      {
+      iang+=l; if(iang>ip) iang-=ip;
+      double war=csarr[2*iang], wai=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        C2(ik,l ) += war*CH2(ik,j );
+        C2(ik,lc) += wai*CH2(ik,jc);
+        }
+      }
+    }
+  for (size_t j=1; j<ipph; ++j)
+    for (size_t ik=0; ik<idl1; ++ik)
+      CH2(ik,0) += CH2(ik,j);
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 124
+    for (size_t k=0; k<l1; ++k)
+      {
+      CH(0,k,j ) = C1(0,k,j)-C1(0,k,jc);
+      CH(0,k,jc) = C1(0,k,j)+C1(0,k,jc);
+      }
+
+  if (ido==1) return;
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)  // 127
+    for (size_t k=0; k<l1; ++k)
+      for (size_t i=1; i<=ido-2; i+=2)
+        {
+        CH(i  ,k,j ) = C1(i  ,k,j)-C1(i+1,k,jc);
+        CH(i  ,k,jc) = C1(i  ,k,j)+C1(i+1,k,jc);
+        CH(i+1,k,j ) = C1(i+1,k,j)+C1(i  ,k,jc);
+        CH(i+1,k,jc) = C1(i+1,k,j)-C1(i  ,k,jc);
+        }
+
+// All in CH
+
+  for (size_t j=1; j<ip; ++j)
+    {
+    size_t is = (j-1)*(ido-1);
+    for (size_t k=0; k<l1; ++k)
+      {
+      size_t idij = is;
+      for (size_t i=1; i<=ido-2; i+=2)
+        {
+        double t1=CH(i,k,j), t2=CH(i+1,k,j);
+        CH(i  ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
+        CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
+        idij+=2;
+        }
+      }
+    }
+  }
+#undef C1
+#undef C2
+#undef CH2
+
+#undef CC
+#undef CH
+#undef PM
+#undef MULPM
+#undef WA
+
+static void copy_and_norm(double *c, double *p1, size_t n, double fct)
+  {
+  if (p1!=c)
+    {
+    if (fct!=1.)
+      for (size_t i=0; i<n; ++i)
+        c[i] = fct*p1[i];
+    else
+      memcpy (c,p1,n*sizeof(double));
+    }
+  else
+    if (fct!=1.)
+      for (size_t i=0; i<n; ++i)
+        c[i] *= fct;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftp_forward(rfftp_plan plan, double c[], double fct)
+  {
+  if (plan->length==1) return 0;
+  size_t n=plan->length;
+  size_t l1=n, nf=plan->nfct;
+  double *ch = RALLOC(double, n);
+  if (!ch) return -1;
+  double *p1=c, *p2=ch;
+
+  for(size_t k1=0; k1<nf;++k1)
+    {
+    size_t k=nf-k1-1;
+    size_t ip=plan->fct[k].fct;
+    size_t ido=n / l1;
+    l1 /= ip;
+    if(ip==4)
+      radf4(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==2)
+      radf2(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==3)
+      radf3(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==5)
+      radf5(ido, l1, p1, p2, plan->fct[k].tw);
+    else
+      {
+      radfg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
+      SWAP (p1,p2,double *);
+      }
+    SWAP (p1,p2,double *);
+    }
+  copy_and_norm(c,p1,n,fct);
+  DEALLOC(ch);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftp_backward(rfftp_plan plan, double c[], double fct)
+  {
+  if (plan->length==1) return 0;
+  size_t n=plan->length;
+  size_t l1=1, nf=plan->nfct;
+  double *ch = RALLOC(double, n);
+  if (!ch) return -1;
+  double *p1=c, *p2=ch;
+
+  for(size_t k=0; k<nf; k++)
+    {
+    size_t ip = plan->fct[k].fct,
+           ido= n/(ip*l1);
+    if(ip==4)
+      radb4(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==2)
+      radb2(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==3)
+      radb3(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==5)
+      radb5(ido, l1, p1, p2, plan->fct[k].tw);
+    else
+      radbg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
+    SWAP (p1,p2,double *);
+    l1*=ip;
+    }
+  copy_and_norm(c,p1,n,fct);
+  DEALLOC(ch);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftp_factorize (rfftp_plan plan)
+  {
+  size_t length=plan->length;
+  size_t nfct=0;
+  while ((length%4)==0)
+    { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
+  if ((length%2)==0)
+    {
+    length>>=1;
+    // factor 2 should be at the front of the factor list
+    if (nfct>=NFCT) return -1;
+    plan->fct[nfct++].fct=2;
+    SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
+    }
+  size_t maxl=(size_t)(sqrt((double)length))+1;
+  for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
+    if ((length%divisor)==0)
+      {
+      while ((length%divisor)==0)
+        {
+        if (nfct>=NFCT) return -1;
+        plan->fct[nfct++].fct=divisor;
+        length/=divisor;
+        }
+      maxl=(size_t)(sqrt((double)length))+1;
+      }
+  if (length>1) plan->fct[nfct++].fct=length;
+  plan->nfct=nfct;
+  return 0;
+  }
+
+static size_t rfftp_twsize(rfftp_plan plan)
+  {
+  size_t twsize=0, l1=1;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
+    twsize+=(ip-1)*(ido-1);
+    if (ip>5) twsize+=2*ip;
+    l1*=ip;
+    }
+  return twsize;
+  return 0;
+  }
+
+WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan)
+  {
+  size_t length=plan->length;
+  double *twid = RALLOC(double, 2*length);
+  if (!twid) return -1;
+  sincos_2pibyn_half(length, twid);
+  size_t l1=1;
+  double *ptr=plan->mem;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido=length/(l1*ip);
+    if (k<plan->nfct-1) // last factor doesn't need twiddles
+      {
+      plan->fct[k].tw=ptr; ptr+=(ip-1)*(ido-1);
+      for (size_t j=1; j<ip; ++j)
+        for (size_t i=1; i<=(ido-1)/2; ++i)
+          {
+          plan->fct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
+          plan->fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
+          }
+      }
+    if (ip>5) // special factors required by *g functions
+      {
+      plan->fct[k].tws=ptr; ptr+=2*ip;
+      plan->fct[k].tws[0] = 1.;
+      plan->fct[k].tws[1] = 0.;
+      for (size_t i=1; i<=(ip>>1); ++i)
+        {
+        plan->fct[k].tws[2*i  ] = twid[2*i*(length/ip)];
+        plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
+        plan->fct[k].tws[2*(ip-i)  ] = twid[2*i*(length/ip)];
+        plan->fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1];
+        }
+      }
+    l1*=ip;
+    }
+  DEALLOC(twid);
+  return 0;
+  }
+
+NOINLINE static rfftp_plan make_rfftp_plan (size_t length)
+  {
+  if (length==0) return NULL;
+  rfftp_plan plan = RALLOC(rfftp_plan_i,1);
+  if (!plan) return NULL;
+  plan->length=length;
+  plan->nfct=0;
+  plan->mem=NULL;
+  for (size_t i=0; i<NFCT; ++i)
+    plan->fct[i]=(rfftp_fctdata){0,0,0};
+  if (length==1) return plan;
+  if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
+  size_t tws=rfftp_twsize(plan);
+  plan->mem=RALLOC(double,tws);
+  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  if (rfftp_comp_twiddle(plan)!=0)
+    { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  return plan;
+  }
+
+NOINLINE static void destroy_rfftp_plan (rfftp_plan plan)
+  {
+  DEALLOC(plan->mem);
+  DEALLOC(plan);
+  }
+
+typedef struct fftblue_plan_i
+  {
+  size_t n, n2;
+  cfftp_plan plan;
+  double *mem;
+  double *bk, *bkf;
+  } fftblue_plan_i;
+typedef struct fftblue_plan_i * fftblue_plan;
+
+NOINLINE static fftblue_plan make_fftblue_plan (size_t length)
+  {
+  fftblue_plan plan = RALLOC(fftblue_plan_i,1);
+  if (!plan) return NULL;
+  plan->n = length;
+  plan->n2 = good_size(plan->n*2-1);
+  plan->mem = RALLOC(double, 2*plan->n+2*plan->n2);
+  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  plan->bk  = plan->mem;
+  plan->bkf = plan->bk+2*plan->n;
+
+/* initialize b_k */
+  double *tmp = RALLOC(double,4*plan->n);
+  if (!tmp) { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  sincos_2pibyn(2*plan->n,tmp);
+  plan->bk[0] = 1;
+  plan->bk[1] = 0;
+
+  size_t coeff=0;
+  for (size_t m=1; m<plan->n; ++m)
+    {
+    coeff+=2*m-1;
+    if (coeff>=2*plan->n) coeff-=2*plan->n;
+    plan->bk[2*m  ] = tmp[2*coeff  ];
+    plan->bk[2*m+1] = tmp[2*coeff+1];
+    }
+
+  /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
+  double xn2 = 1./plan->n2;
+  plan->bkf[0] = plan->bk[0]*xn2;
+  plan->bkf[1] = plan->bk[1]*xn2;
+  for (size_t m=2; m<2*plan->n; m+=2)
+    {
+    plan->bkf[m]   = plan->bkf[2*plan->n2-m]   = plan->bk[m]   *xn2;
+    plan->bkf[m+1] = plan->bkf[2*plan->n2-m+1] = plan->bk[m+1] *xn2;
+    }
+  for (size_t m=2*plan->n;m<=(2*plan->n2-2*plan->n+1);++m)
+    plan->bkf[m]=0.;
+  plan->plan=make_cfftp_plan(plan->n2);
+  if (!plan->plan)
+    { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  if (cfftp_forward(plan->plan,plan->bkf,1.)!=0)
+    { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  DEALLOC(tmp);
+
+  return plan;
+  }
+
+NOINLINE static void destroy_fftblue_plan (fftblue_plan plan)
+  {
+  DEALLOC(plan->mem);
+  destroy_cfftp_plan(plan->plan);
+  DEALLOC(plan);
+  }
+
+NOINLINE WARN_UNUSED_RESULT
+static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct)
+  {
+  size_t n=plan->n;
+  size_t n2=plan->n2;
+  double *bk  = plan->bk;
+  double *bkf = plan->bkf;
+  double *akf = RALLOC(double, 2*n2);
+  if (!akf) return -1;
+
+/* initialize a_k and FFT it */
+  if (isign>0)
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      akf[m]   = c[m]*bk[m]   - c[m+1]*bk[m+1];
+      akf[m+1] = c[m]*bk[m+1] + c[m+1]*bk[m];
+      }
+  else
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      akf[m]   = c[m]*bk[m]   + c[m+1]*bk[m+1];
+      akf[m+1] =-c[m]*bk[m+1] + c[m+1]*bk[m];
+      }
+  for (size_t m=2*n; m<2*n2; ++m)
+    akf[m]=0;
+
+  if (cfftp_forward (plan->plan,akf,fct)!=0)
+    { DEALLOC(akf); return -1; }
+
+/* do the convolution */
+  if (isign>0)
+    for (size_t m=0; m<2*n2; m+=2)
+      {
+      double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
+      akf[m  ]  =  akf[m]*bkf[m]   + akf[m+1]*bkf[m+1];
+      akf[m+1]  = im;
+      }
+  else
+    for (size_t m=0; m<2*n2; m+=2)
+      {
+      double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
+      akf[m  ]  = akf[m]*bkf[m]   - akf[m+1]*bkf[m+1];
+      akf[m+1]  = im;
+      }
+
+/* inverse FFT */
+  if (cfftp_backward (plan->plan,akf,1.)!=0)
+    { DEALLOC(akf); return -1; }
+
+/* multiply by b_k */
+  if (isign>0)
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      c[m]   = bk[m]  *akf[m] - bk[m+1]*akf[m+1];
+      c[m+1] = bk[m+1]*akf[m] + bk[m]  *akf[m+1];
+      }
+  else
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      c[m]   = bk[m]  *akf[m] + bk[m+1]*akf[m+1];
+      c[m+1] =-bk[m+1]*akf[m] + bk[m]  *akf[m+1];
+      }
+  DEALLOC(akf);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int cfftblue_backward(fftblue_plan plan, double c[], double fct)
+  { return fftblue_fft(plan,c,1,fct); }
+
+WARN_UNUSED_RESULT
+static int cfftblue_forward(fftblue_plan plan, double c[], double fct)
+  { return fftblue_fft(plan,c,-1,fct); }
+
+WARN_UNUSED_RESULT
+static int rfftblue_backward(fftblue_plan plan, double c[], double fct)
+  {
+  size_t n=plan->n;
+  double *tmp = RALLOC(double,2*n);
+  if (!tmp) return -1;
+  tmp[0]=c[0];
+  tmp[1]=0.;
+  memcpy (tmp+2,c+1, (n-1)*sizeof(double));
+  if ((n&1)==0) tmp[n+1]=0.;
+  for (size_t m=2; m<n; m+=2)
+    {
+    tmp[2*n-m]=tmp[m];
+    tmp[2*n-m+1]=-tmp[m+1];
+    }
+  if (fftblue_fft(plan,tmp,1,fct)!=0)
+    { DEALLOC(tmp); return -1; }
+  for (size_t m=0; m<n; ++m)
+    c[m] = tmp[2*m];
+  DEALLOC(tmp);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftblue_forward(fftblue_plan plan, double c[], double fct)
+  {
+  size_t n=plan->n;
+  double *tmp = RALLOC(double,2*n);
+  if (!tmp) return -1;
+  for (size_t m=0; m<n; ++m)
+    {
+    tmp[2*m] = c[m];
+    tmp[2*m+1] = 0.;
+    }
+  if (fftblue_fft(plan,tmp,-1,fct)!=0)
+    { DEALLOC(tmp); return -1; }
+  c[0] = tmp[0];
+  memcpy (c+1, tmp+2, (n-1)*sizeof(double));
+  DEALLOC(tmp);
+  return 0;
+  }
+
+typedef struct pocketfft_plan_c_i
+  {
+  cfftp_plan packplan;
+  fftblue_plan blueplan;
+  } pocketfft_plan_c_i;
+
+pocketfft_plan_c pocketfft_make_plan_c (size_t length)
+  {
+  if (length==0) return NULL;
+  pocketfft_plan_c plan = RALLOC(pocketfft_plan_c_i,1);
+  if (!plan) return NULL;
+  plan->blueplan=0;
+  plan->packplan=0;
+  if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
+    {
+    plan->packplan=make_cfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    return plan;
+    }
+  double comp1 = cost_guess(length);
+  double comp2 = 2*cost_guess(good_size(2*length-1));
+  comp2*=1.5; /* fudge factor that appears to give good overall performance */
+  if (comp2<comp1) // use Bluestein
+    {
+    plan->blueplan=make_fftblue_plan(length);
+    if (!plan->blueplan) { DEALLOC(plan); return NULL; }
+    }
+  else
+    {
+    plan->packplan=make_cfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    }
+  return plan;
+  }
+
+void pocketfft_delete_plan_c (pocketfft_plan_c plan)
+  {
+  if (plan->blueplan)
+    destroy_fftblue_plan(plan->blueplan);
+  if (plan->packplan)
+    destroy_cfftp_plan(plan->packplan);
+  DEALLOC(plan);
+  }
+
+WARN_UNUSED_RESULT
+int pocketfft_backward_c(pocketfft_plan_c plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return cfftp_backward(plan->packplan,c,fct);
+  // if (plan->blueplan)
+  return cfftblue_backward(plan->blueplan,c,fct);
+  }
+
+WARN_UNUSED_RESULT
+int pocketfft_forward_c(pocketfft_plan_c plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return cfftp_forward(plan->packplan,c,fct);
+  // if (plan->blueplan)
+  return cfftblue_forward(plan->blueplan,c,fct);
+  }
+
+typedef struct pocketfft_plan_r_i
+  {
+  rfftp_plan packplan;
+  fftblue_plan blueplan;
+  } pocketfft_plan_r_i;
+
+pocketfft_plan_r pocketfft_make_plan_r (size_t length)
+  {
+  if (length==0) return NULL;
+  pocketfft_plan_r plan = RALLOC(pocketfft_plan_r_i,1);
+  if (!plan) return NULL;
+  plan->blueplan=0;
+  plan->packplan=0;
+  if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
+    {
+    plan->packplan=make_rfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    return plan;
+    }
+  double comp1 = 0.5*cost_guess(length);
+  double comp2 = 2*cost_guess(good_size(2*length-1));
+  comp2*=1.5; /* fudge factor that appears to give good overall performance */
+  if (comp2<comp1) // use Bluestein
+    {
+    plan->blueplan=make_fftblue_plan(length);
+    if (!plan->blueplan) { DEALLOC(plan); return NULL; }
+    }
+  else
+    {
+    plan->packplan=make_rfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    }
+  return plan;
+  }
+
+void pocketfft_delete_plan_r (pocketfft_plan_r plan)
+  {
+  if (plan->blueplan)
+    destroy_fftblue_plan(plan->blueplan);
+  if (plan->packplan)
+    destroy_rfftp_plan(plan->packplan);
+  DEALLOC(plan);
+  }
+
+size_t pocketfft_length_r(pocketfft_plan_r plan)
+  {
+  if (plan->packplan) return plan->packplan->length;
+  return plan->blueplan->n;
+  }
+
+size_t pocketfft_length_c(pocketfft_plan_c plan)
+  {
+  if (plan->packplan) return plan->packplan->length;
+  return plan->blueplan->n;
+  }
+
+WARN_UNUSED_RESULT
+int pocketfft_backward_r(pocketfft_plan_r plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return rfftp_backward(plan->packplan,c,fct);
+  else // if (plan->blueplan)
+    return rfftblue_backward(plan->blueplan,c,fct);
+  }
+
+WARN_UNUSED_RESULT
+int pocketfft_forward_r(pocketfft_plan_r plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return rfftp_forward(plan->packplan,c,fct);
+  else // if (plan->blueplan)
+    return rfftblue_forward(plan->blueplan,c,fct);
+  }
diff --git a/pocketfft/pocketfft.h b/pocketfft/pocketfft.h
new file mode 100644
index 0000000..b1a02ee
--- /dev/null
+++ b/pocketfft/pocketfft.h
@@ -0,0 +1,50 @@
+/*
+ * This file is part of pocketfft.
+ * Licensed under a 3-clause BSD style license - see LICENSE.md
+ */
+
+/*! \file pocketfft.h
+ *  Public interface of the pocketfft library
+ *
+ *  Copyright (C) 2008-2019 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef POCKETFFT_H
+#define POCKETFFT_H
+
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct pocketfft_plan_c_i;
+typedef struct pocketfft_plan_c_i * pocketfft_plan_c;
+pocketfft_plan_c pocketfft_make_plan_c (size_t length);
+void pocketfft_delete_plan_c (pocketfft_plan_c plan);
+int pocketfft_backward_c(pocketfft_plan_c plan, double c[], double fct);
+int pocketfft_forward_c(pocketfft_plan_c plan, double c[], double fct);
+size_t pocketfft_length_c(pocketfft_plan_c plan);
+
+struct pocketfft_plan_r_i;
+typedef struct pocketfft_plan_r_i * pocketfft_plan_r;
+pocketfft_plan_r pocketfft_make_plan_r (size_t length);
+void pocketfft_delete_plan_r (pocketfft_plan_r plan);
+/*! Computes a real backward FFT on \a c, using \a plan
+    and assuming the FFTPACK storage scheme:
+    - on entry, \a c has the form <tt>r0, r1, i1, r2, i2, ...</tt>
+    - on exit, it has the form <tt>r0, r1, ..., r[length-1]</tt>. */
+int pocketfft_backward_r(pocketfft_plan_r plan, double c[], double fct);
+/*! Computes a real forward FFT on \a c, using \a plan
+    and assuming the FFTPACK storage scheme:
+    - on entry, \a c has the form <tt>r0, r1, ..., r[length-1]</tt>;
+    - on exit, it has the form <tt>r0, r1, i1, r2, i2, ...</tt> */
+int pocketfft_forward_r(pocketfft_plan_r plan, double c[], double fct);
+size_t pocketfft_length_r(pocketfft_plan_r plan);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

From f42a6f70989e340fce5339f178ca1da30df8108e Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 7 May 2019 14:09:31 +0200
Subject: [PATCH 12/17] update docs, include Fortran files in distribution

---
 COMPILE     | 22 ++++++++++++----------
 Makefile.am |  2 +-
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/COMPILE b/COMPILE
index b63ae7f..a211e04 100644
--- a/COMPILE
+++ b/COMPILE
@@ -44,9 +44,11 @@ resulting binary will most likely not run on other computers, though.
 OpenMP
 ------
 
-OpenMP should be switched on for maximum performance, and at runtime
-OMP_NUM_THREADS should be set to the number of hardware threads (not physical
-cores) of the system.
+OpenMP is enabled by default if the selected compiler supports it.
+It can be disabled at configuration time by specifying "--disable-openmp" at the
+configure command line.
+At runtime OMP_NUM_THREADS should be set to the number of hardware threads
+(not physical cores) of the system.
 (Usually this is  already the default setting when OMP_NUM_THREADS is not
 specified.)
 
@@ -65,19 +67,19 @@ Example configure invocations
 =============================
 
 GCC, OpenMP, portable binary:
-CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure
-
-GCC, no OpenMP, portable binary:
 CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math" ./configure
 
+GCC, no OpenMP, portable binary:
+CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math" ./configure --disable-openmp
+
 Clang, OpenMP, portable binary:
-CC=clang CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure
+CC=clang CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math" ./configure
 
 Intel C compiler, OpenMP, nonportable binary:
-CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp -D__PURE_INTEL_C99_HEADERS__" ./configure
+CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -D__PURE_INTEL_C99_HEADERS__" ./configure
 
-MPI support, nonportable binary:
-CC=mpicc CFLAGS="-DUSE_MPI -std=c99 -O3 -march=native -ffast-math" ./configure
+MPI support, OpenMP, portable binary:
+CC=mpicc CFLAGS="-DUSE_MPI -DMULTIARCH -std=c99 -O3 -ffast-math" ./configure
 
 Additional GCC flags for pedantic warning and debugging:
 
diff --git a/Makefile.am b/Makefile.am
index b172233..f7b33cb 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -55,7 +55,7 @@ nobase_include_HEADERS = \
   libsharp/sharp_cxx.h
 
 EXTRA_DIST = \
-  runtest.sh
+  runtest.sh fortran/sharp.f90 fortran/test_sharp.f90
 
 check_PROGRAMS = sharp_testsuite
 sharp_testsuite_SOURCES = test/sharp_testsuite.c test/memusage.c test/memusage.h

From 1448466337e61acc5d61d73dbc18acfd103a45cb Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 7 May 2019 14:10:44 +0200
Subject: [PATCH 13/17] cosmetics

---
 COMPILE | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/COMPILE b/COMPILE
index a211e04..a898662 100644
--- a/COMPILE
+++ b/COMPILE
@@ -15,10 +15,10 @@ flags.
 Fast math
 ---------
 
-Specifying "-ffast-math" or "-ffp-contract=fast" is important for all compilers, since it allows the
-compiler to fuse multiplications and additions into FMA instructions, which is
-forbidden by the C99 standard. Since FMAs are a central aspect of the algorithm,
-they are needed for optimum performance.
+Specifying "-ffast-math" or "-ffp-contract=fast" is important for all compilers,
+since it allows the compiler to fuse multiplications and additions into FMA
+instructions, which is forbidden by the C99 standard. Since FMAs are a central
+aspect of the algorithm, they are needed for optimum performance.
 
 If you are calling libsharp from other code which requires strict adherence
 to the C99 standard, you should still be able to compile libsharp with

From 5e2fe415ec23c045a1788ff4c13e6f20d15b3889 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 31 May 2019 22:14:03 +0200
Subject: [PATCH 14/17] add MPI-specific files

---
 Makefile.am | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile.am b/Makefile.am
index f7b33cb..8b7d243 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -50,12 +50,13 @@ endif
 
 nobase_include_HEADERS = \
   libsharp/sharp.h \
+  libsharp/sharp_mpi.h \
   libsharp/sharp_geomhelpers.h \
   libsharp/sharp_almhelpers.h \
   libsharp/sharp_cxx.h
 
 EXTRA_DIST = \
-  runtest.sh fortran/sharp.f90 fortran/test_sharp.f90
+  runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp/sharp_mpi.c
 
 check_PROGRAMS = sharp_testsuite
 sharp_testsuite_SOURCES = test/sharp_testsuite.c test/memusage.c test/memusage.h

From 9c5291b4e83bcf11e60e8584ab27a800084765bb Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 13 Jun 2019 13:42:26 +0200
Subject: [PATCH 15/17] add experimental Python interface

---
 python/pysharp.cc | 208 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 208 insertions(+)
 create mode 100644 python/pysharp.cc

diff --git a/python/pysharp.cc b/python/pysharp.cc
new file mode 100644
index 0000000..1b1d28e
--- /dev/null
+++ b/python/pysharp.cc
@@ -0,0 +1,208 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *  For more information about HEALPix, see http://healpix.sourceforge.net
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ */
+
+/*
+ *  Copyright (C) 2017 Max-Planck-Society
+ *  Author: Martin Reinecke
+ */
+
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <iostream>
+#include <vector>
+#include <complex>
+#include <string>
+
+#include "libsharp/sharp_cxx.h"
+#include "libsharp/sharp_legendre_roots.h"
+
+using namespace std;
+
+namespace py = pybind11;
+
+namespace {
+
+template<typename T> using pyarr = py::array_t<T>;
+template<typename T> using pyarr_c
+  = py::array_t<T, py::array::c_style | py::array::forcecast>;
+
+using a_d = py::array_t<double>;
+using a_c = py::array_t<complex<double>>;
+using a_i = py::array_t<int64_t>;
+using a_d_c = py::array_t<double, py::array::c_style | py::array::forcecast>;
+using a_c_c = py::array_t<complex<double>,
+  py::array::c_style | py::array::forcecast>;
+
+void myassert(bool cond, const char *msg)
+  { if (!cond) throw runtime_error(msg); }
+
+template<typename T> class py_sharpjob
+  {
+  private:
+    sharp_cxxjob<T> job;
+    int64_t lmax_, mmax_, npix_;
+
+  public:
+    py_sharpjob () : lmax_(0), mmax_(0), npix_(0) {}
+
+    void set_Gauss_geometry(int64_t nrings, int64_t nphi)
+      {
+      myassert((nrings>0)&&(nphi>0),"bad grid dimensions");
+      npix_=nrings*nphi;
+      job.set_Gauss_geometry(nrings, nphi);
+      }
+    void set_Healpix_geometry(int64_t nside)
+      {
+      myassert(nside>0,"bad Nside value");
+      npix_=12*nside*nside;
+      job.set_Healpix_geometry(nside);
+      }
+    void set_triangular_alm_info (int64_t lmax, int64_t mmax)
+      {
+      myassert(mmax>=0,"negative mmax");
+      myassert(mmax<=lmax,"mmax must not be larger than lmax");
+      lmax_=lmax; mmax_=mmax;
+      job.set_triangular_alm_info (lmax,mmax);
+      }
+
+    int64_t n_alm() const
+      { return ((mmax_+1)*(mmax_+2))/2 + (mmax_+1)*(lmax_-mmax_); }
+
+    a_d_c alm2map (const a_c_c &alm) const
+      {
+      myassert(npix_>0,"no map geometry specified");
+      myassert (alm.size()==n_alm(),
+        "incorrect size of a_lm array");
+      a_d_c map(npix_);
+      auto mr=map.mutable_unchecked<1>();
+      auto ar=alm.unchecked<1>();
+      job.alm2map(&ar[0],&mr[0],false);
+      return map;
+      }
+    a_c_c alm2map_adjoint (const a_d_c &map) const
+      {
+      myassert(npix_>0,"no map geometry specified");
+      myassert (map.size()==npix_,"incorrect size of map array");
+      a_c_c alm(n_alm());
+      auto mr=map.unchecked<1>();
+      auto ar=alm.mutable_unchecked<1>();
+      job.alm2map_adjoint(&mr[0],&ar[0],false);
+      return alm;
+      }
+    a_c_c map2alm (const a_d_c &map) const
+      {
+      myassert(npix_>0,"no map geometry specified");
+      myassert (map.size()==npix_,"incorrect size of map array");
+      a_c_c alm(n_alm());
+      auto mr=map.unchecked<1>();
+      auto ar=alm.mutable_unchecked<1>();
+      job.map2alm(&mr[0],&ar[0],false);
+      return alm;
+      }
+    a_d_c alm2map_spin (const a_c_c &alm, int64_t spin) const
+      {
+      myassert(npix_>0,"no map geometry specified");
+      auto ar=alm.unchecked<2>();
+      myassert((ar.shape(0)==2)&&(ar.shape(1)==n_alm()),
+        "incorrect size of a_lm array");
+      a_d_c map(vector<size_t>{2,size_t(npix_)});
+      auto mr=map.mutable_unchecked<2>();
+      job.alm2map_spin(&ar(0,0),&ar(1,0),&mr(0,0),&mr(1,0),spin,false);
+      return map;
+      }
+    a_c_c map2alm_spin (const a_d_c &map, int64_t spin) const
+      {
+      myassert(npix_>0,"no map geometry specified");
+      auto mr=map.unchecked<2>();
+      myassert ((mr.shape(0)==2)&&(mr.shape(1)==npix_),
+        "incorrect size of map array");
+      a_c_c alm(vector<size_t>{2,size_t(n_alm())});
+      auto ar=alm.mutable_unchecked<2>();
+      job.map2alm_spin(&mr(0,0),&mr(1,0),&ar(0,0),&ar(1,0),spin,false);
+      return alm;
+      }
+  };
+
+a_d_c GL_weights(int64_t nlat, int64_t nlon)
+  {
+  constexpr double twopi=6.283185307179586476925286766559005768394;
+  a_d_c res(nlat);
+  auto rr=res.mutable_unchecked<1>();
+  vector<double> dummy_roots(nlat);
+  sharp_legendre_roots(nlat, dummy_roots.data(), &rr[0]);
+  for (size_t i=0; i<size_t(rr.shape(0)); ++i)
+    rr[i]*=twopi/nlon;
+  return res;
+  }
+
+a_d_c GL_thetas(int64_t nlat)
+  {
+  a_d_c res(nlat);
+  auto rr=res.mutable_unchecked<1>();
+  vector<double> dummy_weights(nlat);
+  sharp_legendre_roots(nlat, &rr[0], dummy_weights.data());
+  for (size_t i=0; i<size_t(rr.shape(0)); ++i)
+    rr[i]=acos(-rr[i]);
+  return res;
+  }
+
+
+const char *pysharp_DS = R"""(
+Python interface for libsharp
+
+All angles are interpreted as radians.
+The theta coordinate is measured as co-latitude, ranging from 0 (North Pole)
+to pi (South Pole).
+
+Error conditions are reported by raising exceptions.
+)""";
+
+
+} // unnamed namespace
+
+PYBIND11_MODULE(pysharp, m)
+  {
+  using namespace pybind11::literals;
+
+  m.doc() = pysharp_DS;
+
+  py::class_<py_sharpjob<double>> (m, "sharpjob_d")
+    .def(py::init<>())
+    .def("set_Gauss_geometry", &py_sharpjob<double>::set_Gauss_geometry,
+      "nrings"_a,"nphi"_a)
+    .def("set_Healpix_geometry", &py_sharpjob<double>::set_Healpix_geometry,
+      "nside"_a)
+    .def("set_triangular_alm_info",
+      &py_sharpjob<double>::set_triangular_alm_info, "lmax"_a, "mmax"_a)
+    .def("n_alm", &py_sharpjob<double>::n_alm)
+    .def("alm2map", &py_sharpjob<double>::alm2map,"alm"_a)
+    .def("alm2map_adjoint", &py_sharpjob<double>::alm2map_adjoint,"map"_a)
+    .def("map2alm", &py_sharpjob<double>::map2alm,"map"_a)
+    .def("alm2map_spin", &py_sharpjob<double>::alm2map_spin,"alm"_a,"spin"_a)
+    .def("map2alm_spin", &py_sharpjob<double>::map2alm_spin,"map"_a,"spin"_a)
+    ;
+
+  m.def("GL_weights",&GL_weights, "nlat"_a, "nlon"_a);
+  m.def("GL_thetas",&GL_thetas, "nlat"_a);
+  }

From d4f4eb41fd67ea85ecc7ea169b5f736216e69df4 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 13 Jun 2019 14:00:16 +0200
Subject: [PATCH 16/17] add simple setup.py

---
 python/setup.py | 76 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 python/setup.py

diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 0000000..c4fc0fc
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,76 @@
+from setuptools import setup, Extension, Distribution
+import setuptools.command.build_ext
+
+import sys
+import sysconfig
+import distutils.sysconfig
+
+
+# FIXME this has to be set from outside!
+sharp_libpath='/home/martin/codes/sharp2/.libs'
+
+class _deferred_pybind11_include(object):
+    def __init__(self, user=False):
+        self.user = user
+
+    def __str__(self):
+        import pybind11
+        return pybind11.get_include(self.user)
+
+
+def _remove_strict_prototype_option_from_distutils_config():
+    strict_prototypes = '-Wstrict-prototypes'
+    config = distutils.sysconfig.get_config_vars()
+    for key, value in config.items():
+        if strict_prototypes in str(value):
+            config[key] = config[key].replace(strict_prototypes, '')
+
+
+_remove_strict_prototype_option_from_distutils_config()
+
+
+extra_cc_compile_args = []
+include_dirs = ['../',
+                _deferred_pybind11_include(),
+                _deferred_pybind11_include(True)]
+
+python_module_link_args = []
+
+if sys.platform == 'darwin':
+    extra_cc_compile_args.append('--std=c++11')
+    extra_cc_compile_args.append('--stdlib=libc++')
+    extra_cc_compile_args.append('-mmacosx-version-min=10.9')
+
+    vars = distutils.sysconfig.get_config_vars()
+    vars['LDSHARED'] = vars['LDSHARED'].replace('-bundle', '')
+    python_module_link_args.append('-bundle')
+    builder = setuptools.command.build_ext.build_ext(Distribution())
+    full_name = builder.get_ext_filename('libsharp')
+else:
+    extra_cc_compile_args += ['-fopenmp', '-march=native', '-O3', '-ffast-math']
+    python_module_link_args += ['-fopenmp', '-march=native']
+    extra_cc_compile_args.append('--std=c++11')
+    python_module_link_args.append("-Wl,-rpath,$ORIGIN")
+
+
+def get_extension_modules():
+    return [Extension('pysharp',
+                      sources=['pysharp.cc'],
+                      include_dirs=include_dirs,
+                      extra_compile_args=extra_cc_compile_args,
+                      libraries=["sharp"],
+                      library_dirs=[sharp_libpath],
+                      extra_link_args=python_module_link_args)]
+
+
+setup(name='pysharp',
+      version='0.0.1',
+      description='Python bindings for libsharp',
+      include_package_data=True,
+      author='Martin Reinecke',
+      author_email='martin@mpa-garching.mpg.de',
+      packages=[],
+      setup_requires=['numpy>=1.10.4', 'pybind11>=2.2.1'],
+      ext_modules=get_extension_modules(),
+      install_requires=['numpy>=1.10.4', 'pybind11>=2.2.1']
+      )

From ebc3f6c92bf959a7334f0d23a14766cb56f67274 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 13 Jun 2019 14:13:06 +0200
Subject: [PATCH 17/17] simplify setup.py

---
 python/setup.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/python/setup.py b/python/setup.py
index c4fc0fc..153af06 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -37,20 +37,14 @@ include_dirs = ['../',
 python_module_link_args = []
 
 if sys.platform == 'darwin':
-    extra_cc_compile_args.append('--std=c++11')
-    extra_cc_compile_args.append('--stdlib=libc++')
-    extra_cc_compile_args.append('-mmacosx-version-min=10.9')
-
+    extra_cc_compile_args += ['--std=c++11', '--stdlib=libc++',
+                              '-mmacosx-version-min=10.9']
     vars = distutils.sysconfig.get_config_vars()
     vars['LDSHARED'] = vars['LDSHARED'].replace('-bundle', '')
-    python_module_link_args.append('-bundle')
-    builder = setuptools.command.build_ext.build_ext(Distribution())
-    full_name = builder.get_ext_filename('libsharp')
+    python_module_link_args += ['-bundle']
 else:
-    extra_cc_compile_args += ['-fopenmp', '-march=native', '-O3', '-ffast-math']
-    python_module_link_args += ['-fopenmp', '-march=native']
-    extra_cc_compile_args.append('--std=c++11')
-    python_module_link_args.append("-Wl,-rpath,$ORIGIN")
+    extra_cc_compile_args += ['--std=c++11']
+    python_module_link_args += ["-Wl,-rpath,$ORIGIN"]
 
 
 def get_extension_modules():