From 351180baf46d8ad051fd58150dd2c905672aebfb Mon Sep 17 00:00:00 2001
From: Dag Sverre Seljebotn <d.s.seljebotn@astro.uio.no>
Date: Wed, 22 Apr 2015 13:16:50 +0200
Subject: [PATCH 1/6] Moved over Legendre transforms from Commander project

---
 Makefile                     |    8 +
 libsharp/planck.make         |    2 +-
 libsharp/sharp.h             |    1 +
 libsharp/sharp_legendre.c    | 1305 ++++++++++++++++++++++++++++++++++
 libsharp/sharp_legendre.c.in |  163 +++++
 libsharp/sharp_legendre.h    |   62 ++
 libsharp/sharp_vecsupport.h  |   37 +
 libsharp/sharp_vecutil.h     |   10 +
 runjinja.py                  |   16 +
 9 files changed, 1603 insertions(+), 1 deletion(-)
 create mode 100644 libsharp/sharp_legendre.c
 create mode 100644 libsharp/sharp_legendre.c.in
 create mode 100644 libsharp/sharp_legendre.h
 create mode 100755 runjinja.py

diff --git a/Makefile b/Makefile
index 5f2d9c8..dd9931e 100644
--- a/Makefile
+++ b/Makefile
@@ -53,3 +53,11 @@ perftest: compile_all
 	$(BINDIR)/sharp_testsuite test gauss 2047 -1 -1 4096 0 1 && \
 	$(BINDIR)/sharp_testsuite test gauss 4095 -1 -1 8192 0 1 && \
 	$(BINDIR)/sharp_testsuite test gauss 8191 -1 -1 16384 0 1
+
+# Jinja templates
+
+%.c: %.c.in
+	./runjinja.py < $< > $@
+
+genclean:
+	rm libsharp/sharp_legendre.c || exit 0
diff --git a/libsharp/planck.make b/libsharp/planck.make
index 3397422..fcf51df 100644
--- a/libsharp/planck.make
+++ b/libsharp/planck.make
@@ -8,7 +8,7 @@ FULL_INCLUDE+= -I$(SD)
 HDR_$(PKG):=$(SD)/*.h
 LIB_$(PKG):=$(LIBDIR)/libsharp.a
 BIN:=sharp_testsuite
-LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o
+LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o
 ALLOBJ:=$(LIBOBJ) sharp_testsuite.o
 LIBOBJ:=$(LIBOBJ:%=$(OD)/%)
 ALLOBJ:=$(ALLOBJ:%=$(OD)/%)
diff --git a/libsharp/sharp.h b/libsharp/sharp.h
index 9c5dd57..da6fc4a 100644
--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@@ -39,5 +39,6 @@
 #include <complex.h>
 
 #include "sharp_lowlevel.h"
+#include "sharp_legendre.h"
 
 #endif
diff --git a/libsharp/sharp_legendre.c b/libsharp/sharp_legendre.c
new file mode 100644
index 0000000..75518d6
--- /dev/null
+++ b/libsharp/sharp_legendre.c
@@ -0,0 +1,1305 @@
+/*
+
+    NOTE NOTE NOTE
+
+    This file is edited in sharp_legendre.c.in which is then preprocessed.
+    Do not make manual  modifications to sharp_legendre.c.
+
+    NOTE NOTE NOTE
+
+*/
+
+
+/*
+ *  This file is part of libsharp.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*! \file sharp_legendre.c.in
+ *
+ *  Copyright (C) 2015 University of Oslo
+ *  \author Dag Sverre Seljebotn
+ */
+
+#include "sharp_legendre.h"
+#include "sharp_vecsupport.h"
+
+#include <malloc.h>
+
+
+
+static void legendre_transform_vec1(double *recfacs, double *bl, ptrdiff_t lmax,
+                                              double xarr[(1) * VLEN],
+                                              double out[(1) * VLEN]) {
+    
+    Tv P_0, Pm1_0, Pm2_0, x0, y0;
+    
+    Tv W1, W2, b, R;
+    ptrdiff_t l;
+
+    
+    x0 = vloadu(xarr + 0 * VLEN);
+    Pm1_0 = vload(1.0);
+    P_0 = x0;
+    b = vbroadcast(bl);
+    y0 = vmul(Pm1_0, b);
+    
+    
+    b = vbroadcast(bl + 1);
+    
+    vfmaeq(y0, P_0, b);
+    
+
+    for (l = 2; l <= lmax; ++l) {
+        b = vbroadcast(bl + l);
+        R = vbroadcast(recfacs + l);
+        
+        /* 
+           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
+        */
+        
+        Pm2_0 = Pm1_0; Pm1_0 = P_0;
+        W1 = vmul(x0, Pm1_0);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_0);
+        vfmaeq(W1, W2, R);
+        P_0 = W1;
+        vfmaeq(y0, P_0, b);
+        
+
+    }
+    
+    vstoreu(out + 0 * VLEN, y0);
+    
+}
+
+static void legendre_transform_vec2(double *recfacs, double *bl, ptrdiff_t lmax,
+                                              double xarr[(2) * VLEN],
+                                              double out[(2) * VLEN]) {
+    
+    Tv P_0, Pm1_0, Pm2_0, x0, y0;
+    
+    Tv P_1, Pm1_1, Pm2_1, x1, y1;
+    
+    Tv W1, W2, b, R;
+    ptrdiff_t l;
+
+    
+    x0 = vloadu(xarr + 0 * VLEN);
+    Pm1_0 = vload(1.0);
+    P_0 = x0;
+    b = vbroadcast(bl);
+    y0 = vmul(Pm1_0, b);
+    
+    x1 = vloadu(xarr + 1 * VLEN);
+    Pm1_1 = vload(1.0);
+    P_1 = x1;
+    b = vbroadcast(bl);
+    y1 = vmul(Pm1_1, b);
+    
+    
+    b = vbroadcast(bl + 1);
+    
+    vfmaeq(y0, P_0, b);
+    
+    vfmaeq(y1, P_1, b);
+    
+
+    for (l = 2; l <= lmax; ++l) {
+        b = vbroadcast(bl + l);
+        R = vbroadcast(recfacs + l);
+        
+        /* 
+           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
+        */
+        
+        Pm2_0 = Pm1_0; Pm1_0 = P_0;
+        W1 = vmul(x0, Pm1_0);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_0);
+        vfmaeq(W1, W2, R);
+        P_0 = W1;
+        vfmaeq(y0, P_0, b);
+        
+        Pm2_1 = Pm1_1; Pm1_1 = P_1;
+        W1 = vmul(x1, Pm1_1);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_1);
+        vfmaeq(W1, W2, R);
+        P_1 = W1;
+        vfmaeq(y1, P_1, b);
+        
+
+    }
+    
+    vstoreu(out + 0 * VLEN, y0);
+    
+    vstoreu(out + 1 * VLEN, y1);
+    
+}
+
+static void legendre_transform_vec3(double *recfacs, double *bl, ptrdiff_t lmax,
+                                              double xarr[(3) * VLEN],
+                                              double out[(3) * VLEN]) {
+    
+    Tv P_0, Pm1_0, Pm2_0, x0, y0;
+    
+    Tv P_1, Pm1_1, Pm2_1, x1, y1;
+    
+    Tv P_2, Pm1_2, Pm2_2, x2, y2;
+    
+    Tv W1, W2, b, R;
+    ptrdiff_t l;
+
+    
+    x0 = vloadu(xarr + 0 * VLEN);
+    Pm1_0 = vload(1.0);
+    P_0 = x0;
+    b = vbroadcast(bl);
+    y0 = vmul(Pm1_0, b);
+    
+    x1 = vloadu(xarr + 1 * VLEN);
+    Pm1_1 = vload(1.0);
+    P_1 = x1;
+    b = vbroadcast(bl);
+    y1 = vmul(Pm1_1, b);
+    
+    x2 = vloadu(xarr + 2 * VLEN);
+    Pm1_2 = vload(1.0);
+    P_2 = x2;
+    b = vbroadcast(bl);
+    y2 = vmul(Pm1_2, b);
+    
+    
+    b = vbroadcast(bl + 1);
+    
+    vfmaeq(y0, P_0, b);
+    
+    vfmaeq(y1, P_1, b);
+    
+    vfmaeq(y2, P_2, b);
+    
+
+    for (l = 2; l <= lmax; ++l) {
+        b = vbroadcast(bl + l);
+        R = vbroadcast(recfacs + l);
+        
+        /* 
+           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
+        */
+        
+        Pm2_0 = Pm1_0; Pm1_0 = P_0;
+        W1 = vmul(x0, Pm1_0);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_0);
+        vfmaeq(W1, W2, R);
+        P_0 = W1;
+        vfmaeq(y0, P_0, b);
+        
+        Pm2_1 = Pm1_1; Pm1_1 = P_1;
+        W1 = vmul(x1, Pm1_1);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_1);
+        vfmaeq(W1, W2, R);
+        P_1 = W1;
+        vfmaeq(y1, P_1, b);
+        
+        Pm2_2 = Pm1_2; Pm1_2 = P_2;
+        W1 = vmul(x2, Pm1_2);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_2);
+        vfmaeq(W1, W2, R);
+        P_2 = W1;
+        vfmaeq(y2, P_2, b);
+        
+
+    }
+    
+    vstoreu(out + 0 * VLEN, y0);
+    
+    vstoreu(out + 1 * VLEN, y1);
+    
+    vstoreu(out + 2 * VLEN, y2);
+    
+}
+
+static void legendre_transform_vec4(double *recfacs, double *bl, ptrdiff_t lmax,
+                                              double xarr[(4) * VLEN],
+                                              double out[(4) * VLEN]) {
+    
+    Tv P_0, Pm1_0, Pm2_0, x0, y0;
+    
+    Tv P_1, Pm1_1, Pm2_1, x1, y1;
+    
+    Tv P_2, Pm1_2, Pm2_2, x2, y2;
+    
+    Tv P_3, Pm1_3, Pm2_3, x3, y3;
+    
+    Tv W1, W2, b, R;
+    ptrdiff_t l;
+
+    
+    x0 = vloadu(xarr + 0 * VLEN);
+    Pm1_0 = vload(1.0);
+    P_0 = x0;
+    b = vbroadcast(bl);
+    y0 = vmul(Pm1_0, b);
+    
+    x1 = vloadu(xarr + 1 * VLEN);
+    Pm1_1 = vload(1.0);
+    P_1 = x1;
+    b = vbroadcast(bl);
+    y1 = vmul(Pm1_1, b);
+    
+    x2 = vloadu(xarr + 2 * VLEN);
+    Pm1_2 = vload(1.0);
+    P_2 = x2;
+    b = vbroadcast(bl);
+    y2 = vmul(Pm1_2, b);
+    
+    x3 = vloadu(xarr + 3 * VLEN);
+    Pm1_3 = vload(1.0);
+    P_3 = x3;
+    b = vbroadcast(bl);
+    y3 = vmul(Pm1_3, b);
+    
+    
+    b = vbroadcast(bl + 1);
+    
+    vfmaeq(y0, P_0, b);
+    
+    vfmaeq(y1, P_1, b);
+    
+    vfmaeq(y2, P_2, b);
+    
+    vfmaeq(y3, P_3, b);
+    
+
+    for (l = 2; l <= lmax; ++l) {
+        b = vbroadcast(bl + l);
+        R = vbroadcast(recfacs + l);
+        
+        /* 
+           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
+        */
+        
+        Pm2_0 = Pm1_0; Pm1_0 = P_0;
+        W1 = vmul(x0, Pm1_0);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_0);
+        vfmaeq(W1, W2, R);
+        P_0 = W1;
+        vfmaeq(y0, P_0, b);
+        
+        Pm2_1 = Pm1_1; Pm1_1 = P_1;
+        W1 = vmul(x1, Pm1_1);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_1);
+        vfmaeq(W1, W2, R);
+        P_1 = W1;
+        vfmaeq(y1, P_1, b);
+        
+        Pm2_2 = Pm1_2; Pm1_2 = P_2;
+        W1 = vmul(x2, Pm1_2);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_2);
+        vfmaeq(W1, W2, R);
+        P_2 = W1;
+        vfmaeq(y2, P_2, b);
+        
+        Pm2_3 = Pm1_3; Pm1_3 = P_3;
+        W1 = vmul(x3, Pm1_3);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_3);
+        vfmaeq(W1, W2, R);
+        P_3 = W1;
+        vfmaeq(y3, P_3, b);
+        
+
+    }
+    
+    vstoreu(out + 0 * VLEN, y0);
+    
+    vstoreu(out + 1 * VLEN, y1);
+    
+    vstoreu(out + 2 * VLEN, y2);
+    
+    vstoreu(out + 3 * VLEN, y3);
+    
+}
+
+static void legendre_transform_vec5(double *recfacs, double *bl, ptrdiff_t lmax,
+                                              double xarr[(5) * VLEN],
+                                              double out[(5) * VLEN]) {
+    
+    Tv P_0, Pm1_0, Pm2_0, x0, y0;
+    
+    Tv P_1, Pm1_1, Pm2_1, x1, y1;
+    
+    Tv P_2, Pm1_2, Pm2_2, x2, y2;
+    
+    Tv P_3, Pm1_3, Pm2_3, x3, y3;
+    
+    Tv P_4, Pm1_4, Pm2_4, x4, y4;
+    
+    Tv W1, W2, b, R;
+    ptrdiff_t l;
+
+    
+    x0 = vloadu(xarr + 0 * VLEN);
+    Pm1_0 = vload(1.0);
+    P_0 = x0;
+    b = vbroadcast(bl);
+    y0 = vmul(Pm1_0, b);
+    
+    x1 = vloadu(xarr + 1 * VLEN);
+    Pm1_1 = vload(1.0);
+    P_1 = x1;
+    b = vbroadcast(bl);
+    y1 = vmul(Pm1_1, b);
+    
+    x2 = vloadu(xarr + 2 * VLEN);
+    Pm1_2 = vload(1.0);
+    P_2 = x2;
+    b = vbroadcast(bl);
+    y2 = vmul(Pm1_2, b);
+    
+    x3 = vloadu(xarr + 3 * VLEN);
+    Pm1_3 = vload(1.0);
+    P_3 = x3;
+    b = vbroadcast(bl);
+    y3 = vmul(Pm1_3, b);
+    
+    x4 = vloadu(xarr + 4 * VLEN);
+    Pm1_4 = vload(1.0);
+    P_4 = x4;
+    b = vbroadcast(bl);
+    y4 = vmul(Pm1_4, b);
+    
+    
+    b = vbroadcast(bl + 1);
+    
+    vfmaeq(y0, P_0, b);
+    
+    vfmaeq(y1, P_1, b);
+    
+    vfmaeq(y2, P_2, b);
+    
+    vfmaeq(y3, P_3, b);
+    
+    vfmaeq(y4, P_4, b);
+    
+
+    for (l = 2; l <= lmax; ++l) {
+        b = vbroadcast(bl + l);
+        R = vbroadcast(recfacs + l);
+        
+        /* 
+           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
+        */
+        
+        Pm2_0 = Pm1_0; Pm1_0 = P_0;
+        W1 = vmul(x0, Pm1_0);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_0);
+        vfmaeq(W1, W2, R);
+        P_0 = W1;
+        vfmaeq(y0, P_0, b);
+        
+        Pm2_1 = Pm1_1; Pm1_1 = P_1;
+        W1 = vmul(x1, Pm1_1);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_1);
+        vfmaeq(W1, W2, R);
+        P_1 = W1;
+        vfmaeq(y1, P_1, b);
+        
+        Pm2_2 = Pm1_2; Pm1_2 = P_2;
+        W1 = vmul(x2, Pm1_2);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_2);
+        vfmaeq(W1, W2, R);
+        P_2 = W1;
+        vfmaeq(y2, P_2, b);
+        
+        Pm2_3 = Pm1_3; Pm1_3 = P_3;
+        W1 = vmul(x3, Pm1_3);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_3);
+        vfmaeq(W1, W2, R);
+        P_3 = W1;
+        vfmaeq(y3, P_3, b);
+        
+        Pm2_4 = Pm1_4; Pm1_4 = P_4;
+        W1 = vmul(x4, Pm1_4);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_4);
+        vfmaeq(W1, W2, R);
+        P_4 = W1;
+        vfmaeq(y4, P_4, b);
+        
+
+    }
+    
+    vstoreu(out + 0 * VLEN, y0);
+    
+    vstoreu(out + 1 * VLEN, y1);
+    
+    vstoreu(out + 2 * VLEN, y2);
+    
+    vstoreu(out + 3 * VLEN, y3);
+    
+    vstoreu(out + 4 * VLEN, y4);
+    
+}
+
+static void legendre_transform_vec6(double *recfacs, double *bl, ptrdiff_t lmax,
+                                              double xarr[(6) * VLEN],
+                                              double out[(6) * VLEN]) {
+    
+    Tv P_0, Pm1_0, Pm2_0, x0, y0;
+    
+    Tv P_1, Pm1_1, Pm2_1, x1, y1;
+    
+    Tv P_2, Pm1_2, Pm2_2, x2, y2;
+    
+    Tv P_3, Pm1_3, Pm2_3, x3, y3;
+    
+    Tv P_4, Pm1_4, Pm2_4, x4, y4;
+    
+    Tv P_5, Pm1_5, Pm2_5, x5, y5;
+    
+    Tv W1, W2, b, R;
+    ptrdiff_t l;
+
+    
+    x0 = vloadu(xarr + 0 * VLEN);
+    Pm1_0 = vload(1.0);
+    P_0 = x0;
+    b = vbroadcast(bl);
+    y0 = vmul(Pm1_0, b);
+    
+    x1 = vloadu(xarr + 1 * VLEN);
+    Pm1_1 = vload(1.0);
+    P_1 = x1;
+    b = vbroadcast(bl);
+    y1 = vmul(Pm1_1, b);
+    
+    x2 = vloadu(xarr + 2 * VLEN);
+    Pm1_2 = vload(1.0);
+    P_2 = x2;
+    b = vbroadcast(bl);
+    y2 = vmul(Pm1_2, b);
+    
+    x3 = vloadu(xarr + 3 * VLEN);
+    Pm1_3 = vload(1.0);
+    P_3 = x3;
+    b = vbroadcast(bl);
+    y3 = vmul(Pm1_3, b);
+    
+    x4 = vloadu(xarr + 4 * VLEN);
+    Pm1_4 = vload(1.0);
+    P_4 = x4;
+    b = vbroadcast(bl);
+    y4 = vmul(Pm1_4, b);
+    
+    x5 = vloadu(xarr + 5 * VLEN);
+    Pm1_5 = vload(1.0);
+    P_5 = x5;
+    b = vbroadcast(bl);
+    y5 = vmul(Pm1_5, b);
+    
+    
+    b = vbroadcast(bl + 1);
+    
+    vfmaeq(y0, P_0, b);
+    
+    vfmaeq(y1, P_1, b);
+    
+    vfmaeq(y2, P_2, b);
+    
+    vfmaeq(y3, P_3, b);
+    
+    vfmaeq(y4, P_4, b);
+    
+    vfmaeq(y5, P_5, b);
+    
+
+    for (l = 2; l <= lmax; ++l) {
+        b = vbroadcast(bl + l);
+        R = vbroadcast(recfacs + l);
+        
+        /* 
+           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
+        */
+        
+        Pm2_0 = Pm1_0; Pm1_0 = P_0;
+        W1 = vmul(x0, Pm1_0);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_0);
+        vfmaeq(W1, W2, R);
+        P_0 = W1;
+        vfmaeq(y0, P_0, b);
+        
+        Pm2_1 = Pm1_1; Pm1_1 = P_1;
+        W1 = vmul(x1, Pm1_1);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_1);
+        vfmaeq(W1, W2, R);
+        P_1 = W1;
+        vfmaeq(y1, P_1, b);
+        
+        Pm2_2 = Pm1_2; Pm1_2 = P_2;
+        W1 = vmul(x2, Pm1_2);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_2);
+        vfmaeq(W1, W2, R);
+        P_2 = W1;
+        vfmaeq(y2, P_2, b);
+        
+        Pm2_3 = Pm1_3; Pm1_3 = P_3;
+        W1 = vmul(x3, Pm1_3);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_3);
+        vfmaeq(W1, W2, R);
+        P_3 = W1;
+        vfmaeq(y3, P_3, b);
+        
+        Pm2_4 = Pm1_4; Pm1_4 = P_4;
+        W1 = vmul(x4, Pm1_4);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_4);
+        vfmaeq(W1, W2, R);
+        P_4 = W1;
+        vfmaeq(y4, P_4, b);
+        
+        Pm2_5 = Pm1_5; Pm1_5 = P_5;
+        W1 = vmul(x5, Pm1_5);
+        W2 = W1;
+        W2 = vsub(W2, Pm2_5);
+        vfmaeq(W1, W2, R);
+        P_5 = W1;
+        vfmaeq(y5, P_5, b);
+        
+
+    }
+    
+    vstoreu(out + 0 * VLEN, y0);
+    
+    vstoreu(out + 1 * VLEN, y1);
+    
+    vstoreu(out + 2 * VLEN, y2);
+    
+    vstoreu(out + 3 * VLEN, y3);
+    
+    vstoreu(out + 4 * VLEN, y4);
+    
+    vstoreu(out + 5 * VLEN, y5);
+    
+}
+
+
+
+static void legendre_transform_vec1_s(float *recfacs, float *bl, ptrdiff_t lmax,
+                                              float xarr[(1) * VLEN_s],
+                                              float out[(1) * VLEN_s]) {
+    
+    Tv_s P_0, Pm1_0, Pm2_0, x0, y0;
+    
+    Tv_s W1, W2, b, R;
+    ptrdiff_t l;
+
+    
+    x0 = vloadu_s(xarr + 0 * VLEN_s);
+    Pm1_0 = vload_s(1.0);
+    P_0 = x0;
+    b = vbroadcast_s(bl);
+    y0 = vmul_s(Pm1_0, b);
+    
+    
+    b = vbroadcast_s(bl + 1);
+    
+    vfmaeq_s(y0, P_0, b);
+    
+
+    for (l = 2; l <= lmax; ++l) {
+        b = vbroadcast_s(bl + l);
+        R = vbroadcast_s(recfacs + l);
+        
+        /* 
+           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
+        */
+        
+        Pm2_0 = Pm1_0; Pm1_0 = P_0;
+        W1 = vmul_s(x0, Pm1_0);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_0);
+        vfmaeq_s(W1, W2, R);
+        P_0 = W1;
+        vfmaeq_s(y0, P_0, b);
+        
+
+    }
+    
+    vstoreu_s(out + 0 * VLEN_s, y0);
+    
+}
+
+static void legendre_transform_vec2_s(float *recfacs, float *bl, ptrdiff_t lmax,
+                                              float xarr[(2) * VLEN_s],
+                                              float out[(2) * VLEN_s]) {
+    
+    Tv_s P_0, Pm1_0, Pm2_0, x0, y0;
+    
+    Tv_s P_1, Pm1_1, Pm2_1, x1, y1;
+    
+    Tv_s W1, W2, b, R;
+    ptrdiff_t l;
+
+    
+    x0 = vloadu_s(xarr + 0 * VLEN_s);
+    Pm1_0 = vload_s(1.0);
+    P_0 = x0;
+    b = vbroadcast_s(bl);
+    y0 = vmul_s(Pm1_0, b);
+    
+    x1 = vloadu_s(xarr + 1 * VLEN_s);
+    Pm1_1 = vload_s(1.0);
+    P_1 = x1;
+    b = vbroadcast_s(bl);
+    y1 = vmul_s(Pm1_1, b);
+    
+    
+    b = vbroadcast_s(bl + 1);
+    
+    vfmaeq_s(y0, P_0, b);
+    
+    vfmaeq_s(y1, P_1, b);
+    
+
+    for (l = 2; l <= lmax; ++l) {
+        b = vbroadcast_s(bl + l);
+        R = vbroadcast_s(recfacs + l);
+        
+        /* 
+           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
+        */
+        
+        Pm2_0 = Pm1_0; Pm1_0 = P_0;
+        W1 = vmul_s(x0, Pm1_0);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_0);
+        vfmaeq_s(W1, W2, R);
+        P_0 = W1;
+        vfmaeq_s(y0, P_0, b);
+        
+        Pm2_1 = Pm1_1; Pm1_1 = P_1;
+        W1 = vmul_s(x1, Pm1_1);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_1);
+        vfmaeq_s(W1, W2, R);
+        P_1 = W1;
+        vfmaeq_s(y1, P_1, b);
+        
+
+    }
+    
+    vstoreu_s(out + 0 * VLEN_s, y0);
+    
+    vstoreu_s(out + 1 * VLEN_s, y1);
+    
+}
+
+static void legendre_transform_vec3_s(float *recfacs, float *bl, ptrdiff_t lmax,
+                                              float xarr[(3) * VLEN_s],
+                                              float out[(3) * VLEN_s]) {
+    
+    Tv_s P_0, Pm1_0, Pm2_0, x0, y0;
+    
+    Tv_s P_1, Pm1_1, Pm2_1, x1, y1;
+    
+    Tv_s P_2, Pm1_2, Pm2_2, x2, y2;
+    
+    Tv_s W1, W2, b, R;
+    ptrdiff_t l;
+
+    
+    x0 = vloadu_s(xarr + 0 * VLEN_s);
+    Pm1_0 = vload_s(1.0);
+    P_0 = x0;
+    b = vbroadcast_s(bl);
+    y0 = vmul_s(Pm1_0, b);
+    
+    x1 = vloadu_s(xarr + 1 * VLEN_s);
+    Pm1_1 = vload_s(1.0);
+    P_1 = x1;
+    b = vbroadcast_s(bl);
+    y1 = vmul_s(Pm1_1, b);
+    
+    x2 = vloadu_s(xarr + 2 * VLEN_s);
+    Pm1_2 = vload_s(1.0);
+    P_2 = x2;
+    b = vbroadcast_s(bl);
+    y2 = vmul_s(Pm1_2, b);
+    
+    
+    b = vbroadcast_s(bl + 1);
+    
+    vfmaeq_s(y0, P_0, b);
+    
+    vfmaeq_s(y1, P_1, b);
+    
+    vfmaeq_s(y2, P_2, b);
+    
+
+    for (l = 2; l <= lmax; ++l) {
+        b = vbroadcast_s(bl + l);
+        R = vbroadcast_s(recfacs + l);
+        
+        /* 
+           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
+        */
+        
+        Pm2_0 = Pm1_0; Pm1_0 = P_0;
+        W1 = vmul_s(x0, Pm1_0);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_0);
+        vfmaeq_s(W1, W2, R);
+        P_0 = W1;
+        vfmaeq_s(y0, P_0, b);
+        
+        Pm2_1 = Pm1_1; Pm1_1 = P_1;
+        W1 = vmul_s(x1, Pm1_1);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_1);
+        vfmaeq_s(W1, W2, R);
+        P_1 = W1;
+        vfmaeq_s(y1, P_1, b);
+        
+        Pm2_2 = Pm1_2; Pm1_2 = P_2;
+        W1 = vmul_s(x2, Pm1_2);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_2);
+        vfmaeq_s(W1, W2, R);
+        P_2 = W1;
+        vfmaeq_s(y2, P_2, b);
+        
+
+    }
+    
+    vstoreu_s(out + 0 * VLEN_s, y0);
+    
+    vstoreu_s(out + 1 * VLEN_s, y1);
+    
+    vstoreu_s(out + 2 * VLEN_s, y2);
+    
+}
+
+static void legendre_transform_vec4_s(float *recfacs, float *bl, ptrdiff_t lmax,
+                                              float xarr[(4) * VLEN_s],
+                                              float out[(4) * VLEN_s]) {
+    
+    Tv_s P_0, Pm1_0, Pm2_0, x0, y0;
+    
+    Tv_s P_1, Pm1_1, Pm2_1, x1, y1;
+    
+    Tv_s P_2, Pm1_2, Pm2_2, x2, y2;
+    
+    Tv_s P_3, Pm1_3, Pm2_3, x3, y3;
+    
+    Tv_s W1, W2, b, R;
+    ptrdiff_t l;
+
+    
+    x0 = vloadu_s(xarr + 0 * VLEN_s);
+    Pm1_0 = vload_s(1.0);
+    P_0 = x0;
+    b = vbroadcast_s(bl);
+    y0 = vmul_s(Pm1_0, b);
+    
+    x1 = vloadu_s(xarr + 1 * VLEN_s);
+    Pm1_1 = vload_s(1.0);
+    P_1 = x1;
+    b = vbroadcast_s(bl);
+    y1 = vmul_s(Pm1_1, b);
+    
+    x2 = vloadu_s(xarr + 2 * VLEN_s);
+    Pm1_2 = vload_s(1.0);
+    P_2 = x2;
+    b = vbroadcast_s(bl);
+    y2 = vmul_s(Pm1_2, b);
+    
+    x3 = vloadu_s(xarr + 3 * VLEN_s);
+    Pm1_3 = vload_s(1.0);
+    P_3 = x3;
+    b = vbroadcast_s(bl);
+    y3 = vmul_s(Pm1_3, b);
+    
+    
+    b = vbroadcast_s(bl + 1);
+    
+    vfmaeq_s(y0, P_0, b);
+    
+    vfmaeq_s(y1, P_1, b);
+    
+    vfmaeq_s(y2, P_2, b);
+    
+    vfmaeq_s(y3, P_3, b);
+    
+
+    for (l = 2; l <= lmax; ++l) {
+        b = vbroadcast_s(bl + l);
+        R = vbroadcast_s(recfacs + l);
+        
+        /* 
+           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
+        */
+        
+        Pm2_0 = Pm1_0; Pm1_0 = P_0;
+        W1 = vmul_s(x0, Pm1_0);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_0);
+        vfmaeq_s(W1, W2, R);
+        P_0 = W1;
+        vfmaeq_s(y0, P_0, b);
+        
+        Pm2_1 = Pm1_1; Pm1_1 = P_1;
+        W1 = vmul_s(x1, Pm1_1);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_1);
+        vfmaeq_s(W1, W2, R);
+        P_1 = W1;
+        vfmaeq_s(y1, P_1, b);
+        
+        Pm2_2 = Pm1_2; Pm1_2 = P_2;
+        W1 = vmul_s(x2, Pm1_2);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_2);
+        vfmaeq_s(W1, W2, R);
+        P_2 = W1;
+        vfmaeq_s(y2, P_2, b);
+        
+        Pm2_3 = Pm1_3; Pm1_3 = P_3;
+        W1 = vmul_s(x3, Pm1_3);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_3);
+        vfmaeq_s(W1, W2, R);
+        P_3 = W1;
+        vfmaeq_s(y3, P_3, b);
+        
+
+    }
+    
+    vstoreu_s(out + 0 * VLEN_s, y0);
+    
+    vstoreu_s(out + 1 * VLEN_s, y1);
+    
+    vstoreu_s(out + 2 * VLEN_s, y2);
+    
+    vstoreu_s(out + 3 * VLEN_s, y3);
+    
+}
+
+static void legendre_transform_vec5_s(float *recfacs, float *bl, ptrdiff_t lmax,
+                                              float xarr[(5) * VLEN_s],
+                                              float out[(5) * VLEN_s]) {
+    
+    Tv_s P_0, Pm1_0, Pm2_0, x0, y0;
+    
+    Tv_s P_1, Pm1_1, Pm2_1, x1, y1;
+    
+    Tv_s P_2, Pm1_2, Pm2_2, x2, y2;
+    
+    Tv_s P_3, Pm1_3, Pm2_3, x3, y3;
+    
+    Tv_s P_4, Pm1_4, Pm2_4, x4, y4;
+    
+    Tv_s W1, W2, b, R;
+    ptrdiff_t l;
+
+    
+    x0 = vloadu_s(xarr + 0 * VLEN_s);
+    Pm1_0 = vload_s(1.0);
+    P_0 = x0;
+    b = vbroadcast_s(bl);
+    y0 = vmul_s(Pm1_0, b);
+    
+    x1 = vloadu_s(xarr + 1 * VLEN_s);
+    Pm1_1 = vload_s(1.0);
+    P_1 = x1;
+    b = vbroadcast_s(bl);
+    y1 = vmul_s(Pm1_1, b);
+    
+    x2 = vloadu_s(xarr + 2 * VLEN_s);
+    Pm1_2 = vload_s(1.0);
+    P_2 = x2;
+    b = vbroadcast_s(bl);
+    y2 = vmul_s(Pm1_2, b);
+    
+    x3 = vloadu_s(xarr + 3 * VLEN_s);
+    Pm1_3 = vload_s(1.0);
+    P_3 = x3;
+    b = vbroadcast_s(bl);
+    y3 = vmul_s(Pm1_3, b);
+    
+    x4 = vloadu_s(xarr + 4 * VLEN_s);
+    Pm1_4 = vload_s(1.0);
+    P_4 = x4;
+    b = vbroadcast_s(bl);
+    y4 = vmul_s(Pm1_4, b);
+    
+    
+    b = vbroadcast_s(bl + 1);
+    
+    vfmaeq_s(y0, P_0, b);
+    
+    vfmaeq_s(y1, P_1, b);
+    
+    vfmaeq_s(y2, P_2, b);
+    
+    vfmaeq_s(y3, P_3, b);
+    
+    vfmaeq_s(y4, P_4, b);
+    
+
+    for (l = 2; l <= lmax; ++l) {
+        b = vbroadcast_s(bl + l);
+        R = vbroadcast_s(recfacs + l);
+        
+        /* 
+           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
+        */
+        
+        Pm2_0 = Pm1_0; Pm1_0 = P_0;
+        W1 = vmul_s(x0, Pm1_0);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_0);
+        vfmaeq_s(W1, W2, R);
+        P_0 = W1;
+        vfmaeq_s(y0, P_0, b);
+        
+        Pm2_1 = Pm1_1; Pm1_1 = P_1;
+        W1 = vmul_s(x1, Pm1_1);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_1);
+        vfmaeq_s(W1, W2, R);
+        P_1 = W1;
+        vfmaeq_s(y1, P_1, b);
+        
+        Pm2_2 = Pm1_2; Pm1_2 = P_2;
+        W1 = vmul_s(x2, Pm1_2);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_2);
+        vfmaeq_s(W1, W2, R);
+        P_2 = W1;
+        vfmaeq_s(y2, P_2, b);
+        
+        Pm2_3 = Pm1_3; Pm1_3 = P_3;
+        W1 = vmul_s(x3, Pm1_3);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_3);
+        vfmaeq_s(W1, W2, R);
+        P_3 = W1;
+        vfmaeq_s(y3, P_3, b);
+        
+        Pm2_4 = Pm1_4; Pm1_4 = P_4;
+        W1 = vmul_s(x4, Pm1_4);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_4);
+        vfmaeq_s(W1, W2, R);
+        P_4 = W1;
+        vfmaeq_s(y4, P_4, b);
+        
+
+    }
+    
+    vstoreu_s(out + 0 * VLEN_s, y0);
+    
+    vstoreu_s(out + 1 * VLEN_s, y1);
+    
+    vstoreu_s(out + 2 * VLEN_s, y2);
+    
+    vstoreu_s(out + 3 * VLEN_s, y3);
+    
+    vstoreu_s(out + 4 * VLEN_s, y4);
+    
+}
+
+static void legendre_transform_vec6_s(float *recfacs, float *bl, ptrdiff_t lmax,
+                                              float xarr[(6) * VLEN_s],
+                                              float out[(6) * VLEN_s]) {
+    
+    Tv_s P_0, Pm1_0, Pm2_0, x0, y0;
+    
+    Tv_s P_1, Pm1_1, Pm2_1, x1, y1;
+    
+    Tv_s P_2, Pm1_2, Pm2_2, x2, y2;
+    
+    Tv_s P_3, Pm1_3, Pm2_3, x3, y3;
+    
+    Tv_s P_4, Pm1_4, Pm2_4, x4, y4;
+    
+    Tv_s P_5, Pm1_5, Pm2_5, x5, y5;
+    
+    Tv_s W1, W2, b, R;
+    ptrdiff_t l;
+
+    
+    x0 = vloadu_s(xarr + 0 * VLEN_s);
+    Pm1_0 = vload_s(1.0);
+    P_0 = x0;
+    b = vbroadcast_s(bl);
+    y0 = vmul_s(Pm1_0, b);
+    
+    x1 = vloadu_s(xarr + 1 * VLEN_s);
+    Pm1_1 = vload_s(1.0);
+    P_1 = x1;
+    b = vbroadcast_s(bl);
+    y1 = vmul_s(Pm1_1, b);
+    
+    x2 = vloadu_s(xarr + 2 * VLEN_s);
+    Pm1_2 = vload_s(1.0);
+    P_2 = x2;
+    b = vbroadcast_s(bl);
+    y2 = vmul_s(Pm1_2, b);
+    
+    x3 = vloadu_s(xarr + 3 * VLEN_s);
+    Pm1_3 = vload_s(1.0);
+    P_3 = x3;
+    b = vbroadcast_s(bl);
+    y3 = vmul_s(Pm1_3, b);
+    
+    x4 = vloadu_s(xarr + 4 * VLEN_s);
+    Pm1_4 = vload_s(1.0);
+    P_4 = x4;
+    b = vbroadcast_s(bl);
+    y4 = vmul_s(Pm1_4, b);
+    
+    x5 = vloadu_s(xarr + 5 * VLEN_s);
+    Pm1_5 = vload_s(1.0);
+    P_5 = x5;
+    b = vbroadcast_s(bl);
+    y5 = vmul_s(Pm1_5, b);
+    
+    
+    b = vbroadcast_s(bl + 1);
+    
+    vfmaeq_s(y0, P_0, b);
+    
+    vfmaeq_s(y1, P_1, b);
+    
+    vfmaeq_s(y2, P_2, b);
+    
+    vfmaeq_s(y3, P_3, b);
+    
+    vfmaeq_s(y4, P_4, b);
+    
+    vfmaeq_s(y5, P_5, b);
+    
+
+    for (l = 2; l <= lmax; ++l) {
+        b = vbroadcast_s(bl + l);
+        R = vbroadcast_s(recfacs + l);
+        
+        /* 
+           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
+        */
+        
+        Pm2_0 = Pm1_0; Pm1_0 = P_0;
+        W1 = vmul_s(x0, Pm1_0);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_0);
+        vfmaeq_s(W1, W2, R);
+        P_0 = W1;
+        vfmaeq_s(y0, P_0, b);
+        
+        Pm2_1 = Pm1_1; Pm1_1 = P_1;
+        W1 = vmul_s(x1, Pm1_1);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_1);
+        vfmaeq_s(W1, W2, R);
+        P_1 = W1;
+        vfmaeq_s(y1, P_1, b);
+        
+        Pm2_2 = Pm1_2; Pm1_2 = P_2;
+        W1 = vmul_s(x2, Pm1_2);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_2);
+        vfmaeq_s(W1, W2, R);
+        P_2 = W1;
+        vfmaeq_s(y2, P_2, b);
+        
+        Pm2_3 = Pm1_3; Pm1_3 = P_3;
+        W1 = vmul_s(x3, Pm1_3);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_3);
+        vfmaeq_s(W1, W2, R);
+        P_3 = W1;
+        vfmaeq_s(y3, P_3, b);
+        
+        Pm2_4 = Pm1_4; Pm1_4 = P_4;
+        W1 = vmul_s(x4, Pm1_4);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_4);
+        vfmaeq_s(W1, W2, R);
+        P_4 = W1;
+        vfmaeq_s(y4, P_4, b);
+        
+        Pm2_5 = Pm1_5; Pm1_5 = P_5;
+        W1 = vmul_s(x5, Pm1_5);
+        W2 = W1;
+        W2 = vsub_s(W2, Pm2_5);
+        vfmaeq_s(W1, W2, R);
+        P_5 = W1;
+        vfmaeq_s(y5, P_5, b);
+        
+
+    }
+    
+    vstoreu_s(out + 0 * VLEN_s, y0);
+    
+    vstoreu_s(out + 1 * VLEN_s, y1);
+    
+    vstoreu_s(out + 2 * VLEN_s, y2);
+    
+    vstoreu_s(out + 3 * VLEN_s, y3);
+    
+    vstoreu_s(out + 4 * VLEN_s, y4);
+    
+    vstoreu_s(out + 5 * VLEN_s, y5);
+    
+}
+
+
+
+
+
+void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax) {
+    /* (l - 1) / l, for l >= 2 */
+    ptrdiff_t l;
+    r[0] = 0;
+    r[1] = 1;
+    for (l = 2; l <= lmax; ++l) {
+        r[l] = (double)(l - 1) / (double)l;
+    }
+}
+
+void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax) {
+    /* (l - 1) / l, for l >= 2 */
+    ptrdiff_t l;
+    r[0] = 0;
+    r[1] = 1;
+    for (l = 2; l <= lmax; ++l) {
+        r[l] = (float)(l - 1) / (float)l;
+    }
+}
+
+
+/*
+  Compute sum_l b_l P_l(x_i) for all i. 
+ */
+
+
+
+#define CS 4
+#define LEN (CS * VLEN)
+#define LEN_s (CS * VLEN_s)
+
+
+void sharp_legendre_transform(double *bl,
+                                   double *recfac,
+                                   ptrdiff_t lmax,
+                                   double *x, double *out, ptrdiff_t nx) {
+    double xchunk[LEN], outchunk[LEN];
+    int compute_recfac;
+    ptrdiff_t i, j, len;
+
+    compute_recfac = (recfac == NULL);
+    if (compute_recfac) {
+        recfac = memalign(16, sizeof(double) * (lmax + 1));
+        sharp_legendre_transform_recfac(recfac, lmax);
+    }
+
+    for (j = 0; j != LEN; ++j) xchunk[j] = 0;
+
+    for (i = 0; i < nx; i += LEN) {
+        len = (i + (LEN) <= nx) ? (LEN) : (nx - i);
+        for (j = 0; j != len; ++j) xchunk[j] = x[i + j];
+        switch ((len + VLEN - 1) / VLEN) {
+          case 6: legendre_transform_vec6(recfac, bl, lmax, xchunk, outchunk); break;
+          case 5: legendre_transform_vec5(recfac, bl, lmax, xchunk, outchunk); break;
+          case 4: legendre_transform_vec4(recfac, bl, lmax, xchunk, outchunk); break;
+          case 3: legendre_transform_vec3(recfac, bl, lmax, xchunk, outchunk); break;
+          case 2: legendre_transform_vec2(recfac, bl, lmax, xchunk, outchunk); break;
+          case 1:
+          case 0:
+              legendre_transform_vec1(recfac, bl, lmax, xchunk, outchunk); break;
+        }
+        for (j = 0; j != len; ++j) out[i + j] = outchunk[j];
+    }
+    if (compute_recfac) {
+        free(recfac);
+    }
+}
+
+void sharp_legendre_transform_s(float *bl,
+                                   float *recfac,
+                                   ptrdiff_t lmax,
+                                   float *x, float *out, ptrdiff_t nx) {
+    float xchunk[LEN_s], outchunk[LEN_s];
+    int compute_recfac;
+    ptrdiff_t i, j, len;
+
+    compute_recfac = (recfac == NULL);
+    if (compute_recfac) {
+        recfac = memalign(16, sizeof(float) * (lmax + 1));
+        sharp_legendre_transform_recfac_s(recfac, lmax);
+    }
+
+    for (j = 0; j != LEN_s; ++j) xchunk[j] = 0;
+
+    for (i = 0; i < nx; i += LEN_s) {
+        len = (i + (LEN_s) <= nx) ? (LEN_s) : (nx - i);
+        for (j = 0; j != len; ++j) xchunk[j] = x[i + j];
+        switch ((len + VLEN_s - 1) / VLEN_s) {
+          case 6: legendre_transform_vec6_s(recfac, bl, lmax, xchunk, outchunk); break;
+          case 5: legendre_transform_vec5_s(recfac, bl, lmax, xchunk, outchunk); break;
+          case 4: legendre_transform_vec4_s(recfac, bl, lmax, xchunk, outchunk); break;
+          case 3: legendre_transform_vec3_s(recfac, bl, lmax, xchunk, outchunk); break;
+          case 2: legendre_transform_vec2_s(recfac, bl, lmax, xchunk, outchunk); break;
+          case 1:
+          case 0:
+              legendre_transform_vec1_s(recfac, bl, lmax, xchunk, outchunk); break;
+        }
+        for (j = 0; j != len; ++j) out[i + j] = outchunk[j];
+    }
+    if (compute_recfac) {
+        free(recfac);
+    }
+}
+
diff --git a/libsharp/sharp_legendre.c.in b/libsharp/sharp_legendre.c.in
new file mode 100644
index 0000000..9862127
--- /dev/null
+++ b/libsharp/sharp_legendre.c.in
@@ -0,0 +1,163 @@
+/*
+
+    NOTE NOTE NOTE
+
+    This file is edited in sharp_legendre.c.in which is then preprocessed.
+    Do not make manual  modifications to sharp_legendre.c.
+
+    NOTE NOTE NOTE
+
+*/
+
+
+/*
+ *  This file is part of libsharp.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*! \file sharp_legendre.c.in
+ *
+ *  Copyright (C) 2015 University of Oslo
+ *  \author Dag Sverre Seljebotn
+ */
+
+#include "sharp_legendre.h"
+#include "sharp_vecsupport.h"
+
+#include <malloc.h>
+
+/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
+/*{ for cs in range(1, 7) }*/
+static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *bl, ptrdiff_t lmax,
+                                              {{scalar}} xarr[({{cs}}) * VLEN{{T}}],
+                                              {{scalar}} out[({{cs}}) * VLEN{{T}}]) {
+    /*{ for i in range(cs) }*/
+    Tv{{T}} P_{{i}}, Pm1_{{i}}, Pm2_{{i}}, x{{i}}, y{{i}};
+    /*{ endfor }*/
+    Tv{{T}} W1, W2, b, R;
+    ptrdiff_t l;
+
+    /*{ for i in range(cs) }*/
+    x{{i}} = vloadu{{T}}(xarr + {{i}} * VLEN{{T}});
+    Pm1_{{i}} = vload{{T}}(1.0);
+    P_{{i}} = x{{i}};
+    b = vbroadcast{{T}}(bl);
+    y{{i}} = vmul{{T}}(Pm1_{{i}}, b);
+    /*{ endfor }*/
+    
+    b = vbroadcast{{T}}(bl + 1);
+    /*{ for i in range(cs) }*/
+    vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
+    /*{ endfor }*/
+
+    for (l = 2; l <= lmax; ++l) {
+        b = vbroadcast{{T}}(bl + l);
+        R = vbroadcast{{T}}(recfacs + l);
+        
+        /* 
+           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
+        */
+        /*{ for i in range(cs) }*/
+        Pm2_{{i}} = Pm1_{{i}}; Pm1_{{i}} = P_{{i}};
+        W1 = vmul{{T}}(x{{i}}, Pm1_{{i}});
+        W2 = W1;
+        W2 = vsub{{T}}(W2, Pm2_{{i}});
+        vfmaeq{{T}}(W1, W2, R);
+        P_{{i}} = W1;
+        vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
+        /*{ endfor }*/
+
+    }
+    /*{ for i in range(cs) }*/
+    vstoreu{{T}}(out + {{i}} * VLEN{{T}}, y{{i}});
+    /*{ endfor }*/
+}
+/*{ endfor }*/
+/*{ endfor }*/
+
+
+/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
+void sharp_legendre_transform_recfac{{T}}({{scalar}} *r, ptrdiff_t lmax) {
+    /* (l - 1) / l, for l >= 2 */
+    ptrdiff_t l;
+    r[0] = 0;
+    r[1] = 1;
+    for (l = 2; l <= lmax; ++l) {
+        r[l] = ({{scalar}})(l - 1) / ({{scalar}})l;
+    }
+}
+/*{ endfor }*/
+
+/*
+  Compute sum_l b_l P_l(x_i) for all i. 
+ */
+
+/*{set cs=4}*/
+
+#define CS {{cs}}
+#define LEN (CS * VLEN)
+#define LEN_s (CS * VLEN_s)
+
+/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
+void sharp_legendre_transform{{T}}({{scalar}} *bl,
+                                   {{scalar}} *recfac,
+                                   ptrdiff_t lmax,
+                                   {{scalar}} *x, {{scalar}} *out, ptrdiff_t nx) {
+    {{scalar}} xchunk[LEN{{T}}], outchunk[LEN{{T}}];
+    int compute_recfac;
+    ptrdiff_t i, j, len;
+
+    compute_recfac = (recfac == NULL);
+    if (compute_recfac) {
+        recfac = memalign(16, sizeof({{scalar}}) * (lmax + 1));
+        sharp_legendre_transform_recfac{{T}}(recfac, lmax);
+    }
+
+    for (j = 0; j != LEN{{T}}; ++j) xchunk[j] = 0;
+
+    for (i = 0; i < nx; i += LEN{{T}}) {
+        len = (i + (LEN{{T}}) <= nx) ? (LEN{{T}}) : (nx - i);
+        for (j = 0; j != len; ++j) xchunk[j] = x[i + j];
+        switch ((len + VLEN{{T}} - 1) / VLEN{{T}}) {
+          case 6: legendre_transform_vec6{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+          case 5: legendre_transform_vec5{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+          case 4: legendre_transform_vec4{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+          case 3: legendre_transform_vec3{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+          case 2: legendre_transform_vec2{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+          case 1:
+          case 0:
+              legendre_transform_vec1{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+        }
+        for (j = 0; j != len; ++j) out[i + j] = outchunk[j];
+    }
+    if (compute_recfac) {
+        free(recfac);
+    }
+}
+/*{ endfor }*/
+
diff --git a/libsharp/sharp_legendre.h b/libsharp/sharp_legendre.h
new file mode 100644
index 0000000..cfd8aee
--- /dev/null
+++ b/libsharp/sharp_legendre.h
@@ -0,0 +1,62 @@
+/*
+ *  This file is part of libsharp.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*! \file sharp_legendre.h
+ *  Interface for the Legendre transform parts of the spherical transform library.
+ *
+ *  Copyright (C) 2015 University of Oslo
+ *  \author Dag Sverre Seljebotn
+ */
+
+#ifndef SHARP_LEGENDRE_H
+#define SHARP_LEGENDRE_H
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef NO_LEGENDRE
+
+void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x,
+                              double *out, ptrdiff_t nx);
+void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x,
+                                float *out, ptrdiff_t nx);
+void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax);
+void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index b26a539..28073ac 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -40,23 +40,31 @@ typedef double Ts;
 #if (VLEN==1)
 
 typedef double Tv;
+typedef float Tv_s;
 typedef int Tm;
 
 #define vadd(a,b) ((a)+(b))
+#define vadd_s(a,b) ((a)+(b))
 #define vaddeq(a,b) ((a)+=(b))
 #define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
 #define vsub(a,b) ((a)-(b))
+#define vsub_s(a,b) ((a)-(b))
 #define vsubeq(a,b) ((a)-=(b))
 #define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
 #define vmul(a,b) ((a)*(b))
+#define vmul_s(a,b) ((a)*(b))
 #define vmuleq(a,b) ((a)*=(b))
 #define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
 #define vfmaeq(a,b,c) ((a)+=(b)*(c))
+#define vfmaeq_s(a,b,c) ((a)+=(b)*(c))
 #define vfmseq(a,b,c) ((a)-=(b)*(c))
 #define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
 #define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
 #define vneg(a) (-(a))
 #define vload(a) (a)
+#define vload_s(a) (a)
+#define vloadu(p) (*p)
+#define vloadu_s(p) (*p)
 #define vabs(a) fabs(a)
 #define vsqrt(a) sqrt(a)
 #define vlt(a,b) ((a)<(b))
@@ -64,6 +72,10 @@ typedef int Tm;
 #define vge(a,b) ((a)>=(b))
 #define vne(a,b) ((a)!=(b))
 #define vand_mask(a,b) ((a)&&(b))
+#define vbroadcast(p) (*p)
+#define vbroadcast_s(p) (*p)
+#define vstoreu(p, a) (*(p)=a)
+#define vstoreu_s(p, a) (*(p)=a)
 
 static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
 static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
@@ -87,6 +99,7 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
 #endif
 
 typedef __m128d Tv;
+typedef __m128 Tv_s;
 typedef __m128d Tm;
 
 #if defined(__SSE4_1__)
@@ -99,15 +112,19 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vone _mm_set1_pd(1.)
 
 #define vadd(a,b) _mm_add_pd(a,b)
+#define vadd_s(a,b) _mm_add_ps(a,b)
 #define vaddeq(a,b) a=_mm_add_pd(a,b)
 #define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero))
 #define vsub(a,b) _mm_sub_pd(a,b)
+#define vsub_s(a,b) _mm_sub_ps(a,b)
 #define vsubeq(a,b) a=_mm_sub_pd(a,b)
 #define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero))
 #define vmul(a,b) _mm_mul_pd(a,b)
+#define vmul_s(a,b) _mm_mul_ps(a,b)
 #define vmuleq(a,b) a=_mm_mul_pd(a,b)
 #define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone))
 #define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
+#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c))
 #define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
 #define vfmaaeq(a,b,c,d,e) \
   a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
@@ -115,6 +132,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
   a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
 #define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
 #define vload(a) _mm_set1_pd(a)
+#define vload_s(a) _mm_set1_ps(a)
 #define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
 #define vsqrt(a) _mm_sqrt_pd(a)
 #define vlt(a,b) _mm_cmplt_pd(a,b)
@@ -126,6 +144,12 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vmax(a,b) _mm_max_pd(a,b);
 #define vanyTrue(a) (_mm_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm_movemask_pd(a)==3)
+#define vloadu(p) _mm_loadu_pd(p)
+#define vloadu_s(p) _mm_loadu_ps(p)
+#define vstoreu(p, v) _mm_storeu_pd(p, v)
+#define vstoreu_s(p, v) _mm_storeu_ps(p, v)
+#define vbroadcast(p) _mm_set_pd(*p, *p)
+#define vbroadcast_s(p) _mm_set_ps(*p, *p, *p, *p)
 
 #endif
 
@@ -137,6 +161,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #endif
 
 typedef __m256d Tv;
+typedef __m256 Tv_s;
 typedef __m256d Tm;
 
 #define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
@@ -144,16 +169,20 @@ typedef __m256d Tm;
 #define vone _mm256_set1_pd(1.)
 
 #define vadd(a,b) _mm256_add_pd(a,b)
+#define vadd_s(a,b) _mm256_add_ps(a,b)
 #define vaddeq(a,b) a=_mm256_add_pd(a,b)
 #define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero))
 #define vsub(a,b) _mm256_sub_pd(a,b)
+#define vsub_s(a,b) _mm256_sub_ps(a,b)
 #define vsubeq(a,b) a=_mm256_sub_pd(a,b)
 #define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero))
 #define vmul(a,b) _mm256_mul_pd(a,b)
+#define vmul_s(a,b) _mm256_mul_ps(a,b)
 #define vmuleq(a,b) a=_mm256_mul_pd(a,b)
 #define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
 #ifdef __FMA4__
 #define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
+#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
 #define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
 #define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
 #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
@@ -167,6 +196,7 @@ typedef __m256d Tm;
 #endif
 #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
 #define vload(a) _mm256_set1_pd(a)
+#define vload_s(a) _mm256_set1_ps(a)
 #define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
 #define vsqrt(a) _mm256_sqrt_pd(a)
 #define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
@@ -179,6 +209,13 @@ typedef __m256d Tm;
 #define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm256_movemask_pd(a)==15)
 
+#define vloadu(p) _mm256_loadu_pd(p)
+#define vloadu_s(p) _mm256_loadu_ps(p)
+#define vstoreu(p, v) _mm256_storeu_pd(p, v)
+#define vstoreu_s(p, v) _mm256_storeu_ps(p, v)
+#define vbroadcast(p) _mm256_broadcast_sd(p)
+#define vbroadcast_s(p) _mm256_broadcast_ss(p)
+
 #endif
 
 #if (VLEN==8)
diff --git a/libsharp/sharp_vecutil.h b/libsharp/sharp_vecutil.h
index 16bfa13..0aee4ae 100644
--- a/libsharp/sharp_vecutil.h
+++ b/libsharp/sharp_vecutil.h
@@ -32,6 +32,8 @@
 #ifndef SHARP_VECUTIL_H
 #define SHARP_VECUTIL_H
 
+#ifndef VLEN
+
 #if (defined (__MIC__))
 #define VLEN 8
 #elif (defined (__AVX__))
@@ -43,3 +45,11 @@
 #endif
 
 #endif
+
+#if (VLEN==1)
+#define VLEN_s 1
+#else
+#define VLEN_s (2*VLEN)
+#endif
+
+#endif
diff --git a/runjinja.py b/runjinja.py
new file mode 100755
index 0000000..5269c1a
--- /dev/null
+++ b/runjinja.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+
+"""
+Preprocesses foo.c.in to foo.c. Reads STDIN and writes STDOUT.
+"""
+
+import sys
+from jinja2 import Template, Environment
+
+env = Environment(block_start_string='/*{',
+                  block_end_string='}*/',
+                  variable_start_string='{{',
+                  variable_end_string='}}')
+
+extra_vars = dict(len=len)
+sys.stdout.write(env.from_string(sys.stdin.read()).render(**extra_vars))

From 0a0cad34aa9edd1f985f7e39d951e54593660a5a Mon Sep 17 00:00:00 2001
From: Dag Sverre Seljebotn <d.s.seljebotn@astro.uio.no>
Date: Wed, 22 Apr 2015 13:18:43 +0200
Subject: [PATCH 2/6] Python wrapper with test for Legendre transform

---
 .gitignore                                    |  5 ++
 Makefile                                      |  7 ++
 python/fake_pyrex/Pyrex/Distutils/__init__.py |  1 +
 .../fake_pyrex/Pyrex/Distutils/build_ext.py   |  1 +
 python/fake_pyrex/Pyrex/__init__.py           |  1 +
 python/fake_pyrex/README                      |  2 +
 python/libsharp/__init__.py                   |  1 +
 python/libsharp/libsharp.pyx                  | 41 ++++++++++
 python/libsharp/tests/__init__.py             |  1 +
 python/libsharp/tests/test_legendre.py        | 29 +++++++
 python/setup.py                               | 76 +++++++++++++++++++
 11 files changed, 165 insertions(+)
 create mode 100644 python/fake_pyrex/Pyrex/Distutils/__init__.py
 create mode 100644 python/fake_pyrex/Pyrex/Distutils/build_ext.py
 create mode 100644 python/fake_pyrex/Pyrex/__init__.py
 create mode 100644 python/fake_pyrex/README
 create mode 100644 python/libsharp/__init__.py
 create mode 100644 python/libsharp/libsharp.pyx
 create mode 100644 python/libsharp/tests/__init__.py
 create mode 100644 python/libsharp/tests/test_legendre.py
 create mode 100644 python/setup.py

diff --git a/.gitignore b/.gitignore
index cd0bf21..b555caa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,9 @@
 *.o
+*.so
 #*
 *~
+*.pyc
+*.pyo
 
 /auto
 /autom4te.cache
@@ -9,3 +12,5 @@
 /config/config.auto
 /configure
 /sharp_oracle.inc
+
+/python/libsharp/libsharp.c
diff --git a/Makefile b/Makefile
index dd9931e..f44ecfd 100644
--- a/Makefile
+++ b/Makefile
@@ -61,3 +61,10 @@ perftest: compile_all
 
 genclean:
 	rm libsharp/sharp_legendre.c || exit 0
+
+pytest:
+	rm python/libsharp/libsharp.so || exit 0
+	cd python && LIBSHARP_INCLUDE=$(INCDIR) LIBSHARP_LIB=$(LIBDIR) python setup.py build_ext --inplace
+	cd python && nosetests libsharp
+
+
diff --git a/python/fake_pyrex/Pyrex/Distutils/__init__.py b/python/fake_pyrex/Pyrex/Distutils/__init__.py
new file mode 100644
index 0000000..51c8e16
--- /dev/null
+++ b/python/fake_pyrex/Pyrex/Distutils/__init__.py
@@ -0,0 +1 @@
+# work around broken setuptools monkey patching
diff --git a/python/fake_pyrex/Pyrex/Distutils/build_ext.py b/python/fake_pyrex/Pyrex/Distutils/build_ext.py
new file mode 100644
index 0000000..4f846f6
--- /dev/null
+++ b/python/fake_pyrex/Pyrex/Distutils/build_ext.py
@@ -0,0 +1 @@
+build_ext = "yes, it's there!"
diff --git a/python/fake_pyrex/Pyrex/__init__.py b/python/fake_pyrex/Pyrex/__init__.py
new file mode 100644
index 0000000..51c8e16
--- /dev/null
+++ b/python/fake_pyrex/Pyrex/__init__.py
@@ -0,0 +1 @@
+# work around broken setuptools monkey patching
diff --git a/python/fake_pyrex/README b/python/fake_pyrex/README
new file mode 100644
index 0000000..cf3f3ff
--- /dev/null
+++ b/python/fake_pyrex/README
@@ -0,0 +1,2 @@
+This directory is here to fool setuptools into building .pyx files
+even if Pyrex is not installed. See ../setup.py.
\ No newline at end of file
diff --git a/python/libsharp/__init__.py b/python/libsharp/__init__.py
new file mode 100644
index 0000000..dd0fa41
--- /dev/null
+++ b/python/libsharp/__init__.py
@@ -0,0 +1 @@
+from .libsharp import *
diff --git a/python/libsharp/libsharp.pyx b/python/libsharp/libsharp.pyx
new file mode 100644
index 0000000..0abfe69
--- /dev/null
+++ b/python/libsharp/libsharp.pyx
@@ -0,0 +1,41 @@
+import numpy as np
+
+cdef extern from "sharp.h":
+    ctypedef long ptrdiff_t
+
+    void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x,
+                                    float *out, ptrdiff_t nx)
+    void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x,
+                                  double *out, ptrdiff_t nx)
+    void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax)
+    void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax)
+
+
+def legendre_transform(x, bl, out=None):
+    if out is None:
+        out = np.empty_like(x)
+    if x.dtype == np.float64:
+        if bl.dtype != np.float64:
+            bl = bl.astype(np.float64)
+        return _legendre_transform(x, bl, out=out)
+    elif x.dtype == np.float32:
+        if bl.dtype != np.float32:
+            bl = bl.astype(np.float32)
+        return _legendre_transform_s(x, bl, out=out)
+    else:
+        raise ValueError("unsupported dtype")
+
+
+def _legendre_transform(double[::1] x, double[::1] bl, double[::1] out):
+    if out.shape[0] != x.shape[0]:
+        raise ValueError('x and out must have same shape')
+    sharp_legendre_transform(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0])
+    return np.asarray(out)
+
+
+def _legendre_transform_s(float[::1] x, float[::1] bl, float[::1] out):
+    if out.shape[0] != x.shape[0]:
+        raise ValueError('x and out must have same shape')
+    sharp_legendre_transform_s(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0])
+    return np.asarray(out)
+
diff --git a/python/libsharp/tests/__init__.py b/python/libsharp/tests/__init__.py
new file mode 100644
index 0000000..1bb8bf6
--- /dev/null
+++ b/python/libsharp/tests/__init__.py
@@ -0,0 +1 @@
+# empty
diff --git a/python/libsharp/tests/test_legendre.py b/python/libsharp/tests/test_legendre.py
new file mode 100644
index 0000000..2e96822
--- /dev/null
+++ b/python/libsharp/tests/test_legendre.py
@@ -0,0 +1,29 @@
+import numpy as np
+from scipy.special import legendre
+import libsharp
+
+
+def test_legendre_transform():
+    lmax = 20
+    ntheta = 19
+
+    l = np.arange(lmax + 1)
+    bl = np.exp(-l*(l+1))
+    bl *= (2 * l + 1)
+
+    theta = np.linspace(0, np.pi, ntheta, endpoint=True)
+    x = np.cos(theta)
+
+    # Compute truth using scipy.special.legendre
+    P = np.zeros((ntheta, lmax + 1))
+    for l in range(lmax + 1):
+        P[:, l] = legendre(l)(x)
+    y0 = np.dot(P, bl)
+
+    # double-precision
+    y = libsharp.legendre_transform(x, bl)
+    assert np.max(np.abs(y - y) / np.abs(y)) < 1e-12
+
+    # single-precision
+    y32 = libsharp.legendre_transform(x.astype(np.float32), bl)
+    assert np.max(np.abs(y32 - y) / np.abs(y32)) < 1e-4
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 0000000..1500f05
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,76 @@
+#! /usr/bin/env python
+
+descr   = """Spherical Harmionic transforms package
+
+Python API for the libsharp spherical harmonic transforms library
+"""
+
+import os
+import sys
+
+DISTNAME            = 'libsharp'
+DESCRIPTION         = 'libsharp library for fast Spherical Harmonic Transforms'
+LONG_DESCRIPTION    = descr
+MAINTAINER          = 'Dag Sverre Seljebotn',
+MAINTAINER_EMAIL    = 'd.s.seljebotn@astro.uio.no',
+URL                 = 'http://sourceforge.net/projects/libsharp/'
+LICENSE             = 'GPL'
+DOWNLOAD_URL        = "http://sourceforge.net/projects/libsharp/"
+VERSION             = '0.1'
+
+# Add our fake Pyrex at the end of the Python search path
+# in order to fool setuptools into allowing compilation of
+# pyx files to C files. Importing Cython.Distutils then
+# makes Cython the tool of choice for this rather than
+# (the possibly nonexisting) Pyrex.
+project_path = os.path.split(__file__)[0]
+sys.path.append(os.path.join(project_path, 'fake_pyrex'))
+
+from setuptools import setup, find_packages, Extension
+from Cython.Distutils import build_ext
+import numpy as np
+
+libsharp = os.environ.get('LIBSHARP', None)
+libsharp_include = os.environ.get('LIBSHARP_INCLUDE', libsharp and os.path.join(libsharp, 'include'))
+libsharp_lib = os.environ.get('LIBSHARP_LIB', libsharp and os.path.join(libsharp, 'lib'))
+
+if libsharp_include is None or libsharp_lib is None:
+    sys.stderr.write('Please set LIBSHARP environment variable to the install directly of libsharp, '
+                     'this script will refer to the lib and include sub-directories. Alternatively '
+                     'set LIBSHARP_INCLUDE and LIBSHARP_LIB\n')
+    sys.exit(1)
+
+if __name__ == "__main__":
+    setup(install_requires = ['numpy'],
+          packages = find_packages(),
+          test_suite="nose.collector",
+          # Well, technically zipping the package will work, but since it's
+          # all compiled code it'll just get unzipped again at runtime, which
+          # is pointless:
+          zip_safe = False,
+          name = DISTNAME,
+          version = VERSION,
+          maintainer = MAINTAINER,
+          maintainer_email = MAINTAINER_EMAIL,
+          description = DESCRIPTION,
+          license = LICENSE,
+          url = URL,
+          download_url = DOWNLOAD_URL,
+          long_description = LONG_DESCRIPTION,
+          classifiers =
+            [ 'Development Status :: 3 - Alpha',
+              'Environment :: Console',
+              'Intended Audience :: Developers',
+              'Intended Audience :: Science/Research',
+              'License :: OSI Approved :: GNU General Public License (GPL)',
+              'Topic :: Scientific/Engineering'],
+          cmdclass = {"build_ext": build_ext},
+          ext_modules = [
+              Extension("libsharp.libsharp",
+                        ["libsharp/libsharp.pyx"],
+                        libraries=["sharp", "fftpack", "c_utils"],
+                        include_dirs=[libsharp_include],
+                        library_dirs=[libsharp_lib]
+                        ),
+              ],
+          )

From ea8671c2eccbae23d0e92cc20a1571b0b323286e Mon Sep 17 00:00:00 2001
From: Dag Sverre Seljebotn <d.s.seljebotn@astro.uio.no>
Date: Wed, 22 Apr 2015 14:23:39 +0200
Subject: [PATCH 3/6] Prevent invoking Jinja templating unecessarily

---
 Makefile                     | 6 +++---
 libsharp/sharp_legendre.c    | 4 +++-
 libsharp/sharp_legendre.c.in | 1 -
 runjinja.py                  | 5 ++++-
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index f44ecfd..a621948 100644
--- a/Makefile
+++ b/Makefile
@@ -54,10 +54,10 @@ perftest: compile_all
 	$(BINDIR)/sharp_testsuite test gauss 4095 -1 -1 8192 0 1 && \
 	$(BINDIR)/sharp_testsuite test gauss 8191 -1 -1 16384 0 1
 
-# Jinja templates
-
 %.c: %.c.in
-	./runjinja.py < $< > $@
+# Only do this if the md5sum changed, in order to avoid Python and Jinja
+# dependency when not modifying the c.in file
+	grep `md5sum $< | cut -d ' ' -f 1` $@ || ./runjinja.py < $< > $@
 
 genclean:
 	rm libsharp/sharp_legendre.c || exit 0
diff --git a/libsharp/sharp_legendre.c b/libsharp/sharp_legendre.c
index 75518d6..8b4fe4b 100644
--- a/libsharp/sharp_legendre.c
+++ b/libsharp/sharp_legendre.c
@@ -1,4 +1,4 @@
-/*
+/* DO NOT EDIT. md5sum of source: 88eec944ab6fbfdc7f391fbb58df3bf1 *//*
 
     NOTE NOTE NOTE
 
@@ -1303,3 +1303,5 @@ void sharp_legendre_transform_s(float *bl,
     }
 }
 
+
+
diff --git a/libsharp/sharp_legendre.c.in b/libsharp/sharp_legendre.c.in
index 9862127..e30f787 100644
--- a/libsharp/sharp_legendre.c.in
+++ b/libsharp/sharp_legendre.c.in
@@ -160,4 +160,3 @@ void sharp_legendre_transform{{T}}({{scalar}} *bl,
     }
 }
 /*{ endfor }*/
-
diff --git a/runjinja.py b/runjinja.py
index 5269c1a..fd659fa 100755
--- a/runjinja.py
+++ b/runjinja.py
@@ -5,6 +5,7 @@ Preprocesses foo.c.in to foo.c. Reads STDIN and writes STDOUT.
 """
 
 import sys
+import hashlib
 from jinja2 import Template, Environment
 
 env = Environment(block_start_string='/*{',
@@ -13,4 +14,6 @@ env = Environment(block_start_string='/*{',
                   variable_end_string='}}')
 
 extra_vars = dict(len=len)
-sys.stdout.write(env.from_string(sys.stdin.read()).render(**extra_vars))
+input = sys.stdin.read()
+sys.stdout.write('/* DO NOT EDIT. md5sum of source: %s */' % hashlib.md5(input).hexdigest())
+sys.stdout.write(env.from_string(input).render(**extra_vars))

From f2fe4f9ca2177803008d70e58553c91c183c4930 Mon Sep 17 00:00:00 2001
From: Dag Sverre Seljebotn <d.s.seljebotn@astro.uio.no>
Date: Thu, 23 Apr 2015 09:57:45 +0200
Subject: [PATCH 4/6] legendre transforms: Polishing and bugfixes

vbroadcast didn't work properly for some reason, vload does..
---
 libsharp/sharp_legendre.c              | 274 +++++++++++++------------
 libsharp/sharp_legendre.c.in           |  38 ++--
 libsharp/sharp_vecsupport.h            |  15 +-
 libsharp/sharp_vecutil.h               |   8 +
 python/libsharp/libsharp.pyx           |   4 +-
 python/libsharp/tests/test_legendre.py |  26 ++-
 6 files changed, 204 insertions(+), 161 deletions(-)

diff --git a/libsharp/sharp_legendre.c b/libsharp/sharp_legendre.c
index 8b4fe4b..cd54101 100644
--- a/libsharp/sharp_legendre.c
+++ b/libsharp/sharp_legendre.c
@@ -1,4 +1,4 @@
-/* DO NOT EDIT. md5sum of source: 88eec944ab6fbfdc7f391fbb58df3bf1 *//*
+/* DO NOT EDIT. md5sum of source: d9499375a254cbf1e7903a249a8676ff *//*
 
     NOTE NOTE NOTE
 
@@ -46,6 +46,21 @@
  *  \author Dag Sverre Seljebotn
  */
 
+#ifndef NO_LEGENDRE
+#if (VLEN==8)
+#error This code is not tested with MIC; please compile with -DNO_LEGENDRE
+/* ...or test it (it probably works) and remove this check */
+#endif
+
+#ifndef SHARP_LEGENDRE_CS
+#define SHARP_LEGENDRE_CS 4
+#endif
+
+#define MAX_CS 6
+#if (SHARP_LEGENDRE_CS > MAX_CS)
+#error (SHARP_LEGENDRE_CS > MAX_CS)
+#endif
+
 #include "sharp_legendre.h"
 #include "sharp_vecsupport.h"
 
@@ -66,18 +81,18 @@ static void legendre_transform_vec1(double *recfacs, double *bl, ptrdiff_t lmax,
     x0 = vloadu(xarr + 0 * VLEN);
     Pm1_0 = vload(1.0);
     P_0 = x0;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y0 = vmul(Pm1_0, b);
     
     
-    b = vbroadcast(bl + 1);
+    b = vload(*(bl + 1));
     
     vfmaeq(y0, P_0, b);
     
 
     for (l = 2; l <= lmax; ++l) {
-        b = vbroadcast(bl + l);
-        R = vbroadcast(recfacs + l);
+        b = vload(*(bl + l));
+        R = vload(*(recfacs + l));
         
         /* 
            P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
@@ -87,8 +102,8 @@ static void legendre_transform_vec1(double *recfacs, double *bl, ptrdiff_t lmax,
         W1 = vmul(x0, Pm1_0);
         W2 = W1;
         W2 = vsub(W2, Pm2_0);
-        vfmaeq(W1, W2, R);
         P_0 = W1;
+        vfmaeq(P_0, W2, R);
         vfmaeq(y0, P_0, b);
         
 
@@ -113,17 +128,17 @@ static void legendre_transform_vec2(double *recfacs, double *bl, ptrdiff_t lmax,
     x0 = vloadu(xarr + 0 * VLEN);
     Pm1_0 = vload(1.0);
     P_0 = x0;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y0 = vmul(Pm1_0, b);
     
     x1 = vloadu(xarr + 1 * VLEN);
     Pm1_1 = vload(1.0);
     P_1 = x1;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y1 = vmul(Pm1_1, b);
     
     
-    b = vbroadcast(bl + 1);
+    b = vload(*(bl + 1));
     
     vfmaeq(y0, P_0, b);
     
@@ -131,8 +146,8 @@ static void legendre_transform_vec2(double *recfacs, double *bl, ptrdiff_t lmax,
     
 
     for (l = 2; l <= lmax; ++l) {
-        b = vbroadcast(bl + l);
-        R = vbroadcast(recfacs + l);
+        b = vload(*(bl + l));
+        R = vload(*(recfacs + l));
         
         /* 
            P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
@@ -142,16 +157,16 @@ static void legendre_transform_vec2(double *recfacs, double *bl, ptrdiff_t lmax,
         W1 = vmul(x0, Pm1_0);
         W2 = W1;
         W2 = vsub(W2, Pm2_0);
-        vfmaeq(W1, W2, R);
         P_0 = W1;
+        vfmaeq(P_0, W2, R);
         vfmaeq(y0, P_0, b);
         
         Pm2_1 = Pm1_1; Pm1_1 = P_1;
         W1 = vmul(x1, Pm1_1);
         W2 = W1;
         W2 = vsub(W2, Pm2_1);
-        vfmaeq(W1, W2, R);
         P_1 = W1;
+        vfmaeq(P_1, W2, R);
         vfmaeq(y1, P_1, b);
         
 
@@ -180,23 +195,23 @@ static void legendre_transform_vec3(double *recfacs, double *bl, ptrdiff_t lmax,
     x0 = vloadu(xarr + 0 * VLEN);
     Pm1_0 = vload(1.0);
     P_0 = x0;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y0 = vmul(Pm1_0, b);
     
     x1 = vloadu(xarr + 1 * VLEN);
     Pm1_1 = vload(1.0);
     P_1 = x1;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y1 = vmul(Pm1_1, b);
     
     x2 = vloadu(xarr + 2 * VLEN);
     Pm1_2 = vload(1.0);
     P_2 = x2;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y2 = vmul(Pm1_2, b);
     
     
-    b = vbroadcast(bl + 1);
+    b = vload(*(bl + 1));
     
     vfmaeq(y0, P_0, b);
     
@@ -206,8 +221,8 @@ static void legendre_transform_vec3(double *recfacs, double *bl, ptrdiff_t lmax,
     
 
     for (l = 2; l <= lmax; ++l) {
-        b = vbroadcast(bl + l);
-        R = vbroadcast(recfacs + l);
+        b = vload(*(bl + l));
+        R = vload(*(recfacs + l));
         
         /* 
            P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
@@ -217,24 +232,24 @@ static void legendre_transform_vec3(double *recfacs, double *bl, ptrdiff_t lmax,
         W1 = vmul(x0, Pm1_0);
         W2 = W1;
         W2 = vsub(W2, Pm2_0);
-        vfmaeq(W1, W2, R);
         P_0 = W1;
+        vfmaeq(P_0, W2, R);
         vfmaeq(y0, P_0, b);
         
         Pm2_1 = Pm1_1; Pm1_1 = P_1;
         W1 = vmul(x1, Pm1_1);
         W2 = W1;
         W2 = vsub(W2, Pm2_1);
-        vfmaeq(W1, W2, R);
         P_1 = W1;
+        vfmaeq(P_1, W2, R);
         vfmaeq(y1, P_1, b);
         
         Pm2_2 = Pm1_2; Pm1_2 = P_2;
         W1 = vmul(x2, Pm1_2);
         W2 = W1;
         W2 = vsub(W2, Pm2_2);
-        vfmaeq(W1, W2, R);
         P_2 = W1;
+        vfmaeq(P_2, W2, R);
         vfmaeq(y2, P_2, b);
         
 
@@ -267,29 +282,29 @@ static void legendre_transform_vec4(double *recfacs, double *bl, ptrdiff_t lmax,
     x0 = vloadu(xarr + 0 * VLEN);
     Pm1_0 = vload(1.0);
     P_0 = x0;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y0 = vmul(Pm1_0, b);
     
     x1 = vloadu(xarr + 1 * VLEN);
     Pm1_1 = vload(1.0);
     P_1 = x1;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y1 = vmul(Pm1_1, b);
     
     x2 = vloadu(xarr + 2 * VLEN);
     Pm1_2 = vload(1.0);
     P_2 = x2;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y2 = vmul(Pm1_2, b);
     
     x3 = vloadu(xarr + 3 * VLEN);
     Pm1_3 = vload(1.0);
     P_3 = x3;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y3 = vmul(Pm1_3, b);
     
     
-    b = vbroadcast(bl + 1);
+    b = vload(*(bl + 1));
     
     vfmaeq(y0, P_0, b);
     
@@ -301,8 +316,8 @@ static void legendre_transform_vec4(double *recfacs, double *bl, ptrdiff_t lmax,
     
 
     for (l = 2; l <= lmax; ++l) {
-        b = vbroadcast(bl + l);
-        R = vbroadcast(recfacs + l);
+        b = vload(*(bl + l));
+        R = vload(*(recfacs + l));
         
         /* 
            P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
@@ -312,32 +327,32 @@ static void legendre_transform_vec4(double *recfacs, double *bl, ptrdiff_t lmax,
         W1 = vmul(x0, Pm1_0);
         W2 = W1;
         W2 = vsub(W2, Pm2_0);
-        vfmaeq(W1, W2, R);
         P_0 = W1;
+        vfmaeq(P_0, W2, R);
         vfmaeq(y0, P_0, b);
         
         Pm2_1 = Pm1_1; Pm1_1 = P_1;
         W1 = vmul(x1, Pm1_1);
         W2 = W1;
         W2 = vsub(W2, Pm2_1);
-        vfmaeq(W1, W2, R);
         P_1 = W1;
+        vfmaeq(P_1, W2, R);
         vfmaeq(y1, P_1, b);
         
         Pm2_2 = Pm1_2; Pm1_2 = P_2;
         W1 = vmul(x2, Pm1_2);
         W2 = W1;
         W2 = vsub(W2, Pm2_2);
-        vfmaeq(W1, W2, R);
         P_2 = W1;
+        vfmaeq(P_2, W2, R);
         vfmaeq(y2, P_2, b);
         
         Pm2_3 = Pm1_3; Pm1_3 = P_3;
         W1 = vmul(x3, Pm1_3);
         W2 = W1;
         W2 = vsub(W2, Pm2_3);
-        vfmaeq(W1, W2, R);
         P_3 = W1;
+        vfmaeq(P_3, W2, R);
         vfmaeq(y3, P_3, b);
         
 
@@ -374,35 +389,35 @@ static void legendre_transform_vec5(double *recfacs, double *bl, ptrdiff_t lmax,
     x0 = vloadu(xarr + 0 * VLEN);
     Pm1_0 = vload(1.0);
     P_0 = x0;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y0 = vmul(Pm1_0, b);
     
     x1 = vloadu(xarr + 1 * VLEN);
     Pm1_1 = vload(1.0);
     P_1 = x1;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y1 = vmul(Pm1_1, b);
     
     x2 = vloadu(xarr + 2 * VLEN);
     Pm1_2 = vload(1.0);
     P_2 = x2;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y2 = vmul(Pm1_2, b);
     
     x3 = vloadu(xarr + 3 * VLEN);
     Pm1_3 = vload(1.0);
     P_3 = x3;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y3 = vmul(Pm1_3, b);
     
     x4 = vloadu(xarr + 4 * VLEN);
     Pm1_4 = vload(1.0);
     P_4 = x4;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y4 = vmul(Pm1_4, b);
     
     
-    b = vbroadcast(bl + 1);
+    b = vload(*(bl + 1));
     
     vfmaeq(y0, P_0, b);
     
@@ -416,8 +431,8 @@ static void legendre_transform_vec5(double *recfacs, double *bl, ptrdiff_t lmax,
     
 
     for (l = 2; l <= lmax; ++l) {
-        b = vbroadcast(bl + l);
-        R = vbroadcast(recfacs + l);
+        b = vload(*(bl + l));
+        R = vload(*(recfacs + l));
         
         /* 
            P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
@@ -427,40 +442,40 @@ static void legendre_transform_vec5(double *recfacs, double *bl, ptrdiff_t lmax,
         W1 = vmul(x0, Pm1_0);
         W2 = W1;
         W2 = vsub(W2, Pm2_0);
-        vfmaeq(W1, W2, R);
         P_0 = W1;
+        vfmaeq(P_0, W2, R);
         vfmaeq(y0, P_0, b);
         
         Pm2_1 = Pm1_1; Pm1_1 = P_1;
         W1 = vmul(x1, Pm1_1);
         W2 = W1;
         W2 = vsub(W2, Pm2_1);
-        vfmaeq(W1, W2, R);
         P_1 = W1;
+        vfmaeq(P_1, W2, R);
         vfmaeq(y1, P_1, b);
         
         Pm2_2 = Pm1_2; Pm1_2 = P_2;
         W1 = vmul(x2, Pm1_2);
         W2 = W1;
         W2 = vsub(W2, Pm2_2);
-        vfmaeq(W1, W2, R);
         P_2 = W1;
+        vfmaeq(P_2, W2, R);
         vfmaeq(y2, P_2, b);
         
         Pm2_3 = Pm1_3; Pm1_3 = P_3;
         W1 = vmul(x3, Pm1_3);
         W2 = W1;
         W2 = vsub(W2, Pm2_3);
-        vfmaeq(W1, W2, R);
         P_3 = W1;
+        vfmaeq(P_3, W2, R);
         vfmaeq(y3, P_3, b);
         
         Pm2_4 = Pm1_4; Pm1_4 = P_4;
         W1 = vmul(x4, Pm1_4);
         W2 = W1;
         W2 = vsub(W2, Pm2_4);
-        vfmaeq(W1, W2, R);
         P_4 = W1;
+        vfmaeq(P_4, W2, R);
         vfmaeq(y4, P_4, b);
         
 
@@ -501,41 +516,41 @@ static void legendre_transform_vec6(double *recfacs, double *bl, ptrdiff_t lmax,
     x0 = vloadu(xarr + 0 * VLEN);
     Pm1_0 = vload(1.0);
     P_0 = x0;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y0 = vmul(Pm1_0, b);
     
     x1 = vloadu(xarr + 1 * VLEN);
     Pm1_1 = vload(1.0);
     P_1 = x1;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y1 = vmul(Pm1_1, b);
     
     x2 = vloadu(xarr + 2 * VLEN);
     Pm1_2 = vload(1.0);
     P_2 = x2;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y2 = vmul(Pm1_2, b);
     
     x3 = vloadu(xarr + 3 * VLEN);
     Pm1_3 = vload(1.0);
     P_3 = x3;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y3 = vmul(Pm1_3, b);
     
     x4 = vloadu(xarr + 4 * VLEN);
     Pm1_4 = vload(1.0);
     P_4 = x4;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y4 = vmul(Pm1_4, b);
     
     x5 = vloadu(xarr + 5 * VLEN);
     Pm1_5 = vload(1.0);
     P_5 = x5;
-    b = vbroadcast(bl);
+    b = vload(*bl);
     y5 = vmul(Pm1_5, b);
     
     
-    b = vbroadcast(bl + 1);
+    b = vload(*(bl + 1));
     
     vfmaeq(y0, P_0, b);
     
@@ -551,8 +566,8 @@ static void legendre_transform_vec6(double *recfacs, double *bl, ptrdiff_t lmax,
     
 
     for (l = 2; l <= lmax; ++l) {
-        b = vbroadcast(bl + l);
-        R = vbroadcast(recfacs + l);
+        b = vload(*(bl + l));
+        R = vload(*(recfacs + l));
         
         /* 
            P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
@@ -562,48 +577,48 @@ static void legendre_transform_vec6(double *recfacs, double *bl, ptrdiff_t lmax,
         W1 = vmul(x0, Pm1_0);
         W2 = W1;
         W2 = vsub(W2, Pm2_0);
-        vfmaeq(W1, W2, R);
         P_0 = W1;
+        vfmaeq(P_0, W2, R);
         vfmaeq(y0, P_0, b);
         
         Pm2_1 = Pm1_1; Pm1_1 = P_1;
         W1 = vmul(x1, Pm1_1);
         W2 = W1;
         W2 = vsub(W2, Pm2_1);
-        vfmaeq(W1, W2, R);
         P_1 = W1;
+        vfmaeq(P_1, W2, R);
         vfmaeq(y1, P_1, b);
         
         Pm2_2 = Pm1_2; Pm1_2 = P_2;
         W1 = vmul(x2, Pm1_2);
         W2 = W1;
         W2 = vsub(W2, Pm2_2);
-        vfmaeq(W1, W2, R);
         P_2 = W1;
+        vfmaeq(P_2, W2, R);
         vfmaeq(y2, P_2, b);
         
         Pm2_3 = Pm1_3; Pm1_3 = P_3;
         W1 = vmul(x3, Pm1_3);
         W2 = W1;
         W2 = vsub(W2, Pm2_3);
-        vfmaeq(W1, W2, R);
         P_3 = W1;
+        vfmaeq(P_3, W2, R);
         vfmaeq(y3, P_3, b);
         
         Pm2_4 = Pm1_4; Pm1_4 = P_4;
         W1 = vmul(x4, Pm1_4);
         W2 = W1;
         W2 = vsub(W2, Pm2_4);
-        vfmaeq(W1, W2, R);
         P_4 = W1;
+        vfmaeq(P_4, W2, R);
         vfmaeq(y4, P_4, b);
         
         Pm2_5 = Pm1_5; Pm1_5 = P_5;
         W1 = vmul(x5, Pm1_5);
         W2 = W1;
         W2 = vsub(W2, Pm2_5);
-        vfmaeq(W1, W2, R);
         P_5 = W1;
+        vfmaeq(P_5, W2, R);
         vfmaeq(y5, P_5, b);
         
 
@@ -638,18 +653,18 @@ static void legendre_transform_vec1_s(float *recfacs, float *bl, ptrdiff_t lmax,
     x0 = vloadu_s(xarr + 0 * VLEN_s);
     Pm1_0 = vload_s(1.0);
     P_0 = x0;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y0 = vmul_s(Pm1_0, b);
     
     
-    b = vbroadcast_s(bl + 1);
+    b = vload_s(*(bl + 1));
     
     vfmaeq_s(y0, P_0, b);
     
 
     for (l = 2; l <= lmax; ++l) {
-        b = vbroadcast_s(bl + l);
-        R = vbroadcast_s(recfacs + l);
+        b = vload_s(*(bl + l));
+        R = vload_s(*(recfacs + l));
         
         /* 
            P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
@@ -659,8 +674,8 @@ static void legendre_transform_vec1_s(float *recfacs, float *bl, ptrdiff_t lmax,
         W1 = vmul_s(x0, Pm1_0);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_0);
-        vfmaeq_s(W1, W2, R);
         P_0 = W1;
+        vfmaeq_s(P_0, W2, R);
         vfmaeq_s(y0, P_0, b);
         
 
@@ -685,17 +700,17 @@ static void legendre_transform_vec2_s(float *recfacs, float *bl, ptrdiff_t lmax,
     x0 = vloadu_s(xarr + 0 * VLEN_s);
     Pm1_0 = vload_s(1.0);
     P_0 = x0;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y0 = vmul_s(Pm1_0, b);
     
     x1 = vloadu_s(xarr + 1 * VLEN_s);
     Pm1_1 = vload_s(1.0);
     P_1 = x1;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y1 = vmul_s(Pm1_1, b);
     
     
-    b = vbroadcast_s(bl + 1);
+    b = vload_s(*(bl + 1));
     
     vfmaeq_s(y0, P_0, b);
     
@@ -703,8 +718,8 @@ static void legendre_transform_vec2_s(float *recfacs, float *bl, ptrdiff_t lmax,
     
 
     for (l = 2; l <= lmax; ++l) {
-        b = vbroadcast_s(bl + l);
-        R = vbroadcast_s(recfacs + l);
+        b = vload_s(*(bl + l));
+        R = vload_s(*(recfacs + l));
         
         /* 
            P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
@@ -714,16 +729,16 @@ static void legendre_transform_vec2_s(float *recfacs, float *bl, ptrdiff_t lmax,
         W1 = vmul_s(x0, Pm1_0);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_0);
-        vfmaeq_s(W1, W2, R);
         P_0 = W1;
+        vfmaeq_s(P_0, W2, R);
         vfmaeq_s(y0, P_0, b);
         
         Pm2_1 = Pm1_1; Pm1_1 = P_1;
         W1 = vmul_s(x1, Pm1_1);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_1);
-        vfmaeq_s(W1, W2, R);
         P_1 = W1;
+        vfmaeq_s(P_1, W2, R);
         vfmaeq_s(y1, P_1, b);
         
 
@@ -752,23 +767,23 @@ static void legendre_transform_vec3_s(float *recfacs, float *bl, ptrdiff_t lmax,
     x0 = vloadu_s(xarr + 0 * VLEN_s);
     Pm1_0 = vload_s(1.0);
     P_0 = x0;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y0 = vmul_s(Pm1_0, b);
     
     x1 = vloadu_s(xarr + 1 * VLEN_s);
     Pm1_1 = vload_s(1.0);
     P_1 = x1;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y1 = vmul_s(Pm1_1, b);
     
     x2 = vloadu_s(xarr + 2 * VLEN_s);
     Pm1_2 = vload_s(1.0);
     P_2 = x2;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y2 = vmul_s(Pm1_2, b);
     
     
-    b = vbroadcast_s(bl + 1);
+    b = vload_s(*(bl + 1));
     
     vfmaeq_s(y0, P_0, b);
     
@@ -778,8 +793,8 @@ static void legendre_transform_vec3_s(float *recfacs, float *bl, ptrdiff_t lmax,
     
 
     for (l = 2; l <= lmax; ++l) {
-        b = vbroadcast_s(bl + l);
-        R = vbroadcast_s(recfacs + l);
+        b = vload_s(*(bl + l));
+        R = vload_s(*(recfacs + l));
         
         /* 
            P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
@@ -789,24 +804,24 @@ static void legendre_transform_vec3_s(float *recfacs, float *bl, ptrdiff_t lmax,
         W1 = vmul_s(x0, Pm1_0);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_0);
-        vfmaeq_s(W1, W2, R);
         P_0 = W1;
+        vfmaeq_s(P_0, W2, R);
         vfmaeq_s(y0, P_0, b);
         
         Pm2_1 = Pm1_1; Pm1_1 = P_1;
         W1 = vmul_s(x1, Pm1_1);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_1);
-        vfmaeq_s(W1, W2, R);
         P_1 = W1;
+        vfmaeq_s(P_1, W2, R);
         vfmaeq_s(y1, P_1, b);
         
         Pm2_2 = Pm1_2; Pm1_2 = P_2;
         W1 = vmul_s(x2, Pm1_2);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_2);
-        vfmaeq_s(W1, W2, R);
         P_2 = W1;
+        vfmaeq_s(P_2, W2, R);
         vfmaeq_s(y2, P_2, b);
         
 
@@ -839,29 +854,29 @@ static void legendre_transform_vec4_s(float *recfacs, float *bl, ptrdiff_t lmax,
     x0 = vloadu_s(xarr + 0 * VLEN_s);
     Pm1_0 = vload_s(1.0);
     P_0 = x0;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y0 = vmul_s(Pm1_0, b);
     
     x1 = vloadu_s(xarr + 1 * VLEN_s);
     Pm1_1 = vload_s(1.0);
     P_1 = x1;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y1 = vmul_s(Pm1_1, b);
     
     x2 = vloadu_s(xarr + 2 * VLEN_s);
     Pm1_2 = vload_s(1.0);
     P_2 = x2;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y2 = vmul_s(Pm1_2, b);
     
     x3 = vloadu_s(xarr + 3 * VLEN_s);
     Pm1_3 = vload_s(1.0);
     P_3 = x3;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y3 = vmul_s(Pm1_3, b);
     
     
-    b = vbroadcast_s(bl + 1);
+    b = vload_s(*(bl + 1));
     
     vfmaeq_s(y0, P_0, b);
     
@@ -873,8 +888,8 @@ static void legendre_transform_vec4_s(float *recfacs, float *bl, ptrdiff_t lmax,
     
 
     for (l = 2; l <= lmax; ++l) {
-        b = vbroadcast_s(bl + l);
-        R = vbroadcast_s(recfacs + l);
+        b = vload_s(*(bl + l));
+        R = vload_s(*(recfacs + l));
         
         /* 
            P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
@@ -884,32 +899,32 @@ static void legendre_transform_vec4_s(float *recfacs, float *bl, ptrdiff_t lmax,
         W1 = vmul_s(x0, Pm1_0);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_0);
-        vfmaeq_s(W1, W2, R);
         P_0 = W1;
+        vfmaeq_s(P_0, W2, R);
         vfmaeq_s(y0, P_0, b);
         
         Pm2_1 = Pm1_1; Pm1_1 = P_1;
         W1 = vmul_s(x1, Pm1_1);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_1);
-        vfmaeq_s(W1, W2, R);
         P_1 = W1;
+        vfmaeq_s(P_1, W2, R);
         vfmaeq_s(y1, P_1, b);
         
         Pm2_2 = Pm1_2; Pm1_2 = P_2;
         W1 = vmul_s(x2, Pm1_2);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_2);
-        vfmaeq_s(W1, W2, R);
         P_2 = W1;
+        vfmaeq_s(P_2, W2, R);
         vfmaeq_s(y2, P_2, b);
         
         Pm2_3 = Pm1_3; Pm1_3 = P_3;
         W1 = vmul_s(x3, Pm1_3);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_3);
-        vfmaeq_s(W1, W2, R);
         P_3 = W1;
+        vfmaeq_s(P_3, W2, R);
         vfmaeq_s(y3, P_3, b);
         
 
@@ -946,35 +961,35 @@ static void legendre_transform_vec5_s(float *recfacs, float *bl, ptrdiff_t lmax,
     x0 = vloadu_s(xarr + 0 * VLEN_s);
     Pm1_0 = vload_s(1.0);
     P_0 = x0;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y0 = vmul_s(Pm1_0, b);
     
     x1 = vloadu_s(xarr + 1 * VLEN_s);
     Pm1_1 = vload_s(1.0);
     P_1 = x1;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y1 = vmul_s(Pm1_1, b);
     
     x2 = vloadu_s(xarr + 2 * VLEN_s);
     Pm1_2 = vload_s(1.0);
     P_2 = x2;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y2 = vmul_s(Pm1_2, b);
     
     x3 = vloadu_s(xarr + 3 * VLEN_s);
     Pm1_3 = vload_s(1.0);
     P_3 = x3;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y3 = vmul_s(Pm1_3, b);
     
     x4 = vloadu_s(xarr + 4 * VLEN_s);
     Pm1_4 = vload_s(1.0);
     P_4 = x4;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y4 = vmul_s(Pm1_4, b);
     
     
-    b = vbroadcast_s(bl + 1);
+    b = vload_s(*(bl + 1));
     
     vfmaeq_s(y0, P_0, b);
     
@@ -988,8 +1003,8 @@ static void legendre_transform_vec5_s(float *recfacs, float *bl, ptrdiff_t lmax,
     
 
     for (l = 2; l <= lmax; ++l) {
-        b = vbroadcast_s(bl + l);
-        R = vbroadcast_s(recfacs + l);
+        b = vload_s(*(bl + l));
+        R = vload_s(*(recfacs + l));
         
         /* 
            P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
@@ -999,40 +1014,40 @@ static void legendre_transform_vec5_s(float *recfacs, float *bl, ptrdiff_t lmax,
         W1 = vmul_s(x0, Pm1_0);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_0);
-        vfmaeq_s(W1, W2, R);
         P_0 = W1;
+        vfmaeq_s(P_0, W2, R);
         vfmaeq_s(y0, P_0, b);
         
         Pm2_1 = Pm1_1; Pm1_1 = P_1;
         W1 = vmul_s(x1, Pm1_1);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_1);
-        vfmaeq_s(W1, W2, R);
         P_1 = W1;
+        vfmaeq_s(P_1, W2, R);
         vfmaeq_s(y1, P_1, b);
         
         Pm2_2 = Pm1_2; Pm1_2 = P_2;
         W1 = vmul_s(x2, Pm1_2);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_2);
-        vfmaeq_s(W1, W2, R);
         P_2 = W1;
+        vfmaeq_s(P_2, W2, R);
         vfmaeq_s(y2, P_2, b);
         
         Pm2_3 = Pm1_3; Pm1_3 = P_3;
         W1 = vmul_s(x3, Pm1_3);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_3);
-        vfmaeq_s(W1, W2, R);
         P_3 = W1;
+        vfmaeq_s(P_3, W2, R);
         vfmaeq_s(y3, P_3, b);
         
         Pm2_4 = Pm1_4; Pm1_4 = P_4;
         W1 = vmul_s(x4, Pm1_4);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_4);
-        vfmaeq_s(W1, W2, R);
         P_4 = W1;
+        vfmaeq_s(P_4, W2, R);
         vfmaeq_s(y4, P_4, b);
         
 
@@ -1073,41 +1088,41 @@ static void legendre_transform_vec6_s(float *recfacs, float *bl, ptrdiff_t lmax,
     x0 = vloadu_s(xarr + 0 * VLEN_s);
     Pm1_0 = vload_s(1.0);
     P_0 = x0;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y0 = vmul_s(Pm1_0, b);
     
     x1 = vloadu_s(xarr + 1 * VLEN_s);
     Pm1_1 = vload_s(1.0);
     P_1 = x1;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y1 = vmul_s(Pm1_1, b);
     
     x2 = vloadu_s(xarr + 2 * VLEN_s);
     Pm1_2 = vload_s(1.0);
     P_2 = x2;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y2 = vmul_s(Pm1_2, b);
     
     x3 = vloadu_s(xarr + 3 * VLEN_s);
     Pm1_3 = vload_s(1.0);
     P_3 = x3;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y3 = vmul_s(Pm1_3, b);
     
     x4 = vloadu_s(xarr + 4 * VLEN_s);
     Pm1_4 = vload_s(1.0);
     P_4 = x4;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y4 = vmul_s(Pm1_4, b);
     
     x5 = vloadu_s(xarr + 5 * VLEN_s);
     Pm1_5 = vload_s(1.0);
     P_5 = x5;
-    b = vbroadcast_s(bl);
+    b = vload_s(*bl);
     y5 = vmul_s(Pm1_5, b);
     
     
-    b = vbroadcast_s(bl + 1);
+    b = vload_s(*(bl + 1));
     
     vfmaeq_s(y0, P_0, b);
     
@@ -1123,8 +1138,8 @@ static void legendre_transform_vec6_s(float *recfacs, float *bl, ptrdiff_t lmax,
     
 
     for (l = 2; l <= lmax; ++l) {
-        b = vbroadcast_s(bl + l);
-        R = vbroadcast_s(recfacs + l);
+        b = vload_s(*(bl + l));
+        R = vload_s(*(recfacs + l));
         
         /* 
            P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
@@ -1134,48 +1149,48 @@ static void legendre_transform_vec6_s(float *recfacs, float *bl, ptrdiff_t lmax,
         W1 = vmul_s(x0, Pm1_0);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_0);
-        vfmaeq_s(W1, W2, R);
         P_0 = W1;
+        vfmaeq_s(P_0, W2, R);
         vfmaeq_s(y0, P_0, b);
         
         Pm2_1 = Pm1_1; Pm1_1 = P_1;
         W1 = vmul_s(x1, Pm1_1);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_1);
-        vfmaeq_s(W1, W2, R);
         P_1 = W1;
+        vfmaeq_s(P_1, W2, R);
         vfmaeq_s(y1, P_1, b);
         
         Pm2_2 = Pm1_2; Pm1_2 = P_2;
         W1 = vmul_s(x2, Pm1_2);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_2);
-        vfmaeq_s(W1, W2, R);
         P_2 = W1;
+        vfmaeq_s(P_2, W2, R);
         vfmaeq_s(y2, P_2, b);
         
         Pm2_3 = Pm1_3; Pm1_3 = P_3;
         W1 = vmul_s(x3, Pm1_3);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_3);
-        vfmaeq_s(W1, W2, R);
         P_3 = W1;
+        vfmaeq_s(P_3, W2, R);
         vfmaeq_s(y3, P_3, b);
         
         Pm2_4 = Pm1_4; Pm1_4 = P_4;
         W1 = vmul_s(x4, Pm1_4);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_4);
-        vfmaeq_s(W1, W2, R);
         P_4 = W1;
+        vfmaeq_s(P_4, W2, R);
         vfmaeq_s(y4, P_4, b);
         
         Pm2_5 = Pm1_5; Pm1_5 = P_5;
         W1 = vmul_s(x5, Pm1_5);
         W2 = W1;
         W2 = vsub_s(W2, Pm2_5);
-        vfmaeq_s(W1, W2, R);
         P_5 = W1;
+        vfmaeq_s(P_5, W2, R);
         vfmaeq_s(y5, P_5, b);
         
 
@@ -1224,24 +1239,21 @@ void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax) {
   Compute sum_l b_l P_l(x_i) for all i. 
  */
 
-
-
-#define CS 4
-#define LEN (CS * VLEN)
-#define LEN_s (CS * VLEN_s)
+#define LEN (SHARP_LEGENDRE_CS * VLEN)
+#define LEN_s (SHARP_LEGENDRE_CS * VLEN_s)
 
 
 void sharp_legendre_transform(double *bl,
                                    double *recfac,
                                    ptrdiff_t lmax,
                                    double *x, double *out, ptrdiff_t nx) {
-    double xchunk[LEN], outchunk[LEN];
+    double xchunk[MAX_CS * VLEN], outchunk[MAX_CS * LEN];
     int compute_recfac;
     ptrdiff_t i, j, len;
 
     compute_recfac = (recfac == NULL);
     if (compute_recfac) {
-        recfac = memalign(16, sizeof(double) * (lmax + 1));
+        recfac = malloc(sizeof(double) * (lmax + 1));
         sharp_legendre_transform_recfac(recfac, lmax);
     }
 
@@ -1271,13 +1283,13 @@ void sharp_legendre_transform_s(float *bl,
                                    float *recfac,
                                    ptrdiff_t lmax,
                                    float *x, float *out, ptrdiff_t nx) {
-    float xchunk[LEN_s], outchunk[LEN_s];
+    float xchunk[MAX_CS * VLEN_s], outchunk[MAX_CS * LEN_s];
     int compute_recfac;
     ptrdiff_t i, j, len;
 
     compute_recfac = (recfac == NULL);
     if (compute_recfac) {
-        recfac = memalign(16, sizeof(float) * (lmax + 1));
+        recfac = malloc(sizeof(float) * (lmax + 1));
         sharp_legendre_transform_recfac_s(recfac, lmax);
     }
 
@@ -1304,4 +1316,4 @@ void sharp_legendre_transform_s(float *bl,
 }
 
 
-
+#endif
\ No newline at end of file
diff --git a/libsharp/sharp_legendre.c.in b/libsharp/sharp_legendre.c.in
index e30f787..63c84b7 100644
--- a/libsharp/sharp_legendre.c.in
+++ b/libsharp/sharp_legendre.c.in
@@ -46,6 +46,21 @@
  *  \author Dag Sverre Seljebotn
  */
 
+#ifndef NO_LEGENDRE
+#if (VLEN==8)
+#error This code is not tested with MIC; please compile with -DNO_LEGENDRE
+/* ...or test it (it probably works) and remove this check */
+#endif
+
+#ifndef SHARP_LEGENDRE_CS
+#define SHARP_LEGENDRE_CS 4
+#endif
+
+#define MAX_CS 6
+#if (SHARP_LEGENDRE_CS > MAX_CS)
+#error (SHARP_LEGENDRE_CS > MAX_CS)
+#endif
+
 #include "sharp_legendre.h"
 #include "sharp_vecsupport.h"
 
@@ -66,18 +81,18 @@ static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *b
     x{{i}} = vloadu{{T}}(xarr + {{i}} * VLEN{{T}});
     Pm1_{{i}} = vload{{T}}(1.0);
     P_{{i}} = x{{i}};
-    b = vbroadcast{{T}}(bl);
+    b = vload{{T}}(*bl);
     y{{i}} = vmul{{T}}(Pm1_{{i}}, b);
     /*{ endfor }*/
     
-    b = vbroadcast{{T}}(bl + 1);
+    b = vload{{T}}(*(bl + 1));
     /*{ for i in range(cs) }*/
     vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
     /*{ endfor }*/
 
     for (l = 2; l <= lmax; ++l) {
-        b = vbroadcast{{T}}(bl + l);
-        R = vbroadcast{{T}}(recfacs + l);
+        b = vload{{T}}(*(bl + l));
+        R = vload{{T}}(*(recfacs + l));
         
         /* 
            P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
@@ -87,8 +102,8 @@ static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *b
         W1 = vmul{{T}}(x{{i}}, Pm1_{{i}});
         W2 = W1;
         W2 = vsub{{T}}(W2, Pm2_{{i}});
-        vfmaeq{{T}}(W1, W2, R);
         P_{{i}} = W1;
+        vfmaeq{{T}}(P_{{i}}, W2, R);
         vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
         /*{ endfor }*/
 
@@ -117,24 +132,21 @@ void sharp_legendre_transform_recfac{{T}}({{scalar}} *r, ptrdiff_t lmax) {
   Compute sum_l b_l P_l(x_i) for all i. 
  */
 
-/*{set cs=4}*/
-
-#define CS {{cs}}
-#define LEN (CS * VLEN)
-#define LEN_s (CS * VLEN_s)
+#define LEN (SHARP_LEGENDRE_CS * VLEN)
+#define LEN_s (SHARP_LEGENDRE_CS * VLEN_s)
 
 /*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
 void sharp_legendre_transform{{T}}({{scalar}} *bl,
                                    {{scalar}} *recfac,
                                    ptrdiff_t lmax,
                                    {{scalar}} *x, {{scalar}} *out, ptrdiff_t nx) {
-    {{scalar}} xchunk[LEN{{T}}], outchunk[LEN{{T}}];
+    {{scalar}} xchunk[MAX_CS * VLEN{{T}}], outchunk[MAX_CS * LEN{{T}}];
     int compute_recfac;
     ptrdiff_t i, j, len;
 
     compute_recfac = (recfac == NULL);
     if (compute_recfac) {
-        recfac = memalign(16, sizeof({{scalar}}) * (lmax + 1));
+        recfac = malloc(sizeof({{scalar}}) * (lmax + 1));
         sharp_legendre_transform_recfac{{T}}(recfac, lmax);
     }
 
@@ -160,3 +172,5 @@ void sharp_legendre_transform{{T}}({{scalar}} *bl,
     }
 }
 /*{ endfor }*/
+
+#endif
diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index 28073ac..ee4c5e7 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -63,8 +63,8 @@ typedef int Tm;
 #define vneg(a) (-(a))
 #define vload(a) (a)
 #define vload_s(a) (a)
-#define vloadu(p) (*p)
-#define vloadu_s(p) (*p)
+#define vloadu(p) (*(p))
+#define vloadu_s(p) (*(p))
 #define vabs(a) fabs(a)
 #define vsqrt(a) sqrt(a)
 #define vlt(a,b) ((a)<(b))
@@ -72,8 +72,6 @@ typedef int Tm;
 #define vge(a,b) ((a)>=(b))
 #define vne(a,b) ((a)!=(b))
 #define vand_mask(a,b) ((a)&&(b))
-#define vbroadcast(p) (*p)
-#define vbroadcast_s(p) (*p)
 #define vstoreu(p, a) (*(p)=a)
 #define vstoreu_s(p, a) (*(p)=a)
 
@@ -148,15 +146,13 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vloadu_s(p) _mm_loadu_ps(p)
 #define vstoreu(p, v) _mm_storeu_pd(p, v)
 #define vstoreu_s(p, v) _mm_storeu_ps(p, v)
-#define vbroadcast(p) _mm_set_pd(*p, *p)
-#define vbroadcast_s(p) _mm_set_ps(*p, *p, *p, *p)
 
 #endif
 
 #if (VLEN==4)
 
 #include <immintrin.h>
-#ifdef __FMA4__
+#if (USE_FMA4)
 #include <x86intrin.h>
 #endif
 
@@ -180,7 +176,7 @@ typedef __m256d Tm;
 #define vmul_s(a,b) _mm256_mul_ps(a,b)
 #define vmuleq(a,b) a=_mm256_mul_pd(a,b)
 #define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
-#ifdef __FMA4__
+#if (USE_FMA4)
 #define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
 #define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
 #define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
@@ -188,6 +184,7 @@ typedef __m256d Tm;
 #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
 #else
 #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
+#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
 #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
 #define vfmaaeq(a,b,c,d,e) \
   a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
@@ -213,8 +210,6 @@ typedef __m256d Tm;
 #define vloadu_s(p) _mm256_loadu_ps(p)
 #define vstoreu(p, v) _mm256_storeu_pd(p, v)
 #define vstoreu_s(p, v) _mm256_storeu_ps(p, v)
-#define vbroadcast(p) _mm256_broadcast_sd(p)
-#define vbroadcast_s(p) _mm256_broadcast_ss(p)
 
 #endif
 
diff --git a/libsharp/sharp_vecutil.h b/libsharp/sharp_vecutil.h
index 0aee4ae..f6161ca 100644
--- a/libsharp/sharp_vecutil.h
+++ b/libsharp/sharp_vecutil.h
@@ -52,4 +52,12 @@
 #define VLEN_s (2*VLEN)
 #endif
 
+#ifndef USE_FMA4
+#ifdef __FMA4__
+#define USE_FMA4 1
+#else
+#define USE_FMA4 0
+#endif
+#endif
+
 #endif
diff --git a/python/libsharp/libsharp.pyx b/python/libsharp/libsharp.pyx
index 0abfe69..e622bb3 100644
--- a/python/libsharp/libsharp.pyx
+++ b/python/libsharp/libsharp.pyx
@@ -14,7 +14,9 @@ cdef extern from "sharp.h":
 def legendre_transform(x, bl, out=None):
     if out is None:
         out = np.empty_like(x)
-    if x.dtype == np.float64:
+    if out.shape[0] == 0:
+        return out
+    elif x.dtype == np.float64:
         if bl.dtype != np.float64:
             bl = bl.astype(np.float64)
         return _legendre_transform(x, bl, out=out)
diff --git a/python/libsharp/tests/test_legendre.py b/python/libsharp/tests/test_legendre.py
index 2e96822..66b4c18 100644
--- a/python/libsharp/tests/test_legendre.py
+++ b/python/libsharp/tests/test_legendre.py
@@ -2,14 +2,17 @@ import numpy as np
 from scipy.special import legendre
 import libsharp
 
+from numpy.testing import assert_allclose
 
-def test_legendre_transform():
-    lmax = 20
-    ntheta = 19
 
+def check_legendre_transform(lmax, ntheta):
     l = np.arange(lmax + 1)
-    bl = np.exp(-l*(l+1))
-    bl *= (2 * l + 1)
+    if lmax >= 1:
+        sigma = -np.log(1e-3) / lmax / (lmax + 1)
+        bl = np.exp(-sigma*l*(l+1))
+        bl *= (2 * l + 1)
+    else:
+        bl = np.asarray([1], dtype=np.double)
 
     theta = np.linspace(0, np.pi, ntheta, endpoint=True)
     x = np.cos(theta)
@@ -20,10 +23,19 @@ def test_legendre_transform():
         P[:, l] = legendre(l)(x)
     y0 = np.dot(P, bl)
 
+
     # double-precision
     y = libsharp.legendre_transform(x, bl)
-    assert np.max(np.abs(y - y) / np.abs(y)) < 1e-12
+
+    assert_allclose(y, y0, rtol=1e-12, atol=1e-12)
 
     # single-precision
     y32 = libsharp.legendre_transform(x.astype(np.float32), bl)
-    assert np.max(np.abs(y32 - y) / np.abs(y32)) < 1e-4
+    assert_allclose(y, y0, rtol=1e-5, atol=1e-5)
+
+
+def test_legendre_transform():
+    nthetas_to_try = [0, 9, 17, 19] + list(np.random.randint(500, size=20))
+    for ntheta in nthetas_to_try:
+        for lmax in [0, 1, 2, 3, 20] + list(np.random.randint(50, size=4)):
+            yield check_legendre_transform, lmax, ntheta

From 765831ea2b4cea62a35ee47606b64dae2d6eb73d Mon Sep 17 00:00:00 2001
From: Dag Sverre Seljebotn <d.s.seljebotn@astro.uio.no>
Date: Thu, 23 Apr 2015 18:54:39 +0200
Subject: [PATCH 5/6] legendre transforms: Fortran wrapper

---
 fortran/sharp.f90      | 39 +++++++++++++++++++++++++++++++++++++++
 fortran/test_sharp.f90 | 27 +++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/fortran/sharp.f90 b/fortran/sharp.f90
index 7dc815d..fb80c3c 100644
--- a/fortran/sharp.f90
+++ b/fortran/sharp.f90
@@ -103,12 +103,32 @@ module sharp
        type(c_ptr), intent(in)                      :: alm(*), map(*)
      end subroutine c_sharp_execute_mpi
 
+     ! Legendre transforms
+     subroutine c_sharp_legendre_transform(bl, recfac, lmax, x, out, nx) &
+          bind(c, name='sharp_legendre_transform')
+       use iso_c_binding
+       integer(c_ptrdiff_t), value :: lmax, nx
+       real(c_double) :: bl(lmax + 1), x(nx), out(nx)
+       real(c_double), optional :: recfac(lmax + 1)
+     end subroutine c_sharp_legendre_transform
+
+     subroutine c_sharp_legendre_transform_s(bl, recfac, lmax, x, out, nx) &
+          bind(c, name='sharp_legendre_transform_s')
+       use iso_c_binding
+       integer(c_ptrdiff_t), value :: lmax, nx
+       real(c_float) :: bl(lmax + 1), x(nx), out(nx)
+       real(c_float), optional :: recfac(lmax + 1)
+     end subroutine c_sharp_legendre_transform_s
   end interface
 
   interface sharp_execute
      module procedure sharp_execute_d
   end interface
 
+  interface sharp_legendre_transform
+     module procedure sharp_legendre_transform_d, sharp_legendre_transform_s
+  end interface sharp_legendre_transform
+
 contains
   ! alm info
 
@@ -240,6 +260,25 @@ contains
    end if
   end subroutine sharp_execute_d
 
+  subroutine sharp_legendre_transform_d(bl, x, out)
+    use iso_c_binding
+    real(c_double) :: bl(:)
+    real(c_double) :: x(:), out(size(x))
+    !--
+    integer(c_ptrdiff_t) :: lmax, nx
+    call c_sharp_legendre_transform(bl, lmax=int(size(bl) - 1, c_ptrdiff_t), &
+                                    x=x, out=out, nx=int(size(x), c_ptrdiff_t))
+  end subroutine sharp_legendre_transform_d
+
+  subroutine sharp_legendre_transform_s(bl, x, out)
+    use iso_c_binding
+    real(c_float) :: bl(:)
+    real(c_float) :: x(:), out(size(x))
+    !--
+    integer(c_ptrdiff_t) :: lmax, nx
+    call c_sharp_legendre_transform_s(bl, lmax=int(size(bl) - 1, c_ptrdiff_t), &
+                                      x=x, out=out, nx=int(size(x), c_ptrdiff_t))
+  end subroutine sharp_legendre_transform_s
 
 
 end module
diff --git a/fortran/test_sharp.f90 b/fortran/test_sharp.f90
index f42cf0c..0b7cce2 100644
--- a/fortran/test_sharp.f90
+++ b/fortran/test_sharp.f90
@@ -54,4 +54,31 @@ program test_sharp
   print *, 'DONE'
   call MPI_Finalize(ierr)
 
+  print *, 'LEGENDRE TRANSFORMS'
+
+  call test_legendre_transforms()
+
+contains
+  subroutine test_legendre_transforms()
+    integer, parameter :: lmax = 20, nx=10
+    real(c_double) :: bl(0:lmax)
+    real(c_double) :: x(nx), out(nx)
+    real(c_float) :: out_s(nx)
+    !--
+    integer :: l, i
+
+    do l = 0, lmax
+       bl(l) = 1.0 / real(l + 1, c_double)
+    end do
+    do i = 1, nx
+       x(i) = 1 / real(i, c_double)
+    end do
+    out = 0
+    call sharp_legendre_transform(bl, x, out)
+    print *, out
+    call sharp_legendre_transform(real(bl, c_float), real(x, c_float), out_s)
+    print *, out_s
+  end subroutine test_legendre_transforms
+
+
 end program test_sharp

From 7ecd1ddc93281d9e58fa28c06bf6d65102ee03eb Mon Sep 17 00:00:00 2001
From: Dag Sverre Seljebotn <d.s.seljebotn@astro.uio.no>
Date: Fri, 24 Apr 2015 09:06:20 +0200
Subject: [PATCH 6/6] Expose gauss_legendre_tbl publicly as
 gauss_legendre_roots

---
 libsharp/planck.make                   |  2 +-
 libsharp/sharp.h                       |  1 +
 libsharp/sharp_geomhelpers.c           | 66 +------------------------
 libsharp/sharp_legendre_roots.c        | 67 ++++++++++++++++++++++++++
 libsharp/sharp_legendre_roots.h        | 50 +++++++++++++++++++
 python/libsharp/libsharp.pyx           | 13 +++++
 python/libsharp/tests/test_legendre.py | 18 +++++++
 7 files changed, 152 insertions(+), 65 deletions(-)
 create mode 100644 libsharp/sharp_legendre_roots.c
 create mode 100644 libsharp/sharp_legendre_roots.h

diff --git a/libsharp/planck.make b/libsharp/planck.make
index fcf51df..8bbd7e1 100644
--- a/libsharp/planck.make
+++ b/libsharp/planck.make
@@ -8,7 +8,7 @@ FULL_INCLUDE+= -I$(SD)
 HDR_$(PKG):=$(SD)/*.h
 LIB_$(PKG):=$(LIBDIR)/libsharp.a
 BIN:=sharp_testsuite
-LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o
+LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o sharp_legendre_roots.o
 ALLOBJ:=$(LIBOBJ) sharp_testsuite.o
 LIBOBJ:=$(LIBOBJ:%=$(OD)/%)
 ALLOBJ:=$(ALLOBJ:%=$(OD)/%)
diff --git a/libsharp/sharp.h b/libsharp/sharp.h
index da6fc4a..4497dd4 100644
--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@@ -40,5 +40,6 @@
 
 #include "sharp_lowlevel.h"
 #include "sharp_legendre.h"
+#include "sharp_legendre_roots.h"
 
 #endif
diff --git a/libsharp/sharp_geomhelpers.c b/libsharp/sharp_geomhelpers.c
index cd088b3..dbb44e0 100644
--- a/libsharp/sharp_geomhelpers.c
+++ b/libsharp/sharp_geomhelpers.c
@@ -32,6 +32,7 @@
 
 #include <math.h>
 #include "sharp_geomhelpers.h"
+#include "sharp_legendre_roots.h"
 #include "c_utils.h"
 #include "ls_fft.h"
 #include <stdio.h>
@@ -106,69 +107,6 @@ void sharp_make_weighted_healpix_geom_info (int nside, int stride,
   sharp_make_subset_healpix_geom_info(nside, stride, 4 * nside - 1, NULL, weight, geom_info);
   }
 
-static inline double one_minus_x2 (double x)
-  { return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; }
-
-/* Function adapted from GNU GSL file glfixed.c
-   Original author: Pavel Holoborodko (http://www.holoborodko.com)
-
-   Adjustments by M. Reinecke
-    - adjusted interface (keep epsilon internal, return full number of points)
-    - removed precomputed tables
-    - tweaked Newton iteration to obtain higher accuracy */
-static void gauss_legendre_tbl(int n, double *x, double *w)
-  {
-  const double pi = 3.141592653589793238462643383279502884197;
-  const double eps = 3e-14;
-  int m = (n+1)>>1;
-
-  double t0 = 1 - (1-1./n) / (8.*n*n);
-  double t1 = 1./(4.*n+2.);
-
-#pragma omp parallel
-{
-  int i;
-#pragma omp for schedule(dynamic,100) 
-  for (i=1; i<=m; ++i)
-    {
-    double x0 = cos(pi * ((i<<2)-1) * t1) * t0;
-
-    int dobreak=0;
-    int j=0;
-    double dpdx;
-    while(1)
-      {
-      double P_1 = 1.0;
-      double P0 = x0;
-      double dx, x1;
-
-      for (int k=2; k<=n; k++)
-        {
-        double P_2 = P_1;
-        P_1 = P0;
-//        P0 = ((2*k-1)*x0*P_1-(k-1)*P_2)/k;
-        P0 = x0*P_1 + (k-1.)/k * (x0*P_1-P_2);
-        }
-
-      dpdx = (P_1 - x0*P0) * n / one_minus_x2(x0);
-
-      /* Newton step */
-      x1 = x0 - P0/dpdx;
-      dx = x0-x1;
-      x0 = x1;
-      if (dobreak) break;
-
-      if (fabs(dx)<=eps) dobreak=1;
-      UTIL_ASSERT(++j<100,"convergence problem");
-      }
-
-    x[i-1] = -x0;
-    x[n-i] = x0;
-    w[i-1] = w[n-i] = 2. / (one_minus_x2(x0) * dpdx * dpdx);
-    }
-} // end of parallel region
-  }
-
 void sharp_make_gauss_geom_info (int nrings, int nphi, double phi0,
   int stride_lon, int stride_lat, sharp_geom_info **geom_info)
   {
@@ -181,7 +119,7 @@ void sharp_make_gauss_geom_info (int nrings, int nphi, double phi0,
   ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
   int *stride_=RALLOC(int,nrings);
 
-  gauss_legendre_tbl(nrings,theta,weight);
+  sharp_legendre_roots(nrings,theta,weight);
   for (int m=0; m<nrings; ++m)
     {
     theta[m] = acos(-theta[m]);
diff --git a/libsharp/sharp_legendre_roots.c b/libsharp/sharp_legendre_roots.c
new file mode 100644
index 0000000..44d8507
--- /dev/null
+++ b/libsharp/sharp_legendre_roots.c
@@ -0,0 +1,67 @@
+/* Function adapted from GNU GSL file glfixed.c
+   Original author: Pavel Holoborodko (http://www.holoborodko.com)
+
+   Adjustments by M. Reinecke
+    - adjusted interface (keep epsilon internal, return full number of points)
+    - removed precomputed tables
+    - tweaked Newton iteration to obtain higher accuracy */
+
+#include <math.h>
+#include "sharp_legendre_roots.h"
+#include "c_utils.h"
+
+static inline double one_minus_x2 (double x)
+  { return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; }
+
+void sharp_legendre_roots(int n, double *x, double *w)
+  {
+  const double pi = 3.141592653589793238462643383279502884197;
+  const double eps = 3e-14;
+  int m = (n+1)>>1;
+
+  double t0 = 1 - (1-1./n) / (8.*n*n);
+  double t1 = 1./(4.*n+2.);
+
+#pragma omp parallel
+{
+  int i;
+#pragma omp for schedule(dynamic,100)
+  for (i=1; i<=m; ++i)
+    {
+    double x0 = cos(pi * ((i<<2)-1) * t1) * t0;
+
+    int dobreak=0;
+    int j=0;
+    double dpdx;
+    while(1)
+      {
+      double P_1 = 1.0;
+      double P0 = x0;
+      double dx, x1;
+
+      for (int k=2; k<=n; k++)
+        {
+        double P_2 = P_1;
+        P_1 = P0;
+//        P0 = ((2*k-1)*x0*P_1-(k-1)*P_2)/k;
+        P0 = x0*P_1 + (k-1.)/k * (x0*P_1-P_2);
+        }
+
+      dpdx = (P_1 - x0*P0) * n / one_minus_x2(x0);
+
+      /* Newton step */
+      x1 = x0 - P0/dpdx;
+      dx = x0-x1;
+      x0 = x1;
+      if (dobreak) break;
+
+      if (fabs(dx)<=eps) dobreak=1;
+      UTIL_ASSERT(++j<100,"convergence problem");
+      }
+
+    x[i-1] = -x0;
+    x[n-i] = x0;
+    w[i-1] = w[n-i] = 2. / (one_minus_x2(x0) * dpdx * dpdx);
+    }
+} // end of parallel region
+  }
diff --git a/libsharp/sharp_legendre_roots.h b/libsharp/sharp_legendre_roots.h
new file mode 100644
index 0000000..2a056b2
--- /dev/null
+++ b/libsharp/sharp_legendre_roots.h
@@ -0,0 +1,50 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_legendre_roots.h
+ *
+ *  Copyright (C) 2006-2012 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef SHARP_LEGENDRE_ROOTS_H
+#define SHARP_LEGENDRE_ROOTS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! Computes roots and Gaussian quadrature weights for Legendre polynomial
+    of degree \a n.
+    \param n Order of Legendre polynomial
+    \param x Array of length \a n for output (root position)
+    \param w Array of length \a w for output (weight for Gaussian quadrature)
+ */
+void sharp_legendre_roots(int n, double *x, double *w);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/python/libsharp/libsharp.pyx b/python/libsharp/libsharp.pyx
index e622bb3..9e83d3d 100644
--- a/python/libsharp/libsharp.pyx
+++ b/python/libsharp/libsharp.pyx
@@ -1,5 +1,7 @@
 import numpy as np
 
+__all__ = ['legendre_transform', 'legendre_roots']
+
 cdef extern from "sharp.h":
     ctypedef long ptrdiff_t
 
@@ -9,6 +11,7 @@ cdef extern from "sharp.h":
                                   double *out, ptrdiff_t nx)
     void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax)
     void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax)
+    void sharp_legendre_roots(int n, double *x, double *w)
 
 
 def legendre_transform(x, bl, out=None):
@@ -41,3 +44,13 @@ def _legendre_transform_s(float[::1] x, float[::1] bl, float[::1] out):
     sharp_legendre_transform_s(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0])
     return np.asarray(out)
 
+
+def legendre_roots(n):
+    x = np.empty(n, np.double)
+    w = np.empty(n, np.double)
+    cdef double[::1] x_buf = x, w_buf = w
+    if not (x_buf.shape[0] == w_buf.shape[0] == n):
+        raise AssertionError()
+    if n > 0:
+        sharp_legendre_roots(n, &x_buf[0], &w_buf[0])
+    return x, w
diff --git a/python/libsharp/tests/test_legendre.py b/python/libsharp/tests/test_legendre.py
index 66b4c18..7860d2a 100644
--- a/python/libsharp/tests/test_legendre.py
+++ b/python/libsharp/tests/test_legendre.py
@@ -1,5 +1,6 @@
 import numpy as np
 from scipy.special import legendre
+from scipy.special import p_roots
 import libsharp
 
 from numpy.testing import assert_allclose
@@ -39,3 +40,20 @@ def test_legendre_transform():
     for ntheta in nthetas_to_try:
         for lmax in [0, 1, 2, 3, 20] + list(np.random.randint(50, size=4)):
             yield check_legendre_transform, lmax, ntheta
+
+def check_legendre_roots(n):
+    xs, ws = ([], []) if n == 0 else p_roots(n) # from SciPy
+    xl, wl = libsharp.legendre_roots(n)
+    assert_allclose(xs, xl)
+    assert_allclose(ws, wl)
+
+def test_legendre_roots():
+    """
+    Test the Legendre root-finding algorithm from libsharp by comparing it with
+    the SciPy version.
+    """
+    yield check_legendre_roots, 0
+    yield check_legendre_roots, 1
+    yield check_legendre_roots, 32
+    yield check_legendre_roots, 33
+    yield check_legendre_roots, 128