Merge pull request #4 from dagss/legtrans

Legendre transforms
2015-04-24 18:04:42 +02:00 · 2015-04-24 18:04:42 +02:00 · bd52e32a87
commit bd52e32a87
parent 9a839b8fd3 765831ea2b
21 changed files with 1883 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,9 @@
 *.o
+*.so
 #*
 *~
+*.pyc
+*.pyo

 /auto
 /autom4te.cache
@ -9,3 +12,5 @@
 /config/config.auto
 /configure
 /sharp_oracle.inc
+
+/python/libsharp/libsharp.c
--- a/15
+++ b/15
@ -53,3 +53,18 @@ perftest: compile_all
 	$(BINDIR)/sharp_testsuite test gauss 2047 -1 -1 4096 0 1 && \
 	$(BINDIR)/sharp_testsuite test gauss 4095 -1 -1 8192 0 1 && \
 	$(BINDIR)/sharp_testsuite test gauss 8191 -1 -1 16384 0 1
+
+%.c: %.c.in
+# Only do this if the md5sum changed, in order to avoid Python and Jinja
+# dependency when not modifying the c.in file
+	grep `md5sum $< | cut -d ' ' -f 1` $@ || ./runjinja.py < $< > $@
+
+genclean:
+	rm libsharp/sharp_legendre.c || exit 0
+
+pytest:
+	rm python/libsharp/libsharp.so || exit 0
+	cd python && LIBSHARP_INCLUDE=$(INCDIR) LIBSHARP_LIB=$(LIBDIR) python setup.py build_ext --inplace
+	cd python && nosetests libsharp
+
+
--- a/fortran/sharp.f90
+++ b/fortran/sharp.f90
@ -103,12 +103,32 @@ module sharp
       type(c_ptr), intent(in)                      :: alm(*), map(*)
     end subroutine c_sharp_execute_mpi

+     ! Legendre transforms
+     subroutine c_sharp_legendre_transform(bl, recfac, lmax, x, out, nx) &
+          bind(c, name='sharp_legendre_transform')
+       use iso_c_binding
+       integer(c_ptrdiff_t), value :: lmax, nx
+       real(c_double) :: bl(lmax + 1), x(nx), out(nx)
+       real(c_double), optional :: recfac(lmax + 1)
+     end subroutine c_sharp_legendre_transform
+
+     subroutine c_sharp_legendre_transform_s(bl, recfac, lmax, x, out, nx) &
+          bind(c, name='sharp_legendre_transform_s')
+       use iso_c_binding
+       integer(c_ptrdiff_t), value :: lmax, nx
+       real(c_float) :: bl(lmax + 1), x(nx), out(nx)
+       real(c_float), optional :: recfac(lmax + 1)
+     end subroutine c_sharp_legendre_transform_s
  end interface

  interface sharp_execute
     module procedure sharp_execute_d
  end interface

+  interface sharp_legendre_transform
+     module procedure sharp_legendre_transform_d, sharp_legendre_transform_s
+  end interface sharp_legendre_transform
+
 contains
  ! alm info

@ -240,6 +260,25 @@ contains
   end if
  end subroutine sharp_execute_d

+  subroutine sharp_legendre_transform_d(bl, x, out)
+    use iso_c_binding
+    real(c_double) :: bl(:)
+    real(c_double) :: x(:), out(size(x))
+    !--
+    integer(c_ptrdiff_t) :: lmax, nx
+    call c_sharp_legendre_transform(bl, lmax=int(size(bl) - 1, c_ptrdiff_t), &
+                                    x=x, out=out, nx=int(size(x), c_ptrdiff_t))
+  end subroutine sharp_legendre_transform_d
+
+  subroutine sharp_legendre_transform_s(bl, x, out)
+    use iso_c_binding
+    real(c_float) :: bl(:)
+    real(c_float) :: x(:), out(size(x))
+    !--
+    integer(c_ptrdiff_t) :: lmax, nx
+    call c_sharp_legendre_transform_s(bl, lmax=int(size(bl) - 1, c_ptrdiff_t), &
+                                      x=x, out=out, nx=int(size(x), c_ptrdiff_t))
+  end subroutine sharp_legendre_transform_s


 end module
--- a/fortran/test_sharp.f90
+++ b/fortran/test_sharp.f90
@ -54,4 +54,31 @@ program test_sharp
  print *, 'DONE'
  call MPI_Finalize(ierr)

+  print *, 'LEGENDRE TRANSFORMS'
+
+  call test_legendre_transforms()
+
+contains
+  subroutine test_legendre_transforms()
+    integer, parameter :: lmax = 20, nx=10
+    real(c_double) :: bl(0:lmax)
+    real(c_double) :: x(nx), out(nx)
+    real(c_float) :: out_s(nx)
+    !--
+    integer :: l, i
+
+    do l = 0, lmax
+       bl(l) = 1.0 / real(l + 1, c_double)
+    end do
+    do i = 1, nx
+       x(i) = 1 / real(i, c_double)
+    end do
+    out = 0
+    call sharp_legendre_transform(bl, x, out)
+    print *, out
+    call sharp_legendre_transform(real(bl, c_float), real(x, c_float), out_s)
+    print *, out_s
+  end subroutine test_legendre_transforms
+
+
 end program test_sharp
--- a/libsharp/planck.make
+++ b/libsharp/planck.make
@ -8,7 +8,7 @@ FULL_INCLUDE+= -I$(SD)
 HDR_$(PKG):=$(SD)/*.h
 LIB_$(PKG):=$(LIBDIR)/libsharp.a
 BIN:=sharp_testsuite
-LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o
+LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o
 ALLOBJ:=$(LIBOBJ) sharp_testsuite.o
 LIBOBJ:=$(LIBOBJ:%=$(OD)/%)
 ALLOBJ:=$(ALLOBJ:%=$(OD)/%)
--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@ -39,5 +39,6 @@
 #include <complex.h>

 #include "sharp_lowlevel.h"
+#include "sharp_legendre.h"

 #endif
--- a/libsharp/sharp_legendre.c
+++ b/libsharp/sharp_legendre.c
--- a/libsharp/sharp_legendre.c.in
+++ b/libsharp/sharp_legendre.c.in
@ -0,0 +1,176 @@
+/*
+
+    NOTE NOTE NOTE
+
+    This file is edited in sharp_legendre.c.in which is then preprocessed.
+    Do not make manual  modifications to sharp_legendre.c.
+
+    NOTE NOTE NOTE
+
+*/
+
+
+/*
+ *  This file is part of libsharp.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*! \file sharp_legendre.c.in
+ *
+ *  Copyright (C) 2015 University of Oslo
+ *  \author Dag Sverre Seljebotn
+ */
+
+#ifndef NO_LEGENDRE
+#if (VLEN==8)
+#error This code is not tested with MIC; please compile with -DNO_LEGENDRE
+/* ...or test it (it probably works) and remove this check */
+#endif
+
+#ifndef SHARP_LEGENDRE_CS
+#define SHARP_LEGENDRE_CS 4
+#endif
+
+#define MAX_CS 6
+#if (SHARP_LEGENDRE_CS > MAX_CS)
+#error (SHARP_LEGENDRE_CS > MAX_CS)
+#endif
+
+#include "sharp_legendre.h"
+#include "sharp_vecsupport.h"
+
+#include <malloc.h>
+
+/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
+/*{ for cs in range(1, 7) }*/
+static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *bl, ptrdiff_t lmax,
+                                              {{scalar}} xarr[({{cs}}) * VLEN{{T}}],
+                                              {{scalar}} out[({{cs}}) * VLEN{{T}}]) {
+    /*{ for i in range(cs) }*/
+    Tv{{T}} P_{{i}}, Pm1_{{i}}, Pm2_{{i}}, x{{i}}, y{{i}};
+    /*{ endfor }*/
+    Tv{{T}} W1, W2, b, R;
+    ptrdiff_t l;
+
+    /*{ for i in range(cs) }*/
+    x{{i}} = vloadu{{T}}(xarr + {{i}} * VLEN{{T}});
+    Pm1_{{i}} = vload{{T}}(1.0);
+    P_{{i}} = x{{i}};
+    b = vload{{T}}(*bl);
+    y{{i}} = vmul{{T}}(Pm1_{{i}}, b);
+    /*{ endfor }*/
+    
+    b = vload{{T}}(*(bl + 1));
+    /*{ for i in range(cs) }*/
+    vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
+    /*{ endfor }*/
+
+    for (l = 2; l <= lmax; ++l) {
+        b = vload{{T}}(*(bl + l));
+        R = vload{{T}}(*(recfacs + l));
+        
+        /* 
+           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
+        */
+        /*{ for i in range(cs) }*/
+        Pm2_{{i}} = Pm1_{{i}}; Pm1_{{i}} = P_{{i}};
+        W1 = vmul{{T}}(x{{i}}, Pm1_{{i}});
+        W2 = W1;
+        W2 = vsub{{T}}(W2, Pm2_{{i}});
+        P_{{i}} = W1;
+        vfmaeq{{T}}(P_{{i}}, W2, R);
+        vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
+        /*{ endfor }*/
+
+    }
+    /*{ for i in range(cs) }*/
+    vstoreu{{T}}(out + {{i}} * VLEN{{T}}, y{{i}});
+    /*{ endfor }*/
+}
+/*{ endfor }*/
+/*{ endfor }*/
+
+
+/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
+void sharp_legendre_transform_recfac{{T}}({{scalar}} *r, ptrdiff_t lmax) {
+    /* (l - 1) / l, for l >= 2 */
+    ptrdiff_t l;
+    r[0] = 0;
+    r[1] = 1;
+    for (l = 2; l <= lmax; ++l) {
+        r[l] = ({{scalar}})(l - 1) / ({{scalar}})l;
+    }
+}
+/*{ endfor }*/
+
+/*
+  Compute sum_l b_l P_l(x_i) for all i. 
+ */
+
+#define LEN (SHARP_LEGENDRE_CS * VLEN)
+#define LEN_s (SHARP_LEGENDRE_CS * VLEN_s)
+
+/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
+void sharp_legendre_transform{{T}}({{scalar}} *bl,
+                                   {{scalar}} *recfac,
+                                   ptrdiff_t lmax,
+                                   {{scalar}} *x, {{scalar}} *out, ptrdiff_t nx) {
+    {{scalar}} xchunk[MAX_CS * VLEN{{T}}], outchunk[MAX_CS * LEN{{T}}];
+    int compute_recfac;
+    ptrdiff_t i, j, len;
+
+    compute_recfac = (recfac == NULL);
+    if (compute_recfac) {
+        recfac = malloc(sizeof({{scalar}}) * (lmax + 1));
+        sharp_legendre_transform_recfac{{T}}(recfac, lmax);
+    }
+
+    for (j = 0; j != LEN{{T}}; ++j) xchunk[j] = 0;
+
+    for (i = 0; i < nx; i += LEN{{T}}) {
+        len = (i + (LEN{{T}}) <= nx) ? (LEN{{T}}) : (nx - i);
+        for (j = 0; j != len; ++j) xchunk[j] = x[i + j];
+        switch ((len + VLEN{{T}} - 1) / VLEN{{T}}) {
+          case 6: legendre_transform_vec6{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+          case 5: legendre_transform_vec5{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+          case 4: legendre_transform_vec4{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+          case 3: legendre_transform_vec3{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+          case 2: legendre_transform_vec2{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+          case 1:
+          case 0:
+              legendre_transform_vec1{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+        }
+        for (j = 0; j != len; ++j) out[i + j] = outchunk[j];
+    }
+    if (compute_recfac) {
+        free(recfac);
+    }
+}
+/*{ endfor }*/
+
+#endif
--- a/libsharp/sharp_legendre.h
+++ b/libsharp/sharp_legendre.h
@ -0,0 +1,62 @@
+/*
+ *  This file is part of libsharp.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*! \file sharp_legendre.h
+ *  Interface for the Legendre transform parts of the spherical transform library.
+ *
+ *  Copyright (C) 2015 University of Oslo
+ *  \author Dag Sverre Seljebotn
+ */
+
+#ifndef SHARP_LEGENDRE_H
+#define SHARP_LEGENDRE_H
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef NO_LEGENDRE
+
+void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x,
+                              double *out, ptrdiff_t nx);
+void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x,
+                                float *out, ptrdiff_t nx);
+void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax);
+void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@ -40,23 +40,31 @@ typedef double Ts;
 #if (VLEN==1)

 typedef double Tv;
+typedef float Tv_s;
 typedef int Tm;

 #define vadd(a,b) ((a)+(b))
+#define vadd_s(a,b) ((a)+(b))
 #define vaddeq(a,b) ((a)+=(b))
 #define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
 #define vsub(a,b) ((a)-(b))
+#define vsub_s(a,b) ((a)-(b))
 #define vsubeq(a,b) ((a)-=(b))
 #define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
 #define vmul(a,b) ((a)*(b))
+#define vmul_s(a,b) ((a)*(b))
 #define vmuleq(a,b) ((a)*=(b))
 #define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
 #define vfmaeq(a,b,c) ((a)+=(b)*(c))
+#define vfmaeq_s(a,b,c) ((a)+=(b)*(c))
 #define vfmseq(a,b,c) ((a)-=(b)*(c))
 #define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
 #define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
 #define vneg(a) (-(a))
 #define vload(a) (a)
+#define vload_s(a) (a)
+#define vloadu(p) (*(p))
+#define vloadu_s(p) (*(p))
 #define vabs(a) fabs(a)
 #define vsqrt(a) sqrt(a)
 #define vlt(a,b) ((a)<(b))
@ -64,6 +72,8 @@ typedef int Tm;
 #define vge(a,b) ((a)>=(b))
 #define vne(a,b) ((a)!=(b))
 #define vand_mask(a,b) ((a)&&(b))
+#define vstoreu(p, a) (*(p)=a)
+#define vstoreu_s(p, a) (*(p)=a)

 static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
 static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
@ -87,6 +97,7 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
 #endif

 typedef __m128d Tv;
+typedef __m128 Tv_s;
 typedef __m128d Tm;

 #if defined(__SSE4_1__)
@ -99,15 +110,19 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vone _mm_set1_pd(1.)

 #define vadd(a,b) _mm_add_pd(a,b)
+#define vadd_s(a,b) _mm_add_ps(a,b)
 #define vaddeq(a,b) a=_mm_add_pd(a,b)
 #define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero))
 #define vsub(a,b) _mm_sub_pd(a,b)
+#define vsub_s(a,b) _mm_sub_ps(a,b)
 #define vsubeq(a,b) a=_mm_sub_pd(a,b)
 #define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero))
 #define vmul(a,b) _mm_mul_pd(a,b)
+#define vmul_s(a,b) _mm_mul_ps(a,b)
 #define vmuleq(a,b) a=_mm_mul_pd(a,b)
 #define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone))
 #define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
+#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c))
 #define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
 #define vfmaaeq(a,b,c,d,e) \
  a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
@ -115,6 +130,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
  a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
 #define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
 #define vload(a) _mm_set1_pd(a)
+#define vload_s(a) _mm_set1_ps(a)
 #define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
 #define vsqrt(a) _mm_sqrt_pd(a)
 #define vlt(a,b) _mm_cmplt_pd(a,b)
@ -126,17 +142,22 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vmax(a,b) _mm_max_pd(a,b);
 #define vanyTrue(a) (_mm_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm_movemask_pd(a)==3)
+#define vloadu(p) _mm_loadu_pd(p)
+#define vloadu_s(p) _mm_loadu_ps(p)
+#define vstoreu(p, v) _mm_storeu_pd(p, v)
+#define vstoreu_s(p, v) _mm_storeu_ps(p, v)

 #endif

 #if (VLEN==4)

 #include <immintrin.h>
-#ifdef __FMA4__
+#if (USE_FMA4)
 #include <x86intrin.h>
 #endif

 typedef __m256d Tv;
+typedef __m256 Tv_s;
 typedef __m256d Tm;

 #define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
@ -144,21 +165,26 @@ typedef __m256d Tm;
 #define vone _mm256_set1_pd(1.)

 #define vadd(a,b) _mm256_add_pd(a,b)
+#define vadd_s(a,b) _mm256_add_ps(a,b)
 #define vaddeq(a,b) a=_mm256_add_pd(a,b)
 #define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero))
 #define vsub(a,b) _mm256_sub_pd(a,b)
+#define vsub_s(a,b) _mm256_sub_ps(a,b)
 #define vsubeq(a,b) a=_mm256_sub_pd(a,b)
 #define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero))
 #define vmul(a,b) _mm256_mul_pd(a,b)
+#define vmul_s(a,b) _mm256_mul_ps(a,b)
 #define vmuleq(a,b) a=_mm256_mul_pd(a,b)
 #define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
-#ifdef __FMA4__
+#if (USE_FMA4)
 #define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
+#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
 #define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
 #define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
 #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
 #else
 #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
+#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
 #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
 #define vfmaaeq(a,b,c,d,e) \
  a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
@ -167,6 +193,7 @@ typedef __m256d Tm;
 #endif
 #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
 #define vload(a) _mm256_set1_pd(a)
+#define vload_s(a) _mm256_set1_ps(a)
 #define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
 #define vsqrt(a) _mm256_sqrt_pd(a)
 #define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
@ -179,6 +206,11 @@ typedef __m256d Tm;
 #define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm256_movemask_pd(a)==15)

+#define vloadu(p) _mm256_loadu_pd(p)
+#define vloadu_s(p) _mm256_loadu_ps(p)
+#define vstoreu(p, v) _mm256_storeu_pd(p, v)
+#define vstoreu_s(p, v) _mm256_storeu_ps(p, v)
+
 #endif

 #if (VLEN==8)
--- a/libsharp/sharp_vecutil.h
+++ b/libsharp/sharp_vecutil.h
@ -32,6 +32,8 @@
 #ifndef SHARP_VECUTIL_H
 #define SHARP_VECUTIL_H

+#ifndef VLEN
+
 #if (defined (__MIC__))
 #define VLEN 8
 #elif (defined (__AVX__))
@ -43,3 +45,19 @@
 #endif

 #endif
+
+#if (VLEN==1)
+#define VLEN_s 1
+#else
+#define VLEN_s (2*VLEN)
+#endif
+
+#ifndef USE_FMA4
+#ifdef __FMA4__
+#define USE_FMA4 1
+#else
+#define USE_FMA4 0
+#endif
+#endif
+
+#endif
--- a/python/fake_pyrex/Pyrex/Distutils/init.py
+++ b/python/fake_pyrex/Pyrex/Distutils/init.py
@ -0,0 +1 @@
+# work around broken setuptools monkey patching
--- a/python/fake_pyrex/Pyrex/Distutils/build_ext.py
+++ b/python/fake_pyrex/Pyrex/Distutils/build_ext.py
@ -0,0 +1 @@
+build_ext = "yes, it's there!"
--- a/python/fake_pyrex/Pyrex/init.py
+++ b/python/fake_pyrex/Pyrex/init.py
@ -0,0 +1 @@
+# work around broken setuptools monkey patching
--- a/python/fake_pyrex/README
+++ b/python/fake_pyrex/README
@ -0,0 +1,2 @@
+This directory is here to fool setuptools into building .pyx files
+even if Pyrex is not installed. See ../setup.py.
--- a/python/libsharp/init.py
+++ b/python/libsharp/init.py
@ -0,0 +1 @@
+from .libsharp import *
--- a/python/libsharp/libsharp.pyx
+++ b/python/libsharp/libsharp.pyx
@ -0,0 +1,43 @@
+import numpy as np
+
+cdef extern from "sharp.h":
+    ctypedef long ptrdiff_t
+
+    void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x,
+                                    float *out, ptrdiff_t nx)
+    void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x,
+                                  double *out, ptrdiff_t nx)
+    void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax)
+    void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax)
+
+
+def legendre_transform(x, bl, out=None):
+    if out is None:
+        out = np.empty_like(x)
+    if out.shape[0] == 0:
+        return out
+    elif x.dtype == np.float64:
+        if bl.dtype != np.float64:
+            bl = bl.astype(np.float64)
+        return _legendre_transform(x, bl, out=out)
+    elif x.dtype == np.float32:
+        if bl.dtype != np.float32:
+            bl = bl.astype(np.float32)
+        return _legendre_transform_s(x, bl, out=out)
+    else:
+        raise ValueError("unsupported dtype")
+
+
+def _legendre_transform(double[::1] x, double[::1] bl, double[::1] out):
+    if out.shape[0] != x.shape[0]:
+        raise ValueError('x and out must have same shape')
+    sharp_legendre_transform(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0])
+    return np.asarray(out)
+
+
+def _legendre_transform_s(float[::1] x, float[::1] bl, float[::1] out):
+    if out.shape[0] != x.shape[0]:
+        raise ValueError('x and out must have same shape')
+    sharp_legendre_transform_s(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0])
+    return np.asarray(out)
+
--- a/python/libsharp/tests/init.py
+++ b/python/libsharp/tests/init.py
@ -0,0 +1 @@
+# empty
--- a/python/libsharp/tests/test_legendre.py
+++ b/python/libsharp/tests/test_legendre.py
@ -0,0 +1,41 @@
+import numpy as np
+from scipy.special import legendre
+import libsharp
+
+from numpy.testing import assert_allclose
+
+
+def check_legendre_transform(lmax, ntheta):
+    l = np.arange(lmax + 1)
+    if lmax >= 1:
+        sigma = -np.log(1e-3) / lmax / (lmax + 1)
+        bl = np.exp(-sigma*l*(l+1))
+        bl *= (2 * l + 1)
+    else:
+        bl = np.asarray([1], dtype=np.double)
+
+    theta = np.linspace(0, np.pi, ntheta, endpoint=True)
+    x = np.cos(theta)
+
+    # Compute truth using scipy.special.legendre
+    P = np.zeros((ntheta, lmax + 1))
+    for l in range(lmax + 1):
+        P[:, l] = legendre(l)(x)
+    y0 = np.dot(P, bl)
+
+
+    # double-precision
+    y = libsharp.legendre_transform(x, bl)
+
+    assert_allclose(y, y0, rtol=1e-12, atol=1e-12)
+
+    # single-precision
+    y32 = libsharp.legendre_transform(x.astype(np.float32), bl)
+    assert_allclose(y, y0, rtol=1e-5, atol=1e-5)
+
+
+def test_legendre_transform():
+    nthetas_to_try = [0, 9, 17, 19] + list(np.random.randint(500, size=20))
+    for ntheta in nthetas_to_try:
+        for lmax in [0, 1, 2, 3, 20] + list(np.random.randint(50, size=4)):
+            yield check_legendre_transform, lmax, ntheta
--- a/python/setup.py
+++ b/python/setup.py
@ -0,0 +1,76 @@
+#! /usr/bin/env python
+
+descr   = """Spherical Harmionic transforms package
+
+Python API for the libsharp spherical harmonic transforms library
+"""
+
+import os
+import sys
+
+DISTNAME            = 'libsharp'
+DESCRIPTION         = 'libsharp library for fast Spherical Harmonic Transforms'
+LONG_DESCRIPTION    = descr
+MAINTAINER          = 'Dag Sverre Seljebotn',
+MAINTAINER_EMAIL    = 'd.s.seljebotn@astro.uio.no',
+URL                 = 'http://sourceforge.net/projects/libsharp/'
+LICENSE             = 'GPL'
+DOWNLOAD_URL        = "http://sourceforge.net/projects/libsharp/"
+VERSION             = '0.1'
+
+# Add our fake Pyrex at the end of the Python search path
+# in order to fool setuptools into allowing compilation of
+# pyx files to C files. Importing Cython.Distutils then
+# makes Cython the tool of choice for this rather than
+# (the possibly nonexisting) Pyrex.
+project_path = os.path.split(__file__)[0]
+sys.path.append(os.path.join(project_path, 'fake_pyrex'))
+
+from setuptools import setup, find_packages, Extension
+from Cython.Distutils import build_ext
+import numpy as np
+
+libsharp = os.environ.get('LIBSHARP', None)
+libsharp_include = os.environ.get('LIBSHARP_INCLUDE', libsharp and os.path.join(libsharp, 'include'))
+libsharp_lib = os.environ.get('LIBSHARP_LIB', libsharp and os.path.join(libsharp, 'lib'))
+
+if libsharp_include is None or libsharp_lib is None:
+    sys.stderr.write('Please set LIBSHARP environment variable to the install directly of libsharp, '
+                     'this script will refer to the lib and include sub-directories. Alternatively '
+                     'set LIBSHARP_INCLUDE and LIBSHARP_LIB\n')
+    sys.exit(1)
+
+if __name__ == "__main__":
+    setup(install_requires = ['numpy'],
+          packages = find_packages(),
+          test_suite="nose.collector",
+          # Well, technically zipping the package will work, but since it's
+          # all compiled code it'll just get unzipped again at runtime, which
+          # is pointless:
+          zip_safe = False,
+          name = DISTNAME,
+          version = VERSION,
+          maintainer = MAINTAINER,
+          maintainer_email = MAINTAINER_EMAIL,
+          description = DESCRIPTION,
+          license = LICENSE,
+          url = URL,
+          download_url = DOWNLOAD_URL,
+          long_description = LONG_DESCRIPTION,
+          classifiers =
+            [ 'Development Status :: 3 - Alpha',
+              'Environment :: Console',
+              'Intended Audience :: Developers',
+              'Intended Audience :: Science/Research',
+              'License :: OSI Approved :: GNU General Public License (GPL)',
+              'Topic :: Scientific/Engineering'],
+          cmdclass = {"build_ext": build_ext},
+          ext_modules = [
+              Extension("libsharp.libsharp",
+                        ["libsharp/libsharp.pyx"],
+                        libraries=["sharp", "fftpack", "c_utils"],
+                        include_dirs=[libsharp_include],
+                        library_dirs=[libsharp_lib]
+                        ),
+              ],
+          )
--- a/runjinja.py
+++ b/runjinja.py
@ -0,0 +1,19 @@
+#!/usr/bin/env python
+
+"""
+Preprocesses foo.c.in to foo.c. Reads STDIN and writes STDOUT.
+"""
+
+import sys
+import hashlib
+from jinja2 import Template, Environment
+
+env = Environment(block_start_string='/*{',
+                  block_end_string='}*/',
+                  variable_start_string='{{',
+                  variable_end_string='}}')
+
+extra_vars = dict(len=len)
+input = sys.stdin.read()
+sys.stdout.write('/* DO NOT EDIT. md5sum of source: %s */' % hashlib.md5(input).hexdigest())
+sys.stdout.write(env.from_string(input).render(**extra_vars))
				`@ -0,0 +1 @@`
				`# work around broken setuptools monkey patching`