diff --git a/Makefile b/Makefile
index 6476fa5..5a3184c 100644
--- a/Makefile
+++ b/Makefile
@@ -74,5 +74,7 @@ endif
 
 python: $(all_lib) hdrcopy $(CYTHON_MODULES)
 
+# the following test files are automatic; the sht wrapper test
+# must be run manually and requires MPI at the moment..
 pytest: python
-	cd python && nosetests --nocapture libsharp/tests/test_sht.py
+	cd python && nosetests --nocapture libsharp/tests/test_legendre_table.py libsharp/tests/test_legendre.py
diff --git a/libsharp/planck.make b/libsharp/planck.make
index 8bbd7e1..76d534f 100644
--- a/libsharp/planck.make
+++ b/libsharp/planck.make
@@ -8,7 +8,7 @@ FULL_INCLUDE+= -I$(SD)
 HDR_$(PKG):=$(SD)/*.h
 LIB_$(PKG):=$(LIBDIR)/libsharp.a
 BIN:=sharp_testsuite
-LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o sharp_legendre_roots.o
+LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o sharp_legendre_roots.o sharp_legendre_table.o
 ALLOBJ:=$(LIBOBJ) sharp_testsuite.o
 LIBOBJ:=$(LIBOBJ:%=$(OD)/%)
 ALLOBJ:=$(ALLOBJ:%=$(OD)/%)
diff --git a/libsharp/sharp.h b/libsharp/sharp.h
index 4497dd4..6722aee 100644
--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@@ -41,5 +41,6 @@
 #include "sharp_lowlevel.h"
 #include "sharp_legendre.h"
 #include "sharp_legendre_roots.h"
+#include "sharp_legendre_table.h"
 
 #endif
diff --git a/libsharp/sharp_legendre_table.c b/libsharp/sharp_legendre_table.c
new file mode 100644
index 0000000..5420899
--- /dev/null
+++ b/libsharp/sharp_legendre_table.c
@@ -0,0 +1,1470 @@
+/*
+
+This file originated as a concatenation of files from libpsht. Further refactoring
+could be carried out to make the code use libsharp conventions instead for SSE etc.;
+
+*/
+
+
+/*
+sse_utils.h
+*/
+/*
+ *  This file is part of libc_utils.
+ *
+ *  libc_utils is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libc_utils is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libc_utils; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sse_utils.h
+ *  SSE/SSE2/SSE3-related functionality
+ *
+ *  Copyright (C) 2010,2011 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+
+#if (defined(__SSE__))
+
+#include <xmmintrin.h>
+
+
+typedef __m128 v4sf; /* vector of 4 floats (SSE1) */
+
+typedef union {
+  float f[4];
+  v4sf v;
+} V4SF;
+
+static inline v4sf build_v4sf (float a, float b, float c, float d)
+  { return _mm_set_ps(d,c,b,a); }
+static inline void read_v4sf (v4sf v, float *a, float *b, float *c, float *d)
+  {
+  V4SF tmp;
+  tmp.v = v;
+  if (a) *a=tmp.f[0];
+  if (b) *b=tmp.f[1];
+  if (c) *c=tmp.f[2];
+  if (d) *d=tmp.f[3];
+  }
+
+
+#endif
+
+#if (defined(__SSE2__))
+
+#include <emmintrin.h>
+
+
+typedef __m128d v2df; /* vector of 2 doubles (SSE2) */
+
+typedef union {
+  double d[2];
+  v2df v;
+} V2DF;
+
+typedef struct {
+  v2df a,b;
+} v2df2;
+typedef struct {
+  V2DF a,b;
+} V2DF2;
+
+#define V2DF_SIGNMASK _mm_set1_pd(-0.0)
+
+static inline v2df build_v2df (double a, double b)
+  { return _mm_set_pd(b,a); }
+static inline void read_v2df (v2df v, double *a, double *b)
+  { _mm_store_sd(a,v); _mm_storeh_pd(b,v); }
+
+static inline int v2df_any_gt (v2df a, v2df b)
+  {
+  return (_mm_movemask_pd(_mm_cmpgt_pd(_mm_andnot_pd(V2DF_SIGNMASK,a),b))!=0);
+  }
+static inline int v2df_all_ge (v2df a, v2df b)
+  {
+  return (_mm_movemask_pd(_mm_cmplt_pd(_mm_andnot_pd(V2DF_SIGNMASK,a),b))==0);
+  }
+static inline V2DF to_V2DF (v2df x)
+  { V2DF X; X.v=x; return X; }
+static inline V2DF2 to_V2DF2 (v2df2 x)
+  { V2DF2 X; X.a.v=x.a; X.b.v=x.b; return X; }
+static inline v2df2 to_v2df2 (V2DF2 X)
+  { v2df2 x; x.a=X.a.v; x.b=X.b.v; return x; }
+static inline v2df2 zero_v2df2(void)
+  { v2df2 x; x.a=x.b=_mm_setzero_pd(); return x; }
+
+
+#endif
+
+#if (defined(__SSE3__))
+
+#include <pmmintrin.h>
+
+#endif
+
+
+
+/*
+ylmgen_c.h
+*/
+/*
+ *  This file is part of libpsht.
+ *
+ *  libpsht is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libpsht is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libpsht; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libpsht is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file ylmgen_c.h
+ *  Code for efficient calculation of Y_lm(phi=0,theta)
+ *
+ *  Copyright (C) 2005-2011 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+typedef double ylmgen_dbl2[2];
+typedef double ylmgen_dbl3[3];
+
+typedef struct
+  {
+  double cth_crit;
+  int mdist_crit;
+  /* members depending on m and m' */
+  int s, m, mlo, mhi, cosPow, sinPow;
+  long double prefactor;
+  ylmgen_dbl3 *fx;
+  int preMinus_p, preMinus_m;
+  } sylmgen_d;
+
+typedef struct
+  {
+  double fsmall, fbig, eps, cth_crit;
+  int lmax, mmax, smax, m_cur, ith, nth, m_crit, spinrec;
+  /*! The index of the first non-negligible Y_lm value. */
+  int *firstl;
+  double *cf, *mfac, *t1fac, *t2fac, *th, *cth, *sth, *logsth;
+  ylmgen_dbl2 *recfac;
+  double *lamfact;
+  /*! Points to an array of size [0..lmax] containing the Y_lm values. */
+  double *ylm;
+  /*! Points to an array of size [0..lmax] containing the lambda_w
+      and lambda_x values for spin>0 transforms. */
+  ylmgen_dbl2 **lambda_wx;
+  long double *logsum, *lc05, *ls05;
+  double *flm1, *flm2, *xl;
+
+  sylmgen_d **sylm;
+
+  int *lwx_uptodate;
+  int ylm_uptodate;
+
+#ifdef __SSE2__
+  int ith1, ith2;
+  /*! Points to an array of size [0..lmax] containing the Y_lm values. */
+  v2df *ylm_sse2;
+  /*! Points to an array of size [0..lmax] containing the lambda_w
+      and lambda_x values for spin>0 transforms. */
+  v2df2 **lambda_wx_sse2;
+  int *lwx_uptodate_sse2;
+  int ylm_uptodate_sse2;
+#endif
+
+  int recfac_uptodate, lamfact_uptodate;
+  } Ylmgen_C;
+
+/*! Creates a generator which will calculate Y_lm(theta,phi=0)
+    up to \a l=l_max and \a m=m_max. It may regard Y_lm whose absolute
+    magnitude is smaller than \a epsilon as zero. If \a spinrec is nonzero,
+    the spin-1 and spin-2 Y_lm will be calculated by recursion from the spin-0
+    ones, otherwise Wigner d matrix elements will be used. */
+static void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int s_max, int spinrec,
+   double epsilon);
+
+/*! Passes am array \a theta of \a nth colatitudes that will be used in
+    subsequent calls. The individual angles will be referenced by their
+    index in the array, starting with 0.
+    \note The array can be freed or reused after the call. */
+static void Ylmgen_set_theta (Ylmgen_C *gen, const double *theta, int nth);
+
+/*! Deallocates a generator previously initialised by Ylmgen_init(). */
+static void Ylmgen_destroy (Ylmgen_C *gen);
+
+/*! Prepares the object for the calculation at \a theta and \a m. */
+static void Ylmgen_prepare (Ylmgen_C *gen, int ith, int m);
+
+/*! Recalculates (if necessary) the Y_lm values. */
+static void Ylmgen_recalc_Ylm (Ylmgen_C *gen);
+/*! Recalculates (if necessary) the lambda_w and lambda_x values for spin >0
+    transforms. */
+/*static void Ylmgen_recalc_lambda_wx (Ylmgen_C *gen, int spin);*/
+
+#ifdef __SSE2__
+/*! Prepares the object for the calculation at \a theta, \a theta2 and \a m. */
+static void Ylmgen_prepare_sse2 (Ylmgen_C *gen, int ith1, int ith2, int m);
+
+/*! Recalculates (if necessary) the Y_lm values. */
+static void Ylmgen_recalc_Ylm_sse2 (Ylmgen_C *gen);
+/*! Recalculates (if necessary) the lambda_w and lambda_x values for spin >0
+    transforms. */
+/*static void Ylmgen_recalc_lambda_wx_sse2 (Ylmgen_C *gen, int spin);*/
+#endif
+
+/*! Returns a pointer to an array with lmax+1 entries containing normalisation
+    factors that must be applied to Y_lm values computed for \a spin with the
+    given \a spinrec flag. The array must be deallocated (using free()) by the
+    user. */
+/*static double *Ylmgen_get_norm (int lmax, int spin, int spinrec);*/
+
+
+/*
+ylmgen_c.c
+*/
+
+/*
+ *  This file is part of libpsht.
+ *
+ *  libpsht is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libpsht is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libpsht; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libpsht is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*
+ *  Code for efficient calculation of Y_lm(theta,phi=0)
+ *
+ *  Copyright (C) 2005-2011 Max-Planck-Society
+ *  Author: Martin Reinecke
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include "c_utils.h"
+
+enum { large_exponent2=90, minscale=-4, maxscale=11 };
+
+/*static void sylmgen_init (sylmgen_d *gen, const Ylmgen_C *ygen, int spin)
+  {
+  int i;
+  UTIL_ASSERT(spin>=1,"incorrect spin");
+  gen->s=spin;
+  gen->m=gen->mlo=gen->mhi=-1234567890;
+  ALLOC(gen->fx,ylmgen_dbl3,ygen->lmax+2);
+
+  for (i=0; i<ygen->lmax+2; ++i)
+    gen->fx[i][0]=gen->fx[i][1]=gen->fx[i][2]=0.;
+
+  gen->cth_crit = 2.;
+  gen->mdist_crit = ygen->lmax+1;
+  }*/
+
+static void sylmgen_destroy (sylmgen_d *gen)
+  { DEALLOC(gen->fx); }
+
+/*static void sylmgen_prepare (sylmgen_d *gen, const Ylmgen_C *ygen, int m_)
+  {
+  int mlo_, mhi_, ms_similar, l;
+
+  if (m_==gen->m) return;
+  UTIL_ASSERT(m_>=0,"incorrect m");
+
+  mlo_=m_; mhi_=gen->s;
+  if (mhi_<mlo_) SWAP(mhi_,mlo_,int);
+  ms_similar = ((gen->mhi==mhi_) && (gen->mlo==mlo_));
+
+  gen->m=m_;
+  gen->mlo = mlo_; gen->mhi = mhi_;
+
+  if (!ms_similar)
+    {
+    for (l=gen->mhi; l<ygen->lmax; ++l)
+      {
+      double t = ygen->flm1[l+gen->m]*ygen->flm1[l-gen->m]
+                *ygen->flm1[l+gen->s]*ygen->flm1[l-gen->s];
+      double lt = 2*l+1;
+      double l1 = l+1;
+      gen->fx[l+1][0]=l1*lt*t;
+      gen->fx[l+1][1]=gen->m*gen->s*ygen->xl[l]*ygen->xl[l+1];
+      t = ygen->flm2[l+gen->m]*ygen->flm2[l-gen->m]
+         *ygen->flm2[l+gen->s]*ygen->flm2[l-gen->s];
+      gen->fx[l+1][2]=t*l1*ygen->xl[l];
+      }
+    gen->prefactor = 0.5L*(ygen->logsum[2*gen->mhi]
+      -ygen->logsum[gen->mhi+gen->mlo]-ygen->logsum[gen->mhi-gen->mlo]);
+    }
+
+  gen->preMinus_p = gen->preMinus_m = 0;
+  if (gen->mhi==gen->m)
+    {
+    gen->cosPow = gen->mhi+gen->s; gen->sinPow = gen->mhi-gen->s;
+    gen->preMinus_p = gen->preMinus_m = ((gen->mhi-gen->s)&1);
+    }
+  else
+    {
+    gen->cosPow = gen->mhi+gen->m; gen->sinPow = gen->mhi-gen->m;
+    gen->preMinus_m = ((gen->mhi+gen->m)&1);
+    }
+    }*/
+
+/*static void sylmgen_recalc (sylmgen_d *gen, const Ylmgen_C *ygen, int ith,
+  ylmgen_dbl2 *res, int *firstl)
+  {
+  const double ln2     = 0.6931471805599453094172321214581766;
+  const double inv_ln2 = 1.4426950408889634073599246810018921;
+  int l=gen->mhi;
+  int lmax = ygen->lmax;
+  ylmgen_dbl3 *fy = gen->fx;
+  const double fsmall = ygen->fsmall, fbig = ygen->fbig, eps = ygen->eps;
+  const double cth = ygen->cth[ith];
+  long double logvalp = inv_ln2*(gen->prefactor
+    + ygen->lc05[ith]*gen->cosPow + ygen->ls05[ith]*gen->sinPow);
+  long double logvalm = inv_ln2*(gen->prefactor
+    + ygen->lc05[ith]*gen->sinPow + ygen->ls05[ith]*gen->cosPow);
+  int scalep = (int)(logvalp/large_exponent2)-minscale;
+  int scalem = (int)(logvalm/large_exponent2)-minscale;
+  double rec1p=0., rec1m=0.;
+  double rec2p = exp(ln2*(double)(logvalp-(scalep+minscale)*large_exponent2));
+  double rec2m = exp(ln2*(double)(logvalm-(scalem+minscale)*large_exponent2));
+  double corfacp,corfacm;
+  double tp,tm;
+
+  if ((abs(gen->m-gen->s)>=gen->mdist_crit)&&(fabs(cth)>=gen->cth_crit))
+    { *firstl=ygen->lmax+1; return; }
+
+  if (gen->preMinus_p)
+    rec2p=-rec2p;
+  if (gen->preMinus_m)
+    rec2m=-rec2m;
+  if (gen->s&1)
+    rec2p=-rec2p;
+
+  / iterate until we reach the realm of IEEE numbers /
+  while((scalem<0)&&(scalep<0))
+    {
+    if (++l>lmax) break;
+    rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
+    rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
+    if (++l>lmax) break;
+    rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
+    rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;
+
+    while (fabs(rec2p)>fbig)
+      { rec1p *= fsmall; rec2p *= fsmall; ++scalep; }
+    while (fabs(rec2m)>fbig)
+      { rec1m *= fsmall; rec2m *= fsmall; ++scalem; }
+    }
+
+  corfacp = (scalep<0) ? 0. : ygen->cf[scalep];
+  corfacm = (scalem<0) ? 0. : ygen->cf[scalem];
+
+  if (l<=lmax)
+    {
+    while (1)
+      {
+      if ((fabs(rec2p*corfacp)>eps) || (fabs(rec2m*corfacm)>eps))
+        break;
+      if (++l>lmax) break;
+      rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
+      rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
+      if ((fabs(rec1p*corfacp)>eps) || (fabs(rec1m*corfacm)>eps))
+        { SWAP(rec1p,rec2p,double); SWAP(rec1m,rec2m,double); break; }
+      if (++l>lmax) break;
+      rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
+      rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;
+
+      if ((fabs(rec2p)>fbig) || (fabs(rec2m)>fbig))
+        {
+        while (fabs(rec2p)>fbig)
+          { rec1p *= fsmall; rec2p *= fsmall; ++scalep; }
+        while (fabs(rec2m)>fbig)
+          { rec1m *= fsmall; rec2m *= fsmall; ++scalem; }
+        corfacp = (scalep<0) ? 0. : ygen->cf[scalep];
+        corfacm = (scalem<0) ? 0. : ygen->cf[scalem];
+        }
+      }
+    }
+
+  *firstl=l;
+  if (l>lmax)
+    {
+    gen->mdist_crit=abs(gen->m-gen->s);
+    gen->cth_crit= fabs(cth);
+    return;
+    }
+
+  tp = rec2p*corfacp; tm = rec2m*corfacm;
+  res[l][0]=tp+tm;
+  res[l][1]=tm-tp;
+
+  while (1)
+    {
+    if ((fabs(tp)>eps) && (fabs(tm)>eps))
+      break;
+    if (++l>lmax) break;
+    rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
+    rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
+    tp=rec1p*corfacp; tm=rec1m*corfacm;
+    res[l][0]=tp+tm; res[l][1]=tm-tp;
+    if ((fabs(tp)>eps) && (fabs(tm)>eps))
+      { SWAP(rec1p,rec2p,double); SWAP(rec1m,rec2m,double); break; }
+    if (++l>lmax) break;
+    rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
+    rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;
+    tp=rec2p*corfacp; tm=rec2m*corfacm;
+    res[l][0]=tp+tm; res[l][1]=tm-tp;
+
+    if ((fabs(rec2p)>fbig) || (fabs(rec2m)>fbig))
+      {
+      while (fabs(rec2p)>fbig)
+        { rec1p *= fsmall; rec2p *= fsmall; ++scalep; }
+      while (fabs(rec2m)>fbig)
+        { rec1m *= fsmall; rec2m *= fsmall; ++scalem; }
+      corfacp = (scalep<0) ? 0. : ygen->cf[scalep];
+      corfacm = (scalem<0) ? 0. : ygen->cf[scalem];
+      }
+    }
+
+  rec1p *= corfacp; rec2p *= corfacp;
+  rec1m *= corfacm; rec2m *= corfacm;
+
+  for (;l<lmax-1;l+=2)
+    {
+    rec1p = (cth - fy[l+1][1])*fy[l+1][0]*rec2p - fy[l+1][2]*rec1p;
+    rec1m = (cth + fy[l+1][1])*fy[l+1][0]*rec2m - fy[l+1][2]*rec1m;
+    res[l+1][0] = rec1p+rec1m; res[l+1][1] = rec1m-rec1p;
+    rec2p = (cth - fy[l+2][1])*fy[l+2][0]*rec1p - fy[l+2][2]*rec2p;
+    rec2m = (cth + fy[l+2][1])*fy[l+2][0]*rec1m - fy[l+2][2]*rec2m;
+    res[l+2][0] = rec2p+rec2m; res[l+2][1] = rec2m-rec2p;
+    }
+  while (1)
+    {
+    if (++l>lmax) break;
+    rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
+    rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
+    res[l][0] = rec1p+rec1m; res[l][1] = rec1m-rec1p;
+    if (++l>lmax) break;
+    rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
+    rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;
+    res[l][0] = rec2p+rec2m; res[l][1] = rec2m-rec2p;
+    }
+    }*/
+
+#ifdef __SSE2__
+
+#define ADVANCE(L,ap,am,bp,bm) \
+  { \
+  v2df f0=_mm_set1_pd(fy[L][0]), \
+       f1=_mm_set1_pd(fy[L][1]), \
+       f2=_mm_set1_pd(fy[L][2]); \
+  ap = _mm_sub_pd( \
+       _mm_mul_pd(_mm_sub_pd(cth,f1),_mm_mul_pd(f0,bp)), \
+       _mm_mul_pd(f2,ap)); \
+  am = _mm_sub_pd( \
+       _mm_mul_pd(_mm_add_pd(cth,f1),_mm_mul_pd(f0,bm)), \
+       _mm_mul_pd(f2,am)); \
+  }
+
+#define RENORMX(r1,r2,corf,sca,scb) \
+  do \
+    { \
+    double rec1a, rec1b, rec2a, rec2b, corfaca, corfacb; \
+    read_v2df (r1, &rec1a, &rec1b); read_v2df (r2, &rec2a, &rec2b); \
+    read_v2df (corf, &corfaca, &corfacb); \
+    while (fabs(rec2a)>fbig) \
+      { \
+      rec1a*=fsmall; rec2a*=fsmall; ++sca; \
+      corfaca = (sca<0) ? 0. : ygen->cf[sca]; \
+      } \
+    while (fabs(rec2b)>fbig) \
+      { \
+      rec1b*=fsmall; rec2b*=fsmall; ++scb; \
+      corfacb = (scb<0) ? 0. : ygen->cf[scb]; \
+      } \
+    r1=build_v2df(rec1a,rec1b); r2=build_v2df(rec2a,rec2b); \
+    corf=build_v2df(corfaca,corfacb); \
+    } \
+  while(0)
+
+#define RENORM \
+  RENORMX(rec1p,rec2p,corfacp,scale1p,scale2p); \
+  RENORMX(rec1m,rec2m,corfacm,scale1m,scale2m)
+
+/*static void sylmgen_recalc_sse2 (sylmgen_d *gen, const Ylmgen_C *ygen,
+  int ith1, int ith2, v2df2 *res, int *firstl)
+  {
+  const double ln2     = 0.6931471805599453094172321214581766;
+  const double inv_ln2 = 1.4426950408889634073599246810018921;
+  int l=gen->mhi;
+  int lmax = ygen->lmax;
+  ylmgen_dbl3 *fy = gen->fx;
+  const double fbig=ygen->fbig, fsmall=ygen->fsmall;
+  v2df eps2    = build_v2df(ygen->eps,ygen->eps);
+  const double cth1=ygen->cth[ith1], cth2=ygen->cth[ith2];
+  v2df cth = build_v2df(cth1,cth2);
+  long double
+    logval1p = inv_ln2*(gen->prefactor
+             + ygen->lc05[ith1]*gen->cosPow + ygen->ls05[ith1]*gen->sinPow),
+    logval2p = inv_ln2*(gen->prefactor
+             + ygen->lc05[ith2]*gen->cosPow + ygen->ls05[ith2]*gen->sinPow),
+    logval1m = inv_ln2*(gen->prefactor
+             + ygen->lc05[ith1]*gen->sinPow + ygen->ls05[ith1]*gen->cosPow),
+    logval2m = inv_ln2*(gen->prefactor
+             + ygen->lc05[ith2]*gen->sinPow + ygen->ls05[ith2]*gen->cosPow);
+
+  int scale1p = (int)(logval1p/large_exponent2)-minscale,
+      scale2p = (int)(logval2p/large_exponent2)-minscale,
+      scale1m = (int)(logval1m/large_exponent2)-minscale,
+      scale2m = (int)(logval2m/large_exponent2)-minscale;
+
+  v2df rec1p =_mm_setzero_pd(), rec1m=_mm_setzero_pd();
+  v2df rec2p = build_v2df(
+    exp(ln2*(double)(logval1p-(scale1p+minscale)*large_exponent2)),
+    exp(ln2*(double)(logval2p-(scale2p+minscale)*large_exponent2)));
+  v2df rec2m = build_v2df(
+    exp(ln2*(double)(logval1m-(scale1m+minscale)*large_exponent2)),
+    exp(ln2*(double)(logval2m-(scale2m+minscale)*large_exponent2)));
+  v2df corfacp=build_v2df((scale1p<0) ? 0. : ygen->cf[scale1p],
+                          (scale2p<0) ? 0. : ygen->cf[scale2p]),
+       corfacm=build_v2df((scale1m<0) ? 0. : ygen->cf[scale1m],
+                          (scale2m<0) ? 0. : ygen->cf[scale2m]);
+  v2df tp,tm;
+
+  if ((abs(gen->m-gen->s)>=gen->mdist_crit)
+     &&(fabs(ygen->cth[ith1])>=gen->cth_crit)
+     &&(fabs(ygen->cth[ith2])>=gen->cth_crit))
+    { *firstl=ygen->lmax+1; return; }
+
+  if (gen->preMinus_p)
+    rec2p = _mm_xor_pd (rec2p,V2DF_SIGNMASK); / negate /
+  if (gen->preMinus_m)
+    rec2m = _mm_xor_pd (rec2m,V2DF_SIGNMASK); / negate /
+  if (gen->s&1)
+    rec2p = _mm_xor_pd (rec2p,V2DF_SIGNMASK); / negate /
+
+  / iterate until we reach the realm of IEEE numbers /
+  while((scale1m<0)&&(scale1p<0)&&(scale2m<0)&&(scale2p<0))
+    {
+    if (++l>lmax) break;
+    ADVANCE (l,rec1p,rec1m,rec2p,rec2m)
+    if (++l>lmax) break;
+    ADVANCE (l,rec2p,rec2m,rec1p,rec1m)
+
+    RENORM;
+    }
+
+  if (l<=lmax)
+    {
+    while (1)
+      {
+      if (v2df_any_gt(_mm_mul_pd(rec2p,corfacp),eps2) ||
+          v2df_any_gt(_mm_mul_pd(rec2m,corfacm),eps2))
+        break;
+      if (++l>lmax) break;
+      ADVANCE (l,rec1p,rec1m,rec2p,rec2m)
+      if (v2df_any_gt(_mm_mul_pd(rec1p,corfacp),eps2) ||
+          v2df_any_gt(_mm_mul_pd(rec1m,corfacm),eps2))
+        { SWAP(rec1p,rec2p,v2df); SWAP(rec1m,rec2m,v2df); break; }
+      if (++l>lmax) break;
+      ADVANCE (l,rec2p,rec2m,rec1p,rec1m)
+
+      RENORM;
+      }
+    }
+
+  *firstl=l;
+  if (l>lmax)
+    {
+    gen->mdist_crit=abs(gen->m-gen->s);
+    gen->cth_crit= (fabs(cth1)<fabs(cth2)) ? fabs(cth1) : fabs(cth2);
+    return;
+    }
+  tp = _mm_mul_pd(rec2p,corfacp); tm = _mm_mul_pd(rec2m,corfacm);
+  res[l].a=_mm_add_pd(tp,tm);
+  res[l].b=_mm_sub_pd(tm,tp);
+
+  while (1)
+    {
+    if (v2df_all_ge(tp,eps2) && v2df_all_ge(tm,eps2))
+      break;
+    if (++l>lmax) break;
+    ADVANCE(l,rec1p,rec1m,rec2p,rec2m)
+    tp=_mm_mul_pd(rec1p,corfacp); tm=_mm_mul_pd(rec1m,corfacm);
+    res[l].a=_mm_add_pd(tp,tm); res[l].b=_mm_sub_pd(tm,tp);
+    if (v2df_all_ge(tp,eps2) && v2df_all_ge(tm,eps2))
+      { SWAP(rec1p,rec2p,v2df); SWAP(rec1m,rec2m,v2df); break; }
+    if (++l>lmax) break;
+    ADVANCE (l,rec2p,rec2m,rec1p,rec1m)
+    tp=_mm_mul_pd(rec2p,corfacp); tm=_mm_mul_pd(rec2m,corfacm);
+    res[l].a=_mm_add_pd(tp,tm); res[l].b=_mm_sub_pd(tm,tp);
+
+    RENORM;
+    }
+
+  rec1p = _mm_mul_pd(rec1p,corfacp); rec2p = _mm_mul_pd(rec2p,corfacp);
+  rec1m = _mm_mul_pd(rec1m,corfacm); rec2m = _mm_mul_pd(rec2m,corfacm);
+
+  for (;l<lmax-1;l+=2)
+    {
+    ADVANCE(l+1,rec1p,rec1m,rec2p,rec2m)
+    res[l+1].a=_mm_add_pd(rec1p,rec1m); res[l+1].b=_mm_sub_pd(rec1m,rec1p);
+    ADVANCE(l+2,rec2p,rec2m,rec1p,rec1m)
+    res[l+2].a=_mm_add_pd(rec2p,rec2m); res[l+2].b=_mm_sub_pd(rec2m,rec2p);
+    }
+  while (1)
+    {
+    if (++l>lmax) break;
+    ADVANCE(l,rec1p,rec1m,rec2p,rec2m)
+    res[l].a=_mm_add_pd(rec1p,rec1m); res[l].b=_mm_sub_pd(rec1m,rec1p);
+    if (++l>lmax) break;
+    ADVANCE(l,rec2p,rec2m,rec1p,rec1m)
+    res[l].a=_mm_add_pd(rec2p,rec2m); res[l].b=_mm_sub_pd(rec2m,rec2p);
+    }
+  }
+*/
+#endif
+
+static void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int s_max, int spinrec,
+  double epsilon)
+  {
+  int m;
+  const double inv_sqrt4pi = 0.2820947917738781434740397257803862929220;
+  const double inv_ln2 = 1.4426950408889634073599246810018921;
+
+  gen->fsmall = ldexp(1.,-large_exponent2);
+  gen->fbig   = ldexp(1., large_exponent2);
+  gen->eps = epsilon;
+  gen->cth_crit = 2.;
+  gen->ith = -1;
+  gen->nth = 0;
+  gen->lmax = l_max;
+  gen->mmax = m_max;
+  gen->smax = s_max;
+  gen->spinrec = spinrec;
+  gen->m_cur = -1;
+  gen->m_crit = gen->mmax+1;
+  gen->firstl = RALLOC(int,gen->smax+1);
+  for (m=0; m<=gen->smax; ++m) gen->firstl[m]=-1;
+  gen->cf = RALLOC(double,maxscale-minscale+1);
+  for (m=0; m<(maxscale-minscale+1); ++m)
+    gen->cf[m] = ldexp(1.,(m+minscale)*large_exponent2);
+  gen->recfac = RALLOC(ylmgen_dbl2,gen->lmax+1);
+  gen->mfac = RALLOC(double,gen->mmax+1);
+  gen->mfac[0] = 1;
+  for (m=1; m<=gen->mmax; ++m)
+    gen->mfac[m] = gen->mfac[m-1]*sqrt((2*m+1.)/(2*m));
+  for (m=0; m<=gen->mmax; ++m)
+    gen->mfac[m] = inv_ln2*log(inv_sqrt4pi*gen->mfac[m]);
+
+  gen->t1fac = RALLOC(double,gen->lmax+1);
+  for (m=0; m<=gen->lmax; ++m)
+    gen->t1fac[m] = sqrt(4.*(m+1)*(m+1)-1.);
+  gen->t2fac = RALLOC(double,gen->lmax+gen->mmax+1);
+  for (m=0; m<=gen->lmax+gen->mmax; ++m)
+    gen->t2fac[m] = 1./sqrt(m+1.);
+
+  gen->lamfact = RALLOC(double,gen->lmax+1);
+  gen->ylm = RALLOC(double,gen->lmax+1);
+  ALLOC(gen->lambda_wx,ylmgen_dbl2 *,gen->smax+1);
+  for (m=0; m<=gen->smax; ++m)
+    gen->lambda_wx[m]=NULL;
+
+  gen->sylm = RALLOC(sylmgen_d *,gen->smax+1);
+  for (m=0; m<=gen->smax; ++m)
+    gen->sylm[m]=NULL;
+
+  gen->ylm_uptodate = 0;
+  gen->lwx_uptodate = RALLOC(int,gen->smax+1);
+  SET_ARRAY(gen->lwx_uptodate,0,gen->smax+1,0);
+  gen->recfac_uptodate = 0;
+  gen->lamfact_uptodate = 0;
+
+  gen->th = gen->cth = gen->sth = gen->logsth = NULL;
+
+#ifdef __SSE2__
+  gen->ith1 = gen->ith2 = -1;
+  gen->ylm_sse2 = RALLOC(v2df,gen->lmax+1);
+  ALLOC(gen->lambda_wx_sse2,v2df2 *,gen->smax+1);
+  for (m=0; m<=gen->smax; ++m)
+    gen->lambda_wx_sse2[m]=NULL;
+  gen->ylm_uptodate_sse2 = 0;
+  gen->lwx_uptodate_sse2 = RALLOC(int,gen->smax+1);
+  SET_ARRAY(gen->lwx_uptodate_sse2,0,gen->smax+1,0);
+#endif
+
+  ALLOC(gen->logsum,long double,2*gen->lmax+1);
+  gen->lc05 = gen->ls05 = NULL;
+  ALLOC(gen->flm1,double,2*gen->lmax+1);
+  ALLOC(gen->flm2,double,2*gen->lmax+1);
+  ALLOC(gen->xl,double,gen->lmax+1);
+
+  gen->logsum[0] = 0.;
+  for (m=1; m<2*gen->lmax+1; ++m)
+    gen->logsum[m] = gen->logsum[m-1]+logl((long double)m);
+  for (m=0; m<2*gen->lmax+1; ++m)
+    {
+    gen->flm1[m] = sqrt(1./(m+1.));
+    gen->flm2[m] = sqrt(m/(m+1.));
+    }
+
+  gen->xl[0]=0;
+  for (m=1; m<gen->lmax+1; ++m) gen->xl[m]=1./m;
+  }
+
+static void Ylmgen_destroy (Ylmgen_C *gen)
+  {
+  int m;
+
+  DEALLOC(gen->firstl);
+  DEALLOC(gen->cf);
+  DEALLOC(gen->recfac);
+  DEALLOC(gen->mfac);
+  DEALLOC(gen->t1fac);
+  DEALLOC(gen->t2fac);
+  DEALLOC(gen->lamfact);
+  DEALLOC(gen->ylm);
+  DEALLOC(gen->lwx_uptodate);
+  for (m=0; m<=gen->smax; ++m)
+    DEALLOC(gen->lambda_wx[m]);
+  DEALLOC(gen->lambda_wx);
+  for (m=0; m<=gen->smax; ++m)
+    if (gen->sylm[m])
+      {
+      sylmgen_destroy (gen->sylm[m]);
+      DEALLOC(gen->sylm[m]);
+      }
+  DEALLOC(gen->sylm);
+  DEALLOC(gen->th);
+  DEALLOC(gen->cth);
+  DEALLOC(gen->sth);
+  DEALLOC(gen->logsth);
+  DEALLOC(gen->logsum);
+  DEALLOC(gen->lc05);
+  DEALLOC(gen->ls05);
+  DEALLOC(gen->flm1);
+  DEALLOC(gen->flm2);
+  DEALLOC(gen->xl);
+#ifdef __SSE2__
+  DEALLOC(gen->ylm_sse2);
+  for (m=0; m<=gen->smax; ++m)
+    DEALLOC(gen->lambda_wx_sse2[m]);
+  DEALLOC(gen->lambda_wx_sse2);
+  DEALLOC(gen->lwx_uptodate_sse2);
+#endif
+  }
+
+static void Ylmgen_set_theta (Ylmgen_C *gen, const double *theta, int nth)
+  {
+  const double inv_ln2 = 1.4426950408889634073599246810018921;
+  int m;
+  DEALLOC(gen->th);
+  DEALLOC(gen->cth);
+  DEALLOC(gen->sth);
+  DEALLOC(gen->logsth);
+  DEALLOC(gen->lc05);
+  DEALLOC(gen->ls05);
+  gen->th = RALLOC(double,nth);
+  gen->cth = RALLOC(double,nth);
+  gen->sth = RALLOC(double,nth);
+  gen->logsth = RALLOC(double,nth);
+  gen->lc05 = RALLOC(long double,nth);
+  gen->ls05 = RALLOC(long double,nth);
+  for (m=0; m<nth; ++m)
+    {
+    const double pi = 3.141592653589793238462643383279502884197;
+    double th=theta[m];
+    UTIL_ASSERT ((th>=0.)&&(th<=pi),"bad theta angle specified");
+    /* tiny adjustments to make sure cos and sin (theta/2) are positive */
+    if (th==0.) th=1e-16;
+    if (ABSAPPROX(th,pi,1e-15)) th=pi-1e-15;
+    gen->th[m] = th;
+    gen->cth[m] = cos(th);
+    gen->sth[m] = sin(th);
+    gen->logsth[m] = inv_ln2*log(gen->sth[m]);
+    gen->lc05[m]=logl(cosl(0.5L*th));
+    gen->ls05[m]=logl(sinl(0.5L*th));
+    }
+
+  gen->nth = nth;
+  gen->ith = -1;
+#ifdef __SSE2__
+  gen->ith1 = gen->ith2 = -1;
+#endif
+  }
+
+static void Ylmgen_prepare (Ylmgen_C *gen, int ith, int m)
+  {
+  if ((ith==gen->ith) && (m==gen->m_cur)) return;
+
+  gen->ylm_uptodate = 0;
+  SET_ARRAY(gen->lwx_uptodate,0,gen->smax+1,0);
+
+  gen->ith = ith;
+
+  if (m!=gen->m_cur)
+    {
+    gen->recfac_uptodate = 0;
+    gen->lamfact_uptodate = 0;
+    gen->m_cur = m;
+    }
+  }
+
+static void Ylmgen_recalc_recfac (Ylmgen_C *gen)
+  {
+  double f_old=1;
+  int l, m;
+
+  if (gen->recfac_uptodate) return;
+  gen->recfac_uptodate = 1;
+
+  m = gen->m_cur;
+  for (l=m; l<=gen->lmax; ++l)
+    {
+    gen->recfac[l][0] = gen->t1fac[l]*gen->t2fac[l+m]*gen->t2fac[l-m];
+    gen->recfac[l][1] = gen->recfac[l][0]/f_old;
+    f_old = gen->recfac[l][0];
+    }
+  }
+
+/*static void Ylmgen_recalc_lamfact (Ylmgen_C *gen)
+  {
+  int l, m;
+
+  if (gen->lamfact_uptodate) return;
+  gen->lamfact_uptodate = 1;
+
+  m = gen->m_cur;
+  gen->lamfact[m] = 0;
+  for (l=m+1; l<=gen->lmax; ++l)
+    gen->lamfact[l] = sqrt((2*l+1.)/(2*l-1.) * (l*l-m*m));
+    }*/
+
+#define RENORMALIZE_SCALAR \
+  do \
+    { \
+    while (fabs(lam_2)>fbig) \
+      { lam_1*=fsmall; lam_2*=fsmall; ++scale; } \
+    corfac = (scale<0) ? 0. : gen->cf[scale]; \
+    } \
+  while(0)
+
+static void Ylmgen_recalc_Ylm (Ylmgen_C *gen)
+  {
+  const double ln2 = 0.6931471805599453094172321214581766;
+
+  double logval,lam_1,lam_2,corfac;
+  double eps=gen->eps, fbig=gen->fbig, fsmall=gen->fsmall;
+  ylmgen_dbl2 *recfac = gen->recfac;
+  int lmax=gen->lmax;
+  int scale,l;
+  int m = gen->m_cur;
+  double cth=gen->cth[gen->ith], sth=gen->sth[gen->ith];
+  double *result = gen->ylm;
+
+  if (gen->ylm_uptodate) return;
+  gen->ylm_uptodate=1;
+
+  if (((m>=gen->m_crit)&&(fabs(cth)>=gen->cth_crit)) || ((m>0)&&(sth==0)))
+    { gen->firstl[0]=gen->lmax+1; return; }
+
+  Ylmgen_recalc_recfac(gen);
+
+  logval = gen->mfac[m];
+  if (m>0) logval += m*gen->logsth[gen->ith];
+  scale = (int) (logval/large_exponent2)-minscale;
+  corfac = (scale<0) ? 0. : gen->cf[scale];
+
+  lam_1 = 0;
+  lam_2 = exp(ln2*(logval-(scale+minscale)*large_exponent2));
+  if (m&1) lam_2 = -lam_2;
+
+  l=m;
+  if (scale<0)
+    {
+    while (1)
+      {
+      if (++l>lmax) break;
+      lam_1 = cth*lam_2*recfac[l-1][0] - lam_1*recfac[l-1][1];
+      if (++l>lmax) break;
+      lam_2 = cth*lam_1*recfac[l-1][0] - lam_2*recfac[l-1][1];
+      if (fabs(lam_2)>fbig)
+        {
+        RENORMALIZE_SCALAR;
+        if (scale>=0) break;
+        }
+      }
+    }
+
+  lam_1*=corfac;
+  lam_2*=corfac;
+
+  if (l<=lmax)
+    {
+    while (1)
+      {
+      if (fabs(lam_2)>eps) break;
+      if (++l>lmax) break;
+      lam_1 = cth*lam_2*recfac[l-1][0] - lam_1*recfac[l-1][1];
+      if (fabs(lam_1)>eps)
+        { double x=lam_1; lam_1=lam_2; lam_2=x; break; }
+      if (++l>lmax) break;
+      lam_2 = cth*lam_1*recfac[l-1][0] - lam_2*recfac[l-1][1];
+      }
+    }
+
+  gen->firstl[0]=l;
+  if (l>lmax)
+    { gen->m_crit=m; gen->cth_crit=fabs(cth); return; }
+
+  for(;l<lmax-3;l+=4)
+    {
+    result[l]=lam_2;
+    lam_1 = cth*lam_2*recfac[l][0] - lam_1*recfac[l][1];
+    result[l+1] = lam_1;
+    lam_2 = cth*lam_1*recfac[l+1][0] - lam_2*recfac[l+1][1];
+    result[l+2]=lam_2;
+    lam_1 = cth*lam_2*recfac[l+2][0] - lam_1*recfac[l+2][1];
+    result[l+3] = lam_1;
+    lam_2 = cth*lam_1*recfac[l+3][0] - lam_2*recfac[l+3][1];
+    }
+
+  while (1)
+    {
+    result[l]=lam_2;
+    if (++l>lmax) break;
+    lam_1 = cth*lam_2*recfac[l-1][0] - lam_1*recfac[l-1][1];
+    result[l] = lam_1;
+    if (++l>lmax) break;
+    lam_2 = cth*lam_1*recfac[l-1][0] - lam_2*recfac[l-1][1];
+    }
+  }
+
+
+/*
+static void Ylmgen_recalc_lambda_wx1 (Ylmgen_C *gen)
+  {
+  if (gen->lwx_uptodate[1]) return;
+  Ylmgen_recalc_Ylm(gen);
+  gen->firstl[1] = gen->firstl[0];
+  if (gen->firstl[1]>gen->lmax) return;
+  Ylmgen_recalc_lamfact(gen);
+  gen->lwx_uptodate[1] = 1;
+
+  {
+  double cth=gen->cth[gen->ith];
+  double xsth=1./(gen->sth[gen->ith]);
+  double m=gen->m_cur;
+  double m_on_sth = m*xsth;
+  double lam_lm=0;
+  ylmgen_dbl2 *lambda_wx = gen->lambda_wx[1];
+  int l;
+  double ell;
+  for (ell=l=gen->firstl[1]; l<=gen->lmax; ++l, ell+=1.)
+    {
+    double lam_lm1m=lam_lm;
+    lam_lm=gen->ylm[l];
+    lambda_wx[l][0] = xsth*(gen->lamfact[l]*lam_lm1m - ell*cth*lam_lm);
+    lambda_wx[l][1] = m_on_sth*lam_lm;
+    }
+  }
+  }
+
+static void Ylmgen_recalc_lambda_wx2 (Ylmgen_C *gen)
+  {
+  if (gen->lwx_uptodate[2]) return;
+  Ylmgen_recalc_Ylm(gen);
+  gen->firstl[2] = gen->firstl[0];
+  if (gen->firstl[2]>gen->lmax) return;
+  Ylmgen_recalc_lamfact(gen);
+  gen->lwx_uptodate[2] = 1;
+
+  {
+  double cth=gen->cth[gen->ith];
+  double sth=gen->sth[gen->ith];
+  double m=gen->m_cur;
+  double one_on_s2 = 1./(sth*sth);
+  double two_on_s2 = 2*one_on_s2;
+  double two_c_on_s2 = cth * two_on_s2;
+  double m2 = m*m;
+  double two_m_on_s2 = m*two_on_s2;
+  double lam_lm=0;
+  ylmgen_dbl2 *lambda_wx = gen->lambda_wx[2];
+  int l;
+  double ell;
+  for (ell=l=gen->firstl[2]; l<=gen->lmax; ++l, ell+=1.)
+    {
+    double lam_lm1m=lam_lm;
+    lam_lm=gen->ylm[l];
+    {
+    const double t1  = lam_lm1m*gen->lamfact[l];
+    const double a_w = (m2-ell)*two_on_s2 - ell*(ell-1.);
+    const double a_x = cth*(ell-1.)*lam_lm;
+    lambda_wx[l][0] = a_w*lam_lm + t1*two_c_on_s2;
+    lambda_wx[l][1] = two_m_on_s2 * (t1-a_x);
+    }
+    }
+  }
+  }
+
+void Ylmgen_recalc_lambda_wx (Ylmgen_C *gen, int spin)
+  {
+  UTIL_ASSERT ((spin>0) && (spin<=gen->smax),
+    "invalid spin in Ylmgen_recalc_lambda_wx");
+
+  if (!gen->lambda_wx[spin])
+    gen->lambda_wx[spin]=RALLOC(ylmgen_dbl2,gen->lmax+1);
+
+  if (gen->spinrec && spin==1) { Ylmgen_recalc_lambda_wx1(gen); return; }
+  if (gen->spinrec && spin==2) { Ylmgen_recalc_lambda_wx2(gen); return; }
+
+  if (!gen->sylm[spin])
+    {
+    gen->sylm[spin]=RALLOC(sylmgen_d,1);
+    sylmgen_init(gen->sylm[spin],gen,spin);
+    }
+  if (gen->lwx_uptodate[spin]) return;
+  sylmgen_prepare(gen->sylm[spin],gen,gen->m_cur);
+  sylmgen_recalc(gen->sylm[spin],gen,gen->ith,gen->lambda_wx[spin],
+    &gen->firstl[spin]);
+  gen->lwx_uptodate[spin] = 1;
+  }
+*/
+#ifdef __SSE2__
+
+static void Ylmgen_prepare_sse2 (Ylmgen_C *gen, int ith1, int ith2, int m)
+  {
+  if ((ith1==gen->ith1) && (ith2==gen->ith2) && (m==gen->m_cur)) return;
+
+  gen->ylm_uptodate_sse2 = 0;
+  SET_ARRAY(gen->lwx_uptodate_sse2,0,gen->smax+1,0);
+
+  gen->ith1 = ith1; gen->ith2 = ith2;
+
+  if (m!=gen->m_cur)
+    {
+    gen->recfac_uptodate = gen->lamfact_uptodate = 0;
+    gen->m_cur = m;
+    }
+  }
+
+
+#define RENORMALIZE \
+  do \
+    { \
+    double lam1a, lam1b, lam2a, lam2b, corfaca, corfacb; \
+    read_v2df (lam_1, &lam1a, &lam1b); read_v2df (lam_2, &lam2a, &lam2b); \
+    read_v2df (corfac, &corfaca, &corfacb); \
+    while (fabs(lam2a)>fbig) \
+      { \
+      lam1a*=fsmall; lam2a*=fsmall; ++scale1; \
+      corfaca = (scale1<0) ? 0. : gen->cf[scale1]; \
+      } \
+    while (fabs(lam2b)>fbig) \
+      { \
+      lam1b*=fsmall; lam2b*=fsmall; ++scale2; \
+      corfacb = (scale2<0) ? 0. : gen->cf[scale2]; \
+      } \
+    lam_1=build_v2df(lam1a,lam1b); lam_2=build_v2df(lam2a,lam2b); \
+    corfac=build_v2df(corfaca,corfacb); \
+    } \
+  while(0)
+#define GETPRE(prea,preb,lv) \
+  { \
+  prea=_mm_mul_pd(_mm_set1_pd(recfac[lv][0]),cth); \
+  preb=_mm_set1_pd(recfac[lv][1]); \
+  }
+#define NEXTSTEP(prea,preb,prec,pred,reca,recb,lv) \
+  { \
+  preb = _mm_mul_pd(preb,reca); \
+  prea = _mm_mul_pd(prea,recb); \
+  prec = _mm_set1_pd(recfac[lv][0]); \
+  pred = _mm_set1_pd(recfac[lv][1]); \
+  reca = _mm_sub_pd(prea,preb); \
+  prec = _mm_mul_pd(cth,prec); \
+  }
+
+static void Ylmgen_recalc_Ylm_sse2 (Ylmgen_C *gen)
+  {
+  const double ln2 = 0.6931471805599453094172321214581766;
+
+  v2df lam_1,lam_2,corfac;
+  double logval1,logval2;
+  double eps=gen->eps, fbig=gen->fbig, fsmall=gen->fsmall;
+  v2df eps2=build_v2df(eps,eps);
+  v2df fbig2=build_v2df(fbig,fbig);
+  ylmgen_dbl2 *recfac = gen->recfac;
+  int lmax=gen->lmax;
+  int scale1,scale2,l;
+  int m = gen->m_cur;
+  double cth1=gen->cth[gen->ith1], cth2=gen->cth[gen->ith2];
+  v2df cth=build_v2df(cth1,cth2);
+  v2df *result = gen->ylm_sse2;
+  v2df pre0,pre1,pre2,pre3;
+
+  if (gen->ylm_uptodate_sse2) return;
+  gen->ylm_uptodate_sse2=1;
+
+  if ((m>=gen->m_crit)&&(fabs(cth1)>=gen->cth_crit)&&(fabs(cth2)>=gen->cth_crit))
+    { gen->firstl[0]=gen->lmax+1; return; }
+
+  Ylmgen_recalc_recfac(gen);
+
+  logval1 = logval2 = gen->mfac[m];
+  if (m>0) logval1 += m*gen->logsth[gen->ith1];
+  if (m>0) logval2 += m*gen->logsth[gen->ith2];
+  scale1 = (int) (logval1/large_exponent2)-minscale;
+  scale2 = (int) (logval2/large_exponent2)-minscale;
+  corfac = build_v2df((scale1<0) ? 0. : gen->cf[scale1],
+                      (scale2<0) ? 0. : gen->cf[scale2]);
+
+  lam_1 = _mm_setzero_pd();
+  lam_2 = build_v2df(exp(ln2*(logval1-(scale1+minscale)*large_exponent2)),
+                     exp(ln2*(logval2-(scale2+minscale)*large_exponent2)));
+  if (m&1) lam_2 = _mm_xor_pd (lam_2,V2DF_SIGNMASK); /* negate */
+
+  l=m;
+  if ((scale1<0) && (scale2<0))
+    {
+    GETPRE(pre0,pre1,l)
+    while (1)
+      {
+      if (++l>lmax) break;
+      NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)
+      if (++l>lmax) break;
+      NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)
+      if (v2df_any_gt(lam_2,fbig2))
+        {
+        RENORMALIZE;
+        if ((scale1>=0) || (scale2>=0)) break;
+        }
+      }
+    }
+
+  if (l<=lmax)
+    {
+    GETPRE(pre0,pre1,l)
+    while (1)
+      {
+      v2df t1;
+      result[l]=t1=_mm_mul_pd(lam_2,corfac);
+      if (v2df_any_gt(t1,eps2))
+        break;
+      if (++l>lmax) break;
+      NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)
+
+      result[l]=t1=_mm_mul_pd(lam_1,corfac);
+      if (v2df_any_gt(t1,eps2))
+        { v2df tmp=lam_1;lam_1=lam_2;lam_2=tmp; break; }
+      if (++l>lmax) break;
+      NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)
+
+      if (v2df_any_gt(lam_2,fbig2))
+        RENORMALIZE;
+      }
+    }
+
+  gen->firstl[0]=l;
+  if (l>lmax)
+    {
+    gen->m_crit=m;
+    gen->cth_crit= (fabs(cth1)<fabs(cth2)) ? fabs(cth1) : fabs(cth2);
+    return;
+    }
+
+  GETPRE(pre0,pre1,l)
+  while (1)
+    {
+    v2df t1;
+    result[l]=t1=_mm_mul_pd(lam_2,corfac);
+    if (v2df_all_ge(t1,eps2))
+      break;
+    if (++l>lmax) return;
+    NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)
+
+    result[l]=t1=_mm_mul_pd(lam_1,corfac);
+    if (v2df_all_ge(t1,eps2))
+      { v2df tmp=lam_1;lam_1=lam_2;lam_2=tmp; break; }
+    if (++l>lmax) return;
+    NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)
+
+    if (v2df_any_gt(lam_2,fbig2))
+      RENORMALIZE;
+    }
+
+  lam_1 = _mm_mul_pd (lam_1,corfac);
+  lam_2 = _mm_mul_pd (lam_2,corfac);
+
+  GETPRE(pre0,pre1,l)
+  for(;l<lmax-2;l+=2)
+    {
+    result[l]=lam_2;
+    NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l+1)
+    result[l+1]=lam_1;
+    NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l+2)
+    }
+
+  while (1)
+    {
+    result[l]=lam_2;
+    if (++l>lmax) break;
+    NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)
+    result[l] = lam_1;
+    if (++l>lmax) break;
+    NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)
+    }
+  }
+
+/*static void Ylmgen_recalc_lambda_wx1_sse2 (Ylmgen_C *gen)
+  {
+  if (gen->lwx_uptodate_sse2[1]) return;
+  Ylmgen_recalc_Ylm_sse2(gen);
+  gen->firstl[1] = gen->firstl[0];
+  if (gen->firstl[1]>gen->lmax) return;
+  Ylmgen_recalc_lamfact(gen);
+  gen->lwx_uptodate_sse2[1] = 1;
+
+  {
+  v2df cth=build_v2df(gen->cth[gen->ith1],gen->cth[gen->ith2]);
+  v2df xsth=build_v2df(1./gen->sth[gen->ith1],1./gen->sth[gen->ith2]);
+  v2df m=build_v2df(gen->m_cur,gen->m_cur);
+  v2df m_on_sth = _mm_mul_pd(m,xsth);
+  v2df lam_lm=_mm_setzero_pd();
+  v2df2 *lambda_wx = gen->lambda_wx_sse2[1];
+  int l;
+  v2df ell=build_v2df(gen->firstl[1],gen->firstl[1]);
+  v2df uno=_mm_set1_pd(1.);
+  for (l=gen->firstl[1]; l<=gen->lmax; ++l, ell=_mm_add_pd(ell,uno))
+    {
+    v2df lamfact=_mm_load1_pd(&gen->lamfact[l]);
+    v2df lam_lm1m=lam_lm;
+    lam_lm=gen->ylm_sse2[l];
+    lambda_wx[l].a = _mm_mul_pd(xsth,_mm_sub_pd(_mm_mul_pd(lamfact,lam_lm1m),
+                     _mm_mul_pd(_mm_mul_pd(ell,cth),lam_lm)));
+    lambda_wx[l].b = _mm_mul_pd(m_on_sth,lam_lm);
+    }
+  }
+  }
+
+static void Ylmgen_recalc_lambda_wx2_sse2 (Ylmgen_C *gen)
+  {
+  if (gen->lwx_uptodate_sse2[2]) return;
+  Ylmgen_recalc_Ylm_sse2(gen);
+  gen->firstl[2] = gen->firstl[0];
+  if (gen->firstl[2]>gen->lmax) return;
+  Ylmgen_recalc_lamfact(gen);
+  gen->lwx_uptodate_sse2[2] = 1;
+
+  {
+  v2df cth=build_v2df(gen->cth[gen->ith1],gen->cth[gen->ith2]);
+  v2df sth=build_v2df(gen->sth[gen->ith1],gen->sth[gen->ith2]);
+  v2df m=build_v2df(gen->m_cur,gen->m_cur);
+  v2df uno=_mm_set1_pd(1.);
+  v2df one_on_s2 = _mm_div_pd(uno,_mm_mul_pd(sth,sth));
+  v2df two_on_s2 = _mm_mul_pd(_mm_set1_pd(2.),one_on_s2);
+  v2df two_c_on_s2 = _mm_mul_pd(cth,two_on_s2);
+  v2df m2 = _mm_mul_pd(m,m);
+  v2df two_m_on_s2 = _mm_mul_pd(m,two_on_s2);
+  v2df lam_lm=_mm_setzero_pd();
+  v2df2 *lambda_wx = gen->lambda_wx_sse2[2];
+  int l;
+  v2df ell=build_v2df(gen->firstl[2],gen->firstl[2]);
+  for (l=gen->firstl[2]; l<=gen->lmax; ++l, ell=_mm_add_pd(ell,uno))
+    {
+    v2df lamfact=_mm_load1_pd(&gen->lamfact[l]);
+    v2df lam_lm1m=lam_lm;
+    lam_lm=gen->ylm_sse2[l];
+    {
+    const v2df t1  = _mm_mul_pd(lam_lm1m,lamfact);
+    const v2df ellm1 = _mm_sub_pd(ell,uno);
+    const v2df a_w = _mm_sub_pd
+      (_mm_mul_pd(_mm_sub_pd(m2,ell),two_on_s2),_mm_mul_pd(ell,ellm1));
+    const v2df a_x = _mm_mul_pd(_mm_mul_pd(cth,ellm1),lam_lm);
+    lambda_wx[l].a =
+      _mm_add_pd(_mm_mul_pd(a_w,lam_lm),_mm_mul_pd(t1,two_c_on_s2));
+    lambda_wx[l].b = _mm_mul_pd(two_m_on_s2,_mm_sub_pd(t1,a_x));
+    }
+    }
+  }
+  }*/
+
+/*static void Ylmgen_recalc_lambda_wx_sse2 (Ylmgen_C *gen, int spin)
+  {
+  UTIL_ASSERT ((spin>0) && (spin<=gen->smax),
+    "invalid spin in Ylmgen_recalc_lambda_wx_sse2");
+
+  if (!gen->lambda_wx_sse2[spin])
+    gen->lambda_wx_sse2[spin]=RALLOC(v2df2,gen->lmax+1);
+
+  if (gen->spinrec && spin==1) { Ylmgen_recalc_lambda_wx1_sse2(gen); return; }
+  if (gen->spinrec && spin==2) { Ylmgen_recalc_lambda_wx2_sse2(gen); return; }
+
+  if (!gen->sylm[spin])
+    {
+    gen->sylm[spin]=RALLOC(sylmgen_d,1);
+    sylmgen_init(gen->sylm[spin],gen,spin);
+    }
+  if (gen->lwx_uptodate_sse2[spin]) return;
+  sylmgen_prepare(gen->sylm[spin],gen,gen->m_cur);
+  sylmgen_recalc_sse2(gen->sylm[spin],gen,gen->ith1,gen->ith2,
+    gen->lambda_wx_sse2[spin],&gen->firstl[spin]);
+  gen->lwx_uptodate_sse2[spin] = 1;
+  }*/
+
+#endif /* __SSE2__ */
+
+/*
+double *Ylmgen_get_norm (int lmax, int spin, int spinrec)
+  {
+  const double pi = 3.141592653589793238462643383279502884197;
+  double *res=RALLOC(double,lmax+1);
+  int l;
+  double spinsign;
+  / sign convention for H=1 (LensPix paper) /
+#if 1
+  spinsign = (spin>0) ? -1.0 : 1.0;
+#else
+  spinsign = 1.0;
+#endif
+
+  if (spin==0)
+    {
+    for (l=0; l<=lmax; ++l)
+      res[l]=1.;
+    return res;
+    }
+
+  if ((!spinrec) || (spin>=3))
+    {
+    spinsign = (spin&1) ? -spinsign : spinsign;
+    for (l=0; l<=lmax; ++l)
+      res[l] = (l<spin) ? 0. : spinsign*0.5*sqrt((2*l+1)/(4*pi));
+    return res;
+    }
+
+  if (spin==1)
+    {
+    for (l=0; l<=lmax; ++l)
+      res[l] = (l<spin) ? 0. : spinsign*sqrt(1./((l+1.)*l));
+    return res;
+    }
+
+  if (spin==2)
+    {
+    for (l=0; l<=lmax; ++l)
+      res[l] = (l<spin) ? 0. : spinsign*sqrt(1./((l+2.)*(l+1.)*l*(l-1.)));
+    return res;
+    }
+
+  UTIL_FAIL ("error in Ylmgen_get_norm");
+  return NULL;
+  }
+*/
+
+
+
+/*
+New high-level wrapper
+*/
+#include "sharp_legendre_table.h"
+#include <stdio.h>
+
+void sharp_normalized_associated_legendre_table(
+  ptrdiff_t m,
+  ptrdiff_t lmax,
+  ptrdiff_t ntheta,
+  /* contigious 1D array of theta values to compute for,
+     contains ntheta values */
+  double *theta,
+  /* number of columns in out; see below. Should be >= (lmax - m). */
+  ptrdiff_t ncols,
+  /* contigiuos 2D array, in "theta-major ordering". Has `ntheta`
+     rows and `ncols` columns. Indexed as out[itheta * ncols + (l - m)].
+     If `ncols > lmax - m` then those entries are not accessed.
+  */
+  double *out
+) {
+
+    Ylmgen_C ctx;
+    ptrdiff_t itheta, l, lmin;
+
+    Ylmgen_init(&ctx, lmax, lmax, 0, 0, 1e-300);
+    Ylmgen_set_theta(&ctx, theta, ntheta);
+
+    itheta = 0;
+    #ifdef __SSE2__
+    for (; itheta < ntheta - 1; itheta += 2) {
+        Ylmgen_prepare_sse2(&ctx, itheta, itheta + 1, m);
+        Ylmgen_recalc_Ylm_sse2(&ctx);
+        lmin = IMIN(*ctx.firstl, lmax + 1);
+        for (l = m; l < lmin; ++l) {
+            out[itheta * ncols + l - m] = 0;
+            out[(itheta + 1) * ncols + l - m ] = 0;
+        }
+        for (l = IMAX(lmin, m); l <= lmax; ++l) {
+            double v1, v2;
+            read_v2df(ctx.ylm_sse2[l], &v1, &v2);
+            out[itheta * ncols + l - m] = v1;
+            out[(itheta + 1) * ncols + l - m] = v2;
+        }
+    }
+    #endif
+    for (; itheta < ntheta; itheta += 1) {
+        Ylmgen_prepare(&ctx, itheta, m);
+        Ylmgen_recalc_Ylm(&ctx);
+        lmin = IMIN(*ctx.firstl, lmax + 1);
+        for (l = m; l < lmin; ++l) {
+            out[itheta * ncols + l - m] = 0;
+        }
+        for (l = IMAX(lmin, m); l <= lmax; ++l) {
+            out[itheta * ncols + l - m] = ctx.ylm[l];
+        }
+    }
+    Ylmgen_destroy(&ctx);
+}
diff --git a/libsharp/sharp_legendre_table.h b/libsharp/sharp_legendre_table.h
new file mode 100644
index 0000000..ccabf52
--- /dev/null
+++ b/libsharp/sharp_legendre_table.h
@@ -0,0 +1,87 @@
+/*
+ *  This file is part of libsharp.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*! \file sharp_legendre_table.h
+ *  Interface for computing tables of the normalized associated Legendre transform
+ *
+ *  Copyright (C) 2017 Dag Sverre Seljebotn
+ *  \author Dag Sverre Seljebotn
+ *
+ *  Note: This code was mainly copied from libpsht; only a small high-level wrapper added
+ */
+
+#ifndef SHARP_LEGENDRE_TABLE_H
+#define SHARP_LEGENDRE_TABLE_H
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef NO_LEGENDRE_TABLE
+
+
+/*! Returns a table of the normalized associated Legendre polynomials. m is a single
+    fixed argument and a table for multiple l and cos(theta) is provided.
+    (Internally, sin(theta) is also used for part of the computation, making theta
+    the most convenient argument.)
+
+    \param m The m-value to compute a table for
+    \param lmax A table will be provided for l = m .. lmax
+    \param ntheta How many theta values to evaluate for
+    \param theta Contiguous 1D array of theta values
+    \param ncols Number of columns in the out-array; should have ncols >= (lmax - m)
+    \param out Contiguous 2D array that will receive the output. Each output entry
+               is assigned to out[itheta * ncols + (l - m)].
+ */
+void sharp_normalized_associated_legendre_table(
+  ptrdiff_t m,
+  ptrdiff_t lmax,
+  ptrdiff_t ntheta,
+  /* contiguous 1D array of theta values to compute for,
+     contains ntheta values */
+  double *theta,
+  /* number of columns in out; see below. Should be >= (lmax - m). */
+  ptrdiff_t ncols,
+  /* contiguous 2D array, in "theta-major ordering". Has `ntheta`
+     rows and `ncols` columns. Indexed as out[itheta * ncols + (l - m)].
+     If `ncols > lmax - m` then those entries are not accessed.
+  */
+  double *out
+);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/python/libsharp/libsharp.pxd b/python/libsharp/libsharp.pxd
index 9209b2b..bdc296b 100644
--- a/python/libsharp/libsharp.pxd
+++ b/python/libsharp/libsharp.pxd
@@ -59,6 +59,9 @@ cdef extern from "sharp.h":
         sharp_alm_info *alm_info, int ntrans, int flags, double *time,
         unsigned long long *opcnt) nogil
 
+    void sharp_normalized_associated_legendre_table(int m, int lmax, int ntheta,
+        double *theta, int ncols, double *out) nogil
+
 
 cdef extern from "sharp_geomhelpers.h":
     void sharp_make_subset_healpix_geom_info(
@@ -76,4 +79,3 @@ cdef extern from "sharp_almhelpers.h":
     void sharp_make_mmajor_real_packed_alm_info (int lmax, int stride,
         int nm, const int *ms, sharp_alm_info **alm_info)
 
-
diff --git a/python/libsharp/libsharp.pyx b/python/libsharp/libsharp.pyx
index 1c783fe..d4bbeb2 100644
--- a/python/libsharp/libsharp.pyx
+++ b/python/libsharp/libsharp.pyx
@@ -1,8 +1,9 @@
 import numpy as np
+cimport cython
 
 __all__ = ['legendre_transform', 'legendre_roots', 'sht', 'synthesis', 'adjoint_synthesis',
            'analysis', 'adjoint_analysis', 'healpix_grid', 'triangular_order', 'rectangular_order',
-           'packed_real_order']
+           'packed_real_order', 'normalized_associated_legendre_table']
 
 
 def legendre_transform(x, bl, out=None):
@@ -254,3 +255,17 @@ cdef class packed_real_order(alm_info):
                                                ms=NULL if ms is None else &ms[0],
                                                alm_info=&self.ainfo)
 
+#
+# 
+#
+
+@cython.boundscheck(False)
+def normalized_associated_legendre_table(int lmax, int m, theta):
+    cdef double[::1] theta_ = np.ascontiguousarray(theta, dtype=np.double)
+    out = np.zeros((theta_.shape[0], lmax - m + 1), np.double)
+    cdef double[:, ::1] out_ = out
+    if lmax < m:
+        raise ValueError("lmax < m")
+    with nogil:
+        sharp_normalized_associated_legendre_table(m, lmax, theta_.shape[0], &theta_[0], lmax - m + 1, &out_[0,0])
+    return out
diff --git a/python/libsharp/tests/test_legendre.py b/python/libsharp/tests/test_legendre.py
index 321fa9c..0129b29 100644
--- a/python/libsharp/tests/test_legendre.py
+++ b/python/libsharp/tests/test_legendre.py
@@ -56,4 +56,3 @@ def test_legendre_roots():
     yield check_legendre_roots, 1
     yield check_legendre_roots, 32
     yield check_legendre_roots, 33
-    yield check_legendre_roots, 128
diff --git a/python/libsharp/tests/test_legendre_table.py b/python/libsharp/tests/test_legendre_table.py
new file mode 100644
index 0000000..1d42af5
--- /dev/null
+++ b/python/libsharp/tests/test_legendre_table.py
@@ -0,0 +1,35 @@
+import numpy as np
+
+from numpy.testing import assert_almost_equal
+from nose.tools import eq_, ok_
+
+from libsharp import normalized_associated_legendre_table
+from scipy.special import sph_harm, p_roots
+
+def test_compare_legendre_table_with_scipy():
+    def test(theta, m, lmax):
+        Plm = normalized_associated_legendre_table(lmax, m, theta)
+
+        Plm_p = sph_harm(m, np.arange(m, lmax + 1), 0, theta)[None, :]
+        if not np.allclose(Plm_p, Plm):
+            print Plm_p
+            print Plm
+        return ok_, np.allclose(Plm_p, Plm)
+
+    yield test(np.pi/2, 0, 10)
+    yield test(np.pi/4, 0, 10)
+    yield test(3 * np.pi/4, 0, 10)
+    yield test(np.pi/4, 1, 4)
+    yield test(np.pi/4, 2, 4)
+    yield test(np.pi/4, 50, 50)
+    yield test(np.pi/2, 49, 50)
+
+
+def test_legendre_table_wrapper_logic():
+    # tests the SSE 2 logic in the high-level wrapper by using an odd number of thetas
+    theta = np.asarray([np.pi/2, np.pi/4, 3 * np.pi / 4])
+    m = 3
+    lmax = 10
+    Plm = normalized_associated_legendre_table(lmax, m, theta)
+    assert np.allclose(Plm[1, :], normalized_associated_legendre_table(lmax, m, np.pi/4)[0, :])
+    assert np.allclose(Plm[2, :], normalized_associated_legendre_table(lmax, m, 3 * np.pi/4)[0, :])