/*

This file originated as a concatenation of files from libpsht. Further refactoring
could be carried out to make the code use libsharp conventions instead for SSE etc.;

*/


/*
sse_utils.h
*/
/*
 *  This file is part of libc_utils.
 *
 *  libc_utils is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libc_utils is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libc_utils; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

/*
 *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
 *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
 *  (DLR).
 */

/*! \file sse_utils.h
 *  SSE/SSE2/SSE3-related functionality
 *
 *  Copyright (C) 2010,2011 Max-Planck-Society
 *  \author Martin Reinecke
 */


#if (defined(__SSE__))

#include <xmmintrin.h>


typedef __m128 v4sf; /* vector of 4 floats (SSE1) */

typedef union {
  float f[4];
  v4sf v;
} V4SF;

static inline v4sf build_v4sf (float a, float b, float c, float d)
  { return _mm_set_ps(d,c,b,a); }
static inline void read_v4sf (v4sf v, float *a, float *b, float *c, float *d)
  {
  V4SF tmp;
  tmp.v = v;
  if (a) *a=tmp.f[0];
  if (b) *b=tmp.f[1];
  if (c) *c=tmp.f[2];
  if (d) *d=tmp.f[3];
  }


#endif

#if (defined(__SSE2__))

#include <emmintrin.h>


typedef __m128d v2df; /* vector of 2 doubles (SSE2) */

typedef union {
  double d[2];
  v2df v;
} V2DF;

typedef struct {
  v2df a,b;
} v2df2;
typedef struct {
  V2DF a,b;
} V2DF2;

#define V2DF_SIGNMASK _mm_set1_pd(-0.0)

static inline v2df build_v2df (double a, double b)
  { return _mm_set_pd(b,a); }
static inline void read_v2df (v2df v, double *a, double *b)
  { _mm_store_sd(a,v); _mm_storeh_pd(b,v); }

static inline int v2df_any_gt (v2df a, v2df b)
  {
  return (_mm_movemask_pd(_mm_cmpgt_pd(_mm_andnot_pd(V2DF_SIGNMASK,a),b))!=0);
  }
static inline int v2df_all_ge (v2df a, v2df b)
  {
  return (_mm_movemask_pd(_mm_cmplt_pd(_mm_andnot_pd(V2DF_SIGNMASK,a),b))==0);
  }
static inline V2DF to_V2DF (v2df x)
  { V2DF X; X.v=x; return X; }
static inline V2DF2 to_V2DF2 (v2df2 x)
  { V2DF2 X; X.a.v=x.a; X.b.v=x.b; return X; }
static inline v2df2 to_v2df2 (V2DF2 X)
  { v2df2 x; x.a=X.a.v; x.b=X.b.v; return x; }
static inline v2df2 zero_v2df2(void)
  { v2df2 x; x.a=x.b=_mm_setzero_pd(); return x; }


#endif

#if (defined(__SSE3__))

#include <pmmintrin.h>

#endif


/*
ylmgen_c.h
*/
/*
 *  This file is part of libpsht.
 *
 *  libpsht is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libpsht is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libpsht; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

/*
 *  libpsht is being developed at the Max-Planck-Institut fuer Astrophysik
 *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
 *  (DLR).
 */

/*! \file ylmgen_c.h
 *  Code for efficient calculation of Y_lm(phi=0,theta)
 *
 *  Copyright (C) 2005-2011 Max-Planck-Society
 *  \author Martin Reinecke
 */

typedef double ylmgen_dbl2[2];
typedef double ylmgen_dbl3[3];

typedef struct
  {
  double cth_crit;
  int mdist_crit;
  /* members depending on m and m' */
  int s, m, mlo, mhi, cosPow, sinPow;
  long double prefactor;
  ylmgen_dbl3 *fx;
  int preMinus_p, preMinus_m;
  } sylmgen_d;

typedef struct
  {
  double fsmall, fbig, eps, cth_crit;
  int lmax, mmax, smax, m_cur, ith, nth, m_crit, spinrec;
  /*! The index of the first non-negligible Y_lm value. */
  int *firstl;
  double *cf, *mfac, *t1fac, *t2fac, *th, *cth, *sth, *logsth;
  ylmgen_dbl2 *recfac;
  double *lamfact;
  /*! Points to an array of size [0..lmax] containing the Y_lm values. */
  double *ylm;
  /*! Points to an array of size [0..lmax] containing the lambda_w
      and lambda_x values for spin>0 transforms. */
  ylmgen_dbl2 **lambda_wx;
  long double *logsum, *lc05, *ls05;
  double *flm1, *flm2, *xl;

  sylmgen_d **sylm;

  int *lwx_uptodate;
  int ylm_uptodate;

#ifdef __SSE2__
  int ith1, ith2;
  /*! Points to an array of size [0..lmax] containing the Y_lm values. */
  v2df *ylm_sse2;
  /*! Points to an array of size [0..lmax] containing the lambda_w
      and lambda_x values for spin>0 transforms. */
  v2df2 **lambda_wx_sse2;
  int *lwx_uptodate_sse2;
  int ylm_uptodate_sse2;
#endif

  int recfac_uptodate, lamfact_uptodate;
  } Ylmgen_C;

/*! Creates a generator which will calculate Y_lm(theta,phi=0)
    up to \a l=l_max and \a m=m_max. It may regard Y_lm whose absolute
    magnitude is smaller than \a epsilon as zero. If \a spinrec is nonzero,
    the spin-1 and spin-2 Y_lm will be calculated by recursion from the spin-0
    ones, otherwise Wigner d matrix elements will be used. */
static void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int s_max, int spinrec,
   double epsilon);

/*! Passes am array \a theta of \a nth colatitudes that will be used in
    subsequent calls. The individual angles will be referenced by their
    index in the array, starting with 0.
    \note The array can be freed or reused after the call. */
static void Ylmgen_set_theta (Ylmgen_C *gen, const double *theta, int nth);

/*! Deallocates a generator previously initialised by Ylmgen_init(). */
static void Ylmgen_destroy (Ylmgen_C *gen);

/*! Prepares the object for the calculation at \a theta and \a m. */
static void Ylmgen_prepare (Ylmgen_C *gen, int ith, int m);

/*! Recalculates (if necessary) the Y_lm values. */
static void Ylmgen_recalc_Ylm (Ylmgen_C *gen);
/*! Recalculates (if necessary) the lambda_w and lambda_x values for spin >0
    transforms. */
/*static void Ylmgen_recalc_lambda_wx (Ylmgen_C *gen, int spin);*/

#ifdef __SSE2__
/*! Prepares the object for the calculation at \a theta, \a theta2 and \a m. */
static void Ylmgen_prepare_sse2 (Ylmgen_C *gen, int ith1, int ith2, int m);

/*! Recalculates (if necessary) the Y_lm values. */
static void Ylmgen_recalc_Ylm_sse2 (Ylmgen_C *gen);
/*! Recalculates (if necessary) the lambda_w and lambda_x values for spin >0
    transforms. */
/*static void Ylmgen_recalc_lambda_wx_sse2 (Ylmgen_C *gen, int spin);*/
#endif

/*! Returns a pointer to an array with lmax+1 entries containing normalisation
    factors that must be applied to Y_lm values computed for \a spin with the
    given \a spinrec flag. The array must be deallocated (using free()) by the
    user. */
/*static double *Ylmgen_get_norm (int lmax, int spin, int spinrec);*/


/*
ylmgen_c.c
*/

/*
 *  This file is part of libpsht.
 *
 *  libpsht is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libpsht is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libpsht; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

/*
 *  libpsht is being developed at the Max-Planck-Institut fuer Astrophysik
 *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
 *  (DLR).
 */

/*
 *  Code for efficient calculation of Y_lm(theta,phi=0)
 *
 *  Copyright (C) 2005-2011 Max-Planck-Society
 *  Author: Martin Reinecke
 */

#include <math.h>
#include <stdlib.h>
#include "c_utils.h"

enum { large_exponent2=90, minscale=-4, maxscale=11 };

/*static void sylmgen_init (sylmgen_d *gen, const Ylmgen_C *ygen, int spin)
  {
  int i;
  UTIL_ASSERT(spin>=1,"incorrect spin");
  gen->s=spin;
  gen->m=gen->mlo=gen->mhi=-1234567890;
  ALLOC(gen->fx,ylmgen_dbl3,ygen->lmax+2);

  for (i=0; i<ygen->lmax+2; ++i)
    gen->fx[i][0]=gen->fx[i][1]=gen->fx[i][2]=0.;

  gen->cth_crit = 2.;
  gen->mdist_crit = ygen->lmax+1;
  }*/

static void sylmgen_destroy (sylmgen_d *gen)
  { DEALLOC(gen->fx); }

/*static void sylmgen_prepare (sylmgen_d *gen, const Ylmgen_C *ygen, int m_)
  {
  int mlo_, mhi_, ms_similar, l;

  if (m_==gen->m) return;
  UTIL_ASSERT(m_>=0,"incorrect m");

  mlo_=m_; mhi_=gen->s;
  if (mhi_<mlo_) SWAP(mhi_,mlo_,int);
  ms_similar = ((gen->mhi==mhi_) && (gen->mlo==mlo_));

  gen->m=m_;
  gen->mlo = mlo_; gen->mhi = mhi_;

  if (!ms_similar)
    {
    for (l=gen->mhi; l<ygen->lmax; ++l)
      {
      double t = ygen->flm1[l+gen->m]*ygen->flm1[l-gen->m]
                *ygen->flm1[l+gen->s]*ygen->flm1[l-gen->s];
      double lt = 2*l+1;
      double l1 = l+1;
      gen->fx[l+1][0]=l1*lt*t;
      gen->fx[l+1][1]=gen->m*gen->s*ygen->xl[l]*ygen->xl[l+1];
      t = ygen->flm2[l+gen->m]*ygen->flm2[l-gen->m]
         *ygen->flm2[l+gen->s]*ygen->flm2[l-gen->s];
      gen->fx[l+1][2]=t*l1*ygen->xl[l];
      }
    gen->prefactor = 0.5L*(ygen->logsum[2*gen->mhi]
      -ygen->logsum[gen->mhi+gen->mlo]-ygen->logsum[gen->mhi-gen->mlo]);
    }

  gen->preMinus_p = gen->preMinus_m = 0;
  if (gen->mhi==gen->m)
    {
    gen->cosPow = gen->mhi+gen->s; gen->sinPow = gen->mhi-gen->s;
    gen->preMinus_p = gen->preMinus_m = ((gen->mhi-gen->s)&1);
    }
  else
    {
    gen->cosPow = gen->mhi+gen->m; gen->sinPow = gen->mhi-gen->m;
    gen->preMinus_m = ((gen->mhi+gen->m)&1);
    }
    }*/

/*static void sylmgen_recalc (sylmgen_d *gen, const Ylmgen_C *ygen, int ith,
  ylmgen_dbl2 *res, int *firstl)
  {
  const double ln2     = 0.6931471805599453094172321214581766;
  const double inv_ln2 = 1.4426950408889634073599246810018921;
  int l=gen->mhi;
  int lmax = ygen->lmax;
  ylmgen_dbl3 *fy = gen->fx;
  const double fsmall = ygen->fsmall, fbig = ygen->fbig, eps = ygen->eps;
  const double cth = ygen->cth[ith];
  long double logvalp = inv_ln2*(gen->prefactor
    + ygen->lc05[ith]*gen->cosPow + ygen->ls05[ith]*gen->sinPow);
  long double logvalm = inv_ln2*(gen->prefactor
    + ygen->lc05[ith]*gen->sinPow + ygen->ls05[ith]*gen->cosPow);
  int scalep = (int)(logvalp/large_exponent2)-minscale;
  int scalem = (int)(logvalm/large_exponent2)-minscale;
  double rec1p=0., rec1m=0.;
  double rec2p = exp(ln2*(double)(logvalp-(scalep+minscale)*large_exponent2));
  double rec2m = exp(ln2*(double)(logvalm-(scalem+minscale)*large_exponent2));
  double corfacp,corfacm;
  double tp,tm;

  if ((abs(gen->m-gen->s)>=gen->mdist_crit)&&(fabs(cth)>=gen->cth_crit))
    { *firstl=ygen->lmax+1; return; }

  if (gen->preMinus_p)
    rec2p=-rec2p;
  if (gen->preMinus_m)
    rec2m=-rec2m;
  if (gen->s&1)
    rec2p=-rec2p;

  / iterate until we reach the realm of IEEE numbers /
  while((scalem<0)&&(scalep<0))
    {
    if (++l>lmax) break;
    rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
    rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
    if (++l>lmax) break;
    rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
    rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;

    while (fabs(rec2p)>fbig)
      { rec1p *= fsmall; rec2p *= fsmall; ++scalep; }
    while (fabs(rec2m)>fbig)
      { rec1m *= fsmall; rec2m *= fsmall; ++scalem; }
    }

  corfacp = (scalep<0) ? 0. : ygen->cf[scalep];
  corfacm = (scalem<0) ? 0. : ygen->cf[scalem];

  if (l<=lmax)
    {
    while (1)
      {
      if ((fabs(rec2p*corfacp)>eps) || (fabs(rec2m*corfacm)>eps))
        break;
      if (++l>lmax) break;
      rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
      rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
      if ((fabs(rec1p*corfacp)>eps) || (fabs(rec1m*corfacm)>eps))
        { SWAP(rec1p,rec2p,double); SWAP(rec1m,rec2m,double); break; }
      if (++l>lmax) break;
      rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
      rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;

      if ((fabs(rec2p)>fbig) || (fabs(rec2m)>fbig))
        {
        while (fabs(rec2p)>fbig)
          { rec1p *= fsmall; rec2p *= fsmall; ++scalep; }
        while (fabs(rec2m)>fbig)
          { rec1m *= fsmall; rec2m *= fsmall; ++scalem; }
        corfacp = (scalep<0) ? 0. : ygen->cf[scalep];
        corfacm = (scalem<0) ? 0. : ygen->cf[scalem];
        }
      }
    }

  *firstl=l;
  if (l>lmax)
    {
    gen->mdist_crit=abs(gen->m-gen->s);
    gen->cth_crit= fabs(cth);
    return;
    }

  tp = rec2p*corfacp; tm = rec2m*corfacm;
  res[l][0]=tp+tm;
  res[l][1]=tm-tp;

  while (1)
    {
    if ((fabs(tp)>eps) && (fabs(tm)>eps))
      break;
    if (++l>lmax) break;
    rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
    rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
    tp=rec1p*corfacp; tm=rec1m*corfacm;
    res[l][0]=tp+tm; res[l][1]=tm-tp;
    if ((fabs(tp)>eps) && (fabs(tm)>eps))
      { SWAP(rec1p,rec2p,double); SWAP(rec1m,rec2m,double); break; }
    if (++l>lmax) break;
    rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
    rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;
    tp=rec2p*corfacp; tm=rec2m*corfacm;
    res[l][0]=tp+tm; res[l][1]=tm-tp;

    if ((fabs(rec2p)>fbig) || (fabs(rec2m)>fbig))
      {
      while (fabs(rec2p)>fbig)
        { rec1p *= fsmall; rec2p *= fsmall; ++scalep; }
      while (fabs(rec2m)>fbig)
        { rec1m *= fsmall; rec2m *= fsmall; ++scalem; }
      corfacp = (scalep<0) ? 0. : ygen->cf[scalep];
      corfacm = (scalem<0) ? 0. : ygen->cf[scalem];
      }
    }

  rec1p *= corfacp; rec2p *= corfacp;
  rec1m *= corfacm; rec2m *= corfacm;

  for (;l<lmax-1;l+=2)
    {
    rec1p = (cth - fy[l+1][1])*fy[l+1][0]*rec2p - fy[l+1][2]*rec1p;
    rec1m = (cth + fy[l+1][1])*fy[l+1][0]*rec2m - fy[l+1][2]*rec1m;
    res[l+1][0] = rec1p+rec1m; res[l+1][1] = rec1m-rec1p;
    rec2p = (cth - fy[l+2][1])*fy[l+2][0]*rec1p - fy[l+2][2]*rec2p;
    rec2m = (cth + fy[l+2][1])*fy[l+2][0]*rec1m - fy[l+2][2]*rec2m;
    res[l+2][0] = rec2p+rec2m; res[l+2][1] = rec2m-rec2p;
    }
  while (1)
    {
    if (++l>lmax) break;
    rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
    rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
    res[l][0] = rec1p+rec1m; res[l][1] = rec1m-rec1p;
    if (++l>lmax) break;
    rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
    rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;
    res[l][0] = rec2p+rec2m; res[l][1] = rec2m-rec2p;
    }
    }*/

#ifdef __SSE2__

#define ADVANCE(L,ap,am,bp,bm) \
  { \
  v2df f0=_mm_set1_pd(fy[L][0]), \
       f1=_mm_set1_pd(fy[L][1]), \
       f2=_mm_set1_pd(fy[L][2]); \
  ap = _mm_sub_pd( \
       _mm_mul_pd(_mm_sub_pd(cth,f1),_mm_mul_pd(f0,bp)), \
       _mm_mul_pd(f2,ap)); \
  am = _mm_sub_pd( \
       _mm_mul_pd(_mm_add_pd(cth,f1),_mm_mul_pd(f0,bm)), \
       _mm_mul_pd(f2,am)); \
  }

#define RENORMX(r1,r2,corf,sca,scb) \
  do \
    { \
    double rec1a, rec1b, rec2a, rec2b, corfaca, corfacb; \
    read_v2df (r1, &rec1a, &rec1b); read_v2df (r2, &rec2a, &rec2b); \
    read_v2df (corf, &corfaca, &corfacb); \
    while (fabs(rec2a)>fbig) \
      { \
      rec1a*=fsmall; rec2a*=fsmall; ++sca; \
      corfaca = (sca<0) ? 0. : ygen->cf[sca]; \
      } \
    while (fabs(rec2b)>fbig) \
      { \
      rec1b*=fsmall; rec2b*=fsmall; ++scb; \
      corfacb = (scb<0) ? 0. : ygen->cf[scb]; \
      } \
    r1=build_v2df(rec1a,rec1b); r2=build_v2df(rec2a,rec2b); \
    corf=build_v2df(corfaca,corfacb); \
    } \
  while(0)

#define RENORM \
  RENORMX(rec1p,rec2p,corfacp,scale1p,scale2p); \
  RENORMX(rec1m,rec2m,corfacm,scale1m,scale2m)

/*static void sylmgen_recalc_sse2 (sylmgen_d *gen, const Ylmgen_C *ygen,
  int ith1, int ith2, v2df2 *res, int *firstl)
  {
  const double ln2     = 0.6931471805599453094172321214581766;
  const double inv_ln2 = 1.4426950408889634073599246810018921;
  int l=gen->mhi;
  int lmax = ygen->lmax;
  ylmgen_dbl3 *fy = gen->fx;
  const double fbig=ygen->fbig, fsmall=ygen->fsmall;
  v2df eps2    = build_v2df(ygen->eps,ygen->eps);
  const double cth1=ygen->cth[ith1], cth2=ygen->cth[ith2];
  v2df cth = build_v2df(cth1,cth2);
  long double
    logval1p = inv_ln2*(gen->prefactor
             + ygen->lc05[ith1]*gen->cosPow + ygen->ls05[ith1]*gen->sinPow),
    logval2p = inv_ln2*(gen->prefactor
             + ygen->lc05[ith2]*gen->cosPow + ygen->ls05[ith2]*gen->sinPow),
    logval1m = inv_ln2*(gen->prefactor
             + ygen->lc05[ith1]*gen->sinPow + ygen->ls05[ith1]*gen->cosPow),
    logval2m = inv_ln2*(gen->prefactor
             + ygen->lc05[ith2]*gen->sinPow + ygen->ls05[ith2]*gen->cosPow);

  int scale1p = (int)(logval1p/large_exponent2)-minscale,
      scale2p = (int)(logval2p/large_exponent2)-minscale,
      scale1m = (int)(logval1m/large_exponent2)-minscale,
      scale2m = (int)(logval2m/large_exponent2)-minscale;

  v2df rec1p =_mm_setzero_pd(), rec1m=_mm_setzero_pd();
  v2df rec2p = build_v2df(
    exp(ln2*(double)(logval1p-(scale1p+minscale)*large_exponent2)),
    exp(ln2*(double)(logval2p-(scale2p+minscale)*large_exponent2)));
  v2df rec2m = build_v2df(
    exp(ln2*(double)(logval1m-(scale1m+minscale)*large_exponent2)),
    exp(ln2*(double)(logval2m-(scale2m+minscale)*large_exponent2)));
  v2df corfacp=build_v2df((scale1p<0) ? 0. : ygen->cf[scale1p],
                          (scale2p<0) ? 0. : ygen->cf[scale2p]),
       corfacm=build_v2df((scale1m<0) ? 0. : ygen->cf[scale1m],
                          (scale2m<0) ? 0. : ygen->cf[scale2m]);
  v2df tp,tm;

  if ((abs(gen->m-gen->s)>=gen->mdist_crit)
     &&(fabs(ygen->cth[ith1])>=gen->cth_crit)
     &&(fabs(ygen->cth[ith2])>=gen->cth_crit))
    { *firstl=ygen->lmax+1; return; }

  if (gen->preMinus_p)
    rec2p = _mm_xor_pd (rec2p,V2DF_SIGNMASK); / negate /
  if (gen->preMinus_m)
    rec2m = _mm_xor_pd (rec2m,V2DF_SIGNMASK); / negate /
  if (gen->s&1)
    rec2p = _mm_xor_pd (rec2p,V2DF_SIGNMASK); / negate /

  / iterate until we reach the realm of IEEE numbers /
  while((scale1m<0)&&(scale1p<0)&&(scale2m<0)&&(scale2p<0))
    {
    if (++l>lmax) break;
    ADVANCE (l,rec1p,rec1m,rec2p,rec2m)
    if (++l>lmax) break;
    ADVANCE (l,rec2p,rec2m,rec1p,rec1m)

    RENORM;
    }

  if (l<=lmax)
    {
    while (1)
      {
      if (v2df_any_gt(_mm_mul_pd(rec2p,corfacp),eps2) ||
          v2df_any_gt(_mm_mul_pd(rec2m,corfacm),eps2))
        break;
      if (++l>lmax) break;
      ADVANCE (l,rec1p,rec1m,rec2p,rec2m)
      if (v2df_any_gt(_mm_mul_pd(rec1p,corfacp),eps2) ||
          v2df_any_gt(_mm_mul_pd(rec1m,corfacm),eps2))
        { SWAP(rec1p,rec2p,v2df); SWAP(rec1m,rec2m,v2df); break; }
      if (++l>lmax) break;
      ADVANCE (l,rec2p,rec2m,rec1p,rec1m)

      RENORM;
      }
    }

  *firstl=l;
  if (l>lmax)
    {
    gen->mdist_crit=abs(gen->m-gen->s);
    gen->cth_crit= (fabs(cth1)<fabs(cth2)) ? fabs(cth1) : fabs(cth2);
    return;
    }
  tp = _mm_mul_pd(rec2p,corfacp); tm = _mm_mul_pd(rec2m,corfacm);
  res[l].a=_mm_add_pd(tp,tm);
  res[l].b=_mm_sub_pd(tm,tp);

  while (1)
    {
    if (v2df_all_ge(tp,eps2) && v2df_all_ge(tm,eps2))
      break;
    if (++l>lmax) break;
    ADVANCE(l,rec1p,rec1m,rec2p,rec2m)
    tp=_mm_mul_pd(rec1p,corfacp); tm=_mm_mul_pd(rec1m,corfacm);
    res[l].a=_mm_add_pd(tp,tm); res[l].b=_mm_sub_pd(tm,tp);
    if (v2df_all_ge(tp,eps2) && v2df_all_ge(tm,eps2))
      { SWAP(rec1p,rec2p,v2df); SWAP(rec1m,rec2m,v2df); break; }
    if (++l>lmax) break;
    ADVANCE (l,rec2p,rec2m,rec1p,rec1m)
    tp=_mm_mul_pd(rec2p,corfacp); tm=_mm_mul_pd(rec2m,corfacm);
    res[l].a=_mm_add_pd(tp,tm); res[l].b=_mm_sub_pd(tm,tp);

    RENORM;
    }

  rec1p = _mm_mul_pd(rec1p,corfacp); rec2p = _mm_mul_pd(rec2p,corfacp);
  rec1m = _mm_mul_pd(rec1m,corfacm); rec2m = _mm_mul_pd(rec2m,corfacm);

  for (;l<lmax-1;l+=2)
    {
    ADVANCE(l+1,rec1p,rec1m,rec2p,rec2m)
    res[l+1].a=_mm_add_pd(rec1p,rec1m); res[l+1].b=_mm_sub_pd(rec1m,rec1p);
    ADVANCE(l+2,rec2p,rec2m,rec1p,rec1m)
    res[l+2].a=_mm_add_pd(rec2p,rec2m); res[l+2].b=_mm_sub_pd(rec2m,rec2p);
    }
  while (1)
    {
    if (++l>lmax) break;
    ADVANCE(l,rec1p,rec1m,rec2p,rec2m)
    res[l].a=_mm_add_pd(rec1p,rec1m); res[l].b=_mm_sub_pd(rec1m,rec1p);
    if (++l>lmax) break;
    ADVANCE(l,rec2p,rec2m,rec1p,rec1m)
    res[l].a=_mm_add_pd(rec2p,rec2m); res[l].b=_mm_sub_pd(rec2m,rec2p);
    }
  }
*/
#endif

static void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int s_max, int spinrec,
  double epsilon)
  {
  int m;
  const double inv_sqrt4pi = 0.2820947917738781434740397257803862929220;
  const double inv_ln2 = 1.4426950408889634073599246810018921;

  gen->fsmall = ldexp(1.,-large_exponent2);
  gen->fbig   = ldexp(1., large_exponent2);
  gen->eps = epsilon;
  gen->cth_crit = 2.;
  gen->ith = -1;
  gen->nth = 0;
  gen->lmax = l_max;
  gen->mmax = m_max;
  gen->smax = s_max;
  gen->spinrec = spinrec;
  gen->m_cur = -1;
  gen->m_crit = gen->mmax+1;
  gen->firstl = RALLOC(int,gen->smax+1);
  for (m=0; m<=gen->smax; ++m) gen->firstl[m]=-1;
  gen->cf = RALLOC(double,maxscale-minscale+1);
  for (m=0; m<(maxscale-minscale+1); ++m)
    gen->cf[m] = ldexp(1.,(m+minscale)*large_exponent2);
  gen->recfac = RALLOC(ylmgen_dbl2,gen->lmax+1);
  gen->mfac = RALLOC(double,gen->mmax+1);
  gen->mfac[0] = 1;
  for (m=1; m<=gen->mmax; ++m)
    gen->mfac[m] = gen->mfac[m-1]*sqrt((2*m+1.)/(2*m));
  for (m=0; m<=gen->mmax; ++m)
    gen->mfac[m] = inv_ln2*log(inv_sqrt4pi*gen->mfac[m]);

  gen->t1fac = RALLOC(double,gen->lmax+1);
  for (m=0; m<=gen->lmax; ++m)
    gen->t1fac[m] = sqrt(4.*(m+1)*(m+1)-1.);
  gen->t2fac = RALLOC(double,gen->lmax+gen->mmax+1);
  for (m=0; m<=gen->lmax+gen->mmax; ++m)
    gen->t2fac[m] = 1./sqrt(m+1.);

  gen->lamfact = RALLOC(double,gen->lmax+1);
  gen->ylm = RALLOC(double,gen->lmax+1);
  ALLOC(gen->lambda_wx,ylmgen_dbl2 *,gen->smax+1);
  for (m=0; m<=gen->smax; ++m)
    gen->lambda_wx[m]=NULL;

  gen->sylm = RALLOC(sylmgen_d *,gen->smax+1);
  for (m=0; m<=gen->smax; ++m)
    gen->sylm[m]=NULL;

  gen->ylm_uptodate = 0;
  gen->lwx_uptodate = RALLOC(int,gen->smax+1);
  SET_ARRAY(gen->lwx_uptodate,0,gen->smax+1,0);
  gen->recfac_uptodate = 0;
  gen->lamfact_uptodate = 0;

  gen->th = gen->cth = gen->sth = gen->logsth = NULL;

#ifdef __SSE2__
  gen->ith1 = gen->ith2 = -1;
  gen->ylm_sse2 = RALLOC(v2df,gen->lmax+1);
  ALLOC(gen->lambda_wx_sse2,v2df2 *,gen->smax+1);
  for (m=0; m<=gen->smax; ++m)
    gen->lambda_wx_sse2[m]=NULL;
  gen->ylm_uptodate_sse2 = 0;
  gen->lwx_uptodate_sse2 = RALLOC(int,gen->smax+1);
  SET_ARRAY(gen->lwx_uptodate_sse2,0,gen->smax+1,0);
#endif

  ALLOC(gen->logsum,long double,2*gen->lmax+1);
  gen->lc05 = gen->ls05 = NULL;
  ALLOC(gen->flm1,double,2*gen->lmax+1);
  ALLOC(gen->flm2,double,2*gen->lmax+1);
  ALLOC(gen->xl,double,gen->lmax+1);

  gen->logsum[0] = 0.;
  for (m=1; m<2*gen->lmax+1; ++m)
    gen->logsum[m] = gen->logsum[m-1]+logl((long double)m);
  for (m=0; m<2*gen->lmax+1; ++m)
    {
    gen->flm1[m] = sqrt(1./(m+1.));
    gen->flm2[m] = sqrt(m/(m+1.));
    }

  gen->xl[0]=0;
  for (m=1; m<gen->lmax+1; ++m) gen->xl[m]=1./m;
  }

static void Ylmgen_destroy (Ylmgen_C *gen)
  {
  int m;

  DEALLOC(gen->firstl);
  DEALLOC(gen->cf);
  DEALLOC(gen->recfac);
  DEALLOC(gen->mfac);
  DEALLOC(gen->t1fac);
  DEALLOC(gen->t2fac);
  DEALLOC(gen->lamfact);
  DEALLOC(gen->ylm);
  DEALLOC(gen->lwx_uptodate);
  for (m=0; m<=gen->smax; ++m)
    DEALLOC(gen->lambda_wx[m]);
  DEALLOC(gen->lambda_wx);
  for (m=0; m<=gen->smax; ++m)
    if (gen->sylm[m])
      {
      sylmgen_destroy (gen->sylm[m]);
      DEALLOC(gen->sylm[m]);
      }
  DEALLOC(gen->sylm);
  DEALLOC(gen->th);
  DEALLOC(gen->cth);
  DEALLOC(gen->sth);
  DEALLOC(gen->logsth);
  DEALLOC(gen->logsum);
  DEALLOC(gen->lc05);
  DEALLOC(gen->ls05);
  DEALLOC(gen->flm1);
  DEALLOC(gen->flm2);
  DEALLOC(gen->xl);
#ifdef __SSE2__
  DEALLOC(gen->ylm_sse2);
  for (m=0; m<=gen->smax; ++m)
    DEALLOC(gen->lambda_wx_sse2[m]);
  DEALLOC(gen->lambda_wx_sse2);
  DEALLOC(gen->lwx_uptodate_sse2);
#endif
  }

static void Ylmgen_set_theta (Ylmgen_C *gen, const double *theta, int nth)
  {
  const double inv_ln2 = 1.4426950408889634073599246810018921;
  int m;
  DEALLOC(gen->th);
  DEALLOC(gen->cth);
  DEALLOC(gen->sth);
  DEALLOC(gen->logsth);
  DEALLOC(gen->lc05);
  DEALLOC(gen->ls05);
  gen->th = RALLOC(double,nth);
  gen->cth = RALLOC(double,nth);
  gen->sth = RALLOC(double,nth);
  gen->logsth = RALLOC(double,nth);
  gen->lc05 = RALLOC(long double,nth);
  gen->ls05 = RALLOC(long double,nth);
  for (m=0; m<nth; ++m)
    {
    const double pi = 3.141592653589793238462643383279502884197;
    double th=theta[m];
    UTIL_ASSERT ((th>=0.)&&(th<=pi),"bad theta angle specified");
    /* tiny adjustments to make sure cos and sin (theta/2) are positive */
    if (th==0.) th=1e-16;
    if (ABSAPPROX(th,pi,1e-15)) th=pi-1e-15;
    gen->th[m] = th;
    gen->cth[m] = cos(th);
    gen->sth[m] = sin(th);
    gen->logsth[m] = inv_ln2*log(gen->sth[m]);
    gen->lc05[m]=logl(cosl(0.5L*th));
    gen->ls05[m]=logl(sinl(0.5L*th));
    }

  gen->nth = nth;
  gen->ith = -1;
#ifdef __SSE2__
  gen->ith1 = gen->ith2 = -1;
#endif
  }

static void Ylmgen_prepare (Ylmgen_C *gen, int ith, int m)
  {
  if ((ith==gen->ith) && (m==gen->m_cur)) return;

  gen->ylm_uptodate = 0;
  SET_ARRAY(gen->lwx_uptodate,0,gen->smax+1,0);

  gen->ith = ith;

  if (m!=gen->m_cur)
    {
    gen->recfac_uptodate = 0;
    gen->lamfact_uptodate = 0;
    gen->m_cur = m;
    }
  }

static void Ylmgen_recalc_recfac (Ylmgen_C *gen)
  {
  double f_old=1;
  int l, m;

  if (gen->recfac_uptodate) return;
  gen->recfac_uptodate = 1;

  m = gen->m_cur;
  for (l=m; l<=gen->lmax; ++l)
    {
    gen->recfac[l][0] = gen->t1fac[l]*gen->t2fac[l+m]*gen->t2fac[l-m];
    gen->recfac[l][1] = gen->recfac[l][0]/f_old;
    f_old = gen->recfac[l][0];
    }
  }

/*static void Ylmgen_recalc_lamfact (Ylmgen_C *gen)
  {
  int l, m;

  if (gen->lamfact_uptodate) return;
  gen->lamfact_uptodate = 1;

  m = gen->m_cur;
  gen->lamfact[m] = 0;
  for (l=m+1; l<=gen->lmax; ++l)
    gen->lamfact[l] = sqrt((2*l+1.)/(2*l-1.) * (l*l-m*m));
    }*/

#define RENORMALIZE_SCALAR \
  do \
    { \
    while (fabs(lam_2)>fbig) \
      { lam_1*=fsmall; lam_2*=fsmall; ++scale; } \
    corfac = (scale<0) ? 0. : gen->cf[scale]; \
    } \
  while(0)

static void Ylmgen_recalc_Ylm (Ylmgen_C *gen)
  {
  const double ln2 = 0.6931471805599453094172321214581766;

  double logval,lam_1,lam_2,corfac;
  double eps=gen->eps, fbig=gen->fbig, fsmall=gen->fsmall;
  ylmgen_dbl2 *recfac = gen->recfac;
  int lmax=gen->lmax;
  int scale,l;
  int m = gen->m_cur;
  double cth=gen->cth[gen->ith], sth=gen->sth[gen->ith];
  double *result = gen->ylm;

  if (gen->ylm_uptodate) return;
  gen->ylm_uptodate=1;

  if (((m>=gen->m_crit)&&(fabs(cth)>=gen->cth_crit)) || ((m>0)&&(sth==0)))
    { gen->firstl[0]=gen->lmax+1; return; }

  Ylmgen_recalc_recfac(gen);

  logval = gen->mfac[m];
  if (m>0) logval += m*gen->logsth[gen->ith];
  scale = (int) (logval/large_exponent2)-minscale;
  corfac = (scale<0) ? 0. : gen->cf[scale];

  lam_1 = 0;
  lam_2 = exp(ln2*(logval-(scale+minscale)*large_exponent2));
  if (m&1) lam_2 = -lam_2;

  l=m;
  if (scale<0)
    {
    while (1)
      {
      if (++l>lmax) break;
      lam_1 = cth*lam_2*recfac[l-1][0] - lam_1*recfac[l-1][1];
      if (++l>lmax) break;
      lam_2 = cth*lam_1*recfac[l-1][0] - lam_2*recfac[l-1][1];
      if (fabs(lam_2)>fbig)
        {
        RENORMALIZE_SCALAR;
        if (scale>=0) break;
        }
      }
    }

  lam_1*=corfac;
  lam_2*=corfac;

  if (l<=lmax)
    {
    while (1)
      {
      if (fabs(lam_2)>eps) break;
      if (++l>lmax) break;
      lam_1 = cth*lam_2*recfac[l-1][0] - lam_1*recfac[l-1][1];
      if (fabs(lam_1)>eps)
        { double x=lam_1; lam_1=lam_2; lam_2=x; break; }
      if (++l>lmax) break;
      lam_2 = cth*lam_1*recfac[l-1][0] - lam_2*recfac[l-1][1];
      }
    }

  gen->firstl[0]=l;
  if (l>lmax)
    { gen->m_crit=m; gen->cth_crit=fabs(cth); return; }

  for(;l<lmax-3;l+=4)
    {
    result[l]=lam_2;
    lam_1 = cth*lam_2*recfac[l][0] - lam_1*recfac[l][1];
    result[l+1] = lam_1;
    lam_2 = cth*lam_1*recfac[l+1][0] - lam_2*recfac[l+1][1];
    result[l+2]=lam_2;
    lam_1 = cth*lam_2*recfac[l+2][0] - lam_1*recfac[l+2][1];
    result[l+3] = lam_1;
    lam_2 = cth*lam_1*recfac[l+3][0] - lam_2*recfac[l+3][1];
    }

  while (1)
    {
    result[l]=lam_2;
    if (++l>lmax) break;
    lam_1 = cth*lam_2*recfac[l-1][0] - lam_1*recfac[l-1][1];
    result[l] = lam_1;
    if (++l>lmax) break;
    lam_2 = cth*lam_1*recfac[l-1][0] - lam_2*recfac[l-1][1];
    }
  }


/*
static void Ylmgen_recalc_lambda_wx1 (Ylmgen_C *gen)
  {
  if (gen->lwx_uptodate[1]) return;
  Ylmgen_recalc_Ylm(gen);
  gen->firstl[1] = gen->firstl[0];
  if (gen->firstl[1]>gen->lmax) return;
  Ylmgen_recalc_lamfact(gen);
  gen->lwx_uptodate[1] = 1;

  {
  double cth=gen->cth[gen->ith];
  double xsth=1./(gen->sth[gen->ith]);
  double m=gen->m_cur;
  double m_on_sth = m*xsth;
  double lam_lm=0;
  ylmgen_dbl2 *lambda_wx = gen->lambda_wx[1];
  int l;
  double ell;
  for (ell=l=gen->firstl[1]; l<=gen->lmax; ++l, ell+=1.)
    {
    double lam_lm1m=lam_lm;
    lam_lm=gen->ylm[l];
    lambda_wx[l][0] = xsth*(gen->lamfact[l]*lam_lm1m - ell*cth*lam_lm);
    lambda_wx[l][1] = m_on_sth*lam_lm;
    }
  }
  }

static void Ylmgen_recalc_lambda_wx2 (Ylmgen_C *gen)
  {
  if (gen->lwx_uptodate[2]) return;
  Ylmgen_recalc_Ylm(gen);
  gen->firstl[2] = gen->firstl[0];
  if (gen->firstl[2]>gen->lmax) return;
  Ylmgen_recalc_lamfact(gen);
  gen->lwx_uptodate[2] = 1;

  {
  double cth=gen->cth[gen->ith];
  double sth=gen->sth[gen->ith];
  double m=gen->m_cur;
  double one_on_s2 = 1./(sth*sth);
  double two_on_s2 = 2*one_on_s2;
  double two_c_on_s2 = cth * two_on_s2;
  double m2 = m*m;
  double two_m_on_s2 = m*two_on_s2;
  double lam_lm=0;
  ylmgen_dbl2 *lambda_wx = gen->lambda_wx[2];
  int l;
  double ell;
  for (ell=l=gen->firstl[2]; l<=gen->lmax; ++l, ell+=1.)
    {
    double lam_lm1m=lam_lm;
    lam_lm=gen->ylm[l];
    {
    const double t1  = lam_lm1m*gen->lamfact[l];
    const double a_w = (m2-ell)*two_on_s2 - ell*(ell-1.);
    const double a_x = cth*(ell-1.)*lam_lm;
    lambda_wx[l][0] = a_w*lam_lm + t1*two_c_on_s2;
    lambda_wx[l][1] = two_m_on_s2 * (t1-a_x);
    }
    }
  }
  }

void Ylmgen_recalc_lambda_wx (Ylmgen_C *gen, int spin)
  {
  UTIL_ASSERT ((spin>0) && (spin<=gen->smax),
    "invalid spin in Ylmgen_recalc_lambda_wx");

  if (!gen->lambda_wx[spin])
    gen->lambda_wx[spin]=RALLOC(ylmgen_dbl2,gen->lmax+1);

  if (gen->spinrec && spin==1) { Ylmgen_recalc_lambda_wx1(gen); return; }
  if (gen->spinrec && spin==2) { Ylmgen_recalc_lambda_wx2(gen); return; }

  if (!gen->sylm[spin])
    {
    gen->sylm[spin]=RALLOC(sylmgen_d,1);
    sylmgen_init(gen->sylm[spin],gen,spin);
    }
  if (gen->lwx_uptodate[spin]) return;
  sylmgen_prepare(gen->sylm[spin],gen,gen->m_cur);
  sylmgen_recalc(gen->sylm[spin],gen,gen->ith,gen->lambda_wx[spin],
    &gen->firstl[spin]);
  gen->lwx_uptodate[spin] = 1;
  }
*/
#ifdef __SSE2__

static void Ylmgen_prepare_sse2 (Ylmgen_C *gen, int ith1, int ith2, int m)
  {
  if ((ith1==gen->ith1) && (ith2==gen->ith2) && (m==gen->m_cur)) return;

  gen->ylm_uptodate_sse2 = 0;
  SET_ARRAY(gen->lwx_uptodate_sse2,0,gen->smax+1,0);

  gen->ith1 = ith1; gen->ith2 = ith2;

  if (m!=gen->m_cur)
    {
    gen->recfac_uptodate = gen->lamfact_uptodate = 0;
    gen->m_cur = m;
    }
  }


#define RENORMALIZE \
  do \
    { \
    double lam1a, lam1b, lam2a, lam2b, corfaca, corfacb; \
    read_v2df (lam_1, &lam1a, &lam1b); read_v2df (lam_2, &lam2a, &lam2b); \
    read_v2df (corfac, &corfaca, &corfacb); \
    while (fabs(lam2a)>fbig) \
      { \
      lam1a*=fsmall; lam2a*=fsmall; ++scale1; \
      corfaca = (scale1<0) ? 0. : gen->cf[scale1]; \
      } \
    while (fabs(lam2b)>fbig) \
      { \
      lam1b*=fsmall; lam2b*=fsmall; ++scale2; \
      corfacb = (scale2<0) ? 0. : gen->cf[scale2]; \
      } \
    lam_1=build_v2df(lam1a,lam1b); lam_2=build_v2df(lam2a,lam2b); \
    corfac=build_v2df(corfaca,corfacb); \
    } \
  while(0)
#define GETPRE(prea,preb,lv) \
  { \
  prea=_mm_mul_pd(_mm_set1_pd(recfac[lv][0]),cth); \
  preb=_mm_set1_pd(recfac[lv][1]); \
  }
#define NEXTSTEP(prea,preb,prec,pred,reca,recb,lv) \
  { \
  preb = _mm_mul_pd(preb,reca); \
  prea = _mm_mul_pd(prea,recb); \
  prec = _mm_set1_pd(recfac[lv][0]); \
  pred = _mm_set1_pd(recfac[lv][1]); \
  reca = _mm_sub_pd(prea,preb); \
  prec = _mm_mul_pd(cth,prec); \
  }

static void Ylmgen_recalc_Ylm_sse2 (Ylmgen_C *gen)
  {
  const double ln2 = 0.6931471805599453094172321214581766;

  v2df lam_1,lam_2,corfac;
  double logval1,logval2;
  double eps=gen->eps, fbig=gen->fbig, fsmall=gen->fsmall;
  v2df eps2=build_v2df(eps,eps);
  v2df fbig2=build_v2df(fbig,fbig);
  ylmgen_dbl2 *recfac = gen->recfac;
  int lmax=gen->lmax;
  int scale1,scale2,l;
  int m = gen->m_cur;
  double cth1=gen->cth[gen->ith1], cth2=gen->cth[gen->ith2];
  v2df cth=build_v2df(cth1,cth2);
  v2df *result = gen->ylm_sse2;
  v2df pre0,pre1,pre2,pre3;

  if (gen->ylm_uptodate_sse2) return;
  gen->ylm_uptodate_sse2=1;

  if ((m>=gen->m_crit)&&(fabs(cth1)>=gen->cth_crit)&&(fabs(cth2)>=gen->cth_crit))
    { gen->firstl[0]=gen->lmax+1; return; }

  Ylmgen_recalc_recfac(gen);

  logval1 = logval2 = gen->mfac[m];
  if (m>0) logval1 += m*gen->logsth[gen->ith1];
  if (m>0) logval2 += m*gen->logsth[gen->ith2];
  scale1 = (int) (logval1/large_exponent2)-minscale;
  scale2 = (int) (logval2/large_exponent2)-minscale;
  corfac = build_v2df((scale1<0) ? 0. : gen->cf[scale1],
                      (scale2<0) ? 0. : gen->cf[scale2]);

  lam_1 = _mm_setzero_pd();
  lam_2 = build_v2df(exp(ln2*(logval1-(scale1+minscale)*large_exponent2)),
                     exp(ln2*(logval2-(scale2+minscale)*large_exponent2)));
  if (m&1) lam_2 = _mm_xor_pd (lam_2,V2DF_SIGNMASK); /* negate */

  l=m;
  if ((scale1<0) && (scale2<0))
    {
    GETPRE(pre0,pre1,l)
    while (1)
      {
      if (++l>lmax) break;
      NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)
      if (++l>lmax) break;
      NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)
      if (v2df_any_gt(lam_2,fbig2))
        {
        RENORMALIZE;
        if ((scale1>=0) || (scale2>=0)) break;
        }
      }
    }

  if (l<=lmax)
    {
    GETPRE(pre0,pre1,l)
    while (1)
      {
      v2df t1;
      result[l]=t1=_mm_mul_pd(lam_2,corfac);
      if (v2df_any_gt(t1,eps2))
        break;
      if (++l>lmax) break;
      NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)

      result[l]=t1=_mm_mul_pd(lam_1,corfac);
      if (v2df_any_gt(t1,eps2))
        { v2df tmp=lam_1;lam_1=lam_2;lam_2=tmp; break; }
      if (++l>lmax) break;
      NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)

      if (v2df_any_gt(lam_2,fbig2))
        RENORMALIZE;
      }
    }

  gen->firstl[0]=l;
  if (l>lmax)
    {
    gen->m_crit=m;
    gen->cth_crit= (fabs(cth1)<fabs(cth2)) ? fabs(cth1) : fabs(cth2);
    return;
    }

  GETPRE(pre0,pre1,l)
  while (1)
    {
    v2df t1;
    result[l]=t1=_mm_mul_pd(lam_2,corfac);
    if (v2df_all_ge(t1,eps2))
      break;
    if (++l>lmax) return;
    NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)

    result[l]=t1=_mm_mul_pd(lam_1,corfac);
    if (v2df_all_ge(t1,eps2))
      { v2df tmp=lam_1;lam_1=lam_2;lam_2=tmp; break; }
    if (++l>lmax) return;
    NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)

    if (v2df_any_gt(lam_2,fbig2))
      RENORMALIZE;
    }

  lam_1 = _mm_mul_pd (lam_1,corfac);
  lam_2 = _mm_mul_pd (lam_2,corfac);

  GETPRE(pre0,pre1,l)
  for(;l<lmax-2;l+=2)
    {
    result[l]=lam_2;
    NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l+1)
    result[l+1]=lam_1;
    NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l+2)
    }

  while (1)
    {
    result[l]=lam_2;
    if (++l>lmax) break;
    NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)
    result[l] = lam_1;
    if (++l>lmax) break;
    NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)
    }
  }

/*static void Ylmgen_recalc_lambda_wx1_sse2 (Ylmgen_C *gen)
  {
  if (gen->lwx_uptodate_sse2[1]) return;
  Ylmgen_recalc_Ylm_sse2(gen);
  gen->firstl[1] = gen->firstl[0];
  if (gen->firstl[1]>gen->lmax) return;
  Ylmgen_recalc_lamfact(gen);
  gen->lwx_uptodate_sse2[1] = 1;

  {
  v2df cth=build_v2df(gen->cth[gen->ith1],gen->cth[gen->ith2]);
  v2df xsth=build_v2df(1./gen->sth[gen->ith1],1./gen->sth[gen->ith2]);
  v2df m=build_v2df(gen->m_cur,gen->m_cur);
  v2df m_on_sth = _mm_mul_pd(m,xsth);
  v2df lam_lm=_mm_setzero_pd();
  v2df2 *lambda_wx = gen->lambda_wx_sse2[1];
  int l;
  v2df ell=build_v2df(gen->firstl[1],gen->firstl[1]);
  v2df uno=_mm_set1_pd(1.);
  for (l=gen->firstl[1]; l<=gen->lmax; ++l, ell=_mm_add_pd(ell,uno))
    {
    v2df lamfact=_mm_load1_pd(&gen->lamfact[l]);
    v2df lam_lm1m=lam_lm;
    lam_lm=gen->ylm_sse2[l];
    lambda_wx[l].a = _mm_mul_pd(xsth,_mm_sub_pd(_mm_mul_pd(lamfact,lam_lm1m),
                     _mm_mul_pd(_mm_mul_pd(ell,cth),lam_lm)));
    lambda_wx[l].b = _mm_mul_pd(m_on_sth,lam_lm);
    }
  }
  }

static void Ylmgen_recalc_lambda_wx2_sse2 (Ylmgen_C *gen)
  {
  if (gen->lwx_uptodate_sse2[2]) return;
  Ylmgen_recalc_Ylm_sse2(gen);
  gen->firstl[2] = gen->firstl[0];
  if (gen->firstl[2]>gen->lmax) return;
  Ylmgen_recalc_lamfact(gen);
  gen->lwx_uptodate_sse2[2] = 1;

  {
  v2df cth=build_v2df(gen->cth[gen->ith1],gen->cth[gen->ith2]);
  v2df sth=build_v2df(gen->sth[gen->ith1],gen->sth[gen->ith2]);
  v2df m=build_v2df(gen->m_cur,gen->m_cur);
  v2df uno=_mm_set1_pd(1.);
  v2df one_on_s2 = _mm_div_pd(uno,_mm_mul_pd(sth,sth));
  v2df two_on_s2 = _mm_mul_pd(_mm_set1_pd(2.),one_on_s2);
  v2df two_c_on_s2 = _mm_mul_pd(cth,two_on_s2);
  v2df m2 = _mm_mul_pd(m,m);
  v2df two_m_on_s2 = _mm_mul_pd(m,two_on_s2);
  v2df lam_lm=_mm_setzero_pd();
  v2df2 *lambda_wx = gen->lambda_wx_sse2[2];
  int l;
  v2df ell=build_v2df(gen->firstl[2],gen->firstl[2]);
  for (l=gen->firstl[2]; l<=gen->lmax; ++l, ell=_mm_add_pd(ell,uno))
    {
    v2df lamfact=_mm_load1_pd(&gen->lamfact[l]);
    v2df lam_lm1m=lam_lm;
    lam_lm=gen->ylm_sse2[l];
    {
    const v2df t1  = _mm_mul_pd(lam_lm1m,lamfact);
    const v2df ellm1 = _mm_sub_pd(ell,uno);
    const v2df a_w = _mm_sub_pd
      (_mm_mul_pd(_mm_sub_pd(m2,ell),two_on_s2),_mm_mul_pd(ell,ellm1));
    const v2df a_x = _mm_mul_pd(_mm_mul_pd(cth,ellm1),lam_lm);
    lambda_wx[l].a =
      _mm_add_pd(_mm_mul_pd(a_w,lam_lm),_mm_mul_pd(t1,two_c_on_s2));
    lambda_wx[l].b = _mm_mul_pd(two_m_on_s2,_mm_sub_pd(t1,a_x));
    }
    }
  }
  }*/

/*static void Ylmgen_recalc_lambda_wx_sse2 (Ylmgen_C *gen, int spin)
  {
  UTIL_ASSERT ((spin>0) && (spin<=gen->smax),
    "invalid spin in Ylmgen_recalc_lambda_wx_sse2");

  if (!gen->lambda_wx_sse2[spin])
    gen->lambda_wx_sse2[spin]=RALLOC(v2df2,gen->lmax+1);

  if (gen->spinrec && spin==1) { Ylmgen_recalc_lambda_wx1_sse2(gen); return; }
  if (gen->spinrec && spin==2) { Ylmgen_recalc_lambda_wx2_sse2(gen); return; }

  if (!gen->sylm[spin])
    {
    gen->sylm[spin]=RALLOC(sylmgen_d,1);
    sylmgen_init(gen->sylm[spin],gen,spin);
    }
  if (gen->lwx_uptodate_sse2[spin]) return;
  sylmgen_prepare(gen->sylm[spin],gen,gen->m_cur);
  sylmgen_recalc_sse2(gen->sylm[spin],gen,gen->ith1,gen->ith2,
    gen->lambda_wx_sse2[spin],&gen->firstl[spin]);
  gen->lwx_uptodate_sse2[spin] = 1;
  }*/

#endif /* __SSE2__ */

/*
double *Ylmgen_get_norm (int lmax, int spin, int spinrec)
  {
  const double pi = 3.141592653589793238462643383279502884197;
  double *res=RALLOC(double,lmax+1);
  int l;
  double spinsign;
  / sign convention for H=1 (LensPix paper) /
#if 1
  spinsign = (spin>0) ? -1.0 : 1.0;
#else
  spinsign = 1.0;
#endif

  if (spin==0)
    {
    for (l=0; l<=lmax; ++l)
      res[l]=1.;
    return res;
    }

  if ((!spinrec) || (spin>=3))
    {
    spinsign = (spin&1) ? -spinsign : spinsign;
    for (l=0; l<=lmax; ++l)
      res[l] = (l<spin) ? 0. : spinsign*0.5*sqrt((2*l+1)/(4*pi));
    return res;
    }

  if (spin==1)
    {
    for (l=0; l<=lmax; ++l)
      res[l] = (l<spin) ? 0. : spinsign*sqrt(1./((l+1.)*l));
    return res;
    }

  if (spin==2)
    {
    for (l=0; l<=lmax; ++l)
      res[l] = (l<spin) ? 0. : spinsign*sqrt(1./((l+2.)*(l+1.)*l*(l-1.)));
    return res;
    }

  UTIL_FAIL ("error in Ylmgen_get_norm");
  return NULL;
  }
*/


/*
New high-level wrapper
*/
#include "sharp_legendre_table.h"
#include <stdio.h>

void sharp_normalized_associated_legendre_table(
  ptrdiff_t m,
  ptrdiff_t lmax,
  ptrdiff_t ntheta,
  /* contigious 1D array of theta values to compute for,
     contains ntheta values */
  double *theta,
  /* number of columns in out; see below. Should be >= (lmax - m). */
  ptrdiff_t ncols,
  /* contigiuos 2D array, in "theta-major ordering". Has `ntheta`
     rows and `ncols` columns. Indexed as out[itheta * ncols + (l - m)].
     If `ncols > lmax - m` then those entries are not accessed.
  */
  double *out
) {

    Ylmgen_C ctx;
    ptrdiff_t itheta, l, lmin;

    Ylmgen_init(&ctx, lmax, lmax, 0, 0, 1e-300);
    Ylmgen_set_theta(&ctx, theta, ntheta);

    itheta = 0;
    #ifdef __SSE2__
    for (; itheta < ntheta - 1; itheta += 2) {
        Ylmgen_prepare_sse2(&ctx, itheta, itheta + 1, m);
        Ylmgen_recalc_Ylm_sse2(&ctx);
        lmin = IMIN(*ctx.firstl, lmax + 1);
        for (l = m; l < lmin; ++l) {
            out[itheta * ncols + l - m] = 0;
            out[(itheta + 1) * ncols + l - m ] = 0;
        }
        for (l = IMAX(lmin, m); l <= lmax; ++l) {
            double v1, v2;
            read_v2df(ctx.ylm_sse2[l], &v1, &v2);
            out[itheta * ncols + l - m] = v1;
            out[(itheta + 1) * ncols + l - m] = v2;
        }
    }
    #endif
    for (; itheta < ntheta; itheta += 1) {
        Ylmgen_prepare(&ctx, itheta, m);
        Ylmgen_recalc_Ylm(&ctx);
        lmin = IMIN(*ctx.firstl, lmax + 1);
        for (l = m; l < lmin; ++l) {
            out[itheta * ncols + l - m] = 0;
        }
        for (l = IMAX(lmin, m); l <= lmax; ++l) {
            out[itheta * ncols + l - m] = ctx.ylm[l];
        }
    }
    Ylmgen_destroy(&ctx);
}