1467 lines
42 KiB
C
1467 lines
42 KiB
C
/*
|
|
|
|
This file originated as a concatenation of files from libpsht. Further refactoring
|
|
could be carried out to make the code use libsharp conventions instead for SSE etc.;
|
|
|
|
*/
|
|
|
|
|
|
/*
|
|
sse_utils.h
|
|
*/
|
|
/*
|
|
* This file is part of libc_utils.
|
|
*
|
|
* libc_utils is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* libc_utils is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with libc_utils; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
/*
|
|
* libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
|
|
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
|
|
* (DLR).
|
|
*/
|
|
|
|
/*! \file sse_utils.h
|
|
* SSE/SSE2/SSE3-related functionality
|
|
*
|
|
* Copyright (C) 2010,2011 Max-Planck-Society
|
|
* \author Martin Reinecke
|
|
*/
|
|
|
|
|
|
#if (defined(__SSE__))
|
|
|
|
#include <xmmintrin.h>
|
|
|
|
|
|
typedef __m128 v4sf; /* vector of 4 floats (SSE1) */
|
|
|
|
typedef union {
|
|
float f[4];
|
|
v4sf v;
|
|
} V4SF;
|
|
|
|
static inline v4sf build_v4sf (float a, float b, float c, float d)
|
|
{ return _mm_set_ps(d,c,b,a); }
|
|
static inline void read_v4sf (v4sf v, float *a, float *b, float *c, float *d)
|
|
{
|
|
V4SF tmp;
|
|
tmp.v = v;
|
|
if (a) *a=tmp.f[0];
|
|
if (b) *b=tmp.f[1];
|
|
if (c) *c=tmp.f[2];
|
|
if (d) *d=tmp.f[3];
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
#if (defined(__SSE2__))
|
|
|
|
#include <emmintrin.h>
|
|
|
|
|
|
typedef __m128d v2df; /* vector of 2 doubles (SSE2) */
|
|
|
|
typedef union {
|
|
double d[2];
|
|
v2df v;
|
|
} V2DF;
|
|
|
|
typedef struct {
|
|
v2df a,b;
|
|
} v2df2;
|
|
typedef struct {
|
|
V2DF a,b;
|
|
} V2DF2;
|
|
|
|
#define V2DF_SIGNMASK _mm_set1_pd(-0.0)
|
|
|
|
static inline v2df build_v2df (double a, double b)
|
|
{ return _mm_set_pd(b,a); }
|
|
static inline void read_v2df (v2df v, double *a, double *b)
|
|
{ _mm_store_sd(a,v); _mm_storeh_pd(b,v); }
|
|
|
|
static inline int v2df_any_gt (v2df a, v2df b)
|
|
{
|
|
return (_mm_movemask_pd(_mm_cmpgt_pd(_mm_andnot_pd(V2DF_SIGNMASK,a),b))!=0);
|
|
}
|
|
static inline int v2df_all_ge (v2df a, v2df b)
|
|
{
|
|
return (_mm_movemask_pd(_mm_cmplt_pd(_mm_andnot_pd(V2DF_SIGNMASK,a),b))==0);
|
|
}
|
|
static inline V2DF to_V2DF (v2df x)
|
|
{ V2DF X; X.v=x; return X; }
|
|
static inline V2DF2 to_V2DF2 (v2df2 x)
|
|
{ V2DF2 X; X.a.v=x.a; X.b.v=x.b; return X; }
|
|
static inline v2df2 to_v2df2 (V2DF2 X)
|
|
{ v2df2 x; x.a=X.a.v; x.b=X.b.v; return x; }
|
|
static inline v2df2 zero_v2df2(void)
|
|
{ v2df2 x; x.a=x.b=_mm_setzero_pd(); return x; }
|
|
|
|
|
|
#endif
|
|
|
|
#if (defined(__SSE3__))
|
|
|
|
#include <pmmintrin.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
ylmgen_c.h
|
|
*/
|
|
/*
|
|
* This file is part of libpsht.
|
|
*
|
|
* libpsht is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* libpsht is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with libpsht; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
/*
|
|
* libpsht is being developed at the Max-Planck-Institut fuer Astrophysik
|
|
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
|
|
* (DLR).
|
|
*/
|
|
|
|
/*! \file ylmgen_c.h
|
|
* Code for efficient calculation of Y_lm(phi=0,theta)
|
|
*
|
|
* Copyright (C) 2005-2011 Max-Planck-Society
|
|
* \author Martin Reinecke
|
|
*/
|
|
|
|
typedef double ylmgen_dbl2[2];
|
|
typedef double ylmgen_dbl3[3];
|
|
|
|
typedef struct
|
|
{
|
|
double cth_crit;
|
|
int mdist_crit;
|
|
/* members depending on m and m' */
|
|
int s, m, mlo, mhi, cosPow, sinPow;
|
|
long double prefactor;
|
|
ylmgen_dbl3 *fx;
|
|
int preMinus_p, preMinus_m;
|
|
} sylmgen_d;
|
|
|
|
typedef struct
|
|
{
|
|
double fsmall, fbig, eps, cth_crit;
|
|
int lmax, mmax, smax, m_cur, ith, nth, m_crit, spinrec;
|
|
/*! The index of the first non-negligible Y_lm value. */
|
|
int *firstl;
|
|
double *cf, *mfac, *t1fac, *t2fac, *th, *cth, *sth, *logsth;
|
|
ylmgen_dbl2 *recfac;
|
|
double *lamfact;
|
|
/*! Points to an array of size [0..lmax] containing the Y_lm values. */
|
|
double *ylm;
|
|
/*! Points to an array of size [0..lmax] containing the lambda_w
|
|
and lambda_x values for spin>0 transforms. */
|
|
ylmgen_dbl2 **lambda_wx;
|
|
long double *logsum, *lc05, *ls05;
|
|
double *flm1, *flm2, *xl;
|
|
|
|
sylmgen_d **sylm;
|
|
|
|
int *lwx_uptodate;
|
|
int ylm_uptodate;
|
|
|
|
#ifdef __SSE2__
|
|
int ith1, ith2;
|
|
/*! Points to an array of size [0..lmax] containing the Y_lm values. */
|
|
v2df *ylm_sse2;
|
|
/*! Points to an array of size [0..lmax] containing the lambda_w
|
|
and lambda_x values for spin>0 transforms. */
|
|
v2df2 **lambda_wx_sse2;
|
|
int *lwx_uptodate_sse2;
|
|
int ylm_uptodate_sse2;
|
|
#endif
|
|
|
|
int recfac_uptodate, lamfact_uptodate;
|
|
} Ylmgen_C;
|
|
|
|
/*! Creates a generator which will calculate Y_lm(theta,phi=0)
|
|
up to \a l=l_max and \a m=m_max. It may regard Y_lm whose absolute
|
|
magnitude is smaller than \a epsilon as zero. If \a spinrec is nonzero,
|
|
the spin-1 and spin-2 Y_lm will be calculated by recursion from the spin-0
|
|
ones, otherwise Wigner d matrix elements will be used. */
|
|
static void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int s_max, int spinrec,
|
|
double epsilon);
|
|
|
|
/*! Passes am array \a theta of \a nth colatitudes that will be used in
|
|
subsequent calls. The individual angles will be referenced by their
|
|
index in the array, starting with 0.
|
|
\note The array can be freed or reused after the call. */
|
|
static void Ylmgen_set_theta (Ylmgen_C *gen, const double *theta, int nth);
|
|
|
|
/*! Deallocates a generator previously initialised by Ylmgen_init(). */
|
|
static void Ylmgen_destroy (Ylmgen_C *gen);
|
|
|
|
/*! Prepares the object for the calculation at \a theta and \a m. */
|
|
static void Ylmgen_prepare (Ylmgen_C *gen, int ith, int m);
|
|
|
|
/*! Recalculates (if necessary) the Y_lm values. */
|
|
static void Ylmgen_recalc_Ylm (Ylmgen_C *gen);
|
|
/*! Recalculates (if necessary) the lambda_w and lambda_x values for spin >0
|
|
transforms. */
|
|
/*static void Ylmgen_recalc_lambda_wx (Ylmgen_C *gen, int spin);*/
|
|
|
|
#ifdef __SSE2__
|
|
/*! Prepares the object for the calculation at \a theta, \a theta2 and \a m. */
|
|
static void Ylmgen_prepare_sse2 (Ylmgen_C *gen, int ith1, int ith2, int m);
|
|
|
|
/*! Recalculates (if necessary) the Y_lm values. */
|
|
static void Ylmgen_recalc_Ylm_sse2 (Ylmgen_C *gen);
|
|
/*! Recalculates (if necessary) the lambda_w and lambda_x values for spin >0
|
|
transforms. */
|
|
/*static void Ylmgen_recalc_lambda_wx_sse2 (Ylmgen_C *gen, int spin);*/
|
|
#endif
|
|
|
|
/*! Returns a pointer to an array with lmax+1 entries containing normalisation
|
|
factors that must be applied to Y_lm values computed for \a spin with the
|
|
given \a spinrec flag. The array must be deallocated (using free()) by the
|
|
user. */
|
|
/*static double *Ylmgen_get_norm (int lmax, int spin, int spinrec);*/
|
|
|
|
|
|
/*
|
|
ylmgen_c.c
|
|
*/
|
|
|
|
/*
|
|
* This file is part of libpsht.
|
|
*
|
|
* libpsht is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* libpsht is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with libpsht; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
/*
|
|
* libpsht is being developed at the Max-Planck-Institut fuer Astrophysik
|
|
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
|
|
* (DLR).
|
|
*/
|
|
|
|
/*
|
|
* Code for efficient calculation of Y_lm(theta,phi=0)
|
|
*
|
|
* Copyright (C) 2005-2011 Max-Planck-Society
|
|
* Author: Martin Reinecke
|
|
*/
|
|
|
|
#include <math.h>
|
|
#include <stdlib.h>
|
|
#include "c_utils.h"
|
|
|
|
enum { large_exponent2=90, minscale=-4, maxscale=11 };
|
|
|
|
/*static void sylmgen_init (sylmgen_d *gen, const Ylmgen_C *ygen, int spin)
|
|
{
|
|
int i;
|
|
UTIL_ASSERT(spin>=1,"incorrect spin");
|
|
gen->s=spin;
|
|
gen->m=gen->mlo=gen->mhi=-1234567890;
|
|
ALLOC(gen->fx,ylmgen_dbl3,ygen->lmax+2);
|
|
|
|
for (i=0; i<ygen->lmax+2; ++i)
|
|
gen->fx[i][0]=gen->fx[i][1]=gen->fx[i][2]=0.;
|
|
|
|
gen->cth_crit = 2.;
|
|
gen->mdist_crit = ygen->lmax+1;
|
|
}*/
|
|
|
|
static void sylmgen_destroy (sylmgen_d *gen)
|
|
{ DEALLOC(gen->fx); }
|
|
|
|
/*static void sylmgen_prepare (sylmgen_d *gen, const Ylmgen_C *ygen, int m_)
|
|
{
|
|
int mlo_, mhi_, ms_similar, l;
|
|
|
|
if (m_==gen->m) return;
|
|
UTIL_ASSERT(m_>=0,"incorrect m");
|
|
|
|
mlo_=m_; mhi_=gen->s;
|
|
if (mhi_<mlo_) SWAP(mhi_,mlo_,int);
|
|
ms_similar = ((gen->mhi==mhi_) && (gen->mlo==mlo_));
|
|
|
|
gen->m=m_;
|
|
gen->mlo = mlo_; gen->mhi = mhi_;
|
|
|
|
if (!ms_similar)
|
|
{
|
|
for (l=gen->mhi; l<ygen->lmax; ++l)
|
|
{
|
|
double t = ygen->flm1[l+gen->m]*ygen->flm1[l-gen->m]
|
|
*ygen->flm1[l+gen->s]*ygen->flm1[l-gen->s];
|
|
double lt = 2*l+1;
|
|
double l1 = l+1;
|
|
gen->fx[l+1][0]=l1*lt*t;
|
|
gen->fx[l+1][1]=gen->m*gen->s*ygen->xl[l]*ygen->xl[l+1];
|
|
t = ygen->flm2[l+gen->m]*ygen->flm2[l-gen->m]
|
|
*ygen->flm2[l+gen->s]*ygen->flm2[l-gen->s];
|
|
gen->fx[l+1][2]=t*l1*ygen->xl[l];
|
|
}
|
|
gen->prefactor = 0.5L*(ygen->logsum[2*gen->mhi]
|
|
-ygen->logsum[gen->mhi+gen->mlo]-ygen->logsum[gen->mhi-gen->mlo]);
|
|
}
|
|
|
|
gen->preMinus_p = gen->preMinus_m = 0;
|
|
if (gen->mhi==gen->m)
|
|
{
|
|
gen->cosPow = gen->mhi+gen->s; gen->sinPow = gen->mhi-gen->s;
|
|
gen->preMinus_p = gen->preMinus_m = ((gen->mhi-gen->s)&1);
|
|
}
|
|
else
|
|
{
|
|
gen->cosPow = gen->mhi+gen->m; gen->sinPow = gen->mhi-gen->m;
|
|
gen->preMinus_m = ((gen->mhi+gen->m)&1);
|
|
}
|
|
}*/
|
|
|
|
/*static void sylmgen_recalc (sylmgen_d *gen, const Ylmgen_C *ygen, int ith,
|
|
ylmgen_dbl2 *res, int *firstl)
|
|
{
|
|
const double ln2 = 0.6931471805599453094172321214581766;
|
|
const double inv_ln2 = 1.4426950408889634073599246810018921;
|
|
int l=gen->mhi;
|
|
int lmax = ygen->lmax;
|
|
ylmgen_dbl3 *fy = gen->fx;
|
|
const double fsmall = ygen->fsmall, fbig = ygen->fbig, eps = ygen->eps;
|
|
const double cth = ygen->cth[ith];
|
|
long double logvalp = inv_ln2*(gen->prefactor
|
|
+ ygen->lc05[ith]*gen->cosPow + ygen->ls05[ith]*gen->sinPow);
|
|
long double logvalm = inv_ln2*(gen->prefactor
|
|
+ ygen->lc05[ith]*gen->sinPow + ygen->ls05[ith]*gen->cosPow);
|
|
int scalep = (int)(logvalp/large_exponent2)-minscale;
|
|
int scalem = (int)(logvalm/large_exponent2)-minscale;
|
|
double rec1p=0., rec1m=0.;
|
|
double rec2p = exp(ln2*(double)(logvalp-(scalep+minscale)*large_exponent2));
|
|
double rec2m = exp(ln2*(double)(logvalm-(scalem+minscale)*large_exponent2));
|
|
double corfacp,corfacm;
|
|
double tp,tm;
|
|
|
|
if ((abs(gen->m-gen->s)>=gen->mdist_crit)&&(fabs(cth)>=gen->cth_crit))
|
|
{ *firstl=ygen->lmax+1; return; }
|
|
|
|
if (gen->preMinus_p)
|
|
rec2p=-rec2p;
|
|
if (gen->preMinus_m)
|
|
rec2m=-rec2m;
|
|
if (gen->s&1)
|
|
rec2p=-rec2p;
|
|
|
|
/ iterate until we reach the realm of IEEE numbers /
|
|
while((scalem<0)&&(scalep<0))
|
|
{
|
|
if (++l>lmax) break;
|
|
rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
|
|
rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
|
|
if (++l>lmax) break;
|
|
rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
|
|
rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;
|
|
|
|
while (fabs(rec2p)>fbig)
|
|
{ rec1p *= fsmall; rec2p *= fsmall; ++scalep; }
|
|
while (fabs(rec2m)>fbig)
|
|
{ rec1m *= fsmall; rec2m *= fsmall; ++scalem; }
|
|
}
|
|
|
|
corfacp = (scalep<0) ? 0. : ygen->cf[scalep];
|
|
corfacm = (scalem<0) ? 0. : ygen->cf[scalem];
|
|
|
|
if (l<=lmax)
|
|
{
|
|
while (1)
|
|
{
|
|
if ((fabs(rec2p*corfacp)>eps) || (fabs(rec2m*corfacm)>eps))
|
|
break;
|
|
if (++l>lmax) break;
|
|
rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
|
|
rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
|
|
if ((fabs(rec1p*corfacp)>eps) || (fabs(rec1m*corfacm)>eps))
|
|
{ SWAP(rec1p,rec2p,double); SWAP(rec1m,rec2m,double); break; }
|
|
if (++l>lmax) break;
|
|
rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
|
|
rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;
|
|
|
|
if ((fabs(rec2p)>fbig) || (fabs(rec2m)>fbig))
|
|
{
|
|
while (fabs(rec2p)>fbig)
|
|
{ rec1p *= fsmall; rec2p *= fsmall; ++scalep; }
|
|
while (fabs(rec2m)>fbig)
|
|
{ rec1m *= fsmall; rec2m *= fsmall; ++scalem; }
|
|
corfacp = (scalep<0) ? 0. : ygen->cf[scalep];
|
|
corfacm = (scalem<0) ? 0. : ygen->cf[scalem];
|
|
}
|
|
}
|
|
}
|
|
|
|
*firstl=l;
|
|
if (l>lmax)
|
|
{
|
|
gen->mdist_crit=abs(gen->m-gen->s);
|
|
gen->cth_crit= fabs(cth);
|
|
return;
|
|
}
|
|
|
|
tp = rec2p*corfacp; tm = rec2m*corfacm;
|
|
res[l][0]=tp+tm;
|
|
res[l][1]=tm-tp;
|
|
|
|
while (1)
|
|
{
|
|
if ((fabs(tp)>eps) && (fabs(tm)>eps))
|
|
break;
|
|
if (++l>lmax) break;
|
|
rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
|
|
rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
|
|
tp=rec1p*corfacp; tm=rec1m*corfacm;
|
|
res[l][0]=tp+tm; res[l][1]=tm-tp;
|
|
if ((fabs(tp)>eps) && (fabs(tm)>eps))
|
|
{ SWAP(rec1p,rec2p,double); SWAP(rec1m,rec2m,double); break; }
|
|
if (++l>lmax) break;
|
|
rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
|
|
rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;
|
|
tp=rec2p*corfacp; tm=rec2m*corfacm;
|
|
res[l][0]=tp+tm; res[l][1]=tm-tp;
|
|
|
|
if ((fabs(rec2p)>fbig) || (fabs(rec2m)>fbig))
|
|
{
|
|
while (fabs(rec2p)>fbig)
|
|
{ rec1p *= fsmall; rec2p *= fsmall; ++scalep; }
|
|
while (fabs(rec2m)>fbig)
|
|
{ rec1m *= fsmall; rec2m *= fsmall; ++scalem; }
|
|
corfacp = (scalep<0) ? 0. : ygen->cf[scalep];
|
|
corfacm = (scalem<0) ? 0. : ygen->cf[scalem];
|
|
}
|
|
}
|
|
|
|
rec1p *= corfacp; rec2p *= corfacp;
|
|
rec1m *= corfacm; rec2m *= corfacm;
|
|
|
|
for (;l<lmax-1;l+=2)
|
|
{
|
|
rec1p = (cth - fy[l+1][1])*fy[l+1][0]*rec2p - fy[l+1][2]*rec1p;
|
|
rec1m = (cth + fy[l+1][1])*fy[l+1][0]*rec2m - fy[l+1][2]*rec1m;
|
|
res[l+1][0] = rec1p+rec1m; res[l+1][1] = rec1m-rec1p;
|
|
rec2p = (cth - fy[l+2][1])*fy[l+2][0]*rec1p - fy[l+2][2]*rec2p;
|
|
rec2m = (cth + fy[l+2][1])*fy[l+2][0]*rec1m - fy[l+2][2]*rec2m;
|
|
res[l+2][0] = rec2p+rec2m; res[l+2][1] = rec2m-rec2p;
|
|
}
|
|
while (1)
|
|
{
|
|
if (++l>lmax) break;
|
|
rec1p = (cth - fy[l][1])*fy[l][0]*rec2p - fy[l][2]*rec1p;
|
|
rec1m = (cth + fy[l][1])*fy[l][0]*rec2m - fy[l][2]*rec1m;
|
|
res[l][0] = rec1p+rec1m; res[l][1] = rec1m-rec1p;
|
|
if (++l>lmax) break;
|
|
rec2p = (cth - fy[l][1])*fy[l][0]*rec1p - fy[l][2]*rec2p;
|
|
rec2m = (cth + fy[l][1])*fy[l][0]*rec1m - fy[l][2]*rec2m;
|
|
res[l][0] = rec2p+rec2m; res[l][1] = rec2m-rec2p;
|
|
}
|
|
}*/
|
|
|
|
#ifdef __SSE2__
|
|
|
|
#define ADVANCE(L,ap,am,bp,bm) \
|
|
{ \
|
|
v2df f0=_mm_set1_pd(fy[L][0]), \
|
|
f1=_mm_set1_pd(fy[L][1]), \
|
|
f2=_mm_set1_pd(fy[L][2]); \
|
|
ap = _mm_sub_pd( \
|
|
_mm_mul_pd(_mm_sub_pd(cth,f1),_mm_mul_pd(f0,bp)), \
|
|
_mm_mul_pd(f2,ap)); \
|
|
am = _mm_sub_pd( \
|
|
_mm_mul_pd(_mm_add_pd(cth,f1),_mm_mul_pd(f0,bm)), \
|
|
_mm_mul_pd(f2,am)); \
|
|
}
|
|
|
|
#define RENORMX(r1,r2,corf,sca,scb) \
|
|
do \
|
|
{ \
|
|
double rec1a, rec1b, rec2a, rec2b, corfaca, corfacb; \
|
|
read_v2df (r1, &rec1a, &rec1b); read_v2df (r2, &rec2a, &rec2b); \
|
|
read_v2df (corf, &corfaca, &corfacb); \
|
|
while (fabs(rec2a)>fbig) \
|
|
{ \
|
|
rec1a*=fsmall; rec2a*=fsmall; ++sca; \
|
|
corfaca = (sca<0) ? 0. : ygen->cf[sca]; \
|
|
} \
|
|
while (fabs(rec2b)>fbig) \
|
|
{ \
|
|
rec1b*=fsmall; rec2b*=fsmall; ++scb; \
|
|
corfacb = (scb<0) ? 0. : ygen->cf[scb]; \
|
|
} \
|
|
r1=build_v2df(rec1a,rec1b); r2=build_v2df(rec2a,rec2b); \
|
|
corf=build_v2df(corfaca,corfacb); \
|
|
} \
|
|
while(0)
|
|
|
|
#define RENORM \
|
|
RENORMX(rec1p,rec2p,corfacp,scale1p,scale2p); \
|
|
RENORMX(rec1m,rec2m,corfacm,scale1m,scale2m)
|
|
|
|
/*static void sylmgen_recalc_sse2 (sylmgen_d *gen, const Ylmgen_C *ygen,
|
|
int ith1, int ith2, v2df2 *res, int *firstl)
|
|
{
|
|
const double ln2 = 0.6931471805599453094172321214581766;
|
|
const double inv_ln2 = 1.4426950408889634073599246810018921;
|
|
int l=gen->mhi;
|
|
int lmax = ygen->lmax;
|
|
ylmgen_dbl3 *fy = gen->fx;
|
|
const double fbig=ygen->fbig, fsmall=ygen->fsmall;
|
|
v2df eps2 = build_v2df(ygen->eps,ygen->eps);
|
|
const double cth1=ygen->cth[ith1], cth2=ygen->cth[ith2];
|
|
v2df cth = build_v2df(cth1,cth2);
|
|
long double
|
|
logval1p = inv_ln2*(gen->prefactor
|
|
+ ygen->lc05[ith1]*gen->cosPow + ygen->ls05[ith1]*gen->sinPow),
|
|
logval2p = inv_ln2*(gen->prefactor
|
|
+ ygen->lc05[ith2]*gen->cosPow + ygen->ls05[ith2]*gen->sinPow),
|
|
logval1m = inv_ln2*(gen->prefactor
|
|
+ ygen->lc05[ith1]*gen->sinPow + ygen->ls05[ith1]*gen->cosPow),
|
|
logval2m = inv_ln2*(gen->prefactor
|
|
+ ygen->lc05[ith2]*gen->sinPow + ygen->ls05[ith2]*gen->cosPow);
|
|
|
|
int scale1p = (int)(logval1p/large_exponent2)-minscale,
|
|
scale2p = (int)(logval2p/large_exponent2)-minscale,
|
|
scale1m = (int)(logval1m/large_exponent2)-minscale,
|
|
scale2m = (int)(logval2m/large_exponent2)-minscale;
|
|
|
|
v2df rec1p =_mm_setzero_pd(), rec1m=_mm_setzero_pd();
|
|
v2df rec2p = build_v2df(
|
|
exp(ln2*(double)(logval1p-(scale1p+minscale)*large_exponent2)),
|
|
exp(ln2*(double)(logval2p-(scale2p+minscale)*large_exponent2)));
|
|
v2df rec2m = build_v2df(
|
|
exp(ln2*(double)(logval1m-(scale1m+minscale)*large_exponent2)),
|
|
exp(ln2*(double)(logval2m-(scale2m+minscale)*large_exponent2)));
|
|
v2df corfacp=build_v2df((scale1p<0) ? 0. : ygen->cf[scale1p],
|
|
(scale2p<0) ? 0. : ygen->cf[scale2p]),
|
|
corfacm=build_v2df((scale1m<0) ? 0. : ygen->cf[scale1m],
|
|
(scale2m<0) ? 0. : ygen->cf[scale2m]);
|
|
v2df tp,tm;
|
|
|
|
if ((abs(gen->m-gen->s)>=gen->mdist_crit)
|
|
&&(fabs(ygen->cth[ith1])>=gen->cth_crit)
|
|
&&(fabs(ygen->cth[ith2])>=gen->cth_crit))
|
|
{ *firstl=ygen->lmax+1; return; }
|
|
|
|
if (gen->preMinus_p)
|
|
rec2p = _mm_xor_pd (rec2p,V2DF_SIGNMASK); / negate /
|
|
if (gen->preMinus_m)
|
|
rec2m = _mm_xor_pd (rec2m,V2DF_SIGNMASK); / negate /
|
|
if (gen->s&1)
|
|
rec2p = _mm_xor_pd (rec2p,V2DF_SIGNMASK); / negate /
|
|
|
|
/ iterate until we reach the realm of IEEE numbers /
|
|
while((scale1m<0)&&(scale1p<0)&&(scale2m<0)&&(scale2p<0))
|
|
{
|
|
if (++l>lmax) break;
|
|
ADVANCE (l,rec1p,rec1m,rec2p,rec2m)
|
|
if (++l>lmax) break;
|
|
ADVANCE (l,rec2p,rec2m,rec1p,rec1m)
|
|
|
|
RENORM;
|
|
}
|
|
|
|
if (l<=lmax)
|
|
{
|
|
while (1)
|
|
{
|
|
if (v2df_any_gt(_mm_mul_pd(rec2p,corfacp),eps2) ||
|
|
v2df_any_gt(_mm_mul_pd(rec2m,corfacm),eps2))
|
|
break;
|
|
if (++l>lmax) break;
|
|
ADVANCE (l,rec1p,rec1m,rec2p,rec2m)
|
|
if (v2df_any_gt(_mm_mul_pd(rec1p,corfacp),eps2) ||
|
|
v2df_any_gt(_mm_mul_pd(rec1m,corfacm),eps2))
|
|
{ SWAP(rec1p,rec2p,v2df); SWAP(rec1m,rec2m,v2df); break; }
|
|
if (++l>lmax) break;
|
|
ADVANCE (l,rec2p,rec2m,rec1p,rec1m)
|
|
|
|
RENORM;
|
|
}
|
|
}
|
|
|
|
*firstl=l;
|
|
if (l>lmax)
|
|
{
|
|
gen->mdist_crit=abs(gen->m-gen->s);
|
|
gen->cth_crit= (fabs(cth1)<fabs(cth2)) ? fabs(cth1) : fabs(cth2);
|
|
return;
|
|
}
|
|
tp = _mm_mul_pd(rec2p,corfacp); tm = _mm_mul_pd(rec2m,corfacm);
|
|
res[l].a=_mm_add_pd(tp,tm);
|
|
res[l].b=_mm_sub_pd(tm,tp);
|
|
|
|
while (1)
|
|
{
|
|
if (v2df_all_ge(tp,eps2) && v2df_all_ge(tm,eps2))
|
|
break;
|
|
if (++l>lmax) break;
|
|
ADVANCE(l,rec1p,rec1m,rec2p,rec2m)
|
|
tp=_mm_mul_pd(rec1p,corfacp); tm=_mm_mul_pd(rec1m,corfacm);
|
|
res[l].a=_mm_add_pd(tp,tm); res[l].b=_mm_sub_pd(tm,tp);
|
|
if (v2df_all_ge(tp,eps2) && v2df_all_ge(tm,eps2))
|
|
{ SWAP(rec1p,rec2p,v2df); SWAP(rec1m,rec2m,v2df); break; }
|
|
if (++l>lmax) break;
|
|
ADVANCE (l,rec2p,rec2m,rec1p,rec1m)
|
|
tp=_mm_mul_pd(rec2p,corfacp); tm=_mm_mul_pd(rec2m,corfacm);
|
|
res[l].a=_mm_add_pd(tp,tm); res[l].b=_mm_sub_pd(tm,tp);
|
|
|
|
RENORM;
|
|
}
|
|
|
|
rec1p = _mm_mul_pd(rec1p,corfacp); rec2p = _mm_mul_pd(rec2p,corfacp);
|
|
rec1m = _mm_mul_pd(rec1m,corfacm); rec2m = _mm_mul_pd(rec2m,corfacm);
|
|
|
|
for (;l<lmax-1;l+=2)
|
|
{
|
|
ADVANCE(l+1,rec1p,rec1m,rec2p,rec2m)
|
|
res[l+1].a=_mm_add_pd(rec1p,rec1m); res[l+1].b=_mm_sub_pd(rec1m,rec1p);
|
|
ADVANCE(l+2,rec2p,rec2m,rec1p,rec1m)
|
|
res[l+2].a=_mm_add_pd(rec2p,rec2m); res[l+2].b=_mm_sub_pd(rec2m,rec2p);
|
|
}
|
|
while (1)
|
|
{
|
|
if (++l>lmax) break;
|
|
ADVANCE(l,rec1p,rec1m,rec2p,rec2m)
|
|
res[l].a=_mm_add_pd(rec1p,rec1m); res[l].b=_mm_sub_pd(rec1m,rec1p);
|
|
if (++l>lmax) break;
|
|
ADVANCE(l,rec2p,rec2m,rec1p,rec1m)
|
|
res[l].a=_mm_add_pd(rec2p,rec2m); res[l].b=_mm_sub_pd(rec2m,rec2p);
|
|
}
|
|
}
|
|
*/
|
|
#endif
|
|
|
|
static void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int s_max, int spinrec,
|
|
double epsilon)
|
|
{
|
|
int m;
|
|
const double inv_sqrt4pi = 0.2820947917738781434740397257803862929220;
|
|
const double inv_ln2 = 1.4426950408889634073599246810018921;
|
|
|
|
gen->fsmall = ldexp(1.,-large_exponent2);
|
|
gen->fbig = ldexp(1., large_exponent2);
|
|
gen->eps = epsilon;
|
|
gen->cth_crit = 2.;
|
|
gen->ith = -1;
|
|
gen->nth = 0;
|
|
gen->lmax = l_max;
|
|
gen->mmax = m_max;
|
|
gen->smax = s_max;
|
|
gen->spinrec = spinrec;
|
|
gen->m_cur = -1;
|
|
gen->m_crit = gen->mmax+1;
|
|
gen->firstl = RALLOC(int,gen->smax+1);
|
|
for (m=0; m<=gen->smax; ++m) gen->firstl[m]=-1;
|
|
gen->cf = RALLOC(double,maxscale-minscale+1);
|
|
for (m=0; m<(maxscale-minscale+1); ++m)
|
|
gen->cf[m] = ldexp(1.,(m+minscale)*large_exponent2);
|
|
gen->recfac = RALLOC(ylmgen_dbl2,gen->lmax+1);
|
|
gen->mfac = RALLOC(double,gen->mmax+1);
|
|
gen->mfac[0] = 1;
|
|
for (m=1; m<=gen->mmax; ++m)
|
|
gen->mfac[m] = gen->mfac[m-1]*sqrt((2*m+1.)/(2*m));
|
|
for (m=0; m<=gen->mmax; ++m)
|
|
gen->mfac[m] = inv_ln2*log(inv_sqrt4pi*gen->mfac[m]);
|
|
|
|
gen->t1fac = RALLOC(double,gen->lmax+1);
|
|
for (m=0; m<=gen->lmax; ++m)
|
|
gen->t1fac[m] = sqrt(4.*(m+1)*(m+1)-1.);
|
|
gen->t2fac = RALLOC(double,gen->lmax+gen->mmax+1);
|
|
for (m=0; m<=gen->lmax+gen->mmax; ++m)
|
|
gen->t2fac[m] = 1./sqrt(m+1.);
|
|
|
|
gen->lamfact = RALLOC(double,gen->lmax+1);
|
|
gen->ylm = RALLOC(double,gen->lmax+1);
|
|
ALLOC(gen->lambda_wx,ylmgen_dbl2 *,gen->smax+1);
|
|
for (m=0; m<=gen->smax; ++m)
|
|
gen->lambda_wx[m]=NULL;
|
|
|
|
gen->sylm = RALLOC(sylmgen_d *,gen->smax+1);
|
|
for (m=0; m<=gen->smax; ++m)
|
|
gen->sylm[m]=NULL;
|
|
|
|
gen->ylm_uptodate = 0;
|
|
gen->lwx_uptodate = RALLOC(int,gen->smax+1);
|
|
SET_ARRAY(gen->lwx_uptodate,0,gen->smax+1,0);
|
|
gen->recfac_uptodate = 0;
|
|
gen->lamfact_uptodate = 0;
|
|
|
|
gen->th = gen->cth = gen->sth = gen->logsth = NULL;
|
|
|
|
#ifdef __SSE2__
|
|
gen->ith1 = gen->ith2 = -1;
|
|
gen->ylm_sse2 = RALLOC(v2df,gen->lmax+1);
|
|
ALLOC(gen->lambda_wx_sse2,v2df2 *,gen->smax+1);
|
|
for (m=0; m<=gen->smax; ++m)
|
|
gen->lambda_wx_sse2[m]=NULL;
|
|
gen->ylm_uptodate_sse2 = 0;
|
|
gen->lwx_uptodate_sse2 = RALLOC(int,gen->smax+1);
|
|
SET_ARRAY(gen->lwx_uptodate_sse2,0,gen->smax+1,0);
|
|
#endif
|
|
|
|
ALLOC(gen->logsum,long double,2*gen->lmax+1);
|
|
gen->lc05 = gen->ls05 = NULL;
|
|
ALLOC(gen->flm1,double,2*gen->lmax+1);
|
|
ALLOC(gen->flm2,double,2*gen->lmax+1);
|
|
ALLOC(gen->xl,double,gen->lmax+1);
|
|
|
|
gen->logsum[0] = 0.;
|
|
for (m=1; m<2*gen->lmax+1; ++m)
|
|
gen->logsum[m] = gen->logsum[m-1]+logl((long double)m);
|
|
for (m=0; m<2*gen->lmax+1; ++m)
|
|
{
|
|
gen->flm1[m] = sqrt(1./(m+1.));
|
|
gen->flm2[m] = sqrt(m/(m+1.));
|
|
}
|
|
|
|
gen->xl[0]=0;
|
|
for (m=1; m<gen->lmax+1; ++m) gen->xl[m]=1./m;
|
|
}
|
|
|
|
static void Ylmgen_destroy (Ylmgen_C *gen)
|
|
{
|
|
int m;
|
|
|
|
DEALLOC(gen->firstl);
|
|
DEALLOC(gen->cf);
|
|
DEALLOC(gen->recfac);
|
|
DEALLOC(gen->mfac);
|
|
DEALLOC(gen->t1fac);
|
|
DEALLOC(gen->t2fac);
|
|
DEALLOC(gen->lamfact);
|
|
DEALLOC(gen->ylm);
|
|
DEALLOC(gen->lwx_uptodate);
|
|
for (m=0; m<=gen->smax; ++m)
|
|
DEALLOC(gen->lambda_wx[m]);
|
|
DEALLOC(gen->lambda_wx);
|
|
for (m=0; m<=gen->smax; ++m)
|
|
if (gen->sylm[m])
|
|
{
|
|
sylmgen_destroy (gen->sylm[m]);
|
|
DEALLOC(gen->sylm[m]);
|
|
}
|
|
DEALLOC(gen->sylm);
|
|
DEALLOC(gen->th);
|
|
DEALLOC(gen->cth);
|
|
DEALLOC(gen->sth);
|
|
DEALLOC(gen->logsth);
|
|
DEALLOC(gen->logsum);
|
|
DEALLOC(gen->lc05);
|
|
DEALLOC(gen->ls05);
|
|
DEALLOC(gen->flm1);
|
|
DEALLOC(gen->flm2);
|
|
DEALLOC(gen->xl);
|
|
#ifdef __SSE2__
|
|
DEALLOC(gen->ylm_sse2);
|
|
for (m=0; m<=gen->smax; ++m)
|
|
DEALLOC(gen->lambda_wx_sse2[m]);
|
|
DEALLOC(gen->lambda_wx_sse2);
|
|
DEALLOC(gen->lwx_uptodate_sse2);
|
|
#endif
|
|
}
|
|
|
|
static void Ylmgen_set_theta (Ylmgen_C *gen, const double *theta, int nth)
|
|
{
|
|
const double inv_ln2 = 1.4426950408889634073599246810018921;
|
|
int m;
|
|
DEALLOC(gen->th);
|
|
DEALLOC(gen->cth);
|
|
DEALLOC(gen->sth);
|
|
DEALLOC(gen->logsth);
|
|
DEALLOC(gen->lc05);
|
|
DEALLOC(gen->ls05);
|
|
gen->th = RALLOC(double,nth);
|
|
gen->cth = RALLOC(double,nth);
|
|
gen->sth = RALLOC(double,nth);
|
|
gen->logsth = RALLOC(double,nth);
|
|
gen->lc05 = RALLOC(long double,nth);
|
|
gen->ls05 = RALLOC(long double,nth);
|
|
for (m=0; m<nth; ++m)
|
|
{
|
|
const double pi = 3.141592653589793238462643383279502884197;
|
|
double th=theta[m];
|
|
UTIL_ASSERT ((th>=0.)&&(th<=pi),"bad theta angle specified");
|
|
/* tiny adjustments to make sure cos and sin (theta/2) are positive */
|
|
if (th==0.) th=1e-16;
|
|
if (ABSAPPROX(th,pi,1e-15)) th=pi-1e-15;
|
|
gen->th[m] = th;
|
|
gen->cth[m] = cos(th);
|
|
gen->sth[m] = sin(th);
|
|
gen->logsth[m] = inv_ln2*log(gen->sth[m]);
|
|
gen->lc05[m]=logl(cosl(0.5L*th));
|
|
gen->ls05[m]=logl(sinl(0.5L*th));
|
|
}
|
|
|
|
gen->nth = nth;
|
|
gen->ith = -1;
|
|
#ifdef __SSE2__
|
|
gen->ith1 = gen->ith2 = -1;
|
|
#endif
|
|
}
|
|
|
|
static void Ylmgen_prepare (Ylmgen_C *gen, int ith, int m)
|
|
{
|
|
if ((ith==gen->ith) && (m==gen->m_cur)) return;
|
|
|
|
gen->ylm_uptodate = 0;
|
|
SET_ARRAY(gen->lwx_uptodate,0,gen->smax+1,0);
|
|
|
|
gen->ith = ith;
|
|
|
|
if (m!=gen->m_cur)
|
|
{
|
|
gen->recfac_uptodate = 0;
|
|
gen->lamfact_uptodate = 0;
|
|
gen->m_cur = m;
|
|
}
|
|
}
|
|
|
|
static void Ylmgen_recalc_recfac (Ylmgen_C *gen)
|
|
{
|
|
double f_old=1;
|
|
int l, m;
|
|
|
|
if (gen->recfac_uptodate) return;
|
|
gen->recfac_uptodate = 1;
|
|
|
|
m = gen->m_cur;
|
|
for (l=m; l<=gen->lmax; ++l)
|
|
{
|
|
gen->recfac[l][0] = gen->t1fac[l]*gen->t2fac[l+m]*gen->t2fac[l-m];
|
|
gen->recfac[l][1] = gen->recfac[l][0]/f_old;
|
|
f_old = gen->recfac[l][0];
|
|
}
|
|
}
|
|
|
|
/*static void Ylmgen_recalc_lamfact (Ylmgen_C *gen)
|
|
{
|
|
int l, m;
|
|
|
|
if (gen->lamfact_uptodate) return;
|
|
gen->lamfact_uptodate = 1;
|
|
|
|
m = gen->m_cur;
|
|
gen->lamfact[m] = 0;
|
|
for (l=m+1; l<=gen->lmax; ++l)
|
|
gen->lamfact[l] = sqrt((2*l+1.)/(2*l-1.) * (l*l-m*m));
|
|
}*/
|
|
|
|
#define RENORMALIZE_SCALAR \
|
|
do \
|
|
{ \
|
|
while (fabs(lam_2)>fbig) \
|
|
{ lam_1*=fsmall; lam_2*=fsmall; ++scale; } \
|
|
corfac = (scale<0) ? 0. : gen->cf[scale]; \
|
|
} \
|
|
while(0)
|
|
|
|
static void Ylmgen_recalc_Ylm (Ylmgen_C *gen)
|
|
{
|
|
const double ln2 = 0.6931471805599453094172321214581766;
|
|
|
|
double logval,lam_1,lam_2,corfac;
|
|
double eps=gen->eps, fbig=gen->fbig, fsmall=gen->fsmall;
|
|
ylmgen_dbl2 *recfac = gen->recfac;
|
|
int lmax=gen->lmax;
|
|
int scale,l;
|
|
int m = gen->m_cur;
|
|
double cth=gen->cth[gen->ith], sth=gen->sth[gen->ith];
|
|
double *result = gen->ylm;
|
|
|
|
if (gen->ylm_uptodate) return;
|
|
gen->ylm_uptodate=1;
|
|
|
|
if (((m>=gen->m_crit)&&(fabs(cth)>=gen->cth_crit)) || ((m>0)&&(sth==0)))
|
|
{ gen->firstl[0]=gen->lmax+1; return; }
|
|
|
|
Ylmgen_recalc_recfac(gen);
|
|
|
|
logval = gen->mfac[m];
|
|
if (m>0) logval += m*gen->logsth[gen->ith];
|
|
scale = (int) (logval/large_exponent2)-minscale;
|
|
corfac = (scale<0) ? 0. : gen->cf[scale];
|
|
|
|
lam_1 = 0;
|
|
lam_2 = exp(ln2*(logval-(scale+minscale)*large_exponent2));
|
|
if (m&1) lam_2 = -lam_2;
|
|
|
|
l=m;
|
|
if (scale<0)
|
|
{
|
|
while (1)
|
|
{
|
|
if (++l>lmax) break;
|
|
lam_1 = cth*lam_2*recfac[l-1][0] - lam_1*recfac[l-1][1];
|
|
if (++l>lmax) break;
|
|
lam_2 = cth*lam_1*recfac[l-1][0] - lam_2*recfac[l-1][1];
|
|
if (fabs(lam_2)>fbig)
|
|
{
|
|
RENORMALIZE_SCALAR;
|
|
if (scale>=0) break;
|
|
}
|
|
}
|
|
}
|
|
|
|
lam_1*=corfac;
|
|
lam_2*=corfac;
|
|
|
|
if (l<=lmax)
|
|
{
|
|
while (1)
|
|
{
|
|
if (fabs(lam_2)>eps) break;
|
|
if (++l>lmax) break;
|
|
lam_1 = cth*lam_2*recfac[l-1][0] - lam_1*recfac[l-1][1];
|
|
if (fabs(lam_1)>eps)
|
|
{ double x=lam_1; lam_1=lam_2; lam_2=x; break; }
|
|
if (++l>lmax) break;
|
|
lam_2 = cth*lam_1*recfac[l-1][0] - lam_2*recfac[l-1][1];
|
|
}
|
|
}
|
|
|
|
gen->firstl[0]=l;
|
|
if (l>lmax)
|
|
{ gen->m_crit=m; gen->cth_crit=fabs(cth); return; }
|
|
|
|
for(;l<lmax-3;l+=4)
|
|
{
|
|
result[l]=lam_2;
|
|
lam_1 = cth*lam_2*recfac[l][0] - lam_1*recfac[l][1];
|
|
result[l+1] = lam_1;
|
|
lam_2 = cth*lam_1*recfac[l+1][0] - lam_2*recfac[l+1][1];
|
|
result[l+2]=lam_2;
|
|
lam_1 = cth*lam_2*recfac[l+2][0] - lam_1*recfac[l+2][1];
|
|
result[l+3] = lam_1;
|
|
lam_2 = cth*lam_1*recfac[l+3][0] - lam_2*recfac[l+3][1];
|
|
}
|
|
|
|
while (1)
|
|
{
|
|
result[l]=lam_2;
|
|
if (++l>lmax) break;
|
|
lam_1 = cth*lam_2*recfac[l-1][0] - lam_1*recfac[l-1][1];
|
|
result[l] = lam_1;
|
|
if (++l>lmax) break;
|
|
lam_2 = cth*lam_1*recfac[l-1][0] - lam_2*recfac[l-1][1];
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
static void Ylmgen_recalc_lambda_wx1 (Ylmgen_C *gen)
|
|
{
|
|
if (gen->lwx_uptodate[1]) return;
|
|
Ylmgen_recalc_Ylm(gen);
|
|
gen->firstl[1] = gen->firstl[0];
|
|
if (gen->firstl[1]>gen->lmax) return;
|
|
Ylmgen_recalc_lamfact(gen);
|
|
gen->lwx_uptodate[1] = 1;
|
|
|
|
{
|
|
double cth=gen->cth[gen->ith];
|
|
double xsth=1./(gen->sth[gen->ith]);
|
|
double m=gen->m_cur;
|
|
double m_on_sth = m*xsth;
|
|
double lam_lm=0;
|
|
ylmgen_dbl2 *lambda_wx = gen->lambda_wx[1];
|
|
int l;
|
|
double ell;
|
|
for (ell=l=gen->firstl[1]; l<=gen->lmax; ++l, ell+=1.)
|
|
{
|
|
double lam_lm1m=lam_lm;
|
|
lam_lm=gen->ylm[l];
|
|
lambda_wx[l][0] = xsth*(gen->lamfact[l]*lam_lm1m - ell*cth*lam_lm);
|
|
lambda_wx[l][1] = m_on_sth*lam_lm;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void Ylmgen_recalc_lambda_wx2 (Ylmgen_C *gen)
|
|
{
|
|
if (gen->lwx_uptodate[2]) return;
|
|
Ylmgen_recalc_Ylm(gen);
|
|
gen->firstl[2] = gen->firstl[0];
|
|
if (gen->firstl[2]>gen->lmax) return;
|
|
Ylmgen_recalc_lamfact(gen);
|
|
gen->lwx_uptodate[2] = 1;
|
|
|
|
{
|
|
double cth=gen->cth[gen->ith];
|
|
double sth=gen->sth[gen->ith];
|
|
double m=gen->m_cur;
|
|
double one_on_s2 = 1./(sth*sth);
|
|
double two_on_s2 = 2*one_on_s2;
|
|
double two_c_on_s2 = cth * two_on_s2;
|
|
double m2 = m*m;
|
|
double two_m_on_s2 = m*two_on_s2;
|
|
double lam_lm=0;
|
|
ylmgen_dbl2 *lambda_wx = gen->lambda_wx[2];
|
|
int l;
|
|
double ell;
|
|
for (ell=l=gen->firstl[2]; l<=gen->lmax; ++l, ell+=1.)
|
|
{
|
|
double lam_lm1m=lam_lm;
|
|
lam_lm=gen->ylm[l];
|
|
{
|
|
const double t1 = lam_lm1m*gen->lamfact[l];
|
|
const double a_w = (m2-ell)*two_on_s2 - ell*(ell-1.);
|
|
const double a_x = cth*(ell-1.)*lam_lm;
|
|
lambda_wx[l][0] = a_w*lam_lm + t1*two_c_on_s2;
|
|
lambda_wx[l][1] = two_m_on_s2 * (t1-a_x);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Ylmgen_recalc_lambda_wx (Ylmgen_C *gen, int spin)
|
|
{
|
|
UTIL_ASSERT ((spin>0) && (spin<=gen->smax),
|
|
"invalid spin in Ylmgen_recalc_lambda_wx");
|
|
|
|
if (!gen->lambda_wx[spin])
|
|
gen->lambda_wx[spin]=RALLOC(ylmgen_dbl2,gen->lmax+1);
|
|
|
|
if (gen->spinrec && spin==1) { Ylmgen_recalc_lambda_wx1(gen); return; }
|
|
if (gen->spinrec && spin==2) { Ylmgen_recalc_lambda_wx2(gen); return; }
|
|
|
|
if (!gen->sylm[spin])
|
|
{
|
|
gen->sylm[spin]=RALLOC(sylmgen_d,1);
|
|
sylmgen_init(gen->sylm[spin],gen,spin);
|
|
}
|
|
if (gen->lwx_uptodate[spin]) return;
|
|
sylmgen_prepare(gen->sylm[spin],gen,gen->m_cur);
|
|
sylmgen_recalc(gen->sylm[spin],gen,gen->ith,gen->lambda_wx[spin],
|
|
&gen->firstl[spin]);
|
|
gen->lwx_uptodate[spin] = 1;
|
|
}
|
|
*/
|
|
#ifdef __SSE2__
|
|
|
|
static void Ylmgen_prepare_sse2 (Ylmgen_C *gen, int ith1, int ith2, int m)
|
|
{
|
|
if ((ith1==gen->ith1) && (ith2==gen->ith2) && (m==gen->m_cur)) return;
|
|
|
|
gen->ylm_uptodate_sse2 = 0;
|
|
SET_ARRAY(gen->lwx_uptodate_sse2,0,gen->smax+1,0);
|
|
|
|
gen->ith1 = ith1; gen->ith2 = ith2;
|
|
|
|
if (m!=gen->m_cur)
|
|
{
|
|
gen->recfac_uptodate = gen->lamfact_uptodate = 0;
|
|
gen->m_cur = m;
|
|
}
|
|
}
|
|
|
|
|
|
#define RENORMALIZE \
|
|
do \
|
|
{ \
|
|
double lam1a, lam1b, lam2a, lam2b, corfaca, corfacb; \
|
|
read_v2df (lam_1, &lam1a, &lam1b); read_v2df (lam_2, &lam2a, &lam2b); \
|
|
read_v2df (corfac, &corfaca, &corfacb); \
|
|
while (fabs(lam2a)>fbig) \
|
|
{ \
|
|
lam1a*=fsmall; lam2a*=fsmall; ++scale1; \
|
|
corfaca = (scale1<0) ? 0. : gen->cf[scale1]; \
|
|
} \
|
|
while (fabs(lam2b)>fbig) \
|
|
{ \
|
|
lam1b*=fsmall; lam2b*=fsmall; ++scale2; \
|
|
corfacb = (scale2<0) ? 0. : gen->cf[scale2]; \
|
|
} \
|
|
lam_1=build_v2df(lam1a,lam1b); lam_2=build_v2df(lam2a,lam2b); \
|
|
corfac=build_v2df(corfaca,corfacb); \
|
|
} \
|
|
while(0)
|
|
#define GETPRE(prea,preb,lv) \
|
|
{ \
|
|
prea=_mm_mul_pd(_mm_set1_pd(recfac[lv][0]),cth); \
|
|
preb=_mm_set1_pd(recfac[lv][1]); \
|
|
}
|
|
#define NEXTSTEP(prea,preb,prec,pred,reca,recb,lv) \
|
|
{ \
|
|
preb = _mm_mul_pd(preb,reca); \
|
|
prea = _mm_mul_pd(prea,recb); \
|
|
prec = _mm_set1_pd(recfac[lv][0]); \
|
|
pred = _mm_set1_pd(recfac[lv][1]); \
|
|
reca = _mm_sub_pd(prea,preb); \
|
|
prec = _mm_mul_pd(cth,prec); \
|
|
}
|
|
|
|
static void Ylmgen_recalc_Ylm_sse2 (Ylmgen_C *gen)
|
|
{
|
|
const double ln2 = 0.6931471805599453094172321214581766;
|
|
|
|
v2df lam_1,lam_2,corfac;
|
|
double logval1,logval2;
|
|
double eps=gen->eps, fbig=gen->fbig, fsmall=gen->fsmall;
|
|
v2df eps2=build_v2df(eps,eps);
|
|
v2df fbig2=build_v2df(fbig,fbig);
|
|
ylmgen_dbl2 *recfac = gen->recfac;
|
|
int lmax=gen->lmax;
|
|
int scale1,scale2,l;
|
|
int m = gen->m_cur;
|
|
double cth1=gen->cth[gen->ith1], cth2=gen->cth[gen->ith2];
|
|
v2df cth=build_v2df(cth1,cth2);
|
|
v2df *result = gen->ylm_sse2;
|
|
v2df pre0,pre1,pre2,pre3;
|
|
|
|
if (gen->ylm_uptodate_sse2) return;
|
|
gen->ylm_uptodate_sse2=1;
|
|
|
|
if ((m>=gen->m_crit)&&(fabs(cth1)>=gen->cth_crit)&&(fabs(cth2)>=gen->cth_crit))
|
|
{ gen->firstl[0]=gen->lmax+1; return; }
|
|
|
|
Ylmgen_recalc_recfac(gen);
|
|
|
|
logval1 = logval2 = gen->mfac[m];
|
|
if (m>0) logval1 += m*gen->logsth[gen->ith1];
|
|
if (m>0) logval2 += m*gen->logsth[gen->ith2];
|
|
scale1 = (int) (logval1/large_exponent2)-minscale;
|
|
scale2 = (int) (logval2/large_exponent2)-minscale;
|
|
corfac = build_v2df((scale1<0) ? 0. : gen->cf[scale1],
|
|
(scale2<0) ? 0. : gen->cf[scale2]);
|
|
|
|
lam_1 = _mm_setzero_pd();
|
|
lam_2 = build_v2df(exp(ln2*(logval1-(scale1+minscale)*large_exponent2)),
|
|
exp(ln2*(logval2-(scale2+minscale)*large_exponent2)));
|
|
if (m&1) lam_2 = _mm_xor_pd (lam_2,V2DF_SIGNMASK); /* negate */
|
|
|
|
l=m;
|
|
if ((scale1<0) && (scale2<0))
|
|
{
|
|
GETPRE(pre0,pre1,l)
|
|
while (1)
|
|
{
|
|
if (++l>lmax) break;
|
|
NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)
|
|
if (++l>lmax) break;
|
|
NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)
|
|
if (v2df_any_gt(lam_2,fbig2))
|
|
{
|
|
RENORMALIZE;
|
|
if ((scale1>=0) || (scale2>=0)) break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (l<=lmax)
|
|
{
|
|
GETPRE(pre0,pre1,l)
|
|
while (1)
|
|
{
|
|
v2df t1;
|
|
result[l]=t1=_mm_mul_pd(lam_2,corfac);
|
|
if (v2df_any_gt(t1,eps2))
|
|
break;
|
|
if (++l>lmax) break;
|
|
NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)
|
|
|
|
result[l]=t1=_mm_mul_pd(lam_1,corfac);
|
|
if (v2df_any_gt(t1,eps2))
|
|
{ v2df tmp=lam_1;lam_1=lam_2;lam_2=tmp; break; }
|
|
if (++l>lmax) break;
|
|
NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)
|
|
|
|
if (v2df_any_gt(lam_2,fbig2))
|
|
RENORMALIZE;
|
|
}
|
|
}
|
|
|
|
gen->firstl[0]=l;
|
|
if (l>lmax)
|
|
{
|
|
gen->m_crit=m;
|
|
gen->cth_crit= (fabs(cth1)<fabs(cth2)) ? fabs(cth1) : fabs(cth2);
|
|
return;
|
|
}
|
|
|
|
GETPRE(pre0,pre1,l)
|
|
while (1)
|
|
{
|
|
v2df t1;
|
|
result[l]=t1=_mm_mul_pd(lam_2,corfac);
|
|
if (v2df_all_ge(t1,eps2))
|
|
break;
|
|
if (++l>lmax) return;
|
|
NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)
|
|
|
|
result[l]=t1=_mm_mul_pd(lam_1,corfac);
|
|
if (v2df_all_ge(t1,eps2))
|
|
{ v2df tmp=lam_1;lam_1=lam_2;lam_2=tmp; break; }
|
|
if (++l>lmax) return;
|
|
NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)
|
|
|
|
if (v2df_any_gt(lam_2,fbig2))
|
|
RENORMALIZE;
|
|
}
|
|
|
|
lam_1 = _mm_mul_pd (lam_1,corfac);
|
|
lam_2 = _mm_mul_pd (lam_2,corfac);
|
|
|
|
GETPRE(pre0,pre1,l)
|
|
for(;l<lmax-2;l+=2)
|
|
{
|
|
result[l]=lam_2;
|
|
NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l+1)
|
|
result[l+1]=lam_1;
|
|
NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l+2)
|
|
}
|
|
|
|
while (1)
|
|
{
|
|
result[l]=lam_2;
|
|
if (++l>lmax) break;
|
|
NEXTSTEP(pre0,pre1,pre2,pre3,lam_1,lam_2,l)
|
|
result[l] = lam_1;
|
|
if (++l>lmax) break;
|
|
NEXTSTEP(pre2,pre3,pre0,pre1,lam_2,lam_1,l)
|
|
}
|
|
}
|
|
|
|
/*static void Ylmgen_recalc_lambda_wx1_sse2 (Ylmgen_C *gen)
|
|
{
|
|
if (gen->lwx_uptodate_sse2[1]) return;
|
|
Ylmgen_recalc_Ylm_sse2(gen);
|
|
gen->firstl[1] = gen->firstl[0];
|
|
if (gen->firstl[1]>gen->lmax) return;
|
|
Ylmgen_recalc_lamfact(gen);
|
|
gen->lwx_uptodate_sse2[1] = 1;
|
|
|
|
{
|
|
v2df cth=build_v2df(gen->cth[gen->ith1],gen->cth[gen->ith2]);
|
|
v2df xsth=build_v2df(1./gen->sth[gen->ith1],1./gen->sth[gen->ith2]);
|
|
v2df m=build_v2df(gen->m_cur,gen->m_cur);
|
|
v2df m_on_sth = _mm_mul_pd(m,xsth);
|
|
v2df lam_lm=_mm_setzero_pd();
|
|
v2df2 *lambda_wx = gen->lambda_wx_sse2[1];
|
|
int l;
|
|
v2df ell=build_v2df(gen->firstl[1],gen->firstl[1]);
|
|
v2df uno=_mm_set1_pd(1.);
|
|
for (l=gen->firstl[1]; l<=gen->lmax; ++l, ell=_mm_add_pd(ell,uno))
|
|
{
|
|
v2df lamfact=_mm_load1_pd(&gen->lamfact[l]);
|
|
v2df lam_lm1m=lam_lm;
|
|
lam_lm=gen->ylm_sse2[l];
|
|
lambda_wx[l].a = _mm_mul_pd(xsth,_mm_sub_pd(_mm_mul_pd(lamfact,lam_lm1m),
|
|
_mm_mul_pd(_mm_mul_pd(ell,cth),lam_lm)));
|
|
lambda_wx[l].b = _mm_mul_pd(m_on_sth,lam_lm);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void Ylmgen_recalc_lambda_wx2_sse2 (Ylmgen_C *gen)
|
|
{
|
|
if (gen->lwx_uptodate_sse2[2]) return;
|
|
Ylmgen_recalc_Ylm_sse2(gen);
|
|
gen->firstl[2] = gen->firstl[0];
|
|
if (gen->firstl[2]>gen->lmax) return;
|
|
Ylmgen_recalc_lamfact(gen);
|
|
gen->lwx_uptodate_sse2[2] = 1;
|
|
|
|
{
|
|
v2df cth=build_v2df(gen->cth[gen->ith1],gen->cth[gen->ith2]);
|
|
v2df sth=build_v2df(gen->sth[gen->ith1],gen->sth[gen->ith2]);
|
|
v2df m=build_v2df(gen->m_cur,gen->m_cur);
|
|
v2df uno=_mm_set1_pd(1.);
|
|
v2df one_on_s2 = _mm_div_pd(uno,_mm_mul_pd(sth,sth));
|
|
v2df two_on_s2 = _mm_mul_pd(_mm_set1_pd(2.),one_on_s2);
|
|
v2df two_c_on_s2 = _mm_mul_pd(cth,two_on_s2);
|
|
v2df m2 = _mm_mul_pd(m,m);
|
|
v2df two_m_on_s2 = _mm_mul_pd(m,two_on_s2);
|
|
v2df lam_lm=_mm_setzero_pd();
|
|
v2df2 *lambda_wx = gen->lambda_wx_sse2[2];
|
|
int l;
|
|
v2df ell=build_v2df(gen->firstl[2],gen->firstl[2]);
|
|
for (l=gen->firstl[2]; l<=gen->lmax; ++l, ell=_mm_add_pd(ell,uno))
|
|
{
|
|
v2df lamfact=_mm_load1_pd(&gen->lamfact[l]);
|
|
v2df lam_lm1m=lam_lm;
|
|
lam_lm=gen->ylm_sse2[l];
|
|
{
|
|
const v2df t1 = _mm_mul_pd(lam_lm1m,lamfact);
|
|
const v2df ellm1 = _mm_sub_pd(ell,uno);
|
|
const v2df a_w = _mm_sub_pd
|
|
(_mm_mul_pd(_mm_sub_pd(m2,ell),two_on_s2),_mm_mul_pd(ell,ellm1));
|
|
const v2df a_x = _mm_mul_pd(_mm_mul_pd(cth,ellm1),lam_lm);
|
|
lambda_wx[l].a =
|
|
_mm_add_pd(_mm_mul_pd(a_w,lam_lm),_mm_mul_pd(t1,two_c_on_s2));
|
|
lambda_wx[l].b = _mm_mul_pd(two_m_on_s2,_mm_sub_pd(t1,a_x));
|
|
}
|
|
}
|
|
}
|
|
}*/
|
|
|
|
/*static void Ylmgen_recalc_lambda_wx_sse2 (Ylmgen_C *gen, int spin)
|
|
{
|
|
UTIL_ASSERT ((spin>0) && (spin<=gen->smax),
|
|
"invalid spin in Ylmgen_recalc_lambda_wx_sse2");
|
|
|
|
if (!gen->lambda_wx_sse2[spin])
|
|
gen->lambda_wx_sse2[spin]=RALLOC(v2df2,gen->lmax+1);
|
|
|
|
if (gen->spinrec && spin==1) { Ylmgen_recalc_lambda_wx1_sse2(gen); return; }
|
|
if (gen->spinrec && spin==2) { Ylmgen_recalc_lambda_wx2_sse2(gen); return; }
|
|
|
|
if (!gen->sylm[spin])
|
|
{
|
|
gen->sylm[spin]=RALLOC(sylmgen_d,1);
|
|
sylmgen_init(gen->sylm[spin],gen,spin);
|
|
}
|
|
if (gen->lwx_uptodate_sse2[spin]) return;
|
|
sylmgen_prepare(gen->sylm[spin],gen,gen->m_cur);
|
|
sylmgen_recalc_sse2(gen->sylm[spin],gen,gen->ith1,gen->ith2,
|
|
gen->lambda_wx_sse2[spin],&gen->firstl[spin]);
|
|
gen->lwx_uptodate_sse2[spin] = 1;
|
|
}*/
|
|
|
|
#endif /* __SSE2__ */
|
|
|
|
/*
|
|
double *Ylmgen_get_norm (int lmax, int spin, int spinrec)
|
|
{
|
|
const double pi = 3.141592653589793238462643383279502884197;
|
|
double *res=RALLOC(double,lmax+1);
|
|
int l;
|
|
double spinsign;
|
|
/ sign convention for H=1 (LensPix paper) /
|
|
#if 1
|
|
spinsign = (spin>0) ? -1.0 : 1.0;
|
|
#else
|
|
spinsign = 1.0;
|
|
#endif
|
|
|
|
if (spin==0)
|
|
{
|
|
for (l=0; l<=lmax; ++l)
|
|
res[l]=1.;
|
|
return res;
|
|
}
|
|
|
|
if ((!spinrec) || (spin>=3))
|
|
{
|
|
spinsign = (spin&1) ? -spinsign : spinsign;
|
|
for (l=0; l<=lmax; ++l)
|
|
res[l] = (l<spin) ? 0. : spinsign*0.5*sqrt((2*l+1)/(4*pi));
|
|
return res;
|
|
}
|
|
|
|
if (spin==1)
|
|
{
|
|
for (l=0; l<=lmax; ++l)
|
|
res[l] = (l<spin) ? 0. : spinsign*sqrt(1./((l+1.)*l));
|
|
return res;
|
|
}
|
|
|
|
if (spin==2)
|
|
{
|
|
for (l=0; l<=lmax; ++l)
|
|
res[l] = (l<spin) ? 0. : spinsign*sqrt(1./((l+2.)*(l+1.)*l*(l-1.)));
|
|
return res;
|
|
}
|
|
|
|
UTIL_FAIL ("error in Ylmgen_get_norm");
|
|
return NULL;
|
|
}
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
New high-level wrapper
|
|
*/
|
|
#include "sharp_legendre_table.h"
|
|
#include <stdio.h>
|
|
|
|
void sharp_normalized_associated_legendre_table(
|
|
ptrdiff_t m,
|
|
int spin,
|
|
ptrdiff_t lmax,
|
|
ptrdiff_t ntheta,
|
|
double *theta,
|
|
ptrdiff_t theta_stride,
|
|
ptrdiff_t l_stride,
|
|
ptrdiff_t spin_stride,
|
|
double *out
|
|
) {
|
|
if (spin != 0) UTIL_FAIL ("sharp_normalized_associated_legendre_table: only spin=0 has been implemented so far");
|
|
|
|
Ylmgen_C ctx;
|
|
ptrdiff_t itheta, l, lmin;
|
|
|
|
Ylmgen_init(&ctx, lmax, lmax, 0, 0, 1e-300);
|
|
Ylmgen_set_theta(&ctx, theta, ntheta);
|
|
|
|
itheta = 0;
|
|
#ifdef __SSE2__
|
|
for (; itheta < ntheta - 1; itheta += 2) {
|
|
Ylmgen_prepare_sse2(&ctx, itheta, itheta + 1, m);
|
|
Ylmgen_recalc_Ylm_sse2(&ctx);
|
|
lmin = IMIN(*ctx.firstl, lmax + 1);
|
|
for (l = m; l < lmin; ++l) {
|
|
out[itheta * theta_stride + (l - m) * l_stride + spin * spin_stride] = 0;
|
|
out[(itheta + 1) * theta_stride + (l - m) * l_stride + spin * spin_stride] = 0;
|
|
}
|
|
for (l = IMAX(lmin, m); l <= lmax; ++l) {
|
|
double v1, v2;
|
|
read_v2df(ctx.ylm_sse2[l], &v1, &v2);
|
|
out[itheta * theta_stride + (l - m) * l_stride + spin * spin_stride] = v1;
|
|
out[(itheta + 1) * theta_stride + (l - m) * l_stride + spin * spin_stride] = v2;
|
|
}
|
|
}
|
|
#endif
|
|
for (; itheta < ntheta; itheta += 1) {
|
|
Ylmgen_prepare(&ctx, itheta, m);
|
|
Ylmgen_recalc_Ylm(&ctx);
|
|
lmin = IMIN(*ctx.firstl, lmax + 1);
|
|
for (l = m; l < lmin; ++l) {
|
|
out[itheta * theta_stride + (l - m) * l_stride + spin * spin_stride] = 0;
|
|
}
|
|
for (l = IMAX(lmin, m); l <= lmax; ++l) {
|
|
out[itheta * theta_stride + (l - m) * l_stride + spin * spin_stride] = ctx.ylm[l];
|
|
}
|
|
}
|
|
Ylmgen_destroy(&ctx);
|
|
}
|