From d96a30180bc94a5e7b983eeba36729684822be48 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Fri, 19 Oct 2012 15:01:34 +0200 Subject: [PATCH] more work on number scaling adjust sharp_acctest to new (flipped) Gauss grid --- libsharp/sharp_acctest.c | 10 +++++ libsharp/sharp_core_inc.c | 74 ++++++++++++++++++++----------------- libsharp/sharp_core_inc2.c | 32 ++++++++-------- libsharp/sharp_core_inc3.c | 32 ++++++++-------- libsharp/sharp_vecsupport.h | 6 +++ libsharp/sharp_ylmgen_c.c | 13 +++++-- libsharp/sharp_ylmgen_c.h | 5 ++- 7 files changed, 102 insertions(+), 70 deletions(-) diff --git a/libsharp/sharp_acctest.c b/libsharp/sharp_acctest.c index ddf3587..95a3bac 100644 --- a/libsharp/sharp_acctest.c +++ b/libsharp/sharp_acctest.c @@ -96,6 +96,16 @@ static void check_sign_scale(void) ptrdiff_t npix=(ptrdiff_t)nrings*ppring; sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo); + /* flip theta to emulate the "old" Gaussian grid geometry */ + for (int i=0; inpairs; ++i) + { + const double pi=3.141592653589793238462643383279502884197; + tinfo->pair[i].r1.cth=-tinfo->pair[i].r1.cth; + tinfo->pair[i].r2.cth=-tinfo->pair[i].r2.cth; + tinfo->pair[i].r1.theta=pi-tinfo->pair[i].r1.theta; + tinfo->pair[i].r2.theta=pi-tinfo->pair[i].r2.theta; + } + sharp_alm_info *alms; sharp_make_triangular_alm_info(lmax,mmax,1,&alms); ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax); diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c index 4870f4f..e87c817 100644 --- a/libsharp/sharp_core_inc.c +++ b/libsharp/sharp_core_inc.c @@ -70,11 +70,37 @@ static inline Tb Y(Tbprod)(Tb a, Tb b) static inline void Y(Tbmuleq)(Tb * restrict a, Tb b) { for (int i=0; iv[i],b.v[i]); } +static inline void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale, + double maxval) + { + const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig); + const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval); + for (int i=0;iv[i]),vfmax); + while (vanyTrue(mask)) + { + vmuleq(val->v[i],vblend(mask,vfsmall,vone)); + vaddeq(scale->v[i],vblend(mask,vone,vzero)); + mask = vgt(vabs(val->v[i]),vfmax); + } + mask = vand(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero)); + while (vanyTrue(mask)) + { + vmuleq(val->v[i],vblend(mask,vfbig,vone)); + vsubeq(scale->v[i],vblend(mask,vone,vzero)); + mask = vand(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero)); + } + } + } + static inline void Y(mypow) (Tb val, int npow, Tb * restrict resd, Tb * restrict ress) { Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.); + Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf); + do { if (npow&1) @@ -83,19 +109,15 @@ static inline void Y(mypow) (Tb val, int npow, Tb * restrict resd, { vmuleq(res.v[i],val.v[i]); vaddeq(scale.v[i],scaleint.v[i]); - Tv mask=vlt(vabs(res.v[i]),vload(sharp_fsmall)); - vmuleq(res.v[i],vblend(mask,vload(sharp_fbig),vone)); - vsubeq(scale.v[i],vblend(mask,vone,vzero)); } + Y(Tbnormalize)(&res,&scale,sharp_fbighalf); } for (int i=0; i>=1); @@ -121,30 +143,6 @@ static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2, return did_scale; } -static inline void Y(normalize) (Tb * restrict val, Tb * restrict scale) - { - const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig); - for (int i=0;iv[i]),vload(sharp_ftol)); - while (vanyTrue(mask)) - { - vmuleq(val->v[i],vblend(mask,vfsmall,vone)); - vaddeq(scale->v[i],vblend(mask,vone,vzero)); - mask = vgt(vabs(val->v[i]),vload(sharp_ftol)); - } - mask = vlt(vabs(val->v[i]),vload(sharp_fsmall*sharp_ftol)); - mask = vand(mask,vne(val->v[i],vzero)); - while (vanyTrue(mask)) - { - vmuleq(val->v[i],vblend(mask,vfbig,vone)); - vsubeq(scale->v[i],vblend(mask,vone,vzero)); - mask = vlt(vabs(val->v[i]),vload(sharp_fsmall*sharp_ftol)); - mask = vand(mask,vne(val->v[i],vzero)); - } - } - } - static inline int Y(TballLt)(Tb a,double b) { Tv vb=vload(b); @@ -161,6 +159,14 @@ static inline int Y(TballGt)(Tb a,double b) res=vand(res,vgt(a.v[i],vb)); return vallTrue(res); } +static inline int Y(TballGe)(Tb a,double b) + { + Tv vb=vload(b); + Tv res=vge(a.v[0],vb); + for (int i=1; im&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); - Y(normalize)(&lam_2,&scale); + Y(Tbnormalize)(&lam_2,&scale,sharp_ftol); int below_limit = Y(TballLt)(scale,sharp_limscale); while (below_limit) @@ -243,7 +249,8 @@ static void Y(iter_to_ieee_spin) (const Tb cth, int *l_, rec2m.v[i]=vmul(prefac,csp.v[i]); scalem.v[i]=vadd(prescale,csps.v[i]); } - Y(normalize)(&rec2m,&scalem); Y(normalize)(&rec2p,&scalep); + Y(Tbnormalize)(&rec2m,&scalem,sharp_fbighalf); + Y(Tbnormalize)(&rec2p,&scalep,sharp_fbighalf); for (int i=0; is&1) rec2p.v[i]=vneg(rec2p.v[i]); } - Y(normalize)(&rec2m,&scalem); Y(normalize)(&rec2p,&scalep); + Y(Tbnormalize)(&rec2m,&scalem,sharp_ftol); + Y(Tbnormalize)(&rec2p,&scalep,sharp_ftol); int l=gen->mhi; diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c index e749c78..7672eff 100644 --- a/libsharp/sharp_core_inc2.c +++ b/libsharp/sharp_core_inc2.c @@ -183,7 +183,7 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth, Y(getCorfac)(scale,&corfac,gen->cf); const sharp_ylmgen_dbl2 * restrict rf = gen->rf; const dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGt)(scale,sharp_minscale); + int full_ieee = Y(TballGe)(scale,sharp_minscale); while (!full_ieee) { for (int j=0; jcf); - full_ieee = Y(TballGt)(scale,sharp_minscale); + full_ieee = Y(TballGe)(scale,sharp_minscale); } } if (l>lmax) { *done=1; return; } @@ -242,7 +242,7 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth, Tb corfac; Y(getCorfac)(scale,&corfac,gen->cf); dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGt)(scale,sharp_minscale); + int full_ieee = Y(TballGe)(scale,sharp_minscale); while (!full_ieee) { for (int j=0; jcf); - full_ieee = Y(TballGt)(scale,sharp_minscale); + full_ieee = Y(TballGe)(scale,sharp_minscale); } } @@ -456,8 +456,8 @@ static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen, Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf); const dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGt)(scalep,sharp_minscale) - && Y(TballGt)(scalem,sharp_minscale); + int full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); while (!full_ieee) { Z(saddstep)(p1, p2, @@ -472,8 +472,8 @@ static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen, { Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf); - full_ieee = Y(TballGt)(scalep,sharp_minscale) - && Y(TballGt)(scalem,sharp_minscale); + full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); } } @@ -502,8 +502,8 @@ static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen, Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf); dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGt)(scalep,sharp_minscale) - && Y(TballGt)(scalem,sharp_minscale); + int full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); while (!full_ieee) { Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm); @@ -518,8 +518,8 @@ static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen, { Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf); - full_ieee = Y(TballGt)(scalep,sharp_minscale) - && Y(TballGt)(scalem,sharp_minscale); + full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); } } @@ -599,8 +599,8 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen, Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf); const dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGt)(scalep,sharp_minscale) - && Y(TballGt)(scalem,sharp_minscale); + int full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); while (!full_ieee) { Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), @@ -615,8 +615,8 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen, { Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf); - full_ieee = Y(TballGt)(scalep,sharp_minscale) - && Y(TballGt)(scalem,sharp_minscale); + full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); } } diff --git a/libsharp/sharp_core_inc3.c b/libsharp/sharp_core_inc3.c index ccf5e5d..e74bd51 100644 --- a/libsharp/sharp_core_inc3.c +++ b/libsharp/sharp_core_inc3.c @@ -173,7 +173,7 @@ static void Y(calc_alm2map) (const Tb cth, const Tb sth, Y(getCorfac)(scale,&corfac,gen->cf); const sharp_ylmgen_dbl2 * restrict rf = gen->rf; const dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGt)(scale,sharp_minscale); + int full_ieee = Y(TballGe)(scale,sharp_minscale); while (!full_ieee) { for (int j=0; jcf); - full_ieee = Y(TballGt)(scale,sharp_minscale); + full_ieee = Y(TballGe)(scale,sharp_minscale); } } if (l>lmax) { *done=1; return; } @@ -232,7 +232,7 @@ static void Y(calc_map2alm) (const Tb cth, const Tb sth, Tb corfac; Y(getCorfac)(scale,&corfac,gen->cf); dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGt)(scale,sharp_minscale); + int full_ieee = Y(TballGe)(scale,sharp_minscale); while (!full_ieee) { for (int j=0; jcf); - full_ieee = Y(TballGt)(scale,sharp_minscale); + full_ieee = Y(TballGe)(scale,sharp_minscale); } } @@ -443,8 +443,8 @@ static void Y(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen, Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf); const dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGt)(scalep,sharp_minscale) - && Y(TballGt)(scalem,sharp_minscale); + int full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); while (!full_ieee) { Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), @@ -459,8 +459,8 @@ static void Y(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen, { Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf); - full_ieee = Y(TballGt)(scalep,sharp_minscale) - && Y(TballGt)(scalem,sharp_minscale); + full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); } } @@ -489,8 +489,8 @@ static void Y(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen, Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf); dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGt)(scalep,sharp_minscale) - && Y(TballGt)(scalem,sharp_minscale); + int full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); while (!full_ieee) { Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm); @@ -505,8 +505,8 @@ static void Y(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen, { Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf); - full_ieee = Y(TballGt)(scalep,sharp_minscale) - && Y(TballGt)(scalem,sharp_minscale); + full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); } } @@ -586,8 +586,8 @@ static void Y(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen, Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf); const dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGt)(scalep,sharp_minscale) - && Y(TballGt)(scalem,sharp_minscale); + int full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); while (!full_ieee) { Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), @@ -602,8 +602,8 @@ static void Y(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen, { Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf); - full_ieee = Y(TballGt)(scalep,sharp_minscale) - && Y(TballGt)(scalem,sharp_minscale); + full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); } } diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h index fcfd253..54f23df 100644 --- a/libsharp/sharp_vecsupport.h +++ b/libsharp/sharp_vecsupport.h @@ -57,8 +57,10 @@ typedef double Tv; #define vsqrt(a) sqrt(a) #define vlt(a,b) (((a)<(b))?1.:0.) #define vgt(a,b) (((a)>(b))?1.:0.) +#define vge(a,b) (((a)>=(b))?1.:0.) #define vne(a,b) (((a)!=(b))?1.:0.) #define vand(a,b) ((((a)*(b))!=0.)?1.:0.) +#define vor(a,b) ((((a)+(b))!=0.)?1.:0.) static inline Tv vmin (Tv a, Tv b) { return (ab) ? a : b; } @@ -102,8 +104,10 @@ typedef __m128d Tv; #define vsqrt(a) _mm_sqrt_pd(a) #define vlt(a,b) _mm_cmplt_pd(a,b) #define vgt(a,b) _mm_cmpgt_pd(a,b) +#define vge(a,b) _mm_cmpge_pd(a,b) #define vne(a,b) _mm_cmpneq_pd(a,b) #define vand(a,b) _mm_and_pd(a,b) +#define vor(a,b) _mm_or_pd(a,b) #define vmin(a,b) _mm_min_pd(a,b) #define vmax(a,b) _mm_max_pd(a,b); #define vanyTrue(a) (_mm_movemask_pd(a)!=0) @@ -153,8 +157,10 @@ typedef __m256d Tv; #define vsqrt(a) _mm256_sqrt_pd(a) #define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ) #define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ) +#define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ) #define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ) #define vand(a,b) _mm256_and_pd(a,b) +#define vor(a,b) _mm256_or_pd(a,b) #define vmin(a,b) _mm256_min_pd(a,b) #define vmax(a,b) _mm256_max_pd(a,b) #define vanyTrue(a) (_mm256_movemask_pd(a)!=0) diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c index 3614fd8..5f0f1a1 100644 --- a/libsharp/sharp_ylmgen_c.c +++ b/libsharp/sharp_ylmgen_c.c @@ -34,6 +34,13 @@ #include "sharp_ylmgen_c.h" #include "c_utils.h" +static inline void normalize (double *val, int *scale, double xfmax) + { + while (fabs(*val)>xfmax) { *val*=sharp_fsmall; ++*scale; } + if (*val!=0.) + while (fabs(*val)1.) { fac[m]*=sharp_fsmall; ++facscale[m]; } + normalize(&fac[m],&facscale[m],sharp_fbighalf); } for (int m=0; m<=gen->mmax; ++m) { @@ -100,10 +107,10 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin) if (mhi1.0) { tfac*=sharp_fsmall; ++tscale; } + normalize(&tfac,&tscale,sharp_fbighalf); tfac/=fac[mhi-mlo]; tscale-=facscale[mhi-mlo]; - if (tfac>1.0) { tfac*=sharp_fsmall; ++tscale; } + normalize(&tfac,&tscale,sharp_fbighalf); gen->prefac[m]=tfac; gen->fscale[m]=tscale; } diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h index e935ce2..3328f76 100644 --- a/libsharp/sharp_ylmgen_c.h +++ b/libsharp/sharp_ylmgen_c.h @@ -36,9 +36,10 @@ extern "C" { #endif -enum { sharp_minscale=-1, sharp_limscale=1, sharp_maxscale=1 }; -static const double sharp_fbig=0x1p+450,sharp_fsmall=0x1p-450; +enum { sharp_minscale=0, sharp_limscale=1, sharp_maxscale=1 }; +static const double sharp_fbig=0x1p+800,sharp_fsmall=0x1p-800; static const double sharp_ftol=0x1p-60; +static const double sharp_fbighalf=0x1p+400; typedef struct { double f[2]; } sharp_ylmgen_dbl2; typedef struct { double f[3]; } sharp_ylmgen_dbl3;