sharpification

This commit is contained in:
Martin Reinecke 2012-07-05 16:14:28 +02:00
parent 84c0a7de18
commit 82dc2a541f
14 changed files with 145 additions and 123 deletions

View file

@ -32,7 +32,7 @@ compile_all: $(all_cbin) hdrcopy
autotune: sharp_bench autotune: sharp_bench
$(BINDIR)/sharp_bench $(BINDIR)/sharp_bench
mv oracle.inc $(SRCROOT)/libsharp mv sharp_oracle.inc $(SRCROOT)/libsharp
$(MAKE) $(MAKE)
hdrclean: hdrclean:

View file

@ -61,11 +61,11 @@
<ul> <ul>
<li>scalar a_lm to map</li> <li>scalar a_lm to map</li>
<li>scalar map to a_lm</li> <li>scalar map to a_lm</li>
<!-- <li>polarised a_lm to map</li> <!-- <li>polarised a_lm to map</li>
<li>polarised map to a_lm</li> --> <li>polarised map to a_lm</li> !-->
<li>spin a_lm to map</li> <li>spin a_lm to map</li>
<li>spin map to a_lm</li> <li>spin map to a_lm</li>
<!-- <li>scalar a_lm to maps of first derivatives</li> --> <li>scalar a_lm to maps of first derivatives</li>
</ul> </ul>
SHARP supports shared-memory parallelisation via OpenMP; this feature will SHARP supports shared-memory parallelisation via OpenMP; this feature will

View file

@ -1,9 +0,0 @@
static const int maxtr = 6;
static const int nv_opt[6][2][3] = {
{{4,2,-1},{2,1,1}},
{{4,2,-1},{2,1,1}},
{{5,2,-1},{5,2,1}},
{{5,2,-1},{5,2,1}},
{{5,2,-1},{5,2,1}},
{{5,2,-1},{5,2,1}}
};

View file

@ -15,7 +15,7 @@ ALLOBJ:=$(ALLOBJ:%=$(OD)/%)
ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils) ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils)
$(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c $(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c
$(OD)/sharp.o: $(SD)/sharp_mpi.c $(SD)/oracle.inc $(OD)/sharp.o: $(SD)/sharp_mpi.c $(SD)/sharp_oracle.inc
BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils) BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils)
$(LIB_$(PKG)): $(LIBOBJ) $(LIB_$(PKG)): $(LIBOBJ)

View file

@ -464,7 +464,7 @@ void sharp_execute_job (sharp_job *job)
int lmax = job->ainfo->lmax, int lmax = job->ainfo->lmax,
mmax=sharp_get_mmax(job->ainfo->mval, job->ainfo->nm); mmax=sharp_get_mmax(job->ainfo->mval, job->ainfo->nm);
job->norm_l = Ylmgen_get_norm (lmax, job->spin); job->norm_l = sharp_Ylmgen_get_norm (lmax, job->spin);
/* clear output arrays if requested */ /* clear output arrays if requested */
init_output (job); init_output (job);
@ -501,8 +501,8 @@ void sharp_execute_job (sharp_job *job)
{ {
sharp_job ljob = *job; sharp_job ljob = *job;
ljob.opcnt=0; ljob.opcnt=0;
Ylmgen_C generator; sharp_Ylmgen_C generator;
Ylmgen_init (&generator,lmax,mmax,ljob.spin); sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
alloc_almtmp(&ljob,lmax); alloc_almtmp(&ljob,lmax);
#pragma omp for schedule(dynamic,1) #pragma omp for schedule(dynamic,1)
@ -517,7 +517,7 @@ void sharp_execute_job (sharp_job *job)
almtmp2alm (&ljob, lmax, mi); almtmp2alm (&ljob, lmax, mi);
} }
Ylmgen_destroy(&generator); sharp_Ylmgen_destroy(&generator);
dealloc_almtmp(&ljob); dealloc_almtmp(&ljob);
#pragma omp critical #pragma omp critical
@ -589,7 +589,7 @@ int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
if (type==ALM2MAP_DERIV1) spin=1; if (type==ALM2MAP_DERIV1) spin=1;
UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms"); UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin"); UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin");
#include "oracle.inc" #include "sharp_oracle.inc"
return nv_opt[IMIN(ntrans,maxtr)-1][spin!=0][type]; return nv_opt[IMIN(ntrans,maxtr)-1][spin!=0][type];
} }

View file

@ -98,7 +98,7 @@ int main(void)
sharp_module_startup("sharp_bench",1,1,"",1); sharp_module_startup("sharp_bench",1,1,"",1);
printf("Benchmarking SHTs.\n\n"); printf("Benchmarking SHTs.\n\n");
FILE *fp=fopen("oracle.inc","w"); FILE *fp=fopen("sharp_oracle.inc","w");
UTIL_ASSERT(fp, "failed to open oracle file for writing"); UTIL_ASSERT(fp, "failed to open oracle file for writing");
fprintf(fp,"static const int maxtr = 6;\n"); fprintf(fp,"static const int maxtr = 6;\n");
fprintf(fp,"static const int nv_opt[6][2][3] = {\n"); fprintf(fp,"static const int nv_opt[6][2][3] = {\n");

View file

@ -73,7 +73,8 @@ typedef complex double dcmplx;
#undef nvec #undef nvec
void inner_loop (sharp_job *job, const int *ispair,const double *cth, void inner_loop (sharp_job *job, const int *ispair,const double *cth,
const double *sth, int llim, int ulim, Ylmgen_C *gen, int mi, const int *idx) const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
const int *idx)
{ {
int njobs=job->ntrans; int njobs=job->ntrans;
if (njobs<=MAXJOB_SPECIAL) if (njobs<=MAXJOB_SPECIAL)

View file

@ -40,7 +40,8 @@ extern "C" {
#endif #endif
void inner_loop (sharp_job *job, const int *ispair,const double *cth, void inner_loop (sharp_job *job, const int *ispair,const double *cth,
const double *sth, int llim, int ulim, Ylmgen_C *gen, int mi, const int *idx); const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
const int *idx);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -83,8 +83,8 @@ static inline void Y(mypow) (Tb val, int npow, Tb * restrict resd,
{ {
vmuleq(res.v[i],val.v[i]); vmuleq(res.v[i],val.v[i]);
vaddeq(scale.v[i],scaleint.v[i]); vaddeq(scale.v[i],scaleint.v[i]);
Tv mask=vlt(vabs(res.v[i]),vload(fsmall)); Tv mask=vlt(vabs(res.v[i]),vload(sharp_fsmall));
vmuleq(res.v[i],vblend(mask,vload(fbig),vone)); vmuleq(res.v[i],vblend(mask,vload(sharp_fbig),vone));
vsubeq(scale.v[i],vblend(mask,vone,vzero)); vsubeq(scale.v[i],vblend(mask,vone,vzero));
} }
} }
@ -92,8 +92,8 @@ static inline void Y(mypow) (Tb val, int npow, Tb * restrict resd,
{ {
vmuleq(val.v[i],val.v[i]); vmuleq(val.v[i],val.v[i]);
vaddeq(scaleint.v[i],scaleint.v[i]); vaddeq(scaleint.v[i],scaleint.v[i]);
Tv mask = vlt(vabs(val.v[i]),vload(fsmall)); Tv mask = vlt(vabs(val.v[i]),vload(sharp_fsmall));
vmuleq(val.v[i],vblend(mask,vload(fbig),vone)); vmuleq(val.v[i],vblend(mask,vload(sharp_fbig),vone));
vsubeq(scaleint.v[i],vblend(mask,vone,vzero)); vsubeq(scaleint.v[i],vblend(mask,vone,vzero));
} }
} }
@ -113,7 +113,7 @@ static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
if (vanyTrue(mask)) if (vanyTrue(mask))
{ {
did_scale=1; did_scale=1;
Tv fact = vblend(mask,vload(fsmall),vone); Tv fact = vblend(mask,vload(sharp_fsmall),vone);
vmuleq(lam1->v[i],fact); vmuleq(lam2->v[i],fact); vmuleq(lam1->v[i],fact); vmuleq(lam2->v[i],fact);
vaddeq(scale->v[i],vblend(mask,vone,vzero)); vaddeq(scale->v[i],vblend(mask,vone,vzero));
} }
@ -123,7 +123,7 @@ static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
static inline void Y(normalize) (Tb * restrict val, Tb * restrict scale) static inline void Y(normalize) (Tb * restrict val, Tb * restrict scale)
{ {
const Tv vfsmall=vload(fsmall), vfbig=vload(fbig); const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
for (int i=0;i<nvec; ++i) for (int i=0;i<nvec; ++i)
{ {
Tv mask = vgt(vabs(val->v[i]),vone); Tv mask = vgt(vabs(val->v[i]),vone);
@ -168,13 +168,14 @@ static inline void Y(getCorfac)(Tb scale, Tb * restrict corfac,
Y(Tbu) sc, corf; Y(Tbu) sc, corf;
sc.b=scale; sc.b=scale;
for (int i=0; i<VLEN*nvec; ++i) for (int i=0; i<VLEN*nvec; ++i)
corf.s[i] = (sc.s[i]<minscale) ? 0. : cf[(int)(sc.s[i])-minscale]; corf.s[i] = (sc.s[i]<sharp_minscale) ?
0. : cf[(int)(sc.s[i])-sharp_minscale];
*corfac=corf.b; *corfac=corf.b;
} }
static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_, static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_, Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
const Ylmgen_C * restrict gen) const sharp_Ylmgen_C * restrict gen)
{ {
int l=gen->m; int l=gen->m;
Tb lam_1=Y(Tbconst)(0.), lam_2, scale; Tb lam_1=Y(Tbconst)(0.), lam_2, scale;
@ -182,7 +183,7 @@ static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
Y(normalize)(&lam_2,&scale); Y(normalize)(&lam_2,&scale);
int below_limit = Y(TballLt)(scale,limscale); int below_limit = Y(TballLt)(scale,sharp_limscale);
while (below_limit) while (below_limit)
{ {
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
@ -193,14 +194,15 @@ static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1)); lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
if (Y(rescale)(&lam_1,&lam_2,&scale)) if (Y(rescale)(&lam_1,&lam_2,&scale))
below_limit = Y(TballLt)(scale,limscale); below_limit = Y(TballLt)(scale,sharp_limscale);
l+=2; l+=2;
} }
*l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale; *l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale;
} }
static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm, static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
Tb * restrict ryp, Tb * restrict rym, const Tb cth, const ylmgen_dbl3 fx) Tb * restrict ryp, Tb * restrict rym, const Tb cth,
const sharp_ylmgen_dbl3 fx)
{ {
Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]); Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
@ -214,9 +216,9 @@ static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
static void Y(iter_to_ieee_spin) (const Tb cth, int *l_, static void Y(iter_to_ieee_spin) (const Tb cth, int *l_,
Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_, Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_,
Tb * scalep_, Tb * scalem_, const Ylmgen_C * restrict gen) Tb * scalep_, Tb * scalem_, const sharp_Ylmgen_C * restrict gen)
{ {
const ylmgen_dbl3 * restrict fx = gen->fx; const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb cth2, sth2; Tb cth2, sth2;
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
@ -251,14 +253,16 @@ static void Y(iter_to_ieee_spin) (const Tb cth, int *l_,
int l=gen->mhi; int l=gen->mhi;
int below_limit = Y(TballLt)(scalep,limscale) && Y(TballLt)(scalem,limscale); int below_limit = Y(TballLt)(scalep,sharp_limscale)
&& Y(TballLt)(scalem,sharp_limscale);
while (below_limit) while (below_limit)
{ {
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]); Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]);
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]); Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
below_limit = Y(TballLt)(scalep,limscale) && Y(TballLt)(scalem,limscale); below_limit = Y(TballLt)(scalep,sharp_limscale)
&& Y(TballLt)(scalem,sharp_limscale);
l+=2; l+=2;
} }

View file

@ -40,7 +40,7 @@ typedef union
static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1, static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2, Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2,
const ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
int l, int lmax) int l, int lmax)
{ {
#if (njobs>1) #if (njobs>1)
@ -126,7 +126,7 @@ static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1, static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
const Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2, const Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2,
const ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax) const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax)
{ {
while (l<lmax) while (l<lmax)
{ {
@ -168,8 +168,9 @@ static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
} }
} }
static void Z(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen, static void Z(calc_alm2map) (const Tb cth, const Tb sth,
sharp_job *job, Z(Tbrij) * restrict p1, Z(Tbrij) * restrict p2, int *done) const sharp_Ylmgen_C *gen, sharp_job *job, Z(Tbrij) * restrict p1,
Z(Tbrij) * restrict p2, int *done)
{ {
int l,lmax=gen->lmax; int l,lmax=gen->lmax;
Tb lam_1,lam_2,scale; Tb lam_1,lam_2,scale;
@ -180,9 +181,9 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
Tb corfac; Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf); Y(getCorfac)(scale,&corfac,gen->cf);
const ylmgen_dbl2 * restrict rf = gen->rf; const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
const dcmplx * restrict alm=job->almtmp; const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scale,minscale); int full_ieee = Y(TballGt)(scale,sharp_minscale);
while (!full_ieee) while (!full_ieee)
{ {
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
@ -216,7 +217,7 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
if (Y(rescale)(&lam_1,&lam_2,&scale)) if (Y(rescale)(&lam_1,&lam_2,&scale))
{ {
Y(getCorfac)(scale,&corfac,gen->cf); Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGt)(scale,minscale); full_ieee = Y(TballGt)(scale,sharp_minscale);
} }
} }
if (l>lmax) { *done=1; return; } if (l>lmax) { *done=1; return; }
@ -226,7 +227,7 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
} }
static void Z(calc_map2alm) (const Tb cth, const Tb sth, static void Z(calc_map2alm) (const Tb cth, const Tb sth,
const Ylmgen_C *gen, sharp_job *job, const Z(Tbrij) * restrict p1, const sharp_Ylmgen_C *gen, sharp_job *job, const Z(Tbrij) * restrict p1,
const Z(Tbrij) * restrict p2, int *done) const Z(Tbrij) * restrict p2, int *done)
{ {
int lmax=gen->lmax; int lmax=gen->lmax;
@ -237,11 +238,11 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
if (l>lmax) { *done=1; return; } if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
const ylmgen_dbl2 * restrict rf = gen->rf; const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
Tb corfac; Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf); Y(getCorfac)(scale,&corfac,gen->cf);
dcmplx * restrict alm=job->almtmp; dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scale,minscale); int full_ieee = Y(TballGt)(scale,sharp_minscale);
while (!full_ieee) while (!full_ieee)
{ {
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
@ -277,7 +278,7 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
if (Y(rescale)(&lam_1,&lam_2,&scale)) if (Y(rescale)(&lam_1,&lam_2,&scale))
{ {
Y(getCorfac)(scale,&corfac,gen->cf); Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGt)(scale,minscale); full_ieee = Y(TballGt)(scale,sharp_minscale);
} }
} }
@ -371,7 +372,8 @@ static inline void Z(saddstep2) (const Z(Tbquj) * restrict px,
static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1, static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1,
Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, int lmax) const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax)
{ {
while (l<lmax) while (l<lmax)
{ {
@ -408,7 +410,7 @@ static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1,
static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1, static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1,
const Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, const Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax) const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
{ {
while (l<lmax) while (l<lmax)
{ {
@ -438,7 +440,7 @@ static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1,
Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l]); Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l]);
} }
static void Z(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen, static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done) sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done)
{ {
int l, lmax=gen->lmax; int l, lmax=gen->lmax;
@ -449,12 +451,13 @@ static void Z(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
{ *done=1; return; } { *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx; const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm; Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp; const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); int full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
while (!full_ieee) while (!full_ieee)
{ {
Z(saddstep)(p1, p2, Z(saddstep)(p1, p2,
@ -469,7 +472,8 @@ static void Z(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
{ {
Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
} }
} }
@ -482,7 +486,7 @@ static void Z(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
rec1p, rec1m, rec2p, rec2m, fx, alm, l, lmax); rec1p, rec1m, rec2p, rec2m, fx, alm, l, lmax);
} }
static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen, static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen,
sharp_job *job, const Z(Tbquj) * restrict p1, const Z(Tbquj) * restrict p2, sharp_job *job, const Z(Tbquj) * restrict p1, const Z(Tbquj) * restrict p2,
int *done) int *done)
{ {
@ -493,12 +497,13 @@ static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
if (l>lmax) { *done=1; return; } if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx; const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm; Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf);
dcmplx * restrict alm=job->almtmp; dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); int full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
while (!full_ieee) while (!full_ieee)
{ {
Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm); Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
@ -513,7 +518,8 @@ static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
{ {
Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
} }
} }
@ -522,8 +528,9 @@ static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
Z(map2alm_spin_kernel) (cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax); Z(map2alm_spin_kernel) (cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
} }
static inline void Z(saddstep_d) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py, static inline void Z(saddstep_d) (Z(Tbquj) * restrict px,
const Tb rxp, const Tb rxm, const dcmplx * restrict alm) Z(Tbquj) * restrict py, const Tb rxp, const Tb rxm,
const dcmplx * restrict alm)
{ {
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
{ {
@ -545,7 +552,7 @@ static inline void Z(saddstep_d) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py
static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1, static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1,
Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax) int lmax)
{ {
while (l<lmax) while (l<lmax)
@ -576,7 +583,7 @@ static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1,
Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l]); Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l]);
} }
static void Z(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen, static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen,
sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done) sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done)
{ {
int l, lmax=gen->lmax; int l, lmax=gen->lmax;
@ -587,12 +594,13 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
{ *done=1; return; } { *done=1; return; }
job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx; const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm; Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp; const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); int full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
while (!full_ieee) while (!full_ieee)
{ {
Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
@ -607,7 +615,8 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
{ {
Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
} }
} }
@ -623,12 +632,12 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
static void Z(inner_loop) (sharp_job *job, const int *ispair, static void Z(inner_loop) (sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim, Ylmgen_C *gen, const double *cth_, const double *sth_, int llim, int ulim,
int mi, const int *idx) sharp_Ylmgen_C *gen, int mi, const int *idx)
{ {
const int nval=nvec*VLEN; const int nval=nvec*VLEN;
const int m = job->ainfo->mval[mi]; const int m = job->ainfo->mval[mi];
Ylmgen_prepare (gen, m); sharp_Ylmgen_prepare (gen, m);
switch (job->type) switch (job->type)
{ {

View file

@ -31,7 +31,7 @@
static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1, static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
const ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
int l, int lmax, int njobs) int l, int lmax, int njobs)
{ {
while (l<lmax-2) while (l<lmax-2)
@ -115,7 +115,7 @@ static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
static void Y(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1, static void Y(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
const ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax, const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax,
int njobs) int njobs)
{ {
while (l<lmax) while (l<lmax)
@ -158,9 +158,9 @@ static void Y(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
} }
} }
static void Y(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen, static void Y(calc_alm2map) (const Tb cth, const Tb sth,
sharp_job *job, Y(Tbri) * restrict p1, Y(Tbri) * restrict p2, int njobs, const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
int *done) Y(Tbri) * restrict p2, int njobs, int *done)
{ {
int l,lmax=gen->lmax; int l,lmax=gen->lmax;
Tb lam_1,lam_2,scale; Tb lam_1,lam_2,scale;
@ -171,9 +171,9 @@ static void Y(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
Tb corfac; Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf); Y(getCorfac)(scale,&corfac,gen->cf);
const ylmgen_dbl2 * restrict rf = gen->rf; const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
const dcmplx * restrict alm=job->almtmp; const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scale,minscale); int full_ieee = Y(TballGt)(scale,sharp_minscale);
while (!full_ieee) while (!full_ieee)
{ {
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
@ -207,7 +207,7 @@ static void Y(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
if (Y(rescale)(&lam_1,&lam_2,&scale)) if (Y(rescale)(&lam_1,&lam_2,&scale))
{ {
Y(getCorfac)(scale,&corfac,gen->cf); Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGt)(scale,minscale); full_ieee = Y(TballGt)(scale,sharp_minscale);
} }
} }
if (l>lmax) { *done=1; return; } if (l>lmax) { *done=1; return; }
@ -217,7 +217,7 @@ static void Y(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
} }
static void Y(calc_map2alm) (const Tb cth, const Tb sth, static void Y(calc_map2alm) (const Tb cth, const Tb sth,
const Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1, const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
const Y(Tbri) * restrict p2, int njobs, int *done) const Y(Tbri) * restrict p2, int njobs, int *done)
{ {
int lmax=gen->lmax; int lmax=gen->lmax;
@ -228,11 +228,11 @@ static void Y(calc_map2alm) (const Tb cth, const Tb sth,
if (l>lmax) { *done=1; return; } if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
const ylmgen_dbl2 * restrict rf = gen->rf; const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
Tb corfac; Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf); Y(getCorfac)(scale,&corfac,gen->cf);
dcmplx * restrict alm=job->almtmp; dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scale,minscale); int full_ieee = Y(TballGt)(scale,sharp_minscale);
while (!full_ieee) while (!full_ieee)
{ {
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
@ -268,7 +268,7 @@ static void Y(calc_map2alm) (const Tb cth, const Tb sth,
if (Y(rescale)(&lam_1,&lam_2,&scale)) if (Y(rescale)(&lam_1,&lam_2,&scale))
{ {
Y(getCorfac)(scale,&corfac,gen->cf); Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGt)(scale,minscale); full_ieee = Y(TballGt)(scale,sharp_minscale);
} }
} }
@ -362,7 +362,7 @@ static inline void Y(saddstep2) (const Y(Tbqu) * restrict px,
static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1, static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax, int njobs) int lmax, int njobs)
{ {
while (l<lmax) while (l<lmax)
@ -395,7 +395,7 @@ static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1, static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax, const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax,
int njobs) int njobs)
{ {
while (l<lmax) while (l<lmax)
@ -426,7 +426,7 @@ static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l], njobs); Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l], njobs);
} }
static void Y(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen, static void Y(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs, sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs,
int *done) int *done)
{ {
@ -438,12 +438,13 @@ static void Y(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
{ *done=1; return; } { *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx; const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm; Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp; const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); int full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
while (!full_ieee) while (!full_ieee)
{ {
Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
@ -458,7 +459,8 @@ static void Y(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
{ {
Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
} }
} }
@ -471,7 +473,7 @@ static void Y(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
lmax, njobs); lmax, njobs);
} }
static void Y(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen, static void Y(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen,
sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2, sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2,
int njobs, int *done) int njobs, int *done)
{ {
@ -482,12 +484,13 @@ static void Y(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
if (l>lmax) { *done=1; return; } if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx; const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm; Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf);
dcmplx * restrict alm=job->almtmp; dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); int full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
while (!full_ieee) while (!full_ieee)
{ {
Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm); Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
@ -502,7 +505,8 @@ static void Y(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
{ {
Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
} }
} }
@ -534,7 +538,7 @@ static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1, static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax, int njobs) int lmax, int njobs)
{ {
while (l<lmax) while (l<lmax)
@ -565,7 +569,7 @@ static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Y(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l], njobs); Y(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l], njobs);
} }
static void Y(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen, static void Y(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen,
sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs, sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs,
int *done) int *done)
{ {
@ -577,12 +581,13 @@ static void Y(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
{ *done=1; return; } { *done=1; return; }
job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx; const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm; Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp; const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); int full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
while (!full_ieee) while (!full_ieee)
{ {
Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
@ -597,7 +602,8 @@ static void Y(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
{ {
Y(getCorfac)(scalep,&corfacp,gen->cf); Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf); Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
} }
} }
@ -614,12 +620,12 @@ static void Y(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
static void Y(inner_loop) (sharp_job *job, const int *ispair, static void Y(inner_loop) (sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim, Ylmgen_C *gen, const double *cth_, const double *sth_, int llim, int ulim,
int mi, const int *idx, int njobs) sharp_Ylmgen_C *gen, int mi, const int *idx, int njobs)
{ {
const int nval=nvec*VLEN; const int nval=nvec*VLEN;
const int m = job->ainfo->mval[mi]; const int m = job->ainfo->mval[mi];
Ylmgen_prepare (gen, m); sharp_Ylmgen_prepare (gen, m);
switch (job->type) switch (job->type)
{ {

View file

@ -0,0 +1,9 @@
static const int maxtr = 6;
static const int nv_opt[6][2][3] = {
{{4,2,-1},{2,1,1}},
{{5,2,-1},{2,1,1}},
{{5,2,-1},{5,2,2}},
{{5,2,-1},{5,2,2}},
{{5,2,-1},{5,2,2}},
{{5,2,-1},{5,2,2}}
};

View file

@ -34,7 +34,7 @@
#include "sharp_ylmgen_c.h" #include "sharp_ylmgen_c.h"
#include "c_utils.h" #include "c_utils.h"
void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin) void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
{ {
const double inv_sqrt4pi = 0.2820947917738781434740397257803862929220; const double inv_sqrt4pi = 0.2820947917738781434740397257803862929220;
@ -42,18 +42,19 @@ void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
gen->mmax = m_max; gen->mmax = m_max;
UTIL_ASSERT(spin>=0,"incorrect spin"); UTIL_ASSERT(spin>=0,"incorrect spin");
gen->s = spin; gen->s = spin;
UTIL_ASSERT((minscale<=0)&&(maxscale>0),"bad value for min/maxscale"); UTIL_ASSERT((sharp_minscale<=0)&&(sharp_maxscale>0),
gen->cf=RALLOC(double,maxscale-minscale+1); "bad value for min/maxscale");
gen->cf[-minscale]=1.; gen->cf=RALLOC(double,sharp_maxscale-sharp_minscale+1);
for (int m=-minscale-1; m>=0; --m) gen->cf[-sharp_minscale]=1.;
gen->cf[m]=gen->cf[m+1]*fsmall; for (int m=-sharp_minscale-1; m>=0; --m)
for (int m=-minscale+1; m<(maxscale-minscale+1); ++m) gen->cf[m]=gen->cf[m+1]*sharp_fsmall;
gen->cf[m]=gen->cf[m-1]*fbig; for (int m=-sharp_minscale+1; m<(sharp_maxscale-sharp_minscale+1); ++m)
gen->cf[m]=gen->cf[m-1]*sharp_fbig;
gen->m = -1; gen->m = -1;
if (spin==0) if (spin==0)
{ {
gen->rf = RALLOC(ylmgen_dbl2,gen->lmax+1); gen->rf = RALLOC(sharp_ylmgen_dbl2,gen->lmax+1);
gen->mfac = RALLOC(double,gen->mmax+1); gen->mfac = RALLOC(double,gen->mmax+1);
gen->mfac[0] = inv_sqrt4pi; gen->mfac[0] = inv_sqrt4pi;
for (int m=1; m<=gen->mmax; ++m) for (int m=1; m<=gen->mmax; ++m)
@ -69,7 +70,7 @@ void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
else else
{ {
gen->m=gen->mlo=gen->mhi=-1234567890; gen->m=gen->mlo=gen->mhi=-1234567890;
ALLOC(gen->fx,ylmgen_dbl3,gen->lmax+2); ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+2);
for (int m=0; m<gen->lmax+2; ++m) for (int m=0; m<gen->lmax+2; ++m)
gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.; gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
ALLOC(gen->inv,double,gen->lmax+1); ALLOC(gen->inv,double,gen->lmax+1);
@ -91,7 +92,7 @@ void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
{ {
fac[m]=fac[m-1]*sqrt(m); fac[m]=fac[m-1]*sqrt(m);
facscale[m]=facscale[m-1]; facscale[m]=facscale[m-1];
if (fac[m]>1.) { fac[m]*=fsmall; ++facscale[m]; } if (fac[m]>1.) { fac[m]*=sharp_fsmall; ++facscale[m]; }
} }
for (int m=0; m<=gen->mmax; ++m) for (int m=0; m<=gen->mmax; ++m)
{ {
@ -105,7 +106,7 @@ void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
} }
} }
void Ylmgen_destroy (Ylmgen_C *gen) void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
{ {
DEALLOC(gen->cf); DEALLOC(gen->cf);
if (gen->s==0) if (gen->s==0)
@ -126,7 +127,7 @@ void Ylmgen_destroy (Ylmgen_C *gen)
} }
} }
void Ylmgen_prepare (Ylmgen_C *gen, int m) void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
{ {
if (m==gen->m) return; if (m==gen->m) return;
UTIL_ASSERT(m>=0,"incorrect m"); UTIL_ASSERT(m>=0,"incorrect m");
@ -181,7 +182,7 @@ void Ylmgen_prepare (Ylmgen_C *gen, int m)
} }
} }
double *Ylmgen_get_norm (int lmax, int spin) double *sharp_Ylmgen_get_norm (int lmax, int spin)
{ {
const double pi = 3.141592653589793238462643383279502884197; const double pi = 3.141592653589793238462643383279502884197;
double *res=RALLOC(double,lmax+1); double *res=RALLOC(double,lmax+1);

View file

@ -36,11 +36,11 @@
extern "C" { extern "C" {
#endif #endif
enum { minscale=-8, limscale=-3, maxscale=5 }; enum { sharp_minscale=-8, sharp_limscale=-3, sharp_maxscale=5 };
static const double fbig=0x1p+90,fsmall=0x1p-90; static const double sharp_fbig=0x1p+90,sharp_fsmall=0x1p-90;
typedef struct { double f[2]; } ylmgen_dbl2; typedef struct { double f[2]; } sharp_ylmgen_dbl2;
typedef struct { double f[3]; } ylmgen_dbl3; typedef struct { double f[3]; } sharp_ylmgen_dbl3;
typedef struct typedef struct
{ {
@ -53,13 +53,13 @@ typedef struct
/* used if s==0 */ /* used if s==0 */
double *mfac; double *mfac;
ylmgen_dbl2 *rf; sharp_ylmgen_dbl2 *rf;
/* used if s!=0 */ /* used if s!=0 */
int sinPow, cosPow, preMinus_p, preMinus_m; int sinPow, cosPow, preMinus_p, preMinus_m;
double *prefac; double *prefac;
int *fscale; int *fscale;
ylmgen_dbl3 *fx; sharp_ylmgen_dbl3 *fx;
/* internal usage only */ /* internal usage only */
/* used if s==0 */ /* used if s==0 */
@ -68,22 +68,22 @@ typedef struct
/* used if s!=0 */ /* used if s!=0 */
double *flm1, *flm2, *inv; double *flm1, *flm2, *inv;
int mlo, mhi; int mlo, mhi;
} Ylmgen_C; } sharp_Ylmgen_C;
/*! Creates a generator which will calculate helper data for Y_lm calculation /*! Creates a generator which will calculate helper data for Y_lm calculation
up to \a l=l_max and \a m=m_max. */ up to \a l=l_max and \a m=m_max. */
void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin); void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin);
/*! Deallocates a generator previously initialised by Ylmgen_init(). */ /*! Deallocates a generator previously initialised by Ylmgen_init(). */
void Ylmgen_destroy (Ylmgen_C *gen); void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen);
/*! Prepares the object for the calculation at \a m. */ /*! Prepares the object for the calculation at \a m. */
void Ylmgen_prepare (Ylmgen_C *gen, int m); void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m);
/*! Returns a pointer to an array with \a lmax+1 entries containing /*! Returns a pointer to an array with \a lmax+1 entries containing
normalisation factors that must be applied to Y_lm values computed for normalisation factors that must be applied to Y_lm values computed for
\a spin. The array must be deallocated (using free()) by the user. */ \a spin. The array must be deallocated (using free()) by the user. */
double *Ylmgen_get_norm (int lmax, int spin); double *sharp_Ylmgen_get_norm (int lmax, int spin);
#ifdef __cplusplus #ifdef __cplusplus
} }