sharpification

This commit is contained in:
Martin Reinecke 2012-07-05 16:14:28 +02:00
parent 84c0a7de18
commit 82dc2a541f
14 changed files with 145 additions and 123 deletions

View file

@ -32,7 +32,7 @@ compile_all: $(all_cbin) hdrcopy
autotune: sharp_bench
$(BINDIR)/sharp_bench
mv oracle.inc $(SRCROOT)/libsharp
mv sharp_oracle.inc $(SRCROOT)/libsharp
$(MAKE)
hdrclean:

View file

@ -61,11 +61,11 @@
<ul>
<li>scalar a_lm to map</li>
<li>scalar map to a_lm</li>
<!-- <li>polarised a_lm to map</li>
<li>polarised map to a_lm</li> -->
<!-- <li>polarised a_lm to map</li>
<li>polarised map to a_lm</li> !-->
<li>spin a_lm to map</li>
<li>spin map to a_lm</li>
<!-- <li>scalar a_lm to maps of first derivatives</li> -->
<li>scalar a_lm to maps of first derivatives</li>
</ul>
SHARP supports shared-memory parallelisation via OpenMP; this feature will

View file

@ -1,9 +0,0 @@
static const int maxtr = 6;
static const int nv_opt[6][2][3] = {
{{4,2,-1},{2,1,1}},
{{4,2,-1},{2,1,1}},
{{5,2,-1},{5,2,1}},
{{5,2,-1},{5,2,1}},
{{5,2,-1},{5,2,1}},
{{5,2,-1},{5,2,1}}
};

View file

@ -15,7 +15,7 @@ ALLOBJ:=$(ALLOBJ:%=$(OD)/%)
ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils)
$(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c
$(OD)/sharp.o: $(SD)/sharp_mpi.c $(SD)/oracle.inc
$(OD)/sharp.o: $(SD)/sharp_mpi.c $(SD)/sharp_oracle.inc
BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils)
$(LIB_$(PKG)): $(LIBOBJ)

View file

@ -464,7 +464,7 @@ void sharp_execute_job (sharp_job *job)
int lmax = job->ainfo->lmax,
mmax=sharp_get_mmax(job->ainfo->mval, job->ainfo->nm);
job->norm_l = Ylmgen_get_norm (lmax, job->spin);
job->norm_l = sharp_Ylmgen_get_norm (lmax, job->spin);
/* clear output arrays if requested */
init_output (job);
@ -501,8 +501,8 @@ void sharp_execute_job (sharp_job *job)
{
sharp_job ljob = *job;
ljob.opcnt=0;
Ylmgen_C generator;
Ylmgen_init (&generator,lmax,mmax,ljob.spin);
sharp_Ylmgen_C generator;
sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
alloc_almtmp(&ljob,lmax);
#pragma omp for schedule(dynamic,1)
@ -517,7 +517,7 @@ void sharp_execute_job (sharp_job *job)
almtmp2alm (&ljob, lmax, mi);
}
Ylmgen_destroy(&generator);
sharp_Ylmgen_destroy(&generator);
dealloc_almtmp(&ljob);
#pragma omp critical
@ -589,7 +589,7 @@ int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
if (type==ALM2MAP_DERIV1) spin=1;
UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin");
#include "oracle.inc"
#include "sharp_oracle.inc"
return nv_opt[IMIN(ntrans,maxtr)-1][spin!=0][type];
}

View file

@ -98,7 +98,7 @@ int main(void)
sharp_module_startup("sharp_bench",1,1,"",1);
printf("Benchmarking SHTs.\n\n");
FILE *fp=fopen("oracle.inc","w");
FILE *fp=fopen("sharp_oracle.inc","w");
UTIL_ASSERT(fp, "failed to open oracle file for writing");
fprintf(fp,"static const int maxtr = 6;\n");
fprintf(fp,"static const int nv_opt[6][2][3] = {\n");

View file

@ -73,7 +73,8 @@ typedef complex double dcmplx;
#undef nvec
void inner_loop (sharp_job *job, const int *ispair,const double *cth,
const double *sth, int llim, int ulim, Ylmgen_C *gen, int mi, const int *idx)
const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
const int *idx)
{
int njobs=job->ntrans;
if (njobs<=MAXJOB_SPECIAL)

View file

@ -40,7 +40,8 @@ extern "C" {
#endif
void inner_loop (sharp_job *job, const int *ispair,const double *cth,
const double *sth, int llim, int ulim, Ylmgen_C *gen, int mi, const int *idx);
const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
const int *idx);
#ifdef __cplusplus
}

View file

@ -83,8 +83,8 @@ static inline void Y(mypow) (Tb val, int npow, Tb * restrict resd,
{
vmuleq(res.v[i],val.v[i]);
vaddeq(scale.v[i],scaleint.v[i]);
Tv mask=vlt(vabs(res.v[i]),vload(fsmall));
vmuleq(res.v[i],vblend(mask,vload(fbig),vone));
Tv mask=vlt(vabs(res.v[i]),vload(sharp_fsmall));
vmuleq(res.v[i],vblend(mask,vload(sharp_fbig),vone));
vsubeq(scale.v[i],vblend(mask,vone,vzero));
}
}
@ -92,8 +92,8 @@ static inline void Y(mypow) (Tb val, int npow, Tb * restrict resd,
{
vmuleq(val.v[i],val.v[i]);
vaddeq(scaleint.v[i],scaleint.v[i]);
Tv mask = vlt(vabs(val.v[i]),vload(fsmall));
vmuleq(val.v[i],vblend(mask,vload(fbig),vone));
Tv mask = vlt(vabs(val.v[i]),vload(sharp_fsmall));
vmuleq(val.v[i],vblend(mask,vload(sharp_fbig),vone));
vsubeq(scaleint.v[i],vblend(mask,vone,vzero));
}
}
@ -113,7 +113,7 @@ static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
if (vanyTrue(mask))
{
did_scale=1;
Tv fact = vblend(mask,vload(fsmall),vone);
Tv fact = vblend(mask,vload(sharp_fsmall),vone);
vmuleq(lam1->v[i],fact); vmuleq(lam2->v[i],fact);
vaddeq(scale->v[i],vblend(mask,vone,vzero));
}
@ -123,7 +123,7 @@ static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
static inline void Y(normalize) (Tb * restrict val, Tb * restrict scale)
{
const Tv vfsmall=vload(fsmall), vfbig=vload(fbig);
const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
for (int i=0;i<nvec; ++i)
{
Tv mask = vgt(vabs(val->v[i]),vone);
@ -168,13 +168,14 @@ static inline void Y(getCorfac)(Tb scale, Tb * restrict corfac,
Y(Tbu) sc, corf;
sc.b=scale;
for (int i=0; i<VLEN*nvec; ++i)
corf.s[i] = (sc.s[i]<minscale) ? 0. : cf[(int)(sc.s[i])-minscale];
corf.s[i] = (sc.s[i]<sharp_minscale) ?
0. : cf[(int)(sc.s[i])-sharp_minscale];
*corfac=corf.b;
}
static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
const Ylmgen_C * restrict gen)
const sharp_Ylmgen_C * restrict gen)
{
int l=gen->m;
Tb lam_1=Y(Tbconst)(0.), lam_2, scale;
@ -182,7 +183,7 @@ static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
Y(normalize)(&lam_2,&scale);
int below_limit = Y(TballLt)(scale,limscale);
int below_limit = Y(TballLt)(scale,sharp_limscale);
while (below_limit)
{
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
@ -193,14 +194,15 @@ static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
if (Y(rescale)(&lam_1,&lam_2,&scale))
below_limit = Y(TballLt)(scale,limscale);
below_limit = Y(TballLt)(scale,sharp_limscale);
l+=2;
}
*l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale;
}
static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
Tb * restrict ryp, Tb * restrict rym, const Tb cth, const ylmgen_dbl3 fx)
Tb * restrict ryp, Tb * restrict rym, const Tb cth,
const sharp_ylmgen_dbl3 fx)
{
Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
for (int i=0; i<nvec; ++i)
@ -214,9 +216,9 @@ static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
static void Y(iter_to_ieee_spin) (const Tb cth, int *l_,
Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_,
Tb * scalep_, Tb * scalem_, const Ylmgen_C * restrict gen)
Tb * scalep_, Tb * scalem_, const sharp_Ylmgen_C * restrict gen)
{
const ylmgen_dbl3 * restrict fx = gen->fx;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb cth2, sth2;
for (int i=0; i<nvec; ++i)
{
@ -251,14 +253,16 @@ static void Y(iter_to_ieee_spin) (const Tb cth, int *l_,
int l=gen->mhi;
int below_limit = Y(TballLt)(scalep,limscale) && Y(TballLt)(scalem,limscale);
int below_limit = Y(TballLt)(scalep,sharp_limscale)
&& Y(TballLt)(scalem,sharp_limscale);
while (below_limit)
{
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]);
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
below_limit = Y(TballLt)(scalep,limscale) && Y(TballLt)(scalem,limscale);
below_limit = Y(TballLt)(scalep,sharp_limscale)
&& Y(TballLt)(scalem,sharp_limscale);
l+=2;
}

View file

@ -40,7 +40,7 @@ typedef union
static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2,
const ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
int l, int lmax)
{
#if (njobs>1)
@ -126,7 +126,7 @@ static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
const Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2,
const ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax)
const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax)
{
while (l<lmax)
{
@ -168,8 +168,9 @@ static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
}
}
static void Z(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
sharp_job *job, Z(Tbrij) * restrict p1, Z(Tbrij) * restrict p2, int *done)
static void Z(calc_alm2map) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, Z(Tbrij) * restrict p1,
Z(Tbrij) * restrict p2, int *done)
{
int l,lmax=gen->lmax;
Tb lam_1,lam_2,scale;
@ -180,9 +181,9 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
const ylmgen_dbl2 * restrict rf = gen->rf;
const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scale,minscale);
int full_ieee = Y(TballGt)(scale,sharp_minscale);
while (!full_ieee)
{
for (int j=0; j<njobs; ++j)
@ -216,7 +217,7 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
if (Y(rescale)(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGt)(scale,minscale);
full_ieee = Y(TballGt)(scale,sharp_minscale);
}
}
if (l>lmax) { *done=1; return; }
@ -226,7 +227,7 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
}
static void Z(calc_map2alm) (const Tb cth, const Tb sth,
const Ylmgen_C *gen, sharp_job *job, const Z(Tbrij) * restrict p1,
const sharp_Ylmgen_C *gen, sharp_job *job, const Z(Tbrij) * restrict p1,
const Z(Tbrij) * restrict p2, int *done)
{
int lmax=gen->lmax;
@ -237,11 +238,11 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
const ylmgen_dbl2 * restrict rf = gen->rf;
const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scale,minscale);
int full_ieee = Y(TballGt)(scale,sharp_minscale);
while (!full_ieee)
{
for (int j=0; j<njobs; ++j)
@ -277,7 +278,7 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
if (Y(rescale)(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGt)(scale,minscale);
full_ieee = Y(TballGt)(scale,sharp_minscale);
}
}
@ -371,7 +372,8 @@ static inline void Z(saddstep2) (const Z(Tbquj) * restrict px,
static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1,
Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, int lmax)
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax)
{
while (l<lmax)
{
@ -408,7 +410,7 @@ static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1,
static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1,
const Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
{
while (l<lmax)
{
@ -438,7 +440,7 @@ static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1,
Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l]);
}
static void Z(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done)
{
int l, lmax=gen->lmax;
@ -449,12 +451,13 @@ static void Z(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
{ *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
int full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
while (!full_ieee)
{
Z(saddstep)(p1, p2,
@ -469,7 +472,8 @@ static void Z(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
}
}
@ -482,7 +486,7 @@ static void Z(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
rec1p, rec1m, rec2p, rec2m, fx, alm, l, lmax);
}
static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen,
sharp_job *job, const Z(Tbquj) * restrict p1, const Z(Tbquj) * restrict p2,
int *done)
{
@ -493,12 +497,13 @@ static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
int full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
while (!full_ieee)
{
Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
@ -513,7 +518,8 @@ static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
}
}
@ -522,8 +528,9 @@ static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
Z(map2alm_spin_kernel) (cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
}
static inline void Z(saddstep_d) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py,
const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
static inline void Z(saddstep_d) (Z(Tbquj) * restrict px,
Z(Tbquj) * restrict py, const Tb rxp, const Tb rxm,
const dcmplx * restrict alm)
{
for (int j=0; j<njobs; ++j)
{
@ -545,7 +552,7 @@ static inline void Z(saddstep_d) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py
static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1,
Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax)
{
while (l<lmax)
@ -576,7 +583,7 @@ static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1,
Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l]);
}
static void Z(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen,
sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done)
{
int l, lmax=gen->lmax;
@ -587,12 +594,13 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
{ *done=1; return; }
job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
int full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
while (!full_ieee)
{
Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
@ -607,7 +615,8 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
}
}
@ -623,12 +632,12 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
static void Z(inner_loop) (sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim, Ylmgen_C *gen,
int mi, const int *idx)
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *idx)
{
const int nval=nvec*VLEN;
const int m = job->ainfo->mval[mi];
Ylmgen_prepare (gen, m);
sharp_Ylmgen_prepare (gen, m);
switch (job->type)
{

View file

@ -31,7 +31,7 @@
static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
const ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
int l, int lmax, int njobs)
{
while (l<lmax-2)
@ -115,7 +115,7 @@ static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
static void Y(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
const ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax,
const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax,
int njobs)
{
while (l<lmax)
@ -158,9 +158,9 @@ static void Y(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
}
}
static void Y(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
sharp_job *job, Y(Tbri) * restrict p1, Y(Tbri) * restrict p2, int njobs,
int *done)
static void Y(calc_alm2map) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
Y(Tbri) * restrict p2, int njobs, int *done)
{
int l,lmax=gen->lmax;
Tb lam_1,lam_2,scale;
@ -171,9 +171,9 @@ static void Y(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
const ylmgen_dbl2 * restrict rf = gen->rf;
const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scale,minscale);
int full_ieee = Y(TballGt)(scale,sharp_minscale);
while (!full_ieee)
{
for (int j=0; j<njobs; ++j)
@ -207,7 +207,7 @@ static void Y(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
if (Y(rescale)(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGt)(scale,minscale);
full_ieee = Y(TballGt)(scale,sharp_minscale);
}
}
if (l>lmax) { *done=1; return; }
@ -217,7 +217,7 @@ static void Y(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
}
static void Y(calc_map2alm) (const Tb cth, const Tb sth,
const Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
const Y(Tbri) * restrict p2, int njobs, int *done)
{
int lmax=gen->lmax;
@ -228,11 +228,11 @@ static void Y(calc_map2alm) (const Tb cth, const Tb sth,
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
const ylmgen_dbl2 * restrict rf = gen->rf;
const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scale,minscale);
int full_ieee = Y(TballGt)(scale,sharp_minscale);
while (!full_ieee)
{
for (int j=0; j<njobs; ++j)
@ -268,7 +268,7 @@ static void Y(calc_map2alm) (const Tb cth, const Tb sth,
if (Y(rescale)(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGt)(scale,minscale);
full_ieee = Y(TballGt)(scale,sharp_minscale);
}
}
@ -362,7 +362,7 @@ static inline void Y(saddstep2) (const Y(Tbqu) * restrict px,
static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax, int njobs)
{
while (l<lmax)
@ -395,7 +395,7 @@ static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax,
const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax,
int njobs)
{
while (l<lmax)
@ -426,7 +426,7 @@ static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l], njobs);
}
static void Y(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
static void Y(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs,
int *done)
{
@ -438,12 +438,13 @@ static void Y(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
{ *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
int full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
while (!full_ieee)
{
Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
@ -458,7 +459,8 @@ static void Y(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
}
}
@ -471,7 +473,7 @@ static void Y(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
lmax, njobs);
}
static void Y(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
static void Y(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen,
sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2,
int njobs, int *done)
{
@ -482,12 +484,13 @@ static void Y(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
int full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
while (!full_ieee)
{
Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
@ -502,7 +505,8 @@ static void Y(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
}
}
@ -534,7 +538,7 @@ static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax, int njobs)
{
while (l<lmax)
@ -565,7 +569,7 @@ static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Y(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l], njobs);
}
static void Y(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
static void Y(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen,
sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs,
int *done)
{
@ -577,12 +581,13 @@ static void Y(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
{ *done=1; return; }
job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
int full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
while (!full_ieee)
{
Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
@ -597,7 +602,8 @@ static void Y(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
full_ieee = Y(TballGt)(scalep,sharp_minscale)
&& Y(TballGt)(scalem,sharp_minscale);
}
}
@ -614,12 +620,12 @@ static void Y(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
static void Y(inner_loop) (sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim, Ylmgen_C *gen,
int mi, const int *idx, int njobs)
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *idx, int njobs)
{
const int nval=nvec*VLEN;
const int m = job->ainfo->mval[mi];
Ylmgen_prepare (gen, m);
sharp_Ylmgen_prepare (gen, m);
switch (job->type)
{

View file

@ -0,0 +1,9 @@
static const int maxtr = 6;
static const int nv_opt[6][2][3] = {
{{4,2,-1},{2,1,1}},
{{5,2,-1},{2,1,1}},
{{5,2,-1},{5,2,2}},
{{5,2,-1},{5,2,2}},
{{5,2,-1},{5,2,2}},
{{5,2,-1},{5,2,2}}
};

View file

@ -34,7 +34,7 @@
#include "sharp_ylmgen_c.h"
#include "c_utils.h"
void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
{
const double inv_sqrt4pi = 0.2820947917738781434740397257803862929220;
@ -42,18 +42,19 @@ void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
gen->mmax = m_max;
UTIL_ASSERT(spin>=0,"incorrect spin");
gen->s = spin;
UTIL_ASSERT((minscale<=0)&&(maxscale>0),"bad value for min/maxscale");
gen->cf=RALLOC(double,maxscale-minscale+1);
gen->cf[-minscale]=1.;
for (int m=-minscale-1; m>=0; --m)
gen->cf[m]=gen->cf[m+1]*fsmall;
for (int m=-minscale+1; m<(maxscale-minscale+1); ++m)
gen->cf[m]=gen->cf[m-1]*fbig;
UTIL_ASSERT((sharp_minscale<=0)&&(sharp_maxscale>0),
"bad value for min/maxscale");
gen->cf=RALLOC(double,sharp_maxscale-sharp_minscale+1);
gen->cf[-sharp_minscale]=1.;
for (int m=-sharp_minscale-1; m>=0; --m)
gen->cf[m]=gen->cf[m+1]*sharp_fsmall;
for (int m=-sharp_minscale+1; m<(sharp_maxscale-sharp_minscale+1); ++m)
gen->cf[m]=gen->cf[m-1]*sharp_fbig;
gen->m = -1;
if (spin==0)
{
gen->rf = RALLOC(ylmgen_dbl2,gen->lmax+1);
gen->rf = RALLOC(sharp_ylmgen_dbl2,gen->lmax+1);
gen->mfac = RALLOC(double,gen->mmax+1);
gen->mfac[0] = inv_sqrt4pi;
for (int m=1; m<=gen->mmax; ++m)
@ -69,7 +70,7 @@ void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
else
{
gen->m=gen->mlo=gen->mhi=-1234567890;
ALLOC(gen->fx,ylmgen_dbl3,gen->lmax+2);
ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+2);
for (int m=0; m<gen->lmax+2; ++m)
gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
ALLOC(gen->inv,double,gen->lmax+1);
@ -91,7 +92,7 @@ void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
{
fac[m]=fac[m-1]*sqrt(m);
facscale[m]=facscale[m-1];
if (fac[m]>1.) { fac[m]*=fsmall; ++facscale[m]; }
if (fac[m]>1.) { fac[m]*=sharp_fsmall; ++facscale[m]; }
}
for (int m=0; m<=gen->mmax; ++m)
{
@ -105,7 +106,7 @@ void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
}
}
void Ylmgen_destroy (Ylmgen_C *gen)
void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
{
DEALLOC(gen->cf);
if (gen->s==0)
@ -126,7 +127,7 @@ void Ylmgen_destroy (Ylmgen_C *gen)
}
}
void Ylmgen_prepare (Ylmgen_C *gen, int m)
void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
{
if (m==gen->m) return;
UTIL_ASSERT(m>=0,"incorrect m");
@ -181,7 +182,7 @@ void Ylmgen_prepare (Ylmgen_C *gen, int m)
}
}
double *Ylmgen_get_norm (int lmax, int spin)
double *sharp_Ylmgen_get_norm (int lmax, int spin)
{
const double pi = 3.141592653589793238462643383279502884197;
double *res=RALLOC(double,lmax+1);

View file

@ -36,11 +36,11 @@
extern "C" {
#endif
enum { minscale=-8, limscale=-3, maxscale=5 };
static const double fbig=0x1p+90,fsmall=0x1p-90;
enum { sharp_minscale=-8, sharp_limscale=-3, sharp_maxscale=5 };
static const double sharp_fbig=0x1p+90,sharp_fsmall=0x1p-90;
typedef struct { double f[2]; } ylmgen_dbl2;
typedef struct { double f[3]; } ylmgen_dbl3;
typedef struct { double f[2]; } sharp_ylmgen_dbl2;
typedef struct { double f[3]; } sharp_ylmgen_dbl3;
typedef struct
{
@ -53,13 +53,13 @@ typedef struct
/* used if s==0 */
double *mfac;
ylmgen_dbl2 *rf;
sharp_ylmgen_dbl2 *rf;
/* used if s!=0 */
int sinPow, cosPow, preMinus_p, preMinus_m;
double *prefac;
int *fscale;
ylmgen_dbl3 *fx;
sharp_ylmgen_dbl3 *fx;
/* internal usage only */
/* used if s==0 */
@ -68,22 +68,22 @@ typedef struct
/* used if s!=0 */
double *flm1, *flm2, *inv;
int mlo, mhi;
} Ylmgen_C;
} sharp_Ylmgen_C;
/*! Creates a generator which will calculate helper data for Y_lm calculation
up to \a l=l_max and \a m=m_max. */
void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin);
void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin);
/*! Deallocates a generator previously initialised by Ylmgen_init(). */
void Ylmgen_destroy (Ylmgen_C *gen);
void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen);
/*! Prepares the object for the calculation at \a m. */
void Ylmgen_prepare (Ylmgen_C *gen, int m);
void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m);
/*! Returns a pointer to an array with \a lmax+1 entries containing
normalisation factors that must be applied to Y_lm values computed for
\a spin. The array must be deallocated (using free()) by the user. */
double *Ylmgen_get_norm (int lmax, int spin);
double *sharp_Ylmgen_get_norm (int lmax, int spin);
#ifdef __cplusplus
}