more simplifications
This commit is contained in:
parent
10c5b5f7a9
commit
aee1a51ac2
6 changed files with 28 additions and 189 deletions
|
@ -515,7 +515,7 @@ static void dealloc_phase (sharp_job *job)
|
|||
{ DEALLOC(job->phase); }
|
||||
|
||||
static void alloc_almtmp (sharp_job *job, int lmax)
|
||||
{ job->almtmp=RALLOC(dcmplx,job->nalm*(lmax+1)); }
|
||||
{ job->almtmp=RALLOC(dcmplx,job->nalm*(lmax+2)); }
|
||||
|
||||
static void dealloc_almtmp (sharp_job *job)
|
||||
{ DEALLOC(job->almtmp); }
|
||||
|
@ -534,6 +534,8 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
|
|||
source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \
|
||||
job->almtmp[job->nalm*l+i] = expr_of_x; \
|
||||
} \
|
||||
for (int i=0; i<job->nalm; ++i) \
|
||||
job->almtmp[job->nalm*(lmax+1)+i] = 0; \
|
||||
}
|
||||
|
||||
if (job->type!=SHARP_MAP2ALM)
|
||||
|
@ -852,8 +854,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
|
|||
init_output (job);
|
||||
|
||||
int nchunks, chunksize;
|
||||
get_chunk_info(job->ginfo->npairs,(job->flags&SHARP_NVMAX)*VLEN,&nchunks,
|
||||
&chunksize);
|
||||
get_chunk_info(job->ginfo->npairs,6*VLEN,&nchunks,&chunksize);
|
||||
//FIXME: needs to be changed to "nm"
|
||||
alloc_phase (job,mmax+1,chunksize);
|
||||
|
||||
|
@ -934,8 +935,6 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
|
|||
job->ginfo = geom_info;
|
||||
job->ainfo = alm_info;
|
||||
job->flags = flags;
|
||||
if ((job->flags&SHARP_NVMAX)==0)
|
||||
job->flags|=sharp_nv_oracle (type, spin);
|
||||
if (alm_info->flags&SHARP_REAL_HARMONICS)
|
||||
job->flags|=SHARP_REAL_HARMONICS;
|
||||
job->time = 0.;
|
||||
|
@ -965,75 +964,6 @@ void sharp_set_nchunks_max(int new_nchunks_max)
|
|||
int sharp_get_nv_max (void)
|
||||
{ return 6; }
|
||||
|
||||
static int sharp_oracle (sharp_jobtype type, int spin)
|
||||
{
|
||||
int lmax=511;
|
||||
int mmax=(lmax+1)/2;
|
||||
int nrings=(lmax+1)/4;
|
||||
int ppring=1;
|
||||
|
||||
spin = (spin!=0) ? 2 : 0;
|
||||
|
||||
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
|
||||
sharp_geom_info *tinfo;
|
||||
sharp_make_gauss_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
|
||||
|
||||
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
|
||||
int ncomp = (spin==0) ? 1 : 2;
|
||||
|
||||
double **map;
|
||||
ALLOC2D(map,double,ncomp,npix);
|
||||
SET_ARRAY(map[0],0,npix*ncomp,0.);
|
||||
|
||||
sharp_alm_info *alms;
|
||||
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
|
||||
|
||||
dcmplx **alm;
|
||||
ALLOC2D(alm,dcmplx,ncomp,nalms);
|
||||
SET_ARRAY(alm[0],0,nalms*ncomp,0.);
|
||||
|
||||
double time=1e30;
|
||||
int nvbest=-1;
|
||||
|
||||
for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
|
||||
{
|
||||
double time_acc=0.;
|
||||
double jtime;
|
||||
int ntries=0;
|
||||
do
|
||||
{
|
||||
sharp_execute(type,spin,&alm[0],&map[0],tinfo,alms,
|
||||
nv|SHARP_DP|SHARP_NO_OPENMP,&jtime,NULL);
|
||||
|
||||
if (jtime<time) { time=jtime; nvbest=nv; }
|
||||
time_acc+=jtime;
|
||||
++ntries;
|
||||
}
|
||||
while ((time_acc<0.02)&&(ntries<2));
|
||||
}
|
||||
|
||||
DEALLOC2D(map);
|
||||
DEALLOC2D(alm);
|
||||
|
||||
sharp_destroy_alm_info(alms);
|
||||
sharp_destroy_geom_info(tinfo);
|
||||
return nvbest;
|
||||
}
|
||||
|
||||
int sharp_nv_oracle (sharp_jobtype type, int spin)
|
||||
{
|
||||
static const int maxtr = 6;
|
||||
static int nv_opt[2][5] = {{0,0,0,0,0},{0,0,0,0,0}};
|
||||
|
||||
if (type==SHARP_ALM2MAP_DERIV1) spin=1;
|
||||
UTIL_ASSERT(type<5,"bad type");
|
||||
UTIL_ASSERT(spin>=0, "bad spin");
|
||||
|
||||
if (nv_opt[spin!=0][type]==0)
|
||||
nv_opt[spin!=0][type]=sharp_oracle(type,spin);
|
||||
return nv_opt[spin!=0][type];
|
||||
}
|
||||
|
||||
#ifdef USE_MPI
|
||||
#include "sharp_mpi.c"
|
||||
|
||||
|
|
|
@ -217,11 +217,12 @@ NOINLINE static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
|
|||
{
|
||||
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
lam_1.v[i] = vload(gen->rf[l].f[0])*(cth.v[i]*lam_2.v[i])
|
||||
- vload(gen->rf[l].f[1])*lam_1.v[i];
|
||||
for (int i=0; i<nvec; ++i)
|
||||
lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
|
||||
- vload(gen->rf[l+1].f[1])*lam_2.v[i];
|
||||
}
|
||||
if (Y(rescale)(&lam_1,&lam_2,&scale))
|
||||
below_limit = Y(TballLt)(scale,sharp_limscale);
|
||||
l+=2;
|
||||
|
|
|
@ -40,35 +40,10 @@
|
|||
|
||||
typedef complex double dcmplx;
|
||||
|
||||
// must be in the range [0;6]
|
||||
#define MAXJOB_SPECIAL 2
|
||||
|
||||
#define XCONCATX(a,b) a##b
|
||||
#define CONCATX(a,b) XCONCATX(a,b)
|
||||
#define XCONCAT2(a,b) a##_##b
|
||||
#define CONCAT2(a,b) XCONCAT2(a,b)
|
||||
#define XCONCAT3(a,b,c) a##_##b##_##c
|
||||
#define CONCAT3(a,b,c) XCONCAT3(a,b,c)
|
||||
|
||||
#define nvec 1
|
||||
#include "sharp_core_inchelper.c"
|
||||
#undef nvec
|
||||
|
||||
#define nvec 2
|
||||
#include "sharp_core_inchelper.c"
|
||||
#undef nvec
|
||||
|
||||
#define nvec 3
|
||||
#include "sharp_core_inchelper.c"
|
||||
#undef nvec
|
||||
|
||||
#define nvec 4
|
||||
#include "sharp_core_inchelper.c"
|
||||
#undef nvec
|
||||
|
||||
#define nvec 5
|
||||
#include "sharp_core_inchelper.c"
|
||||
#undef nvec
|
||||
|
||||
#define nvec 6
|
||||
#include "sharp_core_inchelper.c"
|
||||
|
@ -78,27 +53,5 @@ void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *c
|
|||
const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
|
||||
const int *mlim)
|
||||
{
|
||||
int nv=job->flags&SHARP_NVMAX;
|
||||
switch (nv)
|
||||
{
|
||||
case 0x1:
|
||||
CONCAT2(inner_loop,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
|
||||
return;
|
||||
case 0x2:
|
||||
CONCAT2(inner_loop,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
|
||||
return;
|
||||
case 0x3:
|
||||
CONCAT2(inner_loop,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
|
||||
return;
|
||||
case 0x4:
|
||||
CONCAT2(inner_loop,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
|
||||
return;
|
||||
case 0x5:
|
||||
CONCAT2(inner_loop,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
|
||||
return;
|
||||
case 0x6:
|
||||
CONCAT2(inner_loop,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
|
||||
return;
|
||||
}
|
||||
UTIL_FAIL("Incorrect vector parameters");
|
||||
CONCAT2(inner_loop,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
|
||||
}
|
||||
|
|
|
@ -34,75 +34,44 @@ NOINLINE static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
|
|||
const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
|
||||
int l, int lmax)
|
||||
{
|
||||
while (l<lmax)
|
||||
while (l<=lmax)
|
||||
{
|
||||
for (int i=0; i<nvec; ++i)
|
||||
lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
|
||||
- vload(rf[l].f[1])*lam_1.v[i];
|
||||
{
|
||||
Tv ar=vload(creal(alm[l])),
|
||||
ai=vload(cimag(alm[l]));
|
||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
|
||||
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
||||
Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]),
|
||||
f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
p1->r.v[i] += lam_2.v[i]*ar;
|
||||
p1->i.v[i] += lam_2.v[i]*ai;
|
||||
}
|
||||
}
|
||||
for (int i=0; i<nvec; ++i)
|
||||
lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
|
||||
- vload(rf[l+1].f[1])*lam_2.v[i];
|
||||
{
|
||||
Tv ar=vload(creal(alm[l+1])),
|
||||
ai=vload(cimag(alm[l+1]));
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
p2->r.v[i] += lam_1.v[i]*ar;
|
||||
p2->i.v[i] += lam_1.v[i]*ai;
|
||||
}
|
||||
lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
|
||||
p1->r.v[i] += lam_2.v[i]*ar1;
|
||||
p1->i.v[i] += lam_2.v[i]*ai1;
|
||||
lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
|
||||
p2->r.v[i] += lam_1.v[i]*ar2;
|
||||
p2->i.v[i] += lam_1.v[i]*ai2;
|
||||
}
|
||||
l+=2;
|
||||
}
|
||||
if (l==lmax)
|
||||
{
|
||||
Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
p1->r.v[i] += lam_2.v[i]*ar;
|
||||
p1->i.v[i] += lam_2.v[i]*ai;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
NOINLINE static void Z(map2alm_kernel) (const Tb cth,
|
||||
const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
|
||||
const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp)
|
||||
{
|
||||
while (l<lmax)
|
||||
while (l<=lmax)
|
||||
{
|
||||
for (int i=0; i<nvec; ++i)
|
||||
lam_1.v[i] = vabmc(vload(rf[l].f[0]),vmul(cth.v[i],lam_2.v[i]),
|
||||
vmul(vload(rf[l].f[1]),lam_1.v[i]));
|
||||
Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]),
|
||||
f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
|
||||
vfmaeq(atmp[2*l ],lam_2.v[i],p1->r.v[i]);
|
||||
vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]);
|
||||
}
|
||||
for (int i=0; i<nvec; ++i)
|
||||
lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
|
||||
- vload(rf[l+1].f[1])*lam_2.v[i];
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
|
||||
vfmaeq(atmp[2*(l+1) ],lam_1.v[i],p2->r.v[i]);
|
||||
vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]);
|
||||
}
|
||||
l+=2;
|
||||
}
|
||||
if (l==lmax)
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
atmp[2*l ] += lam_2.v[i]*p1->r.v[i];
|
||||
atmp[2*l+1] += lam_2.v[i]*p1->i.v[i];
|
||||
}
|
||||
}
|
||||
|
||||
NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth,
|
||||
|
@ -187,10 +156,9 @@ NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
|
|||
}
|
||||
if (++l>lmax) return;
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
|
||||
- vload(rf[l-1].f[1])*lam_1.v[i];
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
Tv tmp=lam_1.v[i]*corfac.v[i];
|
||||
atmp[2*l ]+=tmp*p2->r.v[i];
|
||||
atmp[2*l+1]+=tmp*p2->i.v[i];
|
||||
|
@ -222,9 +190,6 @@ static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
|
|||
vfmaeq(px->qi.v[i],agi,lw);
|
||||
vfmaeq(px->ur.v[i],acr,lw);
|
||||
vfmaeq(px->ui.v[i],aci,lw);
|
||||
}
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
Tv lx=vsub(rxm.v[i],rxp.v[i]);
|
||||
vfmseq(py->qr.v[i],aci,lx);
|
||||
vfmaeq(py->qi.v[i],acr,lx);
|
||||
|
@ -249,9 +214,6 @@ static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
|
|||
vfmaaeq(p1->qi.v[i],agi1,lw1,acr2,lx2);
|
||||
vfmaaeq(p1->ur.v[i],acr1,lw1,agi2,lx2);
|
||||
vfmaseq(p1->ui.v[i],aci1,lw1,agr2,lx2);
|
||||
}
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
Tv lx1=r2m.v[i]-r2p.v[i];
|
||||
Tv lw2=r1p.v[i]+r1m.v[i];
|
||||
vfmaseq(p2->qr.v[i],agr2,lw2,aci1,lx1);
|
||||
|
@ -273,9 +235,6 @@ static inline void Z(saddstep2) (const Y(Tbqu) * restrict px,
|
|||
vfmaeq(agi,px->qi.v[i],lw);
|
||||
vfmaeq(acr,px->ur.v[i],lw);
|
||||
vfmaeq(aci,px->ui.v[i],lw);
|
||||
}
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
Tv lx=vsub(rxm->v[i],rxp->v[i]);
|
||||
vfmseq(agr,py->ui.v[i],lx);
|
||||
vfmaeq(agi,py->ur.v[i],lx);
|
||||
|
@ -444,9 +403,6 @@ static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
|
|||
Tv lw=vadd(rxp.v[i],rxm.v[i]);
|
||||
vfmaeq(px->qr.v[i],ar,lw);
|
||||
vfmaeq(px->qi.v[i],ai,lw);
|
||||
}
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
Tv lx=vsub(rxm.v[i],rxp.v[i]);
|
||||
vfmaeq(py->ur.v[i],ai,lx);
|
||||
vfmseq(py->ui.v[i],ar,lx);
|
||||
|
@ -652,8 +608,8 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
|
|||
{
|
||||
if (job->spin==0)
|
||||
{
|
||||
Tv atmp[2*(gen->lmax+1)];
|
||||
memset (&atmp[2*m],0,2*(gen->lmax+1-m)*sizeof(Tv));
|
||||
Tv atmp[2*(gen->lmax+2)];
|
||||
memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv));
|
||||
for (int ith=0; ith<ulim-llim; ith+=nval)
|
||||
{
|
||||
Y(Tburi) p1, p2; VZERO(p1); VZERO(p2);
|
||||
|
|
|
@ -200,7 +200,6 @@ typedef enum { SHARP_DP = 1<<4,
|
|||
|
||||
SHARP_USE_WEIGHTS = 1<<20, /* internal use only */
|
||||
SHARP_NO_OPENMP = 1<<21, /* internal use only */
|
||||
SHARP_NVMAX = (1<<4)-1 /* internal use only */
|
||||
} sharp_jobflags;
|
||||
|
||||
/*! Performs a libsharp SHT job. The interface deliberately does not use
|
||||
|
|
|
@ -69,7 +69,7 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
|
|||
gen->m = -1;
|
||||
if (spin==0)
|
||||
{
|
||||
gen->rf = RALLOC(sharp_ylmgen_dbl2,gen->lmax+1);
|
||||
gen->rf = RALLOC(sharp_ylmgen_dbl2,gen->lmax+2);
|
||||
gen->mfac = RALLOC(double,gen->mmax+1);
|
||||
gen->mfac[0] = inv_sqrt4pi;
|
||||
for (int m=1; m<=gen->mmax; ++m)
|
||||
|
@ -159,7 +159,7 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
|
|||
{
|
||||
gen->rf[m].f[0] = gen->root[2*m+3];
|
||||
gen->rf[m].f[1] = 0.;
|
||||
for (int l=m+1; l<=gen->lmax; ++l)
|
||||
for (int l=m+1; l<=gen->lmax+1; ++l)
|
||||
{
|
||||
double tmp=gen->root[2*l+3]*gen->iroot[l+1+m]*gen->iroot[l+1-m];
|
||||
gen->rf[l].f[0] = tmp*gen->root[2*l+1];
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue