From aee1a51ac2ebfaa40a4e5d37351688916da559fd Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Tue, 11 Dec 2018 09:33:44 +0100 Subject: [PATCH] more simplifications --- libsharp/sharp.c | 78 ++---------------------------------- libsharp/sharp_core_inc.c | 3 +- libsharp/sharp_core_inc0.c | 49 +---------------------- libsharp/sharp_core_inc2.c | 82 +++++++++----------------------------- libsharp/sharp_lowlevel.h | 1 - libsharp/sharp_ylmgen_c.c | 4 +- 6 files changed, 28 insertions(+), 189 deletions(-) diff --git a/libsharp/sharp.c b/libsharp/sharp.c index d882689..943bd79 100644 --- a/libsharp/sharp.c +++ b/libsharp/sharp.c @@ -515,7 +515,7 @@ static void dealloc_phase (sharp_job *job) { DEALLOC(job->phase); } static void alloc_almtmp (sharp_job *job, int lmax) - { job->almtmp=RALLOC(dcmplx,job->nalm*(lmax+1)); } + { job->almtmp=RALLOC(dcmplx,job->nalm*(lmax+2)); } static void dealloc_almtmp (sharp_job *job) { DEALLOC(job->almtmp); } @@ -534,6 +534,8 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \ job->almtmp[job->nalm*l+i] = expr_of_x; \ } \ + for (int i=0; inalm; ++i) \ + job->almtmp[job->nalm*(lmax+1)+i] = 0; \ } if (job->type!=SHARP_MAP2ALM) @@ -852,8 +854,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job) init_output (job); int nchunks, chunksize; - get_chunk_info(job->ginfo->npairs,(job->flags&SHARP_NVMAX)*VLEN,&nchunks, - &chunksize); + get_chunk_info(job->ginfo->npairs,6*VLEN,&nchunks,&chunksize); //FIXME: needs to be changed to "nm" alloc_phase (job,mmax+1,chunksize); @@ -934,8 +935,6 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type, job->ginfo = geom_info; job->ainfo = alm_info; job->flags = flags; - if ((job->flags&SHARP_NVMAX)==0) - job->flags|=sharp_nv_oracle (type, spin); if (alm_info->flags&SHARP_REAL_HARMONICS) job->flags|=SHARP_REAL_HARMONICS; job->time = 0.; @@ -965,75 +964,6 @@ void sharp_set_nchunks_max(int new_nchunks_max) int sharp_get_nv_max (void) { return 6; } -static int sharp_oracle (sharp_jobtype type, int spin) - { - int lmax=511; - int mmax=(lmax+1)/2; - int nrings=(lmax+1)/4; - int ppring=1; - - spin = (spin!=0) ? 2 : 0; - - ptrdiff_t npix=(ptrdiff_t)nrings*ppring; - sharp_geom_info *tinfo; - sharp_make_gauss_geom_info (nrings, ppring, 0., 1, ppring, &tinfo); - - ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax); - int ncomp = (spin==0) ? 1 : 2; - - double **map; - ALLOC2D(map,double,ncomp,npix); - SET_ARRAY(map[0],0,npix*ncomp,0.); - - sharp_alm_info *alms; - sharp_make_triangular_alm_info(lmax,mmax,1,&alms); - - dcmplx **alm; - ALLOC2D(alm,dcmplx,ncomp,nalms); - SET_ARRAY(alm[0],0,nalms*ncomp,0.); - - double time=1e30; - int nvbest=-1; - - for (int nv=1; nv<=sharp_get_nv_max(); ++nv) - { - double time_acc=0.; - double jtime; - int ntries=0; - do - { - sharp_execute(type,spin,&alm[0],&map[0],tinfo,alms, - nv|SHARP_DP|SHARP_NO_OPENMP,&jtime,NULL); - - if (jtime=0, "bad spin"); - - if (nv_opt[spin!=0][type]==0) - nv_opt[spin!=0][type]=sharp_oracle(type,spin); - return nv_opt[spin!=0][type]; - } - #ifdef USE_MPI #include "sharp_mpi.c" diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c index 8a36bfe..a15e35a 100644 --- a/libsharp/sharp_core_inc.c +++ b/libsharp/sharp_core_inc.c @@ -217,11 +217,12 @@ NOINLINE static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_, { if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} for (int i=0; irf[l].f[0])*(cth.v[i]*lam_2.v[i]) - vload(gen->rf[l].f[1])*lam_1.v[i]; - for (int i=0; irf[l+1].f[0])*(cth.v[i]*lam_1.v[i]) - vload(gen->rf[l+1].f[1])*lam_2.v[i]; + } if (Y(rescale)(&lam_1,&lam_2,&scale)) below_limit = Y(TballLt)(scale,sharp_limscale); l+=2; diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c index 06b9285..15190d7 100644 --- a/libsharp/sharp_core_inc0.c +++ b/libsharp/sharp_core_inc0.c @@ -40,35 +40,10 @@ typedef complex double dcmplx; -// must be in the range [0;6] -#define MAXJOB_SPECIAL 2 - #define XCONCATX(a,b) a##b #define CONCATX(a,b) XCONCATX(a,b) #define XCONCAT2(a,b) a##_##b #define CONCAT2(a,b) XCONCAT2(a,b) -#define XCONCAT3(a,b,c) a##_##b##_##c -#define CONCAT3(a,b,c) XCONCAT3(a,b,c) - -#define nvec 1 -#include "sharp_core_inchelper.c" -#undef nvec - -#define nvec 2 -#include "sharp_core_inchelper.c" -#undef nvec - -#define nvec 3 -#include "sharp_core_inchelper.c" -#undef nvec - -#define nvec 4 -#include "sharp_core_inchelper.c" -#undef nvec - -#define nvec 5 -#include "sharp_core_inchelper.c" -#undef nvec #define nvec 6 #include "sharp_core_inchelper.c" @@ -78,27 +53,5 @@ void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *c const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, const int *mlim) { - int nv=job->flags&SHARP_NVMAX; - switch (nv) - { - case 0x1: - CONCAT2(inner_loop,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x2: - CONCAT2(inner_loop,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x3: - CONCAT2(inner_loop,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x4: - CONCAT2(inner_loop,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x5: - CONCAT2(inner_loop,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x6: - CONCAT2(inner_loop,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - } - UTIL_FAIL("Incorrect vector parameters"); + CONCAT2(inner_loop,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); } diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c index de2924f..364eef5 100644 --- a/libsharp/sharp_core_inc2.c +++ b/libsharp/sharp_core_inc2.c @@ -34,75 +34,44 @@ NOINLINE static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1, const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, int l, int lmax) { - while (lr.v[i] += lam_2.v[i]*ar; - p1->i.v[i] += lam_2.v[i]*ai; - } - } - for (int i=0; ir.v[i] += lam_1.v[i]*ar; - p2->i.v[i] += lam_1.v[i]*ai; - } + lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i]; + p1->r.v[i] += lam_2.v[i]*ar1; + p1->i.v[i] += lam_2.v[i]*ai1; + lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i]; + p2->r.v[i] += lam_1.v[i]*ar2; + p2->i.v[i] += lam_1.v[i]*ai2; } l+=2; } - if (l==lmax) - { - Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l])); - for (int i=0; ir.v[i] += lam_2.v[i]*ar; - p1->i.v[i] += lam_2.v[i]*ai; - } - } } NOINLINE static void Z(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp) { - while (lr.v[i]); vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]); - } - for (int i=0; ir.v[i]); vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]); } l+=2; } - if (l==lmax) - for (int i=0; ir.v[i]; - atmp[2*l+1] += lam_2.v[i]*p1->i.v[i]; - } } NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth, @@ -187,10 +156,9 @@ NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth, } if (++l>lmax) return; for (int i=0; ir.v[i]; atmp[2*l+1]+=tmp*p2->i.v[i]; @@ -222,9 +190,6 @@ static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, vfmaeq(px->qi.v[i],agi,lw); vfmaeq(px->ur.v[i],acr,lw); vfmaeq(px->ui.v[i],aci,lw); - } - for (int i=0; iqr.v[i],aci,lx); vfmaeq(py->qi.v[i],acr,lx); @@ -249,9 +214,6 @@ static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, vfmaaeq(p1->qi.v[i],agi1,lw1,acr2,lx2); vfmaaeq(p1->ur.v[i],acr1,lw1,agi2,lx2); vfmaseq(p1->ui.v[i],aci1,lw1,agr2,lx2); - } - for (int i=0; iqr.v[i],agr2,lw2,aci1,lx1); @@ -273,9 +235,6 @@ static inline void Z(saddstep2) (const Y(Tbqu) * restrict px, vfmaeq(agi,px->qi.v[i],lw); vfmaeq(acr,px->ur.v[i],lw); vfmaeq(aci,px->ui.v[i],lw); - } - for (int i=0; iv[i],rxp->v[i]); vfmseq(agr,py->ui.v[i],lx); vfmaeq(agi,py->ur.v[i],lx); @@ -444,9 +403,6 @@ static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, Tv lw=vadd(rxp.v[i],rxm.v[i]); vfmaeq(px->qr.v[i],ar,lw); vfmaeq(px->qi.v[i],ai,lw); - } - for (int i=0; iur.v[i],ai,lx); vfmseq(py->ui.v[i],ar,lx); @@ -652,8 +608,8 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair, { if (job->spin==0) { - Tv atmp[2*(gen->lmax+1)]; - memset (&atmp[2*m],0,2*(gen->lmax+1-m)*sizeof(Tv)); + Tv atmp[2*(gen->lmax+2)]; + memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv)); for (int ith=0; ithm = -1; if (spin==0) { - gen->rf = RALLOC(sharp_ylmgen_dbl2,gen->lmax+1); + gen->rf = RALLOC(sharp_ylmgen_dbl2,gen->lmax+2); gen->mfac = RALLOC(double,gen->mmax+1); gen->mfac[0] = inv_sqrt4pi; for (int m=1; m<=gen->mmax; ++m) @@ -159,7 +159,7 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m) { gen->rf[m].f[0] = gen->root[2*m+3]; gen->rf[m].f[1] = 0.; - for (int l=m+1; l<=gen->lmax; ++l) + for (int l=m+1; l<=gen->lmax+1; ++l) { double tmp=gen->root[2*l+3]*gen->iroot[l+1+m]*gen->iroot[l+1-m]; gen->rf[l].f[0] = tmp*gen->root[2*l+1];