fixes and cleanups

This commit is contained in:
Martin Reinecke 2018-12-17 15:08:51 +01:00
parent 8c98d4624e
commit ac3bf55ac5
3 changed files with 54 additions and 120 deletions

View file

@ -588,7 +588,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
}
else
memset (job->almtmp+job->nalm*job->ainfo->mval[mi], 0,
job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx));
job->nalm*(lmax+2-job->ainfo->mval[mi])*sizeof(dcmplx));
#undef COPY_LOOP
}

View file

@ -40,22 +40,20 @@
typedef complex double dcmplx;
#define nvec (128/VLEN)
#define nv0 (64/VLEN)
#define nvx (128/VLEN)
typedef union
{ Tv v; double s[VLEN]; } Tvu;
typedef Tv Tbv[nvec];
typedef double Tbs[nvec*VLEN];
typedef Tv Tbv0[nv0];
typedef double Tbs0[nv0*VLEN];
typedef struct
{
Tbv sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i;
Tbv0 sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i;
} s0data_v;
typedef struct
{
Tbs sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i;
Tbs0 sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i;
} s0data_s;
typedef union
@ -64,16 +62,19 @@ typedef union
s0data_s s;
} s0data_u;
typedef Tv Tbvx[nvx];
typedef double Tbsx[nvx*VLEN];
typedef struct
{
Tbv sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
Tbvx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
} sxdata_v;
typedef struct
{
Tbs sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
Tbsx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
} sxdata_s;
typedef union
@ -146,6 +147,9 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
static inline void getCorfac(Tv scale, Tv * restrict corfac,
const double * restrict cf)
{
typedef union
{ Tv v; double s[VLEN]; } Tvu;
Tvu sc, corf;
sc.v=scale;
for (int i=0; i<VLEN; ++i)
@ -201,16 +205,6 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
*l_=l;
}
#if 1
static inline void rec_step (Tv * restrict rxp, Tv * restrict rxm,
Tv * restrict ryp, Tv * restrict rym, const Tv cth,
const sharp_ylmgen_dbl3 fx)
{
Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
*rxp = (cth-fx1)*fx0* *ryp - fx2* *rxp;
*rxm = (cth+fx1)*fx0* *rym - fx2* *rxm;
}
NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
sxdata_v * restrict d, int * restrict l_, int nv2)
{
@ -265,10 +259,16 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
{
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
below_limit=1;
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
fx12=vload(fx[l+1].f[2]);
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
fx22=vload(fx[l+2].f[2]);
for (int i=0; i<nv2; ++i)
{
rec_step(&d->l1p[i],&d->l1m[i],&d->l2p[i],&d->l2m[i],d->cth[i],fx[l+1]);
rec_step(&d->l2p[i],&d->l2m[i],&d->l1p[i],&d->l1m[i],d->cth[i],fx[l+2]);
d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
@ -368,7 +368,7 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
int l,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN;
iter_to_ieee_spin(gen, d, &l, nv2);
job->opcnt += (l-gen->m) * 10*nth;
job->opcnt += (l-gen->mhi) * 10*nth;
if (l>lmax) return;
job->opcnt += (lmax+1-l) * 28*nth;
@ -406,10 +406,10 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
d->p1mi[i] += aci1*lw1 - agr2*lx2;
Tv lx1=d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i];
Tv lw2=d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i];
d->p2pr[i] -= agr2*lw2 - aci1*lx1;
d->p2pr[i] += agr2*lw2 - aci1*lx1;
d->p2pi[i] += agi2*lw2 + acr1*lx1;
d->p2mr[i] += acr2*lw2 + agi1*lx1;
d->p2mi[i] -= aci2*lw2 - agr1*lx1;
d->p2mi[i] += aci2*lw2 - agr1*lx1;
d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
@ -443,7 +443,7 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
int l,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN;
iter_to_ieee_spin(gen, d, &l, nv2);
job->opcnt += (l-gen->m) * 10*nth;
job->opcnt += (l-gen->mhi) * 10*nth;
if (l>lmax) return;
job->opcnt += (lmax+1-l) * 28*nth;
@ -519,7 +519,6 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
}
map2alm_spin_kernel(d, fx, alm, l, lmax, nv2);
}
#endif
NOINLINE static void alm2map_kernel(s0data_v * restrict d,
const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
@ -678,19 +677,21 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim)
{
const int nval=nvec*VLEN;
const int m = job->ainfo->mval[mi];
sharp_Ylmgen_prepare (gen, m);
switch (job->type)
{
case SHARP_ALM2MAP:
case SHARP_ALM2MAP_DERIV1:
UTIL_FAIL("derivatives currently not supported");
break;
case SHARP_ALM2MAP:
{
if (job->spin==0)
{
const int nval=nv0*VLEN;
int ith=0;
int itgt[nvec*VLEN];
int itgt[nval];
while (ith<ulim-llim)
{
s0data_u d;
@ -736,8 +737,9 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
}
else
{
const int nval=nvx*VLEN;
int ith=0;
int itgt[nvec*VLEN];
int itgt[nval];
while (ith<ulim-llim)
{
sxdata_u d;
@ -808,7 +810,6 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim)
{
const int nval=nvec*VLEN;
const int m = job->ainfo->mval[mi];
sharp_Ylmgen_prepare (gen, m);
@ -818,6 +819,7 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
{
if (job->spin==0)
{
const int nval=nv0*VLEN;
int ith=0;
while (ith<ulim-llim)
{
@ -852,6 +854,7 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
}
else
{
const int nval=nvx*VLEN;
int ith=0;
while (ith<ulim-llim)
{
@ -919,5 +922,5 @@ int sharp_veclen(void)
int sharp_max_nvec(void)
{
return nvec;
return nv0;
}

View file

@ -24,7 +24,7 @@
/* \file sharp_testsuite.c
*
* Copyright (C) 2012-2013 Max-Planck-Society
* Copyright (C) 2012-2018 Max-Planck-Society
* \author Martin Reinecke
*/
@ -375,7 +375,6 @@ static void check_sign_scale(void)
UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.234675107554816442e+01,1e-12),
"error");
#if 1
sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,SHARP_DP,
NULL,NULL);
UTIL_ASSERT(FAPPROX(map[0][0 ], 2.750897760535633285e+00,1e-12),
@ -406,6 +405,7 @@ static void check_sign_scale(void)
UTIL_ASSERT(FAPPROX(map[1][npix-1],-1.863257892248353897e+01,1e-12),
"error");
#if 0
sharp_execute(SHARP_ALM2MAP_DERIV1,1,&alm[0],&map[0],tinfo,alms,
SHARP_DP,NULL,NULL);
UTIL_ASSERT(FAPPROX(map[0][0 ],-6.859393905369091105e-01,1e-11),
@ -430,7 +430,7 @@ static void check_sign_scale(void)
}
static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
int spin, int nv, double **err_abs, double **err_rel,
int spin, double **err_abs, double **err_rel,
double *t_a2m, double *t_m2a, unsigned long long *op_a2m,
unsigned long long *op_m2a)
{
@ -450,20 +450,20 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
#ifdef USE_MPI
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,
ainfo, SHARP_DP|SHARP_ADD|nv,t_a2m,op_a2m);
ainfo, SHARP_DP|SHARP_ADD,t_a2m,op_a2m);
#else
sharp_execute(SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,ainfo,
SHARP_DP|nv,t_a2m,op_a2m);
SHARP_DP,t_a2m,op_a2m);
#endif
if (t_a2m!=NULL) *t_a2m=maxTime(*t_a2m);
if (op_a2m!=NULL) *op_a2m=totalops(*op_a2m);
double *sqsum=get_sqsum_and_invert(alm,nalms,ncomp);
#ifdef USE_MPI
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,
ainfo,SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
ainfo,SHARP_DP|SHARP_ADD,t_m2a,op_m2a);
#else
sharp_execute(SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,ainfo,
SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
SHARP_DP|SHARP_ADD,t_m2a,op_m2a);
#endif
if (t_m2a!=NULL) *t_m2a=maxTime(*t_m2a);
if (op_m2a!=NULL) *op_m2a=totalops(*op_m2a);
@ -475,11 +475,11 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
}
static void check_accuracy (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
int spin, int nv)
int spin)
{
int ncomp = (spin==0) ? 1 : 2;
double *err_abs, *err_rel;
do_sht (ginfo, ainfo, spin, nv, &err_abs, &err_rel, NULL, NULL,
do_sht (ginfo, ainfo, spin, &err_abs, &err_rel, NULL, NULL,
NULL, NULL);
for (int i=0; i<ncomp; ++i)
UTIL_ASSERT((err_rel[i]<1e-10) && (err_abs[i]<1e-10),"error");
@ -501,16 +501,11 @@ static void sharp_acctest(void)
sharp_alm_info *ainfo;
int lmax=127, mmax=127, nlat=128, nlon=256;
get_infos ("gauss", lmax, &mmax, &nlat, &nlon, &ginfo, &ainfo);
for (int nv=1; nv<=6; ++nv)
{
check_accuracy(ginfo,ainfo,0,nv);
#if 0
check_accuracy(ginfo,ainfo,1,nv);
check_accuracy(ginfo,ainfo,2,nv);
check_accuracy(ginfo,ainfo,3,nv);
check_accuracy(ginfo,ainfo,30,nv);
#endif
}
check_accuracy(ginfo,ainfo,0);
check_accuracy(ginfo,ainfo,1);
check_accuracy(ginfo,ainfo,2);
check_accuracy(ginfo,ainfo,3);
check_accuracy(ginfo,ainfo,30);
sharp_destroy_alm_info(ainfo);
sharp_destroy_geom_info(ginfo);
if (mytask==0) printf("Passed.\n\n");
@ -544,7 +539,7 @@ static void sharp_test (int argc, const char **argv)
{
++nrpt;
double ta2m2, tm2a2;
do_sht (ginfo, ainfo, spin, 0, &err_abs, &err_rel, &ta2m2, &tm2a2,
do_sht (ginfo, ainfo, spin, &err_abs, &err_rel, &ta2m2, &tm2a2,
&op_a2m, &op_m2a);
if (ta2m2<t_a2m) t_a2m=ta2m2;
if (tm2a2<t_m2a) t_m2a=tm2a2;
@ -604,68 +599,6 @@ static void sharp_test (int argc, const char **argv)
DEALLOC(err_rel);
}
static void sharp_bench (int argc, const char **argv)
{
if (mytask==0) sharp_announce("sharp_bench");
UTIL_ASSERT(argc>=8,"usage: grid lmax mmax geom1 geom2 spin");
int lmax=atoi(argv[3]);
int mmax=atoi(argv[4]);
int gpar1=atoi(argv[5]);
int gpar2=atoi(argv[6]);
int spin=atoi(argv[7]);
if (mytask==0) printf("Testing map analysis accuracy.\n");
if (mytask==0) printf("spin=%d\n", spin);
sharp_geom_info *ginfo;
sharp_alm_info *ainfo;
get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo);
double ta2m_auto=1e30, tm2a_auto=1e30, ta2m_min=1e30, tm2a_min=1e30;
unsigned long long opa2m_min=0, opm2a_min=0;
int nvmin_a2m=-1, nvmin_m2a=-1;
for (int nv=0; nv<=6; ++nv)
{
int ntries=0;
double tacc=0;
do
{
double t_a2m, t_m2a;
unsigned long long op_a2m, op_m2a;
double *err_abs,*err_rel;
do_sht (ginfo, ainfo, spin, nv, &err_abs, &err_rel,
&t_a2m, &t_m2a, &op_a2m, &op_m2a);
DEALLOC(err_abs);
DEALLOC(err_rel);
tacc+=t_a2m+t_m2a;
++ntries;
if (nv==0)
{
if (t_a2m<ta2m_auto) ta2m_auto=t_a2m;
if (t_m2a<tm2a_auto) tm2a_auto=t_m2a;
}
else
{
if (t_a2m<ta2m_min) { nvmin_a2m=nv; ta2m_min=t_a2m; opa2m_min=op_a2m; }
if (t_m2a<tm2a_min) { nvmin_m2a=nv; tm2a_min=t_m2a; opm2a_min=op_m2a; }
}
} while((ntries<2)||(tacc<3.));
}
if (mytask==0)
{
printf("a2m: nvmin=%d tmin=%fs speedup=%.2f%% perf=%.2fGFlops/s\n",
nvmin_a2m,ta2m_min,100.*(ta2m_auto-ta2m_min)/ta2m_auto,
1e-9*opa2m_min/ta2m_min);
printf("m2a: nvmin=%d tmin=%fs speedup=%.2f%% perf=%.2fGFlops/s\n",
nvmin_m2a,tm2a_min,100.*(tm2a_auto-tm2a_min)/tm2a_auto,
1e-9*opm2a_min/tm2a_min);
}
sharp_destroy_alm_info(ainfo);
sharp_destroy_geom_info(ginfo);
}
int main(int argc, const char **argv)
{
#ifdef USE_MPI
@ -682,8 +615,6 @@ int main(int argc, const char **argv)
sharp_acctest();
else if (strcmp(argv[1],"test")==0)
sharp_test(argc,argv);
else if (strcmp(argv[1],"bench")==0)
sharp_bench(argc,argv);
else
UTIL_FAIL("unknown command");