From ac3bf55ac501802c3aed9e22d98fca496bde23a0 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 17 Dec 2018 15:08:51 +0100 Subject: [PATCH] fixes and cleanups --- libsharp/sharp.c | 2 +- libsharp/sharp_core.c | 73 ++++++++++++++-------------- libsharp/sharp_testsuite.c | 99 ++++++-------------------------------- 3 files changed, 54 insertions(+), 120 deletions(-) diff --git a/libsharp/sharp.c b/libsharp/sharp.c index f312fc3..aa680df 100644 --- a/libsharp/sharp.c +++ b/libsharp/sharp.c @@ -588,7 +588,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) } else memset (job->almtmp+job->nalm*job->ainfo->mval[mi], 0, - job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx)); + job->nalm*(lmax+2-job->ainfo->mval[mi])*sizeof(dcmplx)); #undef COPY_LOOP } diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c index 1e476d0..b1414f1 100644 --- a/libsharp/sharp_core.c +++ b/libsharp/sharp_core.c @@ -40,22 +40,20 @@ typedef complex double dcmplx; -#define nvec (128/VLEN) +#define nv0 (64/VLEN) +#define nvx (128/VLEN) -typedef union - { Tv v; double s[VLEN]; } Tvu; - -typedef Tv Tbv[nvec]; -typedef double Tbs[nvec*VLEN]; +typedef Tv Tbv0[nv0]; +typedef double Tbs0[nv0*VLEN]; typedef struct { - Tbv sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i; + Tbv0 sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i; } s0data_v; typedef struct { - Tbs sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i; + Tbs0 sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i; } s0data_s; typedef union @@ -64,16 +62,19 @@ typedef union s0data_s s; } s0data_u; +typedef Tv Tbvx[nvx]; +typedef double Tbsx[nvx*VLEN]; + typedef struct { - Tbv sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth, - p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi; + Tbvx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth, + p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi; } sxdata_v; typedef struct { - Tbs sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth, - p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi; + Tbsx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth, + p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi; } sxdata_s; typedef union @@ -146,6 +147,9 @@ static void mypow(Tv val, int npow, const double * restrict powlimit, static inline void getCorfac(Tv scale, Tv * restrict corfac, const double * restrict cf) { + typedef union + { Tv v; double s[VLEN]; } Tvu; + Tvu sc, corf; sc.v=scale; for (int i=0; igen->lmax) {*l_=gen->lmax+1;return;} below_limit=1; + Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]), + fx12=vload(fx[l+1].f[2]); + Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]), + fx22=vload(fx[l+2].f[2]); for (int i=0; il1p[i],&d->l1m[i],&d->l2p[i],&d->l2m[i],d->cth[i],fx[l+1]); - rec_step(&d->l2p[i],&d->l2m[i],&d->l1p[i],&d->l1m[i],d->cth[i],fx[l+2]); + d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i]; + d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i]; + d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i]; + d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i]; if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) || rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol))) below_limit &= vallTrue(vlt(d->scp[i],limscale)) && @@ -368,7 +368,7 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job, int l,lmax=gen->lmax; int nv2 = (nth+VLEN-1)/VLEN; iter_to_ieee_spin(gen, d, &l, nv2); - job->opcnt += (l-gen->m) * 10*nth; + job->opcnt += (l-gen->mhi) * 10*nth; if (l>lmax) return; job->opcnt += (lmax+1-l) * 28*nth; @@ -406,10 +406,10 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job, d->p1mi[i] += aci1*lw1 - agr2*lx2; Tv lx1=d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i]; Tv lw2=d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i]; - d->p2pr[i] -= agr2*lw2 - aci1*lx1; + d->p2pr[i] += agr2*lw2 - aci1*lx1; d->p2pi[i] += agi2*lw2 + acr1*lx1; d->p2mr[i] += acr2*lw2 + agi1*lx1; - d->p2mi[i] -= aci2*lw2 - agr1*lx1; + d->p2mi[i] += aci2*lw2 - agr1*lx1; d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i]; d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i]; if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) @@ -443,7 +443,7 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job, int l,lmax=gen->lmax; int nv2 = (nth+VLEN-1)/VLEN; iter_to_ieee_spin(gen, d, &l, nv2); - job->opcnt += (l-gen->m) * 10*nth; + job->opcnt += (l-gen->mhi) * 10*nth; if (l>lmax) return; job->opcnt += (lmax+1-l) * 28*nth; @@ -519,7 +519,6 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job, } map2alm_spin_kernel(d, fx, alm, l, lmax, nv2); } -#endif NOINLINE static void alm2map_kernel(s0data_v * restrict d, const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, @@ -678,19 +677,21 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, const double *cth_, const double *sth_, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, const int *mlim) { - const int nval=nvec*VLEN; const int m = job->ainfo->mval[mi]; sharp_Ylmgen_prepare (gen, m); switch (job->type) { - case SHARP_ALM2MAP: case SHARP_ALM2MAP_DERIV1: + UTIL_FAIL("derivatives currently not supported"); + break; + case SHARP_ALM2MAP: { if (job->spin==0) { + const int nval=nv0*VLEN; int ith=0; - int itgt[nvec*VLEN]; + int itgt[nval]; while (ithainfo->mval[mi]; sharp_Ylmgen_prepare (gen, m); @@ -818,6 +819,7 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, { if (job->spin==0) { + const int nval=nv0*VLEN; int ith=0; while (ith=8,"usage: grid lmax mmax geom1 geom2 spin"); - int lmax=atoi(argv[3]); - int mmax=atoi(argv[4]); - int gpar1=atoi(argv[5]); - int gpar2=atoi(argv[6]); - int spin=atoi(argv[7]); - - if (mytask==0) printf("Testing map analysis accuracy.\n"); - if (mytask==0) printf("spin=%d\n", spin); - - sharp_geom_info *ginfo; - sharp_alm_info *ainfo; - get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo); - - double ta2m_auto=1e30, tm2a_auto=1e30, ta2m_min=1e30, tm2a_min=1e30; - unsigned long long opa2m_min=0, opm2a_min=0; - int nvmin_a2m=-1, nvmin_m2a=-1; - for (int nv=0; nv<=6; ++nv) - { - int ntries=0; - double tacc=0; - do - { - double t_a2m, t_m2a; - unsigned long long op_a2m, op_m2a; - double *err_abs,*err_rel; - do_sht (ginfo, ainfo, spin, nv, &err_abs, &err_rel, - &t_a2m, &t_m2a, &op_a2m, &op_m2a); - - DEALLOC(err_abs); - DEALLOC(err_rel); - tacc+=t_a2m+t_m2a; - ++ntries; - if (nv==0) - { - if (t_a2m