do not support multiple simultaneous transforms any more

This commit is contained in:
Martin Reinecke 2018-12-10 15:05:41 +01:00
parent 65f47d10cc
commit c56747d36e
8 changed files with 167 additions and 333 deletions

14
.gitignore vendored
View file

@ -1,9 +1,11 @@
*.o
*.so
#*
*~
*.pyc
*.pyo
**.o
**.lo
**.la
**.so
**/#*
**~
**.pyc
**.pyo
/auto
/autom4te.cache

View file

@ -487,10 +487,10 @@ NOINLINE static void init_output (sharp_job *job)
{
if (job->flags&SHARP_ADD) return;
if (job->type == SHARP_MAP2ALM)
for (int i=0; i<job->ntrans*job->nalm; ++i)
for (int i=0; i<job->nalm; ++i)
clear_alm (job->ainfo,job->alm[i],job->flags);
else
for (int i=0; i<job->ntrans*job->nmaps; ++i)
for (int i=0; i<job->nmaps; ++i)
clear_map (job->ginfo,job->map[i],job->flags);
}
@ -498,24 +498,24 @@ NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
{
if (job->type==SHARP_MAP2ALM)
{
job->s_m=2*job->ntrans*job->nmaps;
job->s_m=2*job->nmaps;
if (((job->s_m*16*nm)&1023)==0) nm+=3; // hack to avoid critical strides
job->s_th=job->s_m*nm;
}
else
{
job->s_th=2*job->ntrans*job->nmaps;
job->s_th=2*job->nmaps;
if (((job->s_th*16*ntheta)&1023)==0) ntheta+=3; // hack to avoid critical strides
job->s_m=job->s_th*ntheta;
}
job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta);
job->phase=RALLOC(dcmplx,2*job->nmaps*nm*ntheta);
}
static void dealloc_phase (sharp_job *job)
{ DEALLOC(job->phase); }
static void alloc_almtmp (sharp_job *job, int lmax)
{ job->almtmp=RALLOC(dcmplx,job->ntrans*job->nalm*(lmax+1)); }
{ job->almtmp=RALLOC(dcmplx,job->nalm*(lmax+1)); }
static void dealloc_almtmp (sharp_job *job)
{ DEALLOC(job->almtmp); }
@ -526,13 +526,13 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
#define COPY_LOOP(real_t, source_t, expr_of_x) \
{ \
for (int l=m; l<lmin; ++l) \
for (int i=0; i<job->ntrans*job->nalm; ++i) \
job->almtmp[job->ntrans*job->nalm*l+i] = 0; \
for (int i=0; i<job->nalm; ++i) \
job->almtmp[job->nalm*l+i] = 0; \
for (int l=lmin; l<=lmax; ++l) \
for (int i=0; i<job->ntrans*job->nalm; ++i) \
for (int i=0; i<job->nalm; ++i) \
{ \
source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \
job->almtmp[job->ntrans*job->nalm*l+i] = expr_of_x; \
job->almtmp[job->nalm*l+i] = expr_of_x; \
} \
}
@ -586,8 +586,8 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
}
}
else
memset (job->almtmp+job->ntrans*job->nalm*job->ainfo->mval[mi], 0,
job->ntrans*job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx));
memset (job->almtmp+job->nalm*job->ainfo->mval[mi], 0,
job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx));
#undef COPY_LOOP
}
@ -597,9 +597,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
#define COPY_LOOP(real_t, target_t, expr_of_x) \
for (int l=lmin; l<=lmax; ++l) \
for (int i=0; i<job->ntrans*job->nalm; ++i) \
for (int i=0; i<job->nalm; ++i) \
{ \
dcmplx x = job->almtmp[job->ntrans*job->nalm*l+i]; \
dcmplx x = job->almtmp[job->nalm*l+i]; \
*(target_t *)(((real_t *)job->alm[i])+ofs+l*stride) += expr_of_x; \
}
@ -660,7 +660,7 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
if (job->flags & SHARP_DP)
{
double **dmap = (double **)job->map;
for (int i=0; i<job->ntrans*job->nmaps; ++i)
for (int i=0; i<job->nmaps; ++i)
{
double *restrict p1=&dmap[i][ri->ofs];
const double *restrict p2=&ringtmp[i*rstride+1];
@ -680,7 +680,7 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
else
{
float **fmap = (float **)job->map;
for (int i=0; i<job->ntrans*job->nmaps; ++i)
for (int i=0; i<job->nmaps; ++i)
for (int m=0; m<ri->nph; ++m)
fmap[i][ri->ofs+m*ri->stride] += (float)ringtmp[i*rstride+m+1];
}
@ -690,7 +690,7 @@ NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
double *ringtmp, int rstride)
{
if (job->flags & SHARP_DP)
for (int i=0; i<job->ntrans*job->nmaps; ++i)
for (int i=0; i<job->nmaps; ++i)
{
double *restrict p1=&ringtmp[i*rstride+1],
*restrict p2=&(((double *)(job->map[i]))[ri->ofs]);
@ -701,7 +701,7 @@ NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
p1[m] = p2[m*ri->stride];
}
else
for (int i=0; i<job->ntrans*job->nmaps; ++i)
for (int i=0; i<job->nmaps; ++i)
for (int m=0; m<ri->nph; ++m)
ringtmp[i*rstride+m+1] = ((float *)(job->map[i]))[ri->ofs+m*ri->stride];
}
@ -711,7 +711,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
{
if (ri->nph<0)
{
for (int i=0; i<job->ntrans*job->nmaps; ++i)
for (int i=0; i<job->nmaps; ++i)
for (int m=0; m<=mmax; ++m)
phase[2*i+job->s_m*m]=0.;
}
@ -721,7 +721,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
double wgt = (job->flags&SHARP_USE_WEIGHTS) ? (ri->nph*ri->weight) : 1.;
if (job->flags&SHARP_REAL_HARMONICS)
wgt *= sqrt_two;
for (int i=0; i<job->ntrans*job->nmaps; ++i)
for (int i=0; i<job->nmaps; ++i)
for (int m=0; m<=mmax; ++m)
phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
@ -738,7 +738,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
double wgt = (job->flags&SHARP_USE_WEIGHTS) ? (ri->nph*ri->weight) : 1.;
if (job->flags&SHARP_REAL_HARMONICS)
wgt *= sqrt_one_half;
for (int i=0; i<job->ntrans*job->nmaps; ++i)
for (int i=0; i<job->nmaps; ++i)
for (int m=0; m<=mmax; ++m)
if (job->flags & SHARP_DP)
dmap[i][ri->ofs+m*ri->stride] += wgt*phase[2*i+job->s_m*m];
@ -769,19 +769,19 @@ NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
ringhelper helper;
ringhelper_init(&helper);
int rstride=job->ginfo->nphmax+2;
double *ringtmp=RALLOC(double,job->ntrans*job->nmaps*rstride);
double *ringtmp=RALLOC(double,job->nmaps*rstride);
#pragma omp for schedule(dynamic,1)
for (int ith=llim; ith<ulim; ++ith)
{
int dim2 = job->s_th*(ith-llim);
ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
for (int i=0; i<job->ntrans*job->nmaps; ++i)
for (int i=0; i<job->nmaps; ++i)
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
if (job->ginfo->pair[ith].r2.nph>0)
{
ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
for (int i=0; i<job->ntrans*job->nmaps; ++i)
for (int i=0; i<job->nmaps; ++i)
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
}
@ -814,18 +814,18 @@ NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
ringhelper helper;
ringhelper_init(&helper);
int rstride=job->ginfo->nphmax+2;
double *ringtmp=RALLOC(double,job->ntrans*job->nmaps*rstride);
double *ringtmp=RALLOC(double,job->nmaps*rstride);
#pragma omp for schedule(dynamic,1)
for (int ith=llim; ith<ulim; ++ith)
{
int dim2 = job->s_th*(ith-llim);
for (int i=0; i<job->ntrans*job->nmaps; ++i)
for (int i=0; i<job->nmaps; ++i)
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
if (job->ginfo->pair[ith].r2.nph>0)
{
for (int i=0; i<job->ntrans*job->nmaps; ++i)
for (int i=0; i<job->nmaps; ++i)
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
@ -918,10 +918,8 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
int spin, void *alm, void *map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans, int flags)
const sharp_alm_info *alm_info, int flags)
{
UTIL_ASSERT((ntrans>0)&&(ntrans<=SHARP_MAXTRANS),
"bad number of simultaneous transforms");
if (type==SHARP_ALM2MAP_DERIV1) spin=1;
if (type==SHARP_MAP2ALM) flags|=SHARP_USE_WEIGHTS;
if (type==SHARP_Yt) type=SHARP_MAP2ALM;
@ -937,23 +935,22 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
job->ainfo = alm_info;
job->flags = flags;
if ((job->flags&SHARP_NVMAX)==0)
job->flags|=sharp_nv_oracle (type, spin, ntrans);
job->flags|=sharp_nv_oracle (type, spin);
if (alm_info->flags&SHARP_REAL_HARMONICS)
job->flags|=SHARP_REAL_HARMONICS;
job->time = 0.;
job->opcnt = 0;
job->ntrans = ntrans;
job->alm=alm;
job->map=map;
}
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans,
const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
int flags, double *time, unsigned long long *opcnt)
{
sharp_job job;
sharp_build_job_common (&job, type, spin, alm, map, geom_info, alm_info,
ntrans, flags);
flags);
sharp_execute_job (&job);
if (time!=NULL) *time = job.time;
@ -968,7 +965,7 @@ void sharp_set_nchunks_max(int new_nchunks_max)
int sharp_get_nv_max (void)
{ return 6; }
static int sharp_oracle (sharp_jobtype type, int spin, int ntrans)
static int sharp_oracle (sharp_jobtype type, int spin)
{
int lmax=511;
int mmax=(lmax+1)/2;
@ -982,7 +979,7 @@ static int sharp_oracle (sharp_jobtype type, int spin, int ntrans)
sharp_make_gauss_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
int ncomp = ntrans*((spin==0) ? 1 : 2);
int ncomp = (spin==0) ? 1 : 2;
double **map;
ALLOC2D(map,double,ncomp,npix);
@ -1005,7 +1002,7 @@ static int sharp_oracle (sharp_jobtype type, int spin, int ntrans)
int ntries=0;
do
{
sharp_execute(type,spin,&alm[0],&map[0],tinfo,alms,ntrans,
sharp_execute(type,spin,&alm[0],&map[0],tinfo,alms,
nv|SHARP_DP|SHARP_NO_OPENMP,&jtime,NULL);
if (jtime<time) { time=jtime; nvbest=nv; }
@ -1023,26 +1020,18 @@ static int sharp_oracle (sharp_jobtype type, int spin, int ntrans)
return nvbest;
}
int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
int sharp_nv_oracle (sharp_jobtype type, int spin)
{
static const int maxtr = 6;
static int nv_opt[6][2][5] = {
{{0,0,0,0,0},{0,0,0,0,0}},
{{0,0,0,0,0},{0,0,0,0,0}},
{{0,0,0,0,0},{0,0,0,0,0}},
{{0,0,0,0,0},{0,0,0,0,0}},
{{0,0,0,0,0},{0,0,0,0,0}},
{{0,0,0,0,0},{0,0,0,0,0}} };
static int nv_opt[2][5] = {{0,0,0,0,0},{0,0,0,0,0}};
if (type==SHARP_ALM2MAP_DERIV1) spin=1;
UTIL_ASSERT(type<5,"bad type");
UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
UTIL_ASSERT(spin>=0, "bad spin");
ntrans=IMIN(ntrans,maxtr);
if (nv_opt[ntrans-1][spin!=0][type]==0)
nv_opt[ntrans-1][spin!=0][type]=sharp_oracle(type,spin,ntrans);
return nv_opt[ntrans-1][spin!=0][type];
if (nv_opt[spin!=0][type]==0)
nv_opt[spin!=0][type]=sharp_oracle(type,spin);
return nv_opt[spin!=0][type];
}
#ifdef USE_MPI
@ -1050,11 +1039,11 @@ int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
void *alm, void *map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
const sharp_alm_info *alm_info, int flags, double *time,
unsigned long long *opcnt)
{
MPI_Comm comm = *(MPI_Comm*)pcomm;
sharp_execute_mpi((MPI_Comm)comm, type, spin, alm, map, geom_info, alm_info, ntrans,
sharp_execute_mpi((MPI_Comm)comm, type, spin, alm, map, geom_info, alm_info,
flags, time, opcnt);
return 0;
}
@ -1063,12 +1052,12 @@ int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
void *alm, void *map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
const sharp_alm_info *alm_info, int flags, double *time,
unsigned long long *opcnt)
{
/* Suppress unused warning: */
(void)pcomm; (void)type; (void)spin; (void)alm; (void)map; (void)geom_info;
(void)alm_info; (void)ntrans; (void)flags; (void)time; (void)opcnt;
(void)alm_info; (void)flags; (void)time; (void)opcnt;
return SHARP_ERROR_NO_MPI;
}

View file

@ -78,164 +78,27 @@ void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *c
const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
const int *mlim)
{
int njobs=job->ntrans, nv=job->flags&SHARP_NVMAX;
if (njobs<=MAXJOB_SPECIAL)
int nv=job->flags&SHARP_NVMAX;
switch (nv)
{
switch (njobs*16+nv)
{
#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1))
case 0x11:
CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x12:
CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x13:
CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x14:
CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x15:
CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x16:
CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
#if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2))
case 0x21:
CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x22:
CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x23:
CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x24:
CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x25:
CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x26:
CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
#if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3))
case 0x31:
CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x32:
CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x33:
CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x34:
CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x35:
CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x36:
CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
#if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4))
case 0x41:
CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x42:
CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x43:
CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x44:
CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x45:
CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x46:
CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
#if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5))
case 0x51:
CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x52:
CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x53:
CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x54:
CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x55:
CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x56:
CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
#if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6))
case 0x61:
CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x62:
CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x63:
CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x64:
CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x65:
CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x66:
CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
}
case 0x1:
CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x2:
CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x3:
CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x4:
CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x5:
CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x6:
CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
}
#if (SHARP_MAXTRANS>MAXJOB_SPECIAL)
else
{
switch (nv)
{
case 1:
CONCAT2(inner_loop,1)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
case 2:
CONCAT2(inner_loop,2)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
case 3:
CONCAT2(inner_loop,3)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
case 4:
CONCAT2(inner_loop,4)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
case 5:
CONCAT2(inner_loop,5)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
case 6:
CONCAT2(inner_loop,6)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
}
}
#endif
UTIL_FAIL("Incorrect vector parameters");
}

View file

@ -55,12 +55,11 @@ typedef struct
const sharp_geom_info *ginfo;
const sharp_alm_info *ainfo;
double time;
int ntrans;
unsigned long long opcnt;
} sharp_job;
int sharp_get_nv_max (void);
int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans);
int sharp_nv_oracle (sharp_jobtype type, int spin);
int sharp_get_mlim (int lmax, int spin, double sth, double cth);
#endif

View file

@ -223,7 +223,6 @@ typedef enum { SHARP_DP = 1<<4,
\param alm_info A \c sharp_alm_info object compatible with the provided
\a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
exactly once.
\param ntrans the number of simultaneous SHTs
\param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
\a alm is expected to have the type "complex double **" and \a map is
expected to have the type "double **"; otherwise, the expected
@ -233,7 +232,7 @@ typedef enum { SHARP_DP = 1<<4,
\param opcnt If not NULL, a conservative estimate of the total floating point
operation count for this SHT will be written here. */
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans,
const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
int flags, double *time, unsigned long long *opcnt);
void sharp_set_chunksize_min(int new_chunksize_min);
@ -258,7 +257,7 @@ typedef enum { SHARP_ERROR_NO_MPI = 1,
*/
int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
void *alm, void *map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
const sharp_alm_info *alm_info, int flags, double *time,
unsigned long long *opcnt);

View file

@ -101,7 +101,7 @@ static void sharp_make_mpi_info (MPI_Comm comm, const sharp_job *job,
DEALLOC(theta_tmp);
DEALLOC(ispair_tmp);
minfo->nph=2*job->nmaps*job->ntrans;
minfo->nph=2*job->nmaps;
minfo->almcount=RALLOC(int,minfo->ntasks);
minfo->almdisp=RALLOC(int,minfo->ntasks+1);
@ -184,8 +184,8 @@ static void alloc_phase_mpi (sharp_job *job, int nm, int ntheta,
{
ptrdiff_t phase_size = (job->type==SHARP_MAP2ALM) ?
(ptrdiff_t)(nmfull)*ntheta : (ptrdiff_t)(nm)*nthetafull;
job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*phase_size);
job->s_m=2*job->ntrans*job->nmaps;
job->phase=RALLOC(dcmplx,2*job->nmaps*phase_size);
job->s_m=2*job->nmaps;
job->s_th = job->s_m * ((job->type==SHARP_MAP2ALM) ? nmfull : nm);
}
@ -315,12 +315,12 @@ static void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm)
void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
void *alm, void *map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
const sharp_alm_info *alm_info, int flags, double *time,
unsigned long long *opcnt)
{
sharp_job job;
sharp_build_job_common (&job, type, spin, alm, map, geom_info, alm_info,
ntrans, flags);
flags);
sharp_execute_job_mpi (&job, comm);
if (time!=NULL) *time = job.time;
@ -331,15 +331,15 @@ void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
without declaring it in C header as it should not be available to C code */
void sharp_execute_mpi_fortran(MPI_Fint comm, sharp_jobtype type, int spin,
void *alm, void *map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
const sharp_alm_info *alm_info, int flags, double *time,
unsigned long long *opcnt);
void sharp_execute_mpi_fortran(MPI_Fint comm, sharp_jobtype type, int spin,
void *alm, void *map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
const sharp_alm_info *alm_info, int flags, double *time,
unsigned long long *opcnt)
{
sharp_execute_mpi(MPI_Comm_f2c(comm), type, spin, alm, map, geom_info,
alm_info, ntrans, flags, time, opcnt);
alm_info, flags, time, opcnt);
}
#endif

View file

@ -62,7 +62,6 @@ extern "C" {
\a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
exactly once in the union of all \a alm_info objects over the participating
MPI tasks.
\param ntrans the number of simultaneous SHTs
\param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
\a alm is expected to have the type "complex double **" and \a map is
expected to have the type "double **"; otherwise, the expected
@ -73,7 +72,7 @@ extern "C" {
operation count for this SHT will be written here. */
void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
void *alm, void *map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
const sharp_alm_info *alm_info, int flags, double *time,
unsigned long long *opcnt);
#ifdef __cplusplus

View file

@ -358,97 +358,83 @@ static void check_sign_scale(void)
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
for (int ntrans=1; ntrans<10; ++ntrans)
{
double **map;
ALLOC2D(map,double,2*ntrans,npix);
double **map;
ALLOC2D(map,double,2,npix);
dcmplx **alm;
ALLOC2D(alm,dcmplx,2*ntrans,nalms);
for (int i=0; i<2*ntrans; ++i)
for (int j=0; j<nalms; ++j)
alm[i][j]=1.+_Complex_I;
dcmplx **alm;
ALLOC2D(alm,dcmplx,2,nalms);
for (int i=0; i<2; ++i)
for (int j=0; j<nalms; ++j)
alm[i][j]=1.+_Complex_I;
sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,ntrans,SHARP_DP,
NULL,NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[it][0 ], 3.588246976618616912e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[it][npix/2], 4.042209792157496651e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[it][npix-1],-1.234675107554816442e+01,1e-12),
"error");
}
sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,ntrans,SHARP_DP,
NULL,NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[2*it ][0 ], 2.750897760535633285e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix/2], 3.137704477368562905e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix-1],-8.405730859837063917e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][0 ],-2.398026536095463346e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-4.961140548331700728e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.412765834230440021e+01,1e-12),
"error");
}
sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP,
NULL,NULL);
UTIL_ASSERT(FAPPROX(map[0][0 ], 3.588246976618616912e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[0][npix/2], 4.042209792157496651e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.234675107554816442e+01,1e-12),
"error");
sharp_execute(SHARP_ALM2MAP,2,&alm[0],&map[0],tinfo,alms,ntrans,SHARP_DP,
NULL,NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[2*it ][0 ],-1.398186224727334448e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix/2],-2.456676000884031197e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix-1],-1.516249174408820863e+02,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][0 ],-3.173406200299964119e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-5.831327404513146462e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.863257892248353897e+01,1e-12),
"error");
}
sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,SHARP_DP,
NULL,NULL);
UTIL_ASSERT(FAPPROX(map[0][0 ], 2.750897760535633285e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[0][npix/2], 3.137704477368562905e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[0][npix-1],-8.405730859837063917e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[1][0 ],-2.398026536095463346e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[1][npix/2],-4.961140548331700728e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[1][npix-1],-1.412765834230440021e+01,1e-12),
"error");
sharp_execute(SHARP_ALM2MAP_DERIV1,1,&alm[0],&map[0],tinfo,alms,ntrans,
SHARP_DP,NULL,NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[2*it ][0 ],-6.859393905369091105e-01,1e-11),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix/2],-2.103947835973212364e+02,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix-1],-1.092463246472086439e+03,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][0 ],-1.411433220713928165e+02,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-1.146122859381925082e+03,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1], 7.821618677689795049e+02,1e-12),
"error");
}
sharp_execute(SHARP_ALM2MAP,2,&alm[0],&map[0],tinfo,alms,SHARP_DP,
NULL,NULL);
UTIL_ASSERT(FAPPROX(map[0][0 ],-1.398186224727334448e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[0][npix/2],-2.456676000884031197e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.516249174408820863e+02,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[1][0 ],-3.173406200299964119e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[1][npix/2],-5.831327404513146462e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[1][npix-1],-1.863257892248353897e+01,1e-12),
"error");
DEALLOC2D(map);
DEALLOC2D(alm);
}
sharp_execute(SHARP_ALM2MAP_DERIV1,1,&alm[0],&map[0],tinfo,alms,
SHARP_DP,NULL,NULL);
UTIL_ASSERT(FAPPROX(map[0][0 ],-6.859393905369091105e-01,1e-11),
"error");
UTIL_ASSERT(FAPPROX(map[0][npix/2],-2.103947835973212364e+02,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.092463246472086439e+03,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[1][0 ],-1.411433220713928165e+02,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[1][npix/2],-1.146122859381925082e+03,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[1][npix-1], 7.821618677689795049e+02,1e-12),
"error");
DEALLOC2D(map);
DEALLOC2D(alm);
sharp_destroy_alm_info(alms);
sharp_destroy_geom_info(tinfo);
}
static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
int spin, int ntrans, int nv, double **err_abs, double **err_rel,
int spin, int nv, double **err_abs, double **err_rel,
double *t_a2m, double *t_m2a, unsigned long long *op_a2m,
unsigned long long *op_m2a)
{
ptrdiff_t nalms = get_nalms(ainfo);
int ncomp = ntrans*((spin==0) ? 1 : 2);
int ncomp = (spin==0) ? 1 : 2;
size_t npix = get_npix(ginfo);
double **map;
@ -463,9 +449,9 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
#ifdef USE_MPI
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,
ainfo,ntrans, SHARP_DP|SHARP_ADD|nv,t_a2m,op_a2m);
ainfo, SHARP_DP|SHARP_ADD|nv,t_a2m,op_a2m);
#else
sharp_execute(SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,ainfo,ntrans,
sharp_execute(SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,ainfo,
SHARP_DP|nv,t_a2m,op_a2m);
#endif
if (t_a2m!=NULL) *t_a2m=maxTime(*t_a2m);
@ -473,9 +459,9 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
double *sqsum=get_sqsum_and_invert(alm,nalms,ncomp);
#ifdef USE_MPI
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,
ainfo,ntrans,SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
ainfo,SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
#else
sharp_execute(SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,ainfo,ntrans,
sharp_execute(SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,ainfo,
SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
#endif
if (t_m2a!=NULL) *t_m2a=maxTime(*t_m2a);
@ -488,11 +474,11 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
}
static void check_accuracy (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
int spin, int ntrans, int nv)
int spin, int nv)
{
int ncomp = ntrans*((spin==0) ? 1 : 2);
int ncomp = (spin==0) ? 1 : 2;
double *err_abs, *err_rel;
do_sht (ginfo, ainfo, spin, ntrans, nv, &err_abs, &err_rel, NULL, NULL,
do_sht (ginfo, ainfo, spin, nv, &err_abs, &err_rel, NULL, NULL,
NULL, NULL);
for (int i=0; i<ncomp; ++i)
UTIL_ASSERT((err_rel[i]<1e-10) && (err_abs[i]<1e-10),"error");
@ -515,14 +501,13 @@ static void sharp_acctest(void)
int lmax=127, mmax=127, nlat=128, nlon=256;
get_infos ("gauss", lmax, &mmax, &nlat, &nlon, &ginfo, &ainfo);
for (int nv=1; nv<=6; ++nv)
for (int ntrans=1; ntrans<=6; ++ntrans)
{
check_accuracy(ginfo,ainfo,0,ntrans,nv);
check_accuracy(ginfo,ainfo,1,ntrans,nv);
check_accuracy(ginfo,ainfo,2,ntrans,nv);
check_accuracy(ginfo,ainfo,3,ntrans,nv);
check_accuracy(ginfo,ainfo,30,ntrans,nv);
}
{
check_accuracy(ginfo,ainfo,0,nv);
check_accuracy(ginfo,ainfo,1,nv);
check_accuracy(ginfo,ainfo,2,nv);
check_accuracy(ginfo,ainfo,3,nv);
check_accuracy(ginfo,ainfo,30,nv);
}
sharp_destroy_alm_info(ainfo);
sharp_destroy_geom_info(ginfo);
if (mytask==0) printf("Passed.\n\n");
@ -531,22 +516,21 @@ static void sharp_acctest(void)
static void sharp_test (int argc, const char **argv)
{
if (mytask==0) sharp_announce("sharp_test");
UTIL_ASSERT(argc>=9,"usage: grid lmax mmax geom1 geom2 spin ntrans");
UTIL_ASSERT(argc>=8,"usage: grid lmax mmax geom1 geom2 spin");
int lmax=atoi(argv[3]);
int mmax=atoi(argv[4]);
int gpar1=atoi(argv[5]);
int gpar2=atoi(argv[6]);
int spin=atoi(argv[7]);
int ntrans=atoi(argv[8]);
if (mytask==0) printf("Testing map analysis accuracy.\n");
if (mytask==0) printf("spin=%d, ntrans=%d\n", spin, ntrans);
if (mytask==0) printf("spin=%d\n", spin);
sharp_geom_info *ginfo;
sharp_alm_info *ainfo;
get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo);
int ncomp = ntrans*((spin==0) ? 1 : 2);
int ncomp = (spin==0) ? 1 : 2;
double t_a2m=1e30, t_m2a=1e30;
unsigned long long op_a2m, op_m2a;
double *err_abs,*err_rel;
@ -557,7 +541,7 @@ static void sharp_test (int argc, const char **argv)
{
++nrpt;
double ta2m2, tm2a2;
do_sht (ginfo, ainfo, spin, ntrans, 0, &err_abs, &err_rel, &ta2m2, &tm2a2,
do_sht (ginfo, ainfo, spin, 0, &err_abs, &err_rel, &ta2m2, &tm2a2,
&op_a2m, &op_m2a);
if (ta2m2<t_a2m) t_a2m=ta2m2;
if (tm2a2<t_m2a) t_m2a=tm2a2;
@ -610,7 +594,7 @@ static void sharp_test (int argc, const char **argv)
printf("%-12s %-10s %2d %d %2d %3d %6d %6d %6d %6d %2d %.2e %7.2f %.2e %7.2f"
" %9.2f %6.2f %.2e %.2e\n",
getenv("HOST"),argv[2],spin,VLEN,nomp,ntasks,lmax,mmax,gpar1,gpar2,
ntrans,t_a2m,1e-9*op_a2m/t_a2m,t_m2a,1e-9*op_m2a/t_m2a,tmem/(1<<20),
t_a2m,1e-9*op_a2m/t_a2m,t_m2a,1e-9*op_m2a/t_m2a,tmem/(1<<20),
100.*(1.-iosize/tmem),maxerel,maxeabs);
DEALLOC(err_abs);
@ -620,16 +604,15 @@ static void sharp_test (int argc, const char **argv)
static void sharp_bench (int argc, const char **argv)
{
if (mytask==0) sharp_announce("sharp_bench");
UTIL_ASSERT(argc>=9,"usage: grid lmax mmax geom1 geom2 spin ntrans");
UTIL_ASSERT(argc>=8,"usage: grid lmax mmax geom1 geom2 spin");
int lmax=atoi(argv[3]);
int mmax=atoi(argv[4]);
int gpar1=atoi(argv[5]);
int gpar2=atoi(argv[6]);
int spin=atoi(argv[7]);
int ntrans=atoi(argv[8]);
if (mytask==0) printf("Testing map analysis accuracy.\n");
if (mytask==0) printf("spin=%d, ntrans=%d\n", spin, ntrans);
if (mytask==0) printf("spin=%d\n", spin);
sharp_geom_info *ginfo;
sharp_alm_info *ainfo;
@ -647,7 +630,7 @@ static void sharp_bench (int argc, const char **argv)
double t_a2m, t_m2a;
unsigned long long op_a2m, op_m2a;
double *err_abs,*err_rel;
do_sht (ginfo, ainfo, spin, ntrans, nv, &err_abs, &err_rel,
do_sht (ginfo, ainfo, spin, nv, &err_abs, &err_rel,
&t_a2m, &t_m2a, &op_a2m, &op_m2a);
DEALLOC(err_abs);