/* * This file is part of libsharp. * * libsharp is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * libsharp is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with libsharp; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt * (DLR). */ /*! \file sharp.c * Spherical transform library * * Copyright (C) 2006-2013 Max-Planck-Society * \author Martin Reinecke \author Dag Sverre Seljebotn */ #include #include "ls_fft.h" #include "sharp_ylmgen_c.h" #include "sharp_internal.h" #include "c_utils.h" #include "sharp_core.h" #include "sharp_vecutil.h" #include "walltime_c.h" #include "sharp_almhelpers.h" #include "sharp_geomhelpers.h" typedef complex double dcmplx; typedef complex float fcmplx; static const double sqrt_one_half = 0.707106781186547572737310929369; static const double sqrt_two = 1.414213562373095145474621858739; static int chunksize_min=500, nchunks_max=10; static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize) { *chunksize = (ndata+nchunks_max-1)/nchunks_max; if (*chunksize>=chunksize_min) // use max number of chunks *chunksize = ((*chunksize+nmult-1)/nmult)*nmult; else // need to adjust chunksize and nchunks { *nchunks = (ndata+chunksize_min-1)/chunksize_min; *chunksize = (ndata+(*nchunks)-1)/(*nchunks); if (*nchunks>1) *chunksize = ((*chunksize+nmult-1)/nmult)*nmult; } *nchunks = (ndata+(*chunksize)-1)/(*chunksize); } int sharp_get_mlim (int lmax, int spin, double sth, double cth) { double ofs=lmax*0.01; if (ofs<100.) ofs=100.; double b = -2*spin*fabs(cth); double t1 = lmax*sth+ofs; double c = (double)spin*spin-t1*t1; double discr = b*b-4*c; if (discr<=0) return lmax; double res=(-b+sqrt(discr))/2.; if (res>lmax) res=lmax; return (int)(res+0.5); } typedef struct { double phi0_; dcmplx *shiftarr; int s_shift; real_plan plan; int norot; } ringhelper; static void ringhelper_init (ringhelper *self) { static ringhelper rh_null = { 0, NULL, 0, NULL, 0 }; *self = rh_null; } static void ringhelper_destroy (ringhelper *self) { if (self->plan) kill_real_plan(self->plan); DEALLOC(self->shiftarr); ringhelper_init(self); } static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0) { self->norot = (fabs(phi0)<1e-14); if (!(self->norot)) if ((mmax!=self->s_shift-1) || (!FAPPROX(phi0,self->phi0_,1e-12))) { RESIZE (self->shiftarr,dcmplx,mmax+1); self->s_shift = mmax+1; self->phi0_ = phi0; for (int m=0; m<=mmax; ++m) self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0); } if (!self->plan) self->plan=make_real_plan(nph); if (nph!=(int)self->plan->length) { kill_real_plan(self->plan); self->plan=make_real_plan(nph); } } static int ringinfo_compare (const void *xa, const void *xb) { const sharp_ringinfo *a=xa, *b=xb; return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0; } static int ringpair_compare (const void *xa, const void *xb) { const sharp_ringpair *a=xa, *b=xb; if (a->r1.nph==b->r1.nph) return (a->r1.phi0 < b->r1.phi0) ? -1 : ((a->r1.phi0 > b->r1.phi0) ? 1 : (a->r1.cth>b->r1.cth ? -1 : 1)); return (a->r1.nphr1.nph) ? -1 : 1; } void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval, const ptrdiff_t *mstart, int flags, sharp_alm_info **alm_info) { sharp_alm_info *info = RALLOC(sharp_alm_info,1); info->lmax = lmax; info->nm = nm; info->mval = RALLOC(int,nm); info->mvstart = RALLOC(ptrdiff_t,nm); info->stride = stride; info->flags = flags; for (int mi=0; mimval[mi] = mval[mi]; info->mvstart[mi] = mstart[mi]; } *alm_info = info; } void sharp_make_alm_info (int lmax, int mmax, int stride, const ptrdiff_t *mstart, sharp_alm_info **alm_info) { int *mval=RALLOC(int,mmax+1); for (int i=0; i<=mmax; ++i) mval[i]=i; sharp_make_general_alm_info (lmax, mmax+1, stride, mval, mstart, 0, alm_info); DEALLOC(mval); } ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi) { UTIL_ASSERT(!(self->flags & SHARP_PACKED), "sharp_alm_index not applicable with SHARP_PACKED alms"); return self->mvstart[mi]+self->stride*l; } ptrdiff_t sharp_alm_count(const sharp_alm_info *self) { ptrdiff_t result=0; for (int im=0; im!=self->nm; ++im) { int m=self->mval[im]; ptrdiff_t x=(self->lmax + 1 - m); if ((m!=0)&&((self->flags&SHARP_PACKED)!=0)) result+=2*x; else result+=x; } return result; } void sharp_destroy_alm_info (sharp_alm_info *info) { DEALLOC (info->mval); DEALLOC (info->mvstart); DEALLOC (info); } void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs, const int *stride, const double *phi0, const double *theta, const double *wgt, sharp_geom_info **geom_info) { sharp_geom_info *info = RALLOC(sharp_geom_info,1); sharp_ringinfo *infos = RALLOC(sharp_ringinfo,nrings); int pos=0; info->pair=RALLOC(sharp_ringpair,nrings); info->npairs=0; info->nphmax=0; *geom_info = info; for (int m=0; mnphmaxnphmax=nph[m]; } qsort(infos,nrings,sizeof(sharp_ringinfo),ringinfo_compare); while (pospair[info->npairs].r1=infos[pos]; if ((pos0) // make sure northern ring is in r1 info->pair[info->npairs].r2=infos[pos+1]; else { info->pair[info->npairs].r1=infos[pos+1]; info->pair[info->npairs].r2=infos[pos]; } ++pos; } else info->pair[info->npairs].r2.nph=-1; ++pos; ++info->npairs; } DEALLOC(infos); qsort(info->pair,info->npairs,sizeof(sharp_ringpair),ringpair_compare); } ptrdiff_t sharp_map_size(const sharp_geom_info *info) { ptrdiff_t result = 0; for (int m=0; mnpairs; ++m) { result+=info->pair[m].r1.nph; result+=(info->pair[m].r2.nph>=0) ? (info->pair[m].r2.nph) : 0; } return result; } void sharp_destroy_geom_info (sharp_geom_info *geom_info) { DEALLOC (geom_info->pair); DEALLOC (geom_info); } /* This currently requires all m values from 0 to nm-1 to be present. It might be worthwhile to relax this criterion such that holes in the m distribution are permissible. */ static int sharp_get_mmax (int *mval, int nm) { int *mcheck=RALLOC(int,nm); SET_ARRAY(mcheck,0,nm,0); for (int i=0; i=0) && (m_curnph; ringhelper_update (self, nph, mmax, info->phi0); double wgt = (flags&SHARP_USE_WEIGHTS) ? info->weight : 1.; if (flags&SHARP_REAL_HARMONICS) wgt *= sqrt_one_half; if (nph>=2*mmax+1) { for (int m=0; m<=mmax; ++m) { dcmplx tmp = phase[m*pstride]*wgt; if(!self->norot) tmp*=self->shiftarr[m]; data[2*m]=creal(tmp); data[2*m+1]=cimag(tmp); } for (int m=2*(mmax+1); mnorot) tmp*=self->shiftarr[m]; if (idx1<(nph+2)/2) { data[2*idx1]+=creal(tmp); data[2*idx1+1]+=cimag(tmp); } if (idx2<(nph+2)/2) { data[2*idx2]+=creal(tmp); data[2*idx2+1]-=cimag(tmp); } if (++idx1>=nph) idx1=0; if (--idx2<0) idx2=nph-1; } } data[1]=data[0]; real_plan_backward_fftpack (self->plan, &(data[1])); } static void ringhelper_ring2phase (ringhelper *self, const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase, int pstride, int flags) { int nph = info->nph; #if 1 int maxidx = mmax; /* Enable this for traditional Healpix compatibility */ #else int maxidx = IMIN(nph-1,mmax); #endif ringhelper_update (self, nph, mmax, -info->phi0); double wgt = (flags&SHARP_USE_WEIGHTS) ? info->weight : 1; if (flags&SHARP_REAL_HARMONICS) wgt *= sqrt_two; real_plan_forward_fftpack (self->plan, &(data[1])); data[0]=data[1]; data[1]=data[nph+1]=0.; if (maxidx<=nph/2) { if (self->norot) for (int m=0; m<=maxidx; ++m) phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * wgt; else for (int m=0; m<=maxidx; ++m) phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * self->shiftarr[m] * wgt; } else { for (int m=0; m<=maxidx; ++m) { int idx=m%nph; dcmplx val; if (idx<(nph-idx)) val = (data[2*idx] + _Complex_I*data[2*idx+1]) * wgt; else val = (data[2*(nph-idx)] - _Complex_I*data[2*(nph-idx)+1]) * wgt; if (!self->norot) val *= self->shiftarr[m]; phase[m*pstride]=val; } } for (int m=maxidx+1;m<=mmax; ++m) phase[m*pstride]=0.; } static void fill_map (const sharp_geom_info *ginfo, void *map, double value, int flags) { if (flags & SHARP_NO_FFT) { for (int j=0;jnpairs;++j) { if (flags&SHARP_DP) { for (ptrdiff_t i=0;ipair[j].r1.nph;++i) ((dcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride] =value; for (ptrdiff_t i=0;ipair[j].r2.nph;++i) ((dcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride] =value; } else { for (ptrdiff_t i=0;ipair[j].r1.nph;++i) ((fcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride] =(float)value; for (ptrdiff_t i=0;ipair[j].r2.nph;++i) ((fcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride] =(float)value; } } } else { for (int j=0;jnpairs;++j) { if (flags&SHARP_DP) { for (ptrdiff_t i=0;ipair[j].r1.nph;++i) ((double *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride] =value; for (ptrdiff_t i=0;ipair[j].r2.nph;++i) ((double *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride] =value; } else { for (ptrdiff_t i=0;ipair[j].r1.nph;++i) ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride] =(float)value; for (ptrdiff_t i=0;ipair[j].r2.nph;++i) ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride] =(float)value; } } } } static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags) { #define CLEARLOOP(real_t,body) \ { \ real_t *talm = (real_t *)alm; \ for (int l=m;l<=ainfo->lmax;++l) \ body \ } for (int mi=0;minm;++mi) { int m=ainfo->mval[mi]; ptrdiff_t mvstart = ainfo->mvstart[mi]; ptrdiff_t stride = ainfo->stride; if (!(ainfo->flags&SHARP_PACKED)) mvstart*=2; if ((ainfo->flags&SHARP_PACKED)&&(m==0)) { if (flags&SHARP_DP) CLEARLOOP(double, talm[mvstart+l*stride] = 0.;) else CLEARLOOP(float, talm[mvstart+l*stride] = 0.;) } else { stride*=2; if (flags&SHARP_DP) CLEARLOOP(double,talm[mvstart+l*stride]=talm[mvstart+l*stride+1]=0.;) else CLEARLOOP(float,talm[mvstart+l*stride]=talm[mvstart+l*stride+1]=0.;) } #undef CLEARLOOP } } static void init_output (sharp_job *job) { if (job->flags&SHARP_ADD) return; if (job->type == SHARP_MAP2ALM) for (int i=0; intrans*job->nalm; ++i) clear_alm (job->ainfo,job->alm[i],job->flags); else for (int i=0; intrans*job->nmaps; ++i) fill_map (job->ginfo,job->map[i],0.,job->flags); } static void alloc_phase (sharp_job *job, int nm, int ntheta) { if (job->type==SHARP_MAP2ALM) { if ((nm&1023)==0) nm+=3; // hack to avoid critical strides job->s_m=2*job->ntrans*job->nmaps; job->s_th=job->s_m*nm; } else { if ((ntheta&1023)==0) ntheta+=3; // hack to avoid critical strides job->s_th=2*job->ntrans*job->nmaps; job->s_m=job->s_th*ntheta; } job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta); } static void dealloc_phase (sharp_job *job) { DEALLOC(job->phase); } static void alloc_almtmp (sharp_job *job, int lmax) { job->almtmp=RALLOC(dcmplx,job->ntrans*job->nalm*(lmax+1)); } static void dealloc_almtmp (sharp_job *job) { DEALLOC(job->almtmp); } static void alm2almtmp (sharp_job *job, int lmax, int mi) { #define COPY_LOOP(real_t, source_t, expr_of_x) \ for (int l=job->ainfo->mval[mi]; l<=lmax; ++l) \ for (int i=0; intrans*job->nalm; ++i) \ { \ source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \ job->almtmp[job->ntrans*job->nalm*l+i] = expr_of_x; \ } if (job->type!=SHARP_MAP2ALM) { ptrdiff_t ofs=job->ainfo->mvstart[mi]; int stride=job->ainfo->stride; int m=job->ainfo->mval[mi]; /* in the case of SHARP_REAL_HARMONICS, phase2ring scales all the coefficients by sqrt_one_half; here we must compensate to avoid scaling m=0 */ double norm_m0=(job->flags&SHARP_REAL_HARMONICS) ? sqrt_two : 1.; if (!(job->ainfo->flags&SHARP_PACKED)) ofs *= 2; if (!((job->ainfo->flags&SHARP_PACKED)&&(m==0))) stride *= 2; if (job->spin==0) { if (m==0) { if (job->flags&SHARP_DP) COPY_LOOP(double, double, x*norm_m0) else COPY_LOOP(float, float, x*norm_m0) } else { if (job->flags&SHARP_DP) COPY_LOOP(double, dcmplx, x) else COPY_LOOP(float, fcmplx, x) } } else { if (m==0) { if (job->flags&SHARP_DP) COPY_LOOP(double, double, x*job->norm_l[l]*norm_m0) else COPY_LOOP(float, float, x*job->norm_l[l]*norm_m0) } else { if (job->flags&SHARP_DP) COPY_LOOP(double, dcmplx, x*job->norm_l[l]) else COPY_LOOP(float, fcmplx, x*job->norm_l[l]) } } } else SET_ARRAY(job->almtmp,job->ntrans*job->nalm*job->ainfo->mval[mi], job->ntrans*job->nalm*(lmax+1),0.); #undef COPY_LOOP } static void almtmp2alm (sharp_job *job, int lmax, int mi) { #define COPY_LOOP(real_t, target_t, expr_of_x) \ for (int l=job->ainfo->mval[mi]; l<=lmax; ++l) \ for (int i=0; intrans*job->nalm; ++i) \ { \ dcmplx x = job->almtmp[job->ntrans*job->nalm*l+i]; \ *(target_t *)(((real_t *)job->alm[i])+ofs+l*stride) += expr_of_x; \ } if (job->type != SHARP_MAP2ALM) return; ptrdiff_t ofs=job->ainfo->mvstart[mi]; int stride=job->ainfo->stride; int m=job->ainfo->mval[mi]; /* in the case of SHARP_REAL_HARMONICS, ring2phase scales all the coefficients by sqrt_two; here we must compensate to avoid scaling m=0 */ double norm_m0=(job->flags&SHARP_REAL_HARMONICS) ? sqrt_one_half : 1.; if (!(job->ainfo->flags&SHARP_PACKED)) ofs *= 2; if (!((job->ainfo->flags&SHARP_PACKED)&&(m==0))) stride *= 2; if (job->spin==0) { if (m==0) { if (job->flags&SHARP_DP) COPY_LOOP(double, double, creal(x)*norm_m0) else COPY_LOOP(float, float, crealf(x)*norm_m0) } else { if (job->flags&SHARP_DP) COPY_LOOP(double, dcmplx, x) else COPY_LOOP(float, fcmplx, (fcmplx)x) } } else { if (m==0) { if (job->flags&SHARP_DP) COPY_LOOP(double, double, creal(x)*job->norm_l[l]*norm_m0) else COPY_LOOP(float, fcmplx, (float)(creal(x)*job->norm_l[l]*norm_m0)) } else { if (job->flags&SHARP_DP) COPY_LOOP(double, dcmplx, x*job->norm_l[l]) else COPY_LOOP(float, fcmplx, (fcmplx)(x*job->norm_l[l])) } } #undef COPY_LOOP } static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, double *ringtmp, int rstride) { double **dmap = (double **)job->map; float **fmap = (float **)job->map; for (int i=0; intrans*job->nmaps; ++i) for (int m=0; mnph; ++m) if (job->flags & SHARP_DP) dmap[i][ri->ofs+m*ri->stride] += ringtmp[i*rstride+m+1]; else fmap[i][ri->ofs+m*ri->stride] += (float)ringtmp[i*rstride+m+1]; } static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, double *ringtmp, int rstride) { for (int i=0; intrans*job->nmaps; ++i) for (int m=0; mnph; ++m) ringtmp[i*rstride+m+1] = (job->flags & SHARP_DP) ? ((double *)(job->map[i]))[ri->ofs+m*ri->stride] : ((float *)(job->map[i]))[ri->ofs+m*ri->stride]; } static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, dcmplx *phase) { if (ri->nph<0) { for (int i=0; intrans*job->nmaps; ++i) for (int m=0; m<=mmax; ++m) phase[2*i+job->s_m*m]=0.; } else { UTIL_ASSERT(ri->nph==mmax+1,"bad ring size"); double wgt = (job->flags&SHARP_USE_WEIGHTS) ? (ri->nph*ri->weight) : 1.; if (job->flags&SHARP_REAL_HARMONICS) wgt *= sqrt_two; for (int i=0; intrans*job->nmaps; ++i) for (int m=0; m<=mmax; ++m) phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ? ((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt : ((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt; } } static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, dcmplx *phase) { if (ri->nph<0) return; UTIL_ASSERT(ri->nph==mmax+1,"bad ring size"); dcmplx **dmap = (dcmplx **)job->map; fcmplx **fmap = (fcmplx **)job->map; double wgt = (job->flags&SHARP_USE_WEIGHTS) ? (ri->nph*ri->weight) : 1.; if (job->flags&SHARP_REAL_HARMONICS) wgt *= sqrt_one_half; for (int i=0; intrans*job->nmaps; ++i) for (int m=0; m<=mmax; ++m) if (job->flags & SHARP_DP) dmap[i][ri->ofs+m*ri->stride] += wgt*phase[2*i+job->s_m*m]; else fmap[i][ri->ofs+m*ri->stride] += (fcmplx)(wgt*phase[2*i+job->s_m*m]); } //FIXME: set phase to zero if not SHARP_MAP2ALM? static void map2phase (sharp_job *job, int mmax, int llim, int ulim) { if (job->type != SHARP_MAP2ALM) return; int pstride = job->s_m; if (job->flags & SHARP_NO_FFT) { for (int ith=llim; iths_th*(ith-llim); ring2phase_direct(job,&(job->ginfo->pair[ith].r1),mmax, &(job->phase[dim2])); ring2phase_direct(job,&(job->ginfo->pair[ith].r2),mmax, &(job->phase[dim2+1])); } } else { #pragma omp parallel if ((job->flags&SHARP_NO_OPENMP)==0) { ringhelper helper; ringhelper_init(&helper); int rstride=job->ginfo->nphmax+2; double *ringtmp=RALLOC(double,job->ntrans*job->nmaps*rstride); #pragma omp for schedule(dynamic,1) for (int ith=llim; iths_th*(ith-llim); ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride); for (int i=0; intrans*job->nmaps; ++i) ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1), &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags); if (job->ginfo->pair[ith].r2.nph>0) { ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride); for (int i=0; intrans*job->nmaps; ++i) ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2), &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags); } } DEALLOC(ringtmp); ringhelper_destroy(&helper); } /* end of parallel region */ } } static void phase2map (sharp_job *job, int mmax, int llim, int ulim) { if (job->type == SHARP_MAP2ALM) return; int pstride = job->s_m; if (job->flags & SHARP_NO_FFT) { for (int ith=llim; iths_th*(ith-llim); phase2ring_direct(job,&(job->ginfo->pair[ith].r1),mmax, &(job->phase[dim2])); phase2ring_direct(job,&(job->ginfo->pair[ith].r2),mmax, &(job->phase[dim2+1])); } } else { #pragma omp parallel if ((job->flags&SHARP_NO_OPENMP)==0) { ringhelper helper; ringhelper_init(&helper); int rstride=job->ginfo->nphmax+2; double *ringtmp=RALLOC(double,job->ntrans*job->nmaps*rstride); #pragma omp for schedule(dynamic,1) for (int ith=llim; iths_th*(ith-llim); for (int i=0; intrans*job->nmaps; ++i) ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1), &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags); ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride); if (job->ginfo->pair[ith].r2.nph>0) { for (int i=0; intrans*job->nmaps; ++i) ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2), &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags); ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride); } } DEALLOC(ringtmp); ringhelper_destroy(&helper); } /* end of parallel region */ } } static void sharp_execute_job (sharp_job *job) { double timer=wallTime(); job->opcnt=0; int lmax = job->ainfo->lmax, mmax=sharp_get_mmax(job->ainfo->mval, job->ainfo->nm); job->norm_l = (job->type==SHARP_ALM2MAP_DERIV1) ? sharp_Ylmgen_get_d1norm (lmax) : sharp_Ylmgen_get_norm (lmax, job->spin); /* clear output arrays if requested */ init_output (job); int nchunks, chunksize; get_chunk_info(job->ginfo->npairs,(job->flags&SHARP_NVMAX)*VLEN,&nchunks, &chunksize); alloc_phase (job,mmax+1,chunksize); /* chunk loop */ for (int chunk=0; chunkginfo->npairs); int *ispair = RALLOC(int,ulim-llim); int *mlim = RALLOC(int,ulim-llim); double *cth = RALLOC(double,ulim-llim), *sth = RALLOC(double,ulim-llim); for (int i=0; iginfo->pair[i+llim].r2.nph>0; cth[i] = job->ginfo->pair[i+llim].r1.cth; sth[i] = job->ginfo->pair[i+llim].r1.sth; mlim[i] = sharp_get_mlim(lmax, job->spin, sth[i], cth[i]); } /* map->phase where necessary */ map2phase (job, mmax, llim, ulim); #pragma omp parallel if ((job->flags&SHARP_NO_OPENMP)==0) { sharp_job ljob = *job; ljob.opcnt=0; sharp_Ylmgen_C generator; sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin); alloc_almtmp(&ljob,lmax); #pragma omp for schedule(dynamic,1) for (int mi=0; miainfo->nm; ++mi) { /* alm->alm_tmp where necessary */ alm2almtmp (&ljob, lmax, mi); inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim); /* alm_tmp->alm where necessary */ almtmp2alm (&ljob, lmax, mi); } sharp_Ylmgen_destroy(&generator); dealloc_almtmp(&ljob); #pragma omp critical job->opcnt+=ljob.opcnt; } /* end of parallel region */ /* phase->map where necessary */ phase2map (job, mmax, llim, ulim); DEALLOC(ispair); DEALLOC(mlim); DEALLOC(cth); DEALLOC(sth); } /* end of chunk loop */ DEALLOC(job->norm_l); dealloc_phase (job); job->time=wallTime()-timer; } static void sharp_build_job_common (sharp_job *job, sharp_jobtype type, int spin, void *alm, void *map, const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans, int flags) { UTIL_ASSERT((ntrans>0)&&(ntrans<=SHARP_MAXTRANS), "bad number of simultaneous transforms"); if (type==SHARP_ALM2MAP_DERIV1) spin=1; if (type==SHARP_MAP2ALM) flags|=SHARP_USE_WEIGHTS; if (type==SHARP_Yt) type=SHARP_MAP2ALM; if (type==SHARP_WY) { type=SHARP_ALM2MAP; flags|=SHARP_USE_WEIGHTS; } UTIL_ASSERT((spin>=0)&&(spin<=alm_info->lmax), "bad spin"); job->type = type; job->spin = spin; job->norm_l = NULL; job->nmaps = (type==SHARP_ALM2MAP_DERIV1) ? 2 : ((spin>0) ? 2 : 1); job->nalm = (type==SHARP_ALM2MAP_DERIV1) ? 1 : ((spin>0) ? 2 : 1); job->ginfo = geom_info; job->ainfo = alm_info; job->flags = flags; if ((job->flags&SHARP_NVMAX)==0) job->flags|=sharp_nv_oracle (type, spin, ntrans); if (alm_info->flags&SHARP_REAL_HARMONICS) job->flags|=SHARP_REAL_HARMONICS; job->time = 0.; job->opcnt = 0; job->ntrans = ntrans; job->alm=alm; job->map=map; } void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map, const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans, int flags, double *time, unsigned long long *opcnt) { sharp_job job; sharp_build_job_common (&job, type, spin, alm, map, geom_info, alm_info, ntrans, flags); sharp_execute_job (&job); if (time!=NULL) *time = job.time; if (opcnt!=NULL) *opcnt = job.opcnt; } void sharp_set_chunksize_min(int new_chunksize_min) { chunksize_min=new_chunksize_min; } void sharp_set_nchunks_max(int new_nchunks_max) { nchunks_max=new_nchunks_max; } int sharp_get_nv_max (void) { return 6; } static int sharp_oracle (sharp_jobtype type, int spin, int ntrans) { int lmax=511; int mmax=(lmax+1)/2; int nrings=(lmax+1)/4; int ppring=1; spin = (spin!=0) ? 2 : 0; ptrdiff_t npix=(ptrdiff_t)nrings*ppring; sharp_geom_info *tinfo; sharp_make_gauss_geom_info (nrings, ppring, 0., 1, ppring, &tinfo); ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax); int ncomp = ntrans*((spin==0) ? 1 : 2); double **map; ALLOC2D(map,double,ncomp,npix); SET_ARRAY(map[0],0,npix*ncomp,0.); sharp_alm_info *alms; sharp_make_triangular_alm_info(lmax,mmax,1,&alms); dcmplx **alm; ALLOC2D(alm,dcmplx,ncomp,nalms); SET_ARRAY(alm[0],0,nalms*ncomp,0.); double time=1e30; int nvbest=-1; for (int nv=1; nv<=sharp_get_nv_max(); ++nv) { double time_acc=0.; double jtime; int ntries=0; do { sharp_execute(type,spin,&alm[0],&map[0],tinfo,alms,ntrans, nv|SHARP_DP|SHARP_NO_OPENMP,&jtime,NULL); if (jtime0),"bad number of simultaneous transforms"); UTIL_ASSERT(spin>=0, "bad spin"); ntrans=IMIN(ntrans,maxtr); if (nv_opt[ntrans-1][spin!=0][type]==0) nv_opt[ntrans-1][spin!=0][type]=sharp_oracle(type,spin,ntrans); return nv_opt[ntrans-1][spin!=0][type]; } #ifdef USE_MPI #include "sharp_mpi.c" int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin, void *alm, void *map, const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans, int flags, double *time, unsigned long long *opcnt) { MPI_Comm comm = *(MPI_Comm*)pcomm; sharp_execute_mpi((MPI_Comm)comm, type, spin, alm, map, geom_info, alm_info, ntrans, flags, time, opcnt); return 0; } #else int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin, void *alm, void *map, const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans, int flags, double *time, unsigned long long *opcnt) { /* Suppress unused warning: */ (void)pcomm; (void)type; (void)spin; (void)alm; (void)map; (void)geom_info; (void)alm_info; (void)ntrans; (void)flags; (void)time; (void)opcnt; return SHARP_ERROR_NO_MPI; } #endif