/* * This file is part of libsharp. * * libsharp is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * libsharp is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with libsharp; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */ /*! \file sharp.c * Spherical transform library * * Copyright (C) 2006-2019 Max-Planck-Society * \author Martin Reinecke \author Dag Sverre Seljebotn */ #include #include #include "pocketfft/pocketfft.h" #include "libsharp/sharp_ylmgen_c.h" #include "libsharp/sharp_internal.h" #include "c_utils/c_utils.h" #include "c_utils/walltime_c.h" #include "libsharp/sharp_almhelpers.h" #include "libsharp/sharp_geomhelpers.h" typedef complex double dcmplx; typedef complex float fcmplx; static const double sqrt_one_half = 0.707106781186547572737310929369; static const double sqrt_two = 1.414213562373095145474621858739; static int chunksize_min=500, nchunks_max=10; static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize) { *chunksize = (ndata+nchunks_max-1)/nchunks_max; if (*chunksize>=chunksize_min) // use max number of chunks *chunksize = ((*chunksize+nmult-1)/nmult)*nmult; else // need to adjust chunksize and nchunks { *nchunks = (ndata+chunksize_min-1)/chunksize_min; *chunksize = (ndata+(*nchunks)-1)/(*nchunks); if (*nchunks>1) *chunksize = ((*chunksize+nmult-1)/nmult)*nmult; } *nchunks = (ndata+(*chunksize)-1)/(*chunksize); } NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth) { double ofs=lmax*0.01; if (ofs<100.) ofs=100.; double b = -2*spin*fabs(cth); double t1 = lmax*sth+ofs; double c = (double)spin*spin-t1*t1; double discr = b*b-4*c; if (discr<=0) return lmax; double res=(-b+sqrt(discr))/2.; if (res>lmax) res=lmax; return (int)(res+0.5); } typedef struct { double phi0_; dcmplx *shiftarr; int s_shift; rfft_plan plan; int length; int norot; } ringhelper; static void ringhelper_init (ringhelper *self) { static ringhelper rh_null = { 0, NULL, 0, NULL, 0, 0 }; *self = rh_null; } static void ringhelper_destroy (ringhelper *self) { if (self->plan) destroy_rfft_plan(self->plan); DEALLOC(self->shiftarr); ringhelper_init(self); } NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0) { self->norot = (fabs(phi0)<1e-14); if (!(self->norot)) if ((mmax!=self->s_shift-1) || (!FAPPROX(phi0,self->phi0_,1e-12))) { RESIZE (self->shiftarr,dcmplx,mmax+1); self->s_shift = mmax+1; self->phi0_ = phi0; // FIXME: improve this by using sincos2pibyn(nph) etc. for (int m=0; m<=mmax; ++m) self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0); // double *tmp=(double *) self->shiftarr; // sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2); } // if (!self->plan) self->plan=make_rfft_plan(nph); if (nph!=(int)self->length) { if (self->plan) destroy_rfft_plan(self->plan); self->plan=make_rfft_plan(nph); self->length=nph; } } static int ringinfo_compare (const void *xa, const void *xb) { const sharp_ringinfo *a=xa, *b=xb; return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0; } static int ringpair_compare (const void *xa, const void *xb) { const sharp_ringpair *a=xa, *b=xb; // return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0; if (a->r1.nph==b->r1.nph) return (a->r1.phi0 < b->r1.phi0) ? -1 : ((a->r1.phi0 > b->r1.phi0) ? 1 : (a->r1.cth>b->r1.cth ? -1 : 1)); return (a->r1.nphr1.nph) ? -1 : 1; } void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval, const ptrdiff_t *mstart, int flags, sharp_alm_info **alm_info) { sharp_alm_info *info = RALLOC(sharp_alm_info,1); info->lmax = lmax; info->nm = nm; info->mval = RALLOC(int,nm); info->mvstart = RALLOC(ptrdiff_t,nm); info->stride = stride; info->flags = flags; for (int mi=0; mimval[mi] = mval[mi]; info->mvstart[mi] = mstart[mi]; } *alm_info = info; } void sharp_make_alm_info (int lmax, int mmax, int stride, const ptrdiff_t *mstart, sharp_alm_info **alm_info) { int *mval=RALLOC(int,mmax+1); for (int i=0; i<=mmax; ++i) mval[i]=i; sharp_make_general_alm_info (lmax, mmax+1, stride, mval, mstart, 0, alm_info); DEALLOC(mval); } ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi) { UTIL_ASSERT(!(self->flags & SHARP_PACKED), "sharp_alm_index not applicable with SHARP_PACKED alms"); return self->mvstart[mi]+self->stride*l; } ptrdiff_t sharp_alm_count(const sharp_alm_info *self) { ptrdiff_t result=0; for (int im=0; im!=self->nm; ++im) { int m=self->mval[im]; ptrdiff_t x=(self->lmax + 1 - m); if ((m!=0)&&((self->flags&SHARP_PACKED)!=0)) result+=2*x; else result+=x; } return result; } void sharp_destroy_alm_info (sharp_alm_info *info) { DEALLOC (info->mval); DEALLOC (info->mvstart); DEALLOC (info); } void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs, const int *stride, const double *phi0, const double *theta, const double *wgt, sharp_geom_info **geom_info) { sharp_geom_info *info = RALLOC(sharp_geom_info,1); sharp_ringinfo *infos = RALLOC(sharp_ringinfo,nrings); int pos=0; info->pair=RALLOC(sharp_ringpair,nrings); info->npairs=0; info->nphmax=0; *geom_info = info; for (int m=0; mnphmaxnphmax=nph[m]; } qsort(infos,nrings,sizeof(sharp_ringinfo),ringinfo_compare); while (pospair[info->npairs].r1=infos[pos]; if ((pos0) // make sure northern ring is in r1 info->pair[info->npairs].r2=infos[pos+1]; else { info->pair[info->npairs].r1=infos[pos+1]; info->pair[info->npairs].r2=infos[pos]; } ++pos; } else info->pair[info->npairs].r2.nph=-1; ++pos; ++info->npairs; } DEALLOC(infos); qsort(info->pair,info->npairs,sizeof(sharp_ringpair),ringpair_compare); } ptrdiff_t sharp_map_size(const sharp_geom_info *info) { ptrdiff_t result = 0; for (int m=0; mnpairs; ++m) { result+=info->pair[m].r1.nph; result+=(info->pair[m].r2.nph>=0) ? (info->pair[m].r2.nph) : 0; } return result; } void sharp_destroy_geom_info (sharp_geom_info *geom_info) { DEALLOC (geom_info->pair); DEALLOC (geom_info); } /* This currently requires all m values from 0 to nm-1 to be present. It might be worthwhile to relax this criterion such that holes in the m distribution are permissible. */ static int sharp_get_mmax (int *mval, int nm) { //FIXME: if gaps are allowed, we have to search the maximum m in the array int *mcheck=RALLOC(int,nm); SET_ARRAY(mcheck,0,nm,0); for (int i=0; i=0) && (m_curnph; ringhelper_update (self, nph, mmax, info->phi0); double wgt = (flags&SHARP_USE_WEIGHTS) ? info->weight : 1.; if (flags&SHARP_REAL_HARMONICS) wgt *= sqrt_one_half; if (nph>=2*mmax+1) { if (self->norot) for (int m=0; m<=mmax; ++m) { data[2*m]=creal(phase[m*pstride])*wgt; data[2*m+1]=cimag(phase[m*pstride])*wgt; } else for (int m=0; m<=mmax; ++m) { dcmplx tmp = phase[m*pstride]*self->shiftarr[m]; data[2*m]=creal(tmp)*wgt; data[2*m+1]=cimag(tmp)*wgt; } for (int m=2*(mmax+1); mnorot) tmp*=self->shiftarr[m]; if (idx1<(nph+2)/2) { data[2*idx1]+=creal(tmp); data[2*idx1+1]+=cimag(tmp); } if (idx2<(nph+2)/2) { data[2*idx2]+=creal(tmp); data[2*idx2+1]-=cimag(tmp); } if (++idx1>=nph) idx1=0; if (--idx2<0) idx2=nph-1; } } data[1]=data[0]; rfft_backward (self->plan, &(data[1]), 1.); } NOINLINE static void ringhelper_ring2phase (ringhelper *self, const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase, int pstride, int flags) { int nph = info->nph; #if 1 int maxidx = mmax; /* Enable this for traditional Healpix compatibility */ #else int maxidx = IMIN(nph-1,mmax); #endif ringhelper_update (self, nph, mmax, -info->phi0); double wgt = (flags&SHARP_USE_WEIGHTS) ? info->weight : 1; if (flags&SHARP_REAL_HARMONICS) wgt *= sqrt_two; rfft_forward (self->plan, &(data[1]), 1.); data[0]=data[1]; data[1]=data[nph+1]=0.; if (maxidx<=nph/2) { if (self->norot) for (int m=0; m<=maxidx; ++m) phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * wgt; else for (int m=0; m<=maxidx; ++m) phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * self->shiftarr[m] * wgt; } else { for (int m=0; m<=maxidx; ++m) { int idx=m%nph; dcmplx val; if (idx<(nph-idx)) val = (data[2*idx] + _Complex_I*data[2*idx+1]) * wgt; else val = (data[2*(nph-idx)] - _Complex_I*data[2*(nph-idx)+1]) * wgt; if (!self->norot) val *= self->shiftarr[m]; phase[m*pstride]=val; } } for (int m=maxidx+1;m<=mmax; ++m) phase[m*pstride]=0.; } NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map, int flags) { if (flags & SHARP_NO_FFT) { for (int j=0;jnpairs;++j) { if (flags&SHARP_DP) { for (ptrdiff_t i=0;ipair[j].r1.nph;++i) ((dcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0; for (ptrdiff_t i=0;ipair[j].r2.nph;++i) ((dcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0; } else { for (ptrdiff_t i=0;ipair[j].r1.nph;++i) ((fcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0; for (ptrdiff_t i=0;ipair[j].r2.nph;++i) ((fcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0; } } } else { if (flags&SHARP_DP) { for (int j=0;jnpairs;++j) { double *dmap=(double *)map; if (ginfo->pair[j].r1.stride==1) memset(&dmap[ginfo->pair[j].r1.ofs],0, ginfo->pair[j].r1.nph*sizeof(double)); else for (ptrdiff_t i=0;ipair[j].r1.nph;++i) dmap[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0; if ((ginfo->pair[j].r2.nph>0)&&(ginfo->pair[j].r2.stride==1)) memset(&dmap[ginfo->pair[j].r2.ofs],0, ginfo->pair[j].r2.nph*sizeof(double)); else for (ptrdiff_t i=0;ipair[j].r2.nph;++i) dmap[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0; } } else { for (int j=0;jnpairs;++j) { for (ptrdiff_t i=0;ipair[j].r1.nph;++i) ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0; for (ptrdiff_t i=0;ipair[j].r2.nph;++i) ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0; } } } } NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags) { #define CLEARLOOP(real_t,body) \ { \ real_t *talm = (real_t *)alm; \ for (int l=m;l<=ainfo->lmax;++l) \ body \ } for (int mi=0;minm;++mi) { int m=ainfo->mval[mi]; ptrdiff_t mvstart = ainfo->mvstart[mi]; ptrdiff_t stride = ainfo->stride; if (!(ainfo->flags&SHARP_PACKED)) mvstart*=2; if ((ainfo->flags&SHARP_PACKED)&&(m==0)) { if (flags&SHARP_DP) CLEARLOOP(double, talm[mvstart+l*stride] = 0.;) else CLEARLOOP(float, talm[mvstart+l*stride] = 0.;) } else { stride*=2; if (flags&SHARP_DP) CLEARLOOP(double,talm[mvstart+l*stride]=talm[mvstart+l*stride+1]=0.;) else CLEARLOOP(float,talm[mvstart+l*stride]=talm[mvstart+l*stride+1]=0.;) } #undef CLEARLOOP } } NOINLINE static void init_output (sharp_job *job) { if (job->flags&SHARP_ADD) return; if (job->type == SHARP_MAP2ALM) for (int i=0; inalm; ++i) clear_alm (job->ainfo,job->alm[i],job->flags); else for (int i=0; inmaps; ++i) clear_map (job->ginfo,job->map[i],job->flags); } NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta) { if (job->type==SHARP_MAP2ALM) { job->s_m=2*job->nmaps; if (((job->s_m*16*nm)&1023)==0) nm+=3; // hack to avoid critical strides job->s_th=job->s_m*nm; } else { job->s_th=2*job->nmaps; if (((job->s_th*16*ntheta)&1023)==0) ntheta+=3; // hack to avoid critical strides job->s_m=job->s_th*ntheta; } job->phase=RALLOC(dcmplx,2*job->nmaps*nm*ntheta); } static void dealloc_phase (sharp_job *job) { DEALLOC(job->phase); } static void alloc_almtmp (sharp_job *job, int lmax) { job->almtmp=RALLOC(dcmplx,job->nalm*(lmax+2)); } static void dealloc_almtmp (sharp_job *job) { DEALLOC(job->almtmp); } NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) { #define COPY_LOOP(real_t, source_t, expr_of_x) \ { \ for (int l=m; lnalm; ++i) \ job->almtmp[job->nalm*l+i] = 0; \ for (int l=lmin; l<=lmax; ++l) \ for (int i=0; inalm; ++i) \ { \ source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \ job->almtmp[job->nalm*l+i] = expr_of_x; \ } \ for (int i=0; inalm; ++i) \ job->almtmp[job->nalm*(lmax+1)+i] = 0; \ } if (job->type!=SHARP_MAP2ALM) { ptrdiff_t ofs=job->ainfo->mvstart[mi]; int stride=job->ainfo->stride; int m=job->ainfo->mval[mi]; int lmin=(mspin) ? job->spin : m; /* in the case of SHARP_REAL_HARMONICS, phase2ring scales all the coefficients by sqrt_one_half; here we must compensate to avoid scaling m=0 */ double norm_m0=(job->flags&SHARP_REAL_HARMONICS) ? sqrt_two : 1.; if (!(job->ainfo->flags&SHARP_PACKED)) ofs *= 2; if (!((job->ainfo->flags&SHARP_PACKED)&&(m==0))) stride *= 2; if (job->spin==0) { if (m==0) { if (job->flags&SHARP_DP) COPY_LOOP(double, double, x*norm_m0) else COPY_LOOP(float, float, x*norm_m0) } else { if (job->flags&SHARP_DP) COPY_LOOP(double, dcmplx, x) else COPY_LOOP(float, fcmplx, x) } } else { if (m==0) { if (job->flags&SHARP_DP) COPY_LOOP(double, double, x*job->norm_l[l]*norm_m0) else COPY_LOOP(float, float, x*job->norm_l[l]*norm_m0) } else { if (job->flags&SHARP_DP) COPY_LOOP(double, dcmplx, x*job->norm_l[l]) else COPY_LOOP(float, fcmplx, x*job->norm_l[l]) } } } else memset (job->almtmp+job->nalm*job->ainfo->mval[mi], 0, job->nalm*(lmax+2-job->ainfo->mval[mi])*sizeof(dcmplx)); #undef COPY_LOOP } NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) { #define COPY_LOOP(real_t, target_t, expr_of_x) \ for (int l=lmin; l<=lmax; ++l) \ for (int i=0; inalm; ++i) \ { \ dcmplx x = job->almtmp[job->nalm*l+i]; \ *(target_t *)(((real_t *)job->alm[i])+ofs+l*stride) += expr_of_x; \ } if (job->type != SHARP_MAP2ALM) return; ptrdiff_t ofs=job->ainfo->mvstart[mi]; int stride=job->ainfo->stride; int m=job->ainfo->mval[mi]; int lmin=(mspin) ? job->spin : m; /* in the case of SHARP_REAL_HARMONICS, ring2phase scales all the coefficients by sqrt_two; here we must compensate to avoid scaling m=0 */ double norm_m0=(job->flags&SHARP_REAL_HARMONICS) ? sqrt_one_half : 1.; if (!(job->ainfo->flags&SHARP_PACKED)) ofs *= 2; if (!((job->ainfo->flags&SHARP_PACKED)&&(m==0))) stride *= 2; if (job->spin==0) { if (m==0) { if (job->flags&SHARP_DP) COPY_LOOP(double, double, creal(x)*norm_m0) else COPY_LOOP(float, float, crealf(x)*norm_m0) } else { if (job->flags&SHARP_DP) COPY_LOOP(double, dcmplx, x) else COPY_LOOP(float, fcmplx, (fcmplx)x) } } else { if (m==0) { if (job->flags&SHARP_DP) COPY_LOOP(double, double, creal(x)*job->norm_l[l]*norm_m0) else COPY_LOOP(float, fcmplx, (float)(creal(x)*job->norm_l[l]*norm_m0)) } else { if (job->flags&SHARP_DP) COPY_LOOP(double, dcmplx, x*job->norm_l[l]) else COPY_LOOP(float, fcmplx, (fcmplx)(x*job->norm_l[l])) } } #undef COPY_LOOP } NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, const double *ringtmp, int rstride) { if (job->flags & SHARP_DP) { double **dmap = (double **)job->map; for (int i=0; inmaps; ++i) { double *restrict p1=&dmap[i][ri->ofs]; const double *restrict p2=&ringtmp[i*rstride+1]; if (ri->stride==1) { if (job->flags&SHARP_ADD) for (int m=0; mnph; ++m) p1[m] += p2[m]; else memcpy(p1,p2,ri->nph*sizeof(double)); } else for (int m=0; mnph; ++m) p1[m*ri->stride] += p2[m]; } } else { float **fmap = (float **)job->map; for (int i=0; inmaps; ++i) for (int m=0; mnph; ++m) fmap[i][ri->ofs+m*ri->stride] += (float)ringtmp[i*rstride+m+1]; } } NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, double *ringtmp, int rstride) { if (job->flags & SHARP_DP) for (int i=0; inmaps; ++i) { double *restrict p1=&ringtmp[i*rstride+1], *restrict p2=&(((double *)(job->map[i]))[ri->ofs]); if (ri->stride==1) memcpy(p1,p2,ri->nph*sizeof(double)); else for (int m=0; mnph; ++m) p1[m] = p2[m*ri->stride]; } else for (int i=0; inmaps; ++i) for (int m=0; mnph; ++m) ringtmp[i*rstride+m+1] = ((float *)(job->map[i]))[ri->ofs+m*ri->stride]; } static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, dcmplx *phase) { if (ri->nph<0) { for (int i=0; inmaps; ++i) for (int m=0; m<=mmax; ++m) phase[2*i+job->s_m*m]=0.; } else { UTIL_ASSERT(ri->nph==mmax+1,"bad ring size"); double wgt = (job->flags&SHARP_USE_WEIGHTS) ? (ri->nph*ri->weight) : 1.; if (job->flags&SHARP_REAL_HARMONICS) wgt *= sqrt_two; for (int i=0; inmaps; ++i) for (int m=0; m<=mmax; ++m) phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ? ((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt : ((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt; } } static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, dcmplx *phase) { if (ri->nph<0) return; UTIL_ASSERT(ri->nph==mmax+1,"bad ring size"); dcmplx **dmap = (dcmplx **)job->map; fcmplx **fmap = (fcmplx **)job->map; double wgt = (job->flags&SHARP_USE_WEIGHTS) ? (ri->nph*ri->weight) : 1.; if (job->flags&SHARP_REAL_HARMONICS) wgt *= sqrt_one_half; for (int i=0; inmaps; ++i) for (int m=0; m<=mmax; ++m) if (job->flags & SHARP_DP) dmap[i][ri->ofs+m*ri->stride] += wgt*phase[2*i+job->s_m*m]; else fmap[i][ri->ofs+m*ri->stride] += (fcmplx)(wgt*phase[2*i+job->s_m*m]); } //FIXME: set phase to zero if not SHARP_MAP2ALM? NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim) { if (job->type != SHARP_MAP2ALM) return; int pstride = job->s_m; if (job->flags & SHARP_NO_FFT) { for (int ith=llim; iths_th*(ith-llim); ring2phase_direct(job,&(job->ginfo->pair[ith].r1),mmax, &(job->phase[dim2])); ring2phase_direct(job,&(job->ginfo->pair[ith].r2),mmax, &(job->phase[dim2+1])); } } else { #pragma omp parallel { ringhelper helper; ringhelper_init(&helper); int rstride=job->ginfo->nphmax+2; double *ringtmp=RALLOC(double,job->nmaps*rstride); #pragma omp for schedule(dynamic,1) for (int ith=llim; iths_th*(ith-llim); ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride); for (int i=0; inmaps; ++i) ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1), &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags); if (job->ginfo->pair[ith].r2.nph>0) { ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride); for (int i=0; inmaps; ++i) ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2), &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags); } } DEALLOC(ringtmp); ringhelper_destroy(&helper); } /* end of parallel region */ } } NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim) { if (job->type == SHARP_MAP2ALM) return; int pstride = job->s_m; if (job->flags & SHARP_NO_FFT) { for (int ith=llim; iths_th*(ith-llim); phase2ring_direct(job,&(job->ginfo->pair[ith].r1),mmax, &(job->phase[dim2])); phase2ring_direct(job,&(job->ginfo->pair[ith].r2),mmax, &(job->phase[dim2+1])); } } else { #pragma omp parallel { ringhelper helper; ringhelper_init(&helper); int rstride=job->ginfo->nphmax+2; double *ringtmp=RALLOC(double,job->nmaps*rstride); #pragma omp for schedule(dynamic,1) for (int ith=llim; iths_th*(ith-llim); for (int i=0; inmaps; ++i) ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1), &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags); ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride); if (job->ginfo->pair[ith].r2.nph>0) { for (int i=0; inmaps; ++i) ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2), &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags); ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride); } } DEALLOC(ringtmp); ringhelper_destroy(&helper); } /* end of parallel region */ } } NOINLINE static void sharp_execute_job (sharp_job *job) { double timer=wallTime(); job->opcnt=0; int lmax = job->ainfo->lmax, mmax=sharp_get_mmax(job->ainfo->mval, job->ainfo->nm); job->norm_l = (job->type==SHARP_ALM2MAP_DERIV1) ? sharp_Ylmgen_get_d1norm (lmax) : sharp_Ylmgen_get_norm (lmax, job->spin); /* clear output arrays if requested */ init_output (job); int nchunks, chunksize; get_chunk_info(job->ginfo->npairs,sharp_veclen()*sharp_max_nvec(job->spin), &nchunks,&chunksize); //FIXME: needs to be changed to "nm" alloc_phase (job,mmax+1,chunksize); /* chunk loop */ for (int chunk=0; chunkginfo->npairs); int *ispair = RALLOC(int,ulim-llim); int *mlim = RALLOC(int,ulim-llim); double *cth = RALLOC(double,ulim-llim), *sth = RALLOC(double,ulim-llim); for (int i=0; iginfo->pair[i+llim].r2.nph>0; cth[i] = job->ginfo->pair[i+llim].r1.cth; sth[i] = job->ginfo->pair[i+llim].r1.sth; mlim[i] = sharp_get_mlim(lmax, job->spin, sth[i], cth[i]); } /* map->phase where necessary */ map2phase (job, mmax, llim, ulim); #pragma omp parallel { sharp_job ljob = *job; ljob.opcnt=0; sharp_Ylmgen_C generator; sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin); alloc_almtmp(&ljob,lmax); #pragma omp for schedule(dynamic,1) for (int mi=0; miainfo->nm; ++mi) { /* alm->alm_tmp where necessary */ alm2almtmp (&ljob, lmax, mi); inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim); /* alm_tmp->alm where necessary */ almtmp2alm (&ljob, lmax, mi); } sharp_Ylmgen_destroy(&generator); dealloc_almtmp(&ljob); #pragma omp critical job->opcnt+=ljob.opcnt; } /* end of parallel region */ /* phase->map where necessary */ phase2map (job, mmax, llim, ulim); DEALLOC(ispair); DEALLOC(mlim); DEALLOC(cth); DEALLOC(sth); } /* end of chunk loop */ DEALLOC(job->norm_l); dealloc_phase (job); job->time=wallTime()-timer; } static void sharp_build_job_common (sharp_job *job, sharp_jobtype type, int spin, void *alm, void *map, const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int flags) { if (type==SHARP_ALM2MAP_DERIV1) spin=1; if (type==SHARP_MAP2ALM) flags|=SHARP_USE_WEIGHTS; if (type==SHARP_Yt) type=SHARP_MAP2ALM; if (type==SHARP_WY) { type=SHARP_ALM2MAP; flags|=SHARP_USE_WEIGHTS; } UTIL_ASSERT((spin>=0)&&(spin<=alm_info->lmax), "bad spin"); job->type = type; job->spin = spin; job->norm_l = NULL; job->nmaps = (type==SHARP_ALM2MAP_DERIV1) ? 2 : ((spin>0) ? 2 : 1); job->nalm = (type==SHARP_ALM2MAP_DERIV1) ? 1 : ((spin>0) ? 2 : 1); job->ginfo = geom_info; job->ainfo = alm_info; job->flags = flags; if (alm_info->flags&SHARP_REAL_HARMONICS) job->flags|=SHARP_REAL_HARMONICS; job->time = 0.; job->opcnt = 0; job->alm=alm; job->map=map; } void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map, const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int flags, double *time, unsigned long long *opcnt) { sharp_job job; sharp_build_job_common (&job, type, spin, alm, map, geom_info, alm_info, flags); sharp_execute_job (&job); if (time!=NULL) *time = job.time; if (opcnt!=NULL) *opcnt = job.opcnt; } void sharp_set_chunksize_min(int new_chunksize_min) { chunksize_min=new_chunksize_min; } void sharp_set_nchunks_max(int new_nchunks_max) { nchunks_max=new_nchunks_max; } #ifdef USE_MPI #include "sharp_mpi.c" int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin, void *alm, void *map, const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int flags, double *time, unsigned long long *opcnt) { MPI_Comm comm = *(MPI_Comm*)pcomm; sharp_execute_mpi((MPI_Comm)comm, type, spin, alm, map, geom_info, alm_info, flags, time, opcnt); return 0; } #else int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin, void *alm, void *map, const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int flags, double *time, unsigned long long *opcnt) { /* Suppress unused warning: */ (void)pcomm; (void)type; (void)spin; (void)alm; (void)map; (void)geom_info; (void)alm_info; (void)flags; (void)time; (void)opcnt; return SHARP_ERROR_NO_MPI; } #endif