diff --git a/libsharp/sharp.c b/libsharp/sharp.c index 5dee3f7..913b5de 100644 --- a/libsharp/sharp.c +++ b/libsharp/sharp.c @@ -55,16 +55,17 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize) *nchunks = (ndata+*chunksize-1) / *chunksize; } -typedef struct +static int sharp_get_mlim (int lmax, int spin, double sth, double cth, + double ofs) { - double s; - int i; - } idxhelper; - -static int idx_compare (const void *xa, const void *xb) - { - const idxhelper *a=xa, *b=xb; - return (a->s > b->s) ? -1 : (a->s < b->s) ? 1 : 0; + double b = -2*spin*fabs(cth); + double t1 = lmax*sth+ofs; + double c = spin*spin-t1*t1; + double discr = b*b-4*c; + if (discr<=0) return lmax; + double res=(-b+sqrt(discr))/2.; + if (res>lmax) res=lmax; + return (int)(res+0.5); } typedef struct @@ -249,38 +250,34 @@ static void ringhelper_phase2ring (ringhelper *self, ringhelper_update (self, nph, mmax, info->phi0); self->work[0]=phase[0]; - SET_ARRAY(self->work,1,nph,0.); -#if 0 - if (self->norot) + if (nph>=2*mmax+1) + { for (int m=1; m<=mmax; ++m) { - int idx1 = m%nph; - int idx2 = nph-1-((m-1)%nph); - self->work[idx1]+=phase[m*pstride]; - self->work[idx2]+=conj(phase[m*pstride]); + dcmplx tmp = phase[m*pstride]; + if(!self->norot) tmp*=self->shiftarr[m]; + self->work[m]=tmp; + self->work[nph-m]=conj(tmp); } + for (int m=mmax+1; m<(nph-mmax); ++m) + self->work[m]=0.; + } else + { + SET_ARRAY(self->work,1,nph,0.); + + int idx1=1, idx2=nph-1; for (int m=1; m<=mmax; ++m) { - int idx1 = m%nph; - int idx2 = nph-1-((m-1)%nph); - dcmplx tmp = phase[m*pstride]*self->shiftarr[m]; + dcmplx tmp = phase[m*pstride]; + if(!self->norot) tmp*=self->shiftarr[m]; self->work[idx1]+=tmp; self->work[idx2]+=conj(tmp); + if (++idx1>=nph) idx1=0; + if (--idx2<0) idx2=nph-1; } -#else - int idx1=1, idx2=nph-1; - for (int m=1; m<=mmax; ++m) - { - dcmplx tmp = phase[m*pstride]; - if(!self->norot) tmp*=self->shiftarr[m]; - self->work[idx1]+=tmp; - self->work[idx2]+=conj(tmp); - if (++idx1>=nph) idx1=0; - if (--idx2<0) idx2=nph-1; } -#endif real_plan_backward_c (self->plan, (double *)(self->work)); double wgt = (flags&SHARP_USE_WEIGHTS) ? info->weight : 1.; if (flags&SHARP_REAL_HARMONICS) @@ -317,12 +314,24 @@ static void ringhelper_ring2phase (ringhelper *self, real_plan_forward_c (self->plan, (double *)self->work); - if (self->norot) - for (int m=0; m<=maxidx; ++m) - phase[m*pstride] = self->work[m%nph]; + if (maxidxnorot) + for (int m=0; m<=maxidx; ++m) + phase[m*pstride] = self->work[m]; + else + for (int m=0; m<=maxidx; ++m) + phase[m*pstride]=self->work[m]*self->shiftarr[m]; + } else - for (int m=0; m<=maxidx; ++m) - phase[m*pstride]=self->work[m%nph]*self->shiftarr[m]; + { + if (self->norot) + for (int m=0; m<=maxidx; ++m) + phase[m*pstride] = self->work[m%nph]; + else + for (int m=0; m<=maxidx; ++m) + phase[m*pstride]=self->work[m%nph]*self->shiftarr[m]; + } for (int m=maxidx+1;m<=mmax; ++m) phase[m*pstride]=0.; @@ -622,21 +631,15 @@ static void sharp_execute_job (sharp_job *job) { int llim=chunk*chunksize, ulim=IMIN(llim+chunksize,job->ginfo->npairs); int *ispair = RALLOC(int,ulim-llim); + int *mlim = RALLOC(int,ulim-llim); double *cth = RALLOC(double,ulim-llim), *sth = RALLOC(double,ulim-llim); - idxhelper *stmp = RALLOC(idxhelper,ulim-llim); for (int i=0; iginfo->pair[i+llim].r2.nph>0; cth[i] = job->ginfo->pair[i+llim].r1.cth; sth[i] = job->ginfo->pair[i+llim].r1.sth; - stmp[i].s=sth[i]; - stmp[i].i=i; + mlim[i] = sharp_get_mlim(lmax, job->spin, sth[i], cth[i], 100.); } - qsort (stmp,ulim-llim,sizeof(idxhelper),idx_compare); - int *idx = RALLOC(int,ulim-llim); - for (int i=0; iphase where necessary */ map2phase (job, mmax, llim, ulim); @@ -655,7 +658,7 @@ static void sharp_execute_job (sharp_job *job) /* alm->alm_tmp where necessary */ alm2almtmp (&ljob, lmax, mi); - inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, idx); + inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim); /* alm_tmp->alm where necessary */ almtmp2alm (&ljob, lmax, mi); @@ -672,9 +675,9 @@ static void sharp_execute_job (sharp_job *job) phase2map (job, mmax, llim, ulim); DEALLOC(ispair); + DEALLOC(mlim); DEALLOC(cth); DEALLOC(sth); - DEALLOC(idx); } /* end of chunk loop */ DEALLOC(job->norm_l); @@ -693,7 +696,7 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type, if (type==SHARP_Yt) type=SHARP_MAP2ALM; if (type==SHARP_WY) { type=SHARP_ALM2MAP; flags|=SHARP_USE_WEIGHTS; } - UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin"); + UTIL_ASSERT((spin>=0)&&(spin<=alm_info->lmax), "bad spin"); job->type = type; job->spin = spin; job->norm_l = NULL; @@ -801,7 +804,7 @@ int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans) if (type==SHARP_ALM2MAP_DERIV1) spin=1; UTIL_ASSERT(type<5,"bad type"); UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms"); - UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin"); + UTIL_ASSERT(spin>=0, "bad spin"); ntrans=IMIN(ntrans,maxtr); if (nv_opt[ntrans-1][spin!=0][type]==0) diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c index 9931859..4f07d00 100644 --- a/libsharp/sharp_core.c +++ b/libsharp/sharp_core.c @@ -75,7 +75,7 @@ typedef complex double dcmplx; void inner_loop (sharp_job *job, const int *ispair,const double *cth, const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, - const int *idx) + const int *mlim) { int njobs=job->ntrans, nv=job->flags&SHARP_NVMAX; if (njobs<=MAXJOB_SPECIAL) @@ -84,122 +84,122 @@ void inner_loop (sharp_job *job, const int *ispair,const double *cth, { #if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1)) case 0x11: - CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x12: - CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x13: - CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x14: - CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x15: - CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x16: - CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; #endif #if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2)) case 0x21: - CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x22: - CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x23: - CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x24: - CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x25: - CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x26: - CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; #endif #if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3)) case 0x31: - CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x32: - CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x33: - CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x34: - CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x35: - CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x36: - CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; #endif #if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4)) case 0x41: - CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x42: - CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x43: - CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x44: - CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x45: - CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x46: - CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; #endif #if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5)) case 0x51: - CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x52: - CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x53: - CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x54: - CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x55: - CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x56: - CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; #endif #if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6)) case 0x61: - CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x62: - CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x63: - CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x64: - CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x65: - CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x66: - CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; #endif } @@ -211,27 +211,27 @@ void inner_loop (sharp_job *job, const int *ispair,const double *cth, { case 1: CONCAT2(inner_loop,1) - (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); + (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); return; case 2: CONCAT2(inner_loop,2) - (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); + (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); return; case 3: CONCAT2(inner_loop,3) - (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); + (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); return; case 4: CONCAT2(inner_loop,4) - (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); + (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); return; case 5: CONCAT2(inner_loop,5) - (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); + (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); return; case 6: CONCAT2(inner_loop,6) - (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); + (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); return; } } diff --git a/libsharp/sharp_core.h b/libsharp/sharp_core.h index 17b5881..d5dcded 100644 --- a/libsharp/sharp_core.h +++ b/libsharp/sharp_core.h @@ -41,7 +41,7 @@ extern "C" { void inner_loop (sharp_job *job, const int *ispair,const double *cth, const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, - const int *idx); + const int *mlim); #ifdef __cplusplus } diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c index 63d2e4a..baa2186 100644 --- a/libsharp/sharp_core_inc2.c +++ b/libsharp/sharp_core_inc2.c @@ -163,13 +163,13 @@ static void Z(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1, static void Z(calc_alm2map) (const Tb cth, const Tb sth, const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1, - Y(Tbri) * restrict p2 NJ1, int *done) + Y(Tbri) * restrict p2 NJ1) { int l,lmax=gen->lmax; Tb lam_1,lam_2,scale; Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); job->opcnt += (l-gen->m) * 4*VLEN*nvec; - if (l>lmax) { *done=1; return; } + if (l>lmax) return; job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec; Tb corfac; @@ -213,7 +213,7 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth, full_ieee = Y(TballGe)(scale,sharp_minscale); } } - if (l>lmax) { *done=1; return; } + if (l>lmax) return; Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2); @@ -221,14 +221,14 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth, static void Z(calc_map2alm) (const Tb cth, const Tb sth, const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1, - const Y(Tbri) * restrict p2 NJ1, int *done) + const Y(Tbri) * restrict p2 NJ1) { int lmax=gen->lmax; Tb lam_1,lam_2,scale; int l=gen->m; Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); job->opcnt += (l-gen->m) * 4*VLEN*nvec; - if (l>lmax) { *done=1; return; } + if (l>lmax) return; job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec; const sharp_ylmgen_dbl2 * restrict rf = gen->rf; @@ -249,7 +249,7 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth, } alm[l*njobs+j]+=vhsum_cmplx(tre,tim); } - if (++l>lmax) { *done=1; return; } + if (++l>lmax) return; Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]); for (int i=0; ilmax) { *done=1; return; } + if (++l>lmax) return; r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]); for (int i=0; i1) Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l], &alm[2*njobs*(l+1)] NJ2); -#else - Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2); - Z(saddstep)(p2, p1, rec1p, rec1m, &alm[2*njobs*(l+1)] NJ2); -#endif -#else - Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l], - &alm[2*njobs*(l+1)] NJ2); -#endif fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]); fx2=vload(fx[l+2].f[2]); for (int i=0; ilmax; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); job->opcnt += (l-gen->m) * 10*VLEN*nvec; - if (l>lmax) - { *done=1; return; } + if (l>lmax) return; job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; const sharp_ylmgen_dbl3 * restrict fx = gen->fx; @@ -477,8 +465,7 @@ static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen, } } - if (l>lmax) - { *done=1; return; } + if (l>lmax) return; Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); @@ -487,14 +474,13 @@ static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen, } static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen, - sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 - NJ1, int *done) + sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1) { int l, lmax=gen->lmax; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); job->opcnt += (l-gen->m) * 10*VLEN*nvec; - if (l>lmax) { *done=1; return; } + if (l>lmax) return; job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; const sharp_ylmgen_dbl3 * restrict fx = gen->fx; @@ -508,11 +494,11 @@ static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen, { Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm); Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l] NJ2); - if (++l>lmax) { *done=1; return; } + if (++l>lmax) return; Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm); Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l] NJ2); - if (++l>lmax) { *done=1; return; } + if (++l>lmax) return; Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) { @@ -583,15 +569,13 @@ static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1, } static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen, - sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2 NJ1, - int *done) + sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2 NJ1) { int l, lmax=gen->lmax; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); job->opcnt += (l-gen->m) * 10*VLEN*nvec; - if (l>lmax) - { *done=1; return; } + if (l>lmax) return; job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec; const sharp_ylmgen_dbl3 * restrict fx = gen->fx; @@ -620,8 +604,7 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen, } } - if (l>lmax) - { *done=1; return; } + if (l>lmax) return; Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); @@ -634,7 +617,7 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen, static void Z(inner_loop) (sharp_job *job, const int *ispair, const double *cth_, const double *sth_, int llim, int ulim, - sharp_Ylmgen_C *gen, int mi, const int *idx NJ1) + sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1) { const int nval=nvec*VLEN; const int m = job->ainfo->mval[mi]; @@ -647,30 +630,27 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair, { if (job->spin==0) { - int done=0; for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; - itot=idx[itot]; - cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; - } - Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2,&done); + int skip=1; + for (int i=0; i=ulim-llim) itot=ulim-llim-1; + if (mlim[itot]>=m) skip=0; + cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; } + if (!skip) + Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2); for (int i=0; iainfo->nm+mi)); @@ -686,34 +666,31 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair, } else { - int done=0; for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; - itot=idx[itot]; - cth.s[i]=cth_[itot]; - } + for (int i=0; i=ulim-llim) itot=ulim-llim-1; + if (mlim[itot]>=m) skip=0; + cth.s[i]=cth_[itot]; + } + if (!skip) (job->type==SHARP_ALM2MAP) ? Z(calc_alm2map_spin ) - (cth.b,gen,job,&p1[0].b,&p2[0].b NJ2,&done) : + (cth.b,gen,job,&p1[0].b,&p2[0].b NJ2) : Z(calc_alm2map_deriv1) - (cth.b,gen,job,&p1[0].b,&p2[0].b NJ2,&done); - } + (cth.b,gen,job,&p1[0].b,&p2[0].b NJ2); for (int i=0; iainfo->nm+mi)); @@ -743,19 +720,19 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair, { if (job->spin==0) { - int done=0; - for (int ith=0; (ith=ulim-llim) itot=ulim-llim-1; - itot=idx[itot]; + if (mlim[itot]>=m) skip=0; cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; - if (i+ith=m)) { for (int j=0; j=ulim-llim) itot=ulim-llim-1; - itot=idx[itot]; + if (mlim[itot]>=m) skip=0; cth.s[i]=cth_[itot]; if (i+ith