diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c index d7c3624..06b9285 100644 --- a/libsharp/sharp_core_inc0.c +++ b/libsharp/sharp_core_inc0.c @@ -82,22 +82,22 @@ void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *c switch (nv) { case 0x1: - CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + CONCAT2(inner_loop,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x2: - CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + CONCAT2(inner_loop,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x3: - CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + CONCAT2(inner_loop,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x4: - CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + CONCAT2(inner_loop,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x5: - CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + CONCAT2(inner_loop,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; case 0x6: - CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + CONCAT2(inner_loop,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); return; } UTIL_FAIL("Incorrect vector parameters"); diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c index 017df07..de2924f 100644 --- a/libsharp/sharp_core_inc2.c +++ b/libsharp/sharp_core_inc2.c @@ -32,149 +32,89 @@ NOINLINE static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1, Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, - int l, int lmax NJ1) + int l, int lmax) { -if (njobs>1) - { - while (lr.v[i] += lam_2.v[i]*ar; + p1->i.v[i] += lam_2.v[i]*ai; } + } for (int i=0; ir.v[i] += lam_1.v[i]*ar; + p2->i.v[i] += lam_1.v[i]*ai; + } } l+=2; } if (l==lmax) { - for (int j=0; jr.v[i] += lam_2.v[i]*ar; + p1->i.v[i] += lam_2.v[i]*ai; } } } NOINLINE static void Z(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, - const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp - NJ1) + const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp) { while (lr.v[i]); + vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]); + } for (int i=0; ir.v[i]); + vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]); + } l+=2; } if (l==lmax) - { - for (int j=0; jr.v[i]; + atmp[2*l+1] += lam_2.v[i]*p1->i.v[i]; + } } NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth, const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1, - Y(Tbri) * restrict p2 NJ1) + Y(Tbri) * restrict p2) { int l,lmax=gen->lmax; Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale; Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); job->opcnt += (l-gen->m) * 4*VLEN*nvec; if (l>lmax) return; - job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec; + job->opcnt += (lmax+1-l) * 8*VLEN*nvec; Tb corfac; Y(getCorfac)(scale,&corfac,gen->cf); @@ -183,30 +123,28 @@ NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth, int full_ieee = Y(TballGe)(scale,sharp_minscale); while (!full_ieee) { - for (int j=0; jr.v[i],tmp,ar); + vfmaeq(p1->i.v[i],tmp,ai); } + } if (++l>lmax) break; Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]); for (int i=0; ir.v[i],tmp,ar); + vfmaeq(p2->i.v[i],tmp,ai); } + } if (++l>lmax) break; r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]); for (int i=0; ilmax) return; Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); - Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2); + Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax); } NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth, const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1, - const Y(Tbri) * restrict p2, Tv *restrict atmp NJ1) + const Y(Tbri) * restrict p2, Tv *restrict atmp) { int lmax=gen->lmax; Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale; @@ -233,7 +171,7 @@ NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth, Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); job->opcnt += (l-gen->m) * 4*VLEN*nvec; if (l>lmax) return; - job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec; + job->opcnt += (lmax+1-l) * 8*VLEN*nvec; const sharp_ylmgen_dbl2 * restrict rf = gen->rf; Tb corfac; @@ -241,24 +179,22 @@ NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth, int full_ieee = Y(TballGe)(scale,sharp_minscale); while (!full_ieee) { - for (int j=0; jr.v[i]; + atmp[2*l+1]+=tmp*p1->i.v[i]; + } if (++l>lmax) return; for (int i=0; ir.v[i]; + atmp[2*l+1]+=tmp*p2->i.v[i]; + } if (++l>lmax) return; for (int i=0; iqr.v[i],agr,lw); + vfmaeq(px->qi.v[i],agi,lw); + vfmaeq(px->ur.v[i],acr,lw); + vfmaeq(px->ui.v[i],aci,lw); + } + for (int i=0; iqr.v[i],aci,lx); + vfmaeq(py->qi.v[i],acr,lx); + vfmaeq(py->ur.v[i],agi,lx); + vfmseq(py->ui.v[i],agr,lx); } } static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m, - const dcmplx * restrict alm1, const dcmplx * restrict alm2 NJ1) + const dcmplx * restrict alm1, const dcmplx * restrict alm2) { - for (int j=0; jqr.v[i],agr1,lw1,aci2,lx2); + vfmaaeq(p1->qi.v[i],agi1,lw1,acr2,lx2); + vfmaaeq(p1->ur.v[i],acr1,lw1,agi2,lx2); + vfmaseq(p1->ui.v[i],aci1,lw1,agr2,lx2); + } + for (int i=0; iqr.v[i],agr2,lw2,aci1,lx1); + vfmaaeq(p2->qi.v[i],agi2,lw2,acr1,lx1); + vfmaaeq(p2->ur.v[i],acr2,lw2,agi1,lx1); + vfmaseq(p2->ui.v[i],aci2,lw2,agr1,lx1); } } static inline void Z(saddstep2) (const Y(Tbqu) * restrict px, const Y(Tbqu) * restrict py, const Tb * restrict rxp, - const Tb * restrict rxm, dcmplx * restrict alm NJ1) + const Tb * restrict rxm, dcmplx * restrict alm) { - for (int j=0; jv[i],rxm->v[i]); - vfmaeq(agr,px[j].qr.v[i],lw); - vfmaeq(agi,px[j].qi.v[i],lw); - vfmaeq(acr,px[j].ur.v[i],lw); - vfmaeq(aci,px[j].ui.v[i],lw); - } - for (int i=0; iv[i],rxp->v[i]); - vfmseq(agr,py[j].ui.v[i],lx); - vfmaeq(agi,py[j].ur.v[i],lx); - vfmaeq(acr,py[j].qi.v[i],lx); - vfmseq(aci,py[j].qr.v[i],lx); - } - vhsum_cmplx_special(agr,agi,acr,aci,&alm[2*j]); + Tv lw=vadd(rxp->v[i],rxm->v[i]); + vfmaeq(agr,px->qr.v[i],lw); + vfmaeq(agi,px->qi.v[i],lw); + vfmaeq(acr,px->ur.v[i],lw); + vfmaeq(aci,px->ui.v[i],lw); } + for (int i=0; iv[i],rxp->v[i]); + vfmseq(agr,py->ui.v[i],lx); + vfmaeq(agi,py->ur.v[i],lx); + vfmaeq(acr,py->qi.v[i],lx); + vfmseq(aci,py->qr.v[i],lx); + } + vhsum_cmplx_special(agr,agi,acr,aci,alm); } NOINLINE static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, - int lmax NJ1) + int lmax) { while (llmax; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; @@ -430,7 +356,7 @@ NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth, (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); job->opcnt += (l-gen->m) * 10*VLEN*nvec; if (l>lmax) return; - job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; + job->opcnt += (lmax+1-l) * 28*VLEN*nvec; const sharp_ylmgen_dbl3 * restrict fx = gen->fx; Tb corfacp,corfacm; @@ -442,11 +368,11 @@ NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth, while (!full_ieee) { Z(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), - &alm[2*njobs*l] NJ2); + &alm[2*l]); if (++l>lmax) break; Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); Z(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), - &alm[2*njobs*l] NJ2); + &alm[2*l]); if (++l>lmax) break; Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) @@ -463,12 +389,12 @@ NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth, Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); Z(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, - lmax NJ2); + lmax); } NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth, const sharp_Ylmgen_C * restrict gen, sharp_job *job, - const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1) + const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2) { int l, lmax=gen->lmax; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; @@ -476,7 +402,7 @@ NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth, (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); job->opcnt += (l-gen->m) * 10*VLEN*nvec; if (l>lmax) return; - job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; + job->opcnt += (lmax+1-l) * 28*VLEN*nvec; const sharp_ylmgen_dbl3 * restrict fx = gen->fx; Tb corfacp,corfacm; @@ -488,11 +414,11 @@ NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth, while (!full_ieee) { Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm); - Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l] NJ2); + Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*l]); if (++l>lmax) return; Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm); - Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l] NJ2); + Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*l]); if (++l>lmax) return; Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) @@ -506,34 +432,31 @@ NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth, Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); - Z(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax NJ2); + Z(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax); } static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, - const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1) + const Tb rxp, const Tb rxm, const dcmplx * restrict alm) { - for (int j=0; jqr.v[i],ar,lw); + vfmaeq(px->qi.v[i],ai,lw); + } + for (int i=0; iur.v[i],ai,lx); + vfmseq(py->ui.v[i],ar,lx); } } NOINLINE static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, - int lmax NJ1) + int lmax) { while (llmax; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; @@ -573,7 +496,7 @@ NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth, (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); job->opcnt += (l-gen->m) * 10*VLEN*nvec; if (l>lmax) return; - job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec; + job->opcnt += (lmax+1-l) * 20*VLEN*nvec; const sharp_ylmgen_dbl3 * restrict fx = gen->fx; Tb corfacp,corfacm; @@ -585,11 +508,11 @@ NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth, while (!full_ieee) { Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), - &alm[njobs*l] NJ2); + &alm[l]); if (++l>lmax) break; Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), - &alm[njobs*l] NJ2); + &alm[l]); if (++l>lmax) break; Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) @@ -606,7 +529,7 @@ NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth, Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, - lmax NJ2); + lmax); } @@ -614,7 +537,7 @@ NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth, NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair, const double *cth_, const double *sth_, int llim, int ulim, - sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1) + sharp_Ylmgen_C *gen, int mi, const int *mlim) { const int nval=nvec*VLEN; const int m = job->ainfo->mval[mi]; @@ -629,7 +552,7 @@ NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair, { for (int ith=0; iths_th + mi*job->s_m + 2*j; - complex double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I, - r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I; - job->phase[phas_idx] = r1+r2; - if (ispair[itot]) - job->phase[phas_idx+1] = r1-r2; - } + int phas_idx = itot*job->s_th + mi*job->s_m; + complex double r1 = p1.s.r[i] + p1.s.i[i]*_Complex_I, + r2 = p2.s.r[i] + p2.s.i[i]*_Complex_I; + job->phase[phas_idx] = r1+r2; + if (ispair[itot]) + job->phase[phas_idx+1] = r1-r2; } } } @@ -665,7 +585,7 @@ NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair, { for (int ith=0; ithtype==SHARP_ALM2MAP) ? Z(calc_alm2map_spin ) - (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2) : + (cth.b,sth.b,gen,job,&p1.b,&p2.b) : Z(calc_alm2map_deriv1) - (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2); + (cth.b,sth.b,gen,job,&p1.b,&p2.b); for (int i=0; is_th + mi*job->s_m; + complex double q1 = p1.s.qr[i] + p1.s.qi[i]*_Complex_I, + q2 = p2.s.qr[i] + p2.s.qi[i]*_Complex_I, + u1 = p1.s.ur[i] + p1.s.ui[i]*_Complex_I, + u2 = p2.s.ur[i] + p2.s.ui[i]*_Complex_I; + job->phase[phas_idx] = q1+q2; + job->phase[phas_idx+2] = u1+u2; + if (ispair[itot]) { - int phas_idx = itot*job->s_th + mi*job->s_m + 4*j; - complex double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I, - q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I, - u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I, - u2 = p2[j].s.ur[i] + p2[j].s.ui[i]*_Complex_I; - job->phase[phas_idx] = q1+q2; - job->phase[phas_idx+2] = u1+u2; - if (ispair[itot]) - { - dcmplx *phQ = &(job->phase[phas_idx+1]), - *phU = &(job->phase[phas_idx+3]); - *phQ = q1-q2; - *phU = u1-u2; - if ((gen->mhi-gen->m+gen->s)&1) - { *phQ=-(*phQ); *phU=-(*phU); } - } + dcmplx *phQ = &(job->phase[phas_idx+1]), + *phU = &(job->phase[phas_idx+3]); + *phQ = q1-q2; + *phU = u1-u2; + if ((gen->mhi-gen->m+gen->s)&1) + { *phQ=-(*phQ); *phU=-(*phU); } } } } @@ -723,7 +640,7 @@ NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair, NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair, const double *cth_, const double *sth_, int llim, int ulim, - sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1) + sharp_Ylmgen_C *gen, int mi, const int *mlim) { const int nval=nvec*VLEN; const int m = job->ainfo->mval[mi]; @@ -735,11 +652,11 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair, { if (job->spin==0) { - Tv atmp[2*njobs*(gen->lmax+1)]; - memset (&atmp[2*njobs*m],0,2*njobs*(gen->lmax+1-m)*sizeof(Tv)); + Tv atmp[2*(gen->lmax+1)]; + memset (&atmp[2*m],0,2*(gen->lmax+1-m)*sizeof(Tv)); for (int ith=0; ith=m)) { - for (int j=0; js_th + mi*job->s_m + 2*j; - dcmplx ph1=job->phase[phas_idx]; - dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.; - p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2); - p2[j].s.r[i]=creal(ph1-ph2); p2[j].s.i[i]=cimag(ph1-ph2); - } + int phas_idx = itot*job->s_th + mi*job->s_m; + dcmplx ph1=job->phase[phas_idx]; + dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.; + p1.s.r[i]=creal(ph1+ph2); p1.s.i[i]=cimag(ph1+ph2); + p2.s.r[i]=creal(ph1-ph2); p2.s.i[i]=cimag(ph1-ph2); } } if (!skip) - Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b, atmp NJ2); + Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp); } { - int istart=m*njobs, istop=(gen->lmax+1)*njobs; + int istart=m, istop=gen->lmax+1; for(; istartalmtmp[istart])); for(; istarts_th + mi*job->s_m + 4*j; - dcmplx p1Q=job->phase[phas_idx], - p1U=job->phase[phas_idx+2], - p2Q=ispair[itot] ? job->phase[phas_idx+1]:0., - p2U=ispair[itot] ? job->phase[phas_idx+3]:0.; - if ((gen->mhi-gen->m+gen->s)&1) - { p2Q=-p2Q; p2U=-p2U; } - p1[j].s.qr[i]=creal(p1Q+p2Q); p1[j].s.qi[i]=cimag(p1Q+p2Q); - p1[j].s.ur[i]=creal(p1U+p2U); p1[j].s.ui[i]=cimag(p1U+p2U); - p2[j].s.qr[i]=creal(p1Q-p2Q); p2[j].s.qi[i]=cimag(p1Q-p2Q); - p2[j].s.ur[i]=creal(p1U-p2U); p2[j].s.ui[i]=cimag(p1U-p2U); - } + int phas_idx = itot*job->s_th + mi*job->s_m; + dcmplx p1Q=job->phase[phas_idx], + p1U=job->phase[phas_idx+2], + p2Q=ispair[itot] ? job->phase[phas_idx+1]:0., + p2U=ispair[itot] ? job->phase[phas_idx+3]:0.; + if ((gen->mhi-gen->m+gen->s)&1) + { p2Q=-p2Q; p2U=-p2U; } + p1.s.qr[i]=creal(p1Q+p2Q); p1.s.qi[i]=cimag(p1Q+p2Q); + p1.s.ur[i]=creal(p1U+p2U); p1.s.ui[i]=cimag(p1U+p2U); + p2.s.qr[i]=creal(p1Q-p2Q); p2.s.qi[i]=cimag(p1Q-p2Q); + p2.s.ur[i]=creal(p1U-p2U); p2.s.ui[i]=cimag(p1U-p2U); } } if (!skip) - Z(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2); + Z(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1.b,&p2.b); } } break; @@ -820,11 +731,11 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair, static void Z(inner_loop) (sharp_job *job, const int *ispair, const double *cth_, const double *sth_, int llim, int ulim, - sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1) + sharp_Ylmgen_C *gen, int mi, const int *mlim) { (job->type==SHARP_MAP2ALM) ? - Z(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2) : - Z(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2); + Z(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) : + Z(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim); } #undef VZERO diff --git a/libsharp/sharp_core_inchelper.c b/libsharp/sharp_core_inchelper.c index 89d79cd..c58cecc 100644 --- a/libsharp/sharp_core_inchelper.c +++ b/libsharp/sharp_core_inchelper.c @@ -2,69 +2,9 @@ #define Y(arg) CONCAT2(arg,nvec) #include "sharp_core_inc.c" -#if (SHARP_MAXTRANS>MAXJOB_SPECIAL) -#define NJ1 , int njobs -#define NJ2 , njobs #define Z(arg) CONCAT2(arg,nvec) #include "sharp_core_inc2.c" #undef Z -#undef NJ1 -#undef NJ2 -#endif - -#define NJ1 -#define NJ2 - -#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1)) -#define njobs 1 -#define Z(arg) CONCAT3(arg,nvec,njobs) -#include "sharp_core_inc2.c" -#undef Z -#undef njobs -#endif - -#if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2)) -#define njobs 2 -#define Z(arg) CONCAT3(arg,nvec,njobs) -#include "sharp_core_inc2.c" -#undef Z -#undef njobs -#endif - -#if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3)) -#define njobs 3 -#define Z(arg) CONCAT3(arg,nvec,njobs) -#include "sharp_core_inc2.c" -#undef Z -#undef njobs -#endif - -#if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4)) -#define njobs 4 -#define Z(arg) CONCAT3(arg,nvec,njobs) -#include "sharp_core_inc2.c" -#undef Z -#undef njobs -#endif - -#if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5)) -#define njobs 5 -#define Z(arg) CONCAT3(arg,nvec,njobs) -#include "sharp_core_inc2.c" -#undef Z -#undef njobs -#endif - -#if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6)) -#define njobs 6 -#define Z(arg) CONCAT3(arg,nvec,njobs) -#include "sharp_core_inc2.c" -#undef Z -#undef njobs -#endif - -#undef NJ1 -#undef NJ2 #undef Y #undef Tb