diff --git a/Makefile.am b/Makefile.am index 5cd60d4..7a035ba 100644 --- a/Makefile.am +++ b/Makefile.am @@ -34,9 +34,7 @@ include_HEADERS = \ EXTRA_DIST = \ libsharp/sharp_core_inc0.c \ - libsharp/sharp_core_inc.c \ - libsharp/sharp_core_inc2.c \ - libsharp/sharp_core_inchelper.c + libsharp/sharp_core_inc.c libsharp_la_SOURCES = $(src_sharp) diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c index a15e35a..60fbc6f 100644 --- a/libsharp/sharp_core_inc.c +++ b/libsharp/sharp_core_inc.c @@ -315,3 +315,671 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_, *rec1p_=rec1p; *rec2p_=rec2p; *scalep_=scalep; *rec1m_=rec1m; *rec2m_=rec2m; *scalem_=scalem; } + + +NOINLINE static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1, + Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, + const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, + int l, int lmax) + { + while (l<=lmax) + { + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); + Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]), + f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]); + for (int i=0; ir.v[i] += lam_2.v[i]*ar1; + p1->i.v[i] += lam_2.v[i]*ai1; + lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i]; + p2->r.v[i] += lam_1.v[i]*ar2; + p2->i.v[i] += lam_1.v[i]*ai2; + } + l+=2; + } + } + +NOINLINE static void Y(map2alm_kernel) (const Tb cth, + const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, + const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp) + { + while (l<=lmax) + { + Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]), + f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]); + for (int i=0; ir.v[i]); + vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]); + lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i]; + vfmaeq(atmp[2*(l+1) ],lam_1.v[i],p2->r.v[i]); + vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]); + } + l+=2; + } + } + +NOINLINE static void Y(calc_alm2map) (const Tb cth, const Tb sth, + const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1, + Y(Tbri) * restrict p2) + { + int l,lmax=gen->lmax; + Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale; + Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); + job->opcnt += (l-gen->m) * 4*VLEN*nvec; + if (l>lmax) return; + job->opcnt += (lmax+1-l) * 8*VLEN*nvec; + + Tb corfac; + Y(getCorfac)(scale,&corfac,gen->cf); + const sharp_ylmgen_dbl2 * restrict rf = gen->rf; + const dcmplx * restrict alm=job->almtmp; + int full_ieee = Y(TballGe)(scale,sharp_minscale); + while (!full_ieee) + { + { + Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l])); + for (int i=0; ir.v[i],tmp,ar); + vfmaeq(p1->i.v[i],tmp,ai); + } + } + if (++l>lmax) break; + Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]); + for (int i=0; ir.v[i],tmp,ar); + vfmaeq(p2->i.v[i],tmp,ai); + } + } + if (++l>lmax) break; + r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]); + for (int i=0; icf); + full_ieee = Y(TballGe)(scale,sharp_minscale); + } + } + if (l>lmax) return; + + Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); + Y(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax); + } + +NOINLINE static void Y(calc_map2alm) (const Tb cth, const Tb sth, + const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1, + const Y(Tbri) * restrict p2, Tv *restrict atmp) + { + int lmax=gen->lmax; + Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale; + int l=gen->m; + Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); + job->opcnt += (l-gen->m) * 4*VLEN*nvec; + if (l>lmax) return; + job->opcnt += (lmax+1-l) * 8*VLEN*nvec; + + const sharp_ylmgen_dbl2 * restrict rf = gen->rf; + Tb corfac; + Y(getCorfac)(scale,&corfac,gen->cf); + int full_ieee = Y(TballGe)(scale,sharp_minscale); + while (!full_ieee) + { + for (int i=0; ir.v[i]; + atmp[2*l+1]+=tmp*p1->i.v[i]; + } + if (++l>lmax) return; + for (int i=0; ir.v[i]; + atmp[2*l+1]+=tmp*p2->i.v[i]; + } + if (++l>lmax) return; + for (int i=0; icf); + full_ieee = Y(TballGe)(scale,sharp_minscale); + } + } + + Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); + Y(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp); + } + +static inline void Y(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, + const Tb rxp, const Tb rxm, const dcmplx * restrict alm) + { + Tv agr=vload(creal(alm[0])), agi=vload(cimag(alm[0])), + acr=vload(creal(alm[1])), aci=vload(cimag(alm[1])); + for (int i=0; iqr.v[i],agr,lw); + vfmaeq(px->qi.v[i],agi,lw); + vfmaeq(px->ur.v[i],acr,lw); + vfmaeq(px->ui.v[i],aci,lw); + Tv lx=vsub(rxm.v[i],rxp.v[i]); + vfmseq(py->qr.v[i],aci,lx); + vfmaeq(py->qi.v[i],acr,lx); + vfmaeq(py->ur.v[i],agi,lx); + vfmseq(py->ui.v[i],agr,lx); + } + } + +static inline void Y(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, + const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m, + const dcmplx * restrict alm1, const dcmplx * restrict alm2) + { + Tv agr1=vload(creal(alm1[0])), agi1=vload(cimag(alm1[0])), + acr1=vload(creal(alm1[1])), aci1=vload(cimag(alm1[1])); + Tv agr2=vload(creal(alm2[0])), agi2=vload(cimag(alm2[0])), + acr2=vload(creal(alm2[1])), aci2=vload(cimag(alm2[1])); + for (int i=0; iqr.v[i],agr1,lw1,aci2,lx2); + vfmaaeq(p1->qi.v[i],agi1,lw1,acr2,lx2); + vfmaaeq(p1->ur.v[i],acr1,lw1,agi2,lx2); + vfmaseq(p1->ui.v[i],aci1,lw1,agr2,lx2); + Tv lx1=r2m.v[i]-r2p.v[i]; + Tv lw2=r1p.v[i]+r1m.v[i]; + vfmaseq(p2->qr.v[i],agr2,lw2,aci1,lx1); + vfmaaeq(p2->qi.v[i],agi2,lw2,acr1,lx1); + vfmaaeq(p2->ur.v[i],acr2,lw2,agi1,lx1); + vfmaseq(p2->ui.v[i],aci2,lw2,agr1,lx1); + } + } + +static inline void Y(saddstep2) (const Y(Tbqu) * restrict px, + const Y(Tbqu) * restrict py, const Tb * restrict rxp, + const Tb * restrict rxm, dcmplx * restrict alm) + { + Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero; + for (int i=0; iv[i],rxm->v[i]); + vfmaeq(agr,px->qr.v[i],lw); + vfmaeq(agi,px->qi.v[i],lw); + vfmaeq(acr,px->ur.v[i],lw); + vfmaeq(aci,px->ui.v[i],lw); + Tv lx=vsub(rxm->v[i],rxp->v[i]); + vfmseq(agr,py->ui.v[i],lx); + vfmaeq(agi,py->ur.v[i],lx); + vfmaeq(acr,py->qi.v[i],lx); + vfmseq(aci,py->qr.v[i],lx); + } + vhsum_cmplx_special(agr,agi,acr,aci,alm); + } + +NOINLINE static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1, + Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, + const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, + int lmax) + { + while (llmax; + Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; + Y(iter_to_ieee_spin) + (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); + job->opcnt += (l-gen->m) * 10*VLEN*nvec; + if (l>lmax) return; + job->opcnt += (lmax+1-l) * 28*VLEN*nvec; + + const sharp_ylmgen_dbl3 * restrict fx = gen->fx; + Tb corfacp,corfacm; + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + const dcmplx * restrict alm=job->almtmp; + int full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); + while (!full_ieee) + { + Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), + &alm[2*l]); + if (++l>lmax) break; + Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); + Y(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), + &alm[2*l]); + if (++l>lmax) break; + Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); + if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) + { + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); + } + } + + if (l>lmax) return; + + Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); + Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); + Y(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, + lmax); + } + +NOINLINE static void Y(calc_map2alm_spin) (Tb cth, Tb sth, + const sharp_Ylmgen_C * restrict gen, sharp_job *job, + const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2) + { + int l, lmax=gen->lmax; + Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; + Y(iter_to_ieee_spin) + (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); + job->opcnt += (l-gen->m) * 10*VLEN*nvec; + if (l>lmax) return; + job->opcnt += (lmax+1-l) * 28*VLEN*nvec; + + const sharp_ylmgen_dbl3 * restrict fx = gen->fx; + Tb corfacp,corfacm; + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + dcmplx * restrict alm=job->almtmp; + int full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); + while (!full_ieee) + { + Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm); + Y(saddstep2)(p1, p2, &t1, &t2, &alm[2*l]); + if (++l>lmax) return; + Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); + t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm); + Y(saddstep2)(p2, p1, &t1, &t2, &alm[2*l]); + if (++l>lmax) return; + Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); + if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) + { + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); + } + } + + Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); + Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); + Y(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax); + } + +static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, + const Tb rxp, const Tb rxm, const dcmplx * restrict alm) + { + Tv ar=vload(creal(alm[0])), ai=vload(cimag(alm[0])); + for (int i=0; iqr.v[i],ar,lw); + vfmaeq(px->qi.v[i],ai,lw); + Tv lx=vsub(rxm.v[i],rxp.v[i]); + vfmaeq(py->ur.v[i],ai,lx); + vfmseq(py->ui.v[i],ar,lx); + } + } + +NOINLINE static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1, + Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, + const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, + int lmax) + { + while (llmax; + Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; + Y(iter_to_ieee_spin) + (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); + job->opcnt += (l-gen->m) * 10*VLEN*nvec; + if (l>lmax) return; + job->opcnt += (lmax+1-l) * 20*VLEN*nvec; + + const sharp_ylmgen_dbl3 * restrict fx = gen->fx; + Tb corfacp,corfacm; + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + const dcmplx * restrict alm=job->almtmp; + int full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); + while (!full_ieee) + { + Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), + &alm[l]); + if (++l>lmax) break; + Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); + Y(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), + &alm[l]); + if (++l>lmax) break; + Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); + if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) + { + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + full_ieee = Y(TballGe)(scalep,sharp_minscale) + && Y(TballGe)(scalem,sharp_minscale); + } + } + + if (l>lmax) return; + + Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); + Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); + Y(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, + lmax); + } + + +#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) + +NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair, + const double *cth_, const double *sth_, int llim, int ulim, + sharp_Ylmgen_C *gen, int mi, const int *mlim) + { + const int nval=nvec*VLEN; + const int m = job->ainfo->mval[mi]; + sharp_Ylmgen_prepare (gen, m); + + switch (job->type) + { + case SHARP_ALM2MAP: + case SHARP_ALM2MAP_DERIV1: + { + if (job->spin==0) + { + for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; + if (mlim[itot]>=m) skip=0; + cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; + } + if (!skip) + Y(calc_alm2map) (cth.b,sth.b,gen,job,&p1.b,&p2.b); + + for (int i=0; is_th + mi*job->s_m; + complex double r1 = p1.s.r[i] + p1.s.i[i]*_Complex_I, + r2 = p2.s.r[i] + p2.s.i[i]*_Complex_I; + job->phase[phas_idx] = r1+r2; + if (ispair[itot]) + job->phase[phas_idx+1] = r1-r2; + } + } + } + } + else + { + for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; + if (mlim[itot]>=m) skip=0; + cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; + } + if (!skip) + (job->type==SHARP_ALM2MAP) ? + Y(calc_alm2map_spin ) + (cth.b,sth.b,gen,job,&p1.b,&p2.b) : + Y(calc_alm2map_deriv1) + (cth.b,sth.b,gen,job,&p1.b,&p2.b); + + for (int i=0; is_th + mi*job->s_m; + complex double q1 = p1.s.qr[i] + p1.s.qi[i]*_Complex_I, + q2 = p2.s.qr[i] + p2.s.qi[i]*_Complex_I, + u1 = p1.s.ur[i] + p1.s.ui[i]*_Complex_I, + u2 = p2.s.ur[i] + p2.s.ui[i]*_Complex_I; + job->phase[phas_idx] = q1+q2; + job->phase[phas_idx+2] = u1+u2; + if (ispair[itot]) + { + dcmplx *phQ = &(job->phase[phas_idx+1]), + *phU = &(job->phase[phas_idx+3]); + *phQ = q1-q2; + *phU = u1-u2; + if ((gen->mhi-gen->m+gen->s)&1) + { *phQ=-(*phQ); *phU=-(*phU); } + } + } + } + } + } + break; + } + default: + { + UTIL_FAIL("must not happen"); + break; + } + } + } + +NOINLINE static void Y(inner_loop_m2a) (sharp_job *job, const int *ispair, + const double *cth_, const double *sth_, int llim, int ulim, + sharp_Ylmgen_C *gen, int mi, const int *mlim) + { + const int nval=nvec*VLEN; + const int m = job->ainfo->mval[mi]; + sharp_Ylmgen_prepare (gen, m); + + switch (job->type) + { + case SHARP_MAP2ALM: + { + if (job->spin==0) + { + Tv atmp[2*(gen->lmax+2)]; + memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv)); + for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; + if (mlim[itot]>=m) skip=0; + cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; + if ((i+ith=m)) + { + int phas_idx = itot*job->s_th + mi*job->s_m; + dcmplx ph1=job->phase[phas_idx]; + dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.; + p1.s.r[i]=creal(ph1+ph2); p1.s.i[i]=cimag(ph1+ph2); + p2.s.r[i]=creal(ph1-ph2); p2.s.i[i]=cimag(ph1-ph2); + } + } + if (!skip) + Y(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp); + } + { + int istart=m, istop=gen->lmax+1; + for(; istartalmtmp[istart])); + for(; istartalmtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]); + } + } + else + { + for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; + if (mlim[itot]>=m) skip=0; + cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; + if (i+iths_th + mi*job->s_m; + dcmplx p1Q=job->phase[phas_idx], + p1U=job->phase[phas_idx+2], + p2Q=ispair[itot] ? job->phase[phas_idx+1]:0., + p2U=ispair[itot] ? job->phase[phas_idx+3]:0.; + if ((gen->mhi-gen->m+gen->s)&1) + { p2Q=-p2Q; p2U=-p2U; } + p1.s.qr[i]=creal(p1Q+p2Q); p1.s.qi[i]=cimag(p1Q+p2Q); + p1.s.ur[i]=creal(p1U+p2U); p1.s.ui[i]=cimag(p1U+p2U); + p2.s.qr[i]=creal(p1Q-p2Q); p2.s.qi[i]=cimag(p1Q-p2Q); + p2.s.ur[i]=creal(p1U-p2U); p2.s.ui[i]=cimag(p1U-p2U); + } + } + if (!skip) + Y(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1.b,&p2.b); + } + } + break; + } + default: + { + UTIL_FAIL("must not happen"); + break; + } + } + } + +static void Y(inner_loop) (sharp_job *job, const int *ispair, + const double *cth_, const double *sth_, int llim, int ulim, + sharp_Ylmgen_C *gen, int mi, const int *mlim) + { + (job->type==SHARP_MAP2ALM) ? + Y(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) : + Y(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim); + } + +#undef VZERO diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c index 15190d7..1139ef8 100644 --- a/libsharp/sharp_core_inc0.c +++ b/libsharp/sharp_core_inc0.c @@ -46,12 +46,17 @@ typedef complex double dcmplx; #define CONCAT2(a,b) XCONCAT2(a,b) #define nvec 6 -#include "sharp_core_inchelper.c" +#define Tb CONCAT2(Tb,nvec) +#define Y(arg) CONCAT2(arg,nvec) +#include "sharp_core_inc.c" + +#undef Y +#undef Tb #undef nvec void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *cth, const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, const int *mlim) { - CONCAT2(inner_loop,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + inner_loop_6(job, ispair,cth,sth,llim,ulim,gen,mi,mlim); } diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c deleted file mode 100644 index 364eef5..0000000 --- a/libsharp/sharp_core_inc2.c +++ /dev/null @@ -1,697 +0,0 @@ -/* - * This file is part of libsharp. - * - * libsharp is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * libsharp is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with libsharp; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* - * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik - * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt - * (DLR). - */ - -/*! \file sharp_core_inc2.c - * Type-dependent code for the computational core - * - * Copyright (C) 2012-2017 Max-Planck-Society - * \author Martin Reinecke - */ - -NOINLINE static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1, - Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, - const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, - int l, int lmax) - { - while (l<=lmax) - { - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); - Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); - Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]), - f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]); - for (int i=0; ir.v[i] += lam_2.v[i]*ar1; - p1->i.v[i] += lam_2.v[i]*ai1; - lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i]; - p2->r.v[i] += lam_1.v[i]*ar2; - p2->i.v[i] += lam_1.v[i]*ai2; - } - l+=2; - } - } - -NOINLINE static void Z(map2alm_kernel) (const Tb cth, - const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, - const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp) - { - while (l<=lmax) - { - Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]), - f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]); - for (int i=0; ir.v[i]); - vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]); - lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i]; - vfmaeq(atmp[2*(l+1) ],lam_1.v[i],p2->r.v[i]); - vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]); - } - l+=2; - } - } - -NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth, - const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1, - Y(Tbri) * restrict p2) - { - int l,lmax=gen->lmax; - Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale; - Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); - job->opcnt += (l-gen->m) * 4*VLEN*nvec; - if (l>lmax) return; - job->opcnt += (lmax+1-l) * 8*VLEN*nvec; - - Tb corfac; - Y(getCorfac)(scale,&corfac,gen->cf); - const sharp_ylmgen_dbl2 * restrict rf = gen->rf; - const dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGe)(scale,sharp_minscale); - while (!full_ieee) - { - { - Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l])); - for (int i=0; ir.v[i],tmp,ar); - vfmaeq(p1->i.v[i],tmp,ai); - } - } - if (++l>lmax) break; - Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]); - for (int i=0; ir.v[i],tmp,ar); - vfmaeq(p2->i.v[i],tmp,ai); - } - } - if (++l>lmax) break; - r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]); - for (int i=0; icf); - full_ieee = Y(TballGe)(scale,sharp_minscale); - } - } - if (l>lmax) return; - - Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); - Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax); - } - -NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth, - const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1, - const Y(Tbri) * restrict p2, Tv *restrict atmp) - { - int lmax=gen->lmax; - Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale; - int l=gen->m; - Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); - job->opcnt += (l-gen->m) * 4*VLEN*nvec; - if (l>lmax) return; - job->opcnt += (lmax+1-l) * 8*VLEN*nvec; - - const sharp_ylmgen_dbl2 * restrict rf = gen->rf; - Tb corfac; - Y(getCorfac)(scale,&corfac,gen->cf); - int full_ieee = Y(TballGe)(scale,sharp_minscale); - while (!full_ieee) - { - for (int i=0; ir.v[i]; - atmp[2*l+1]+=tmp*p1->i.v[i]; - } - if (++l>lmax) return; - for (int i=0; ir.v[i]; - atmp[2*l+1]+=tmp*p2->i.v[i]; - } - if (++l>lmax) return; - for (int i=0; icf); - full_ieee = Y(TballGe)(scale,sharp_minscale); - } - } - - Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); - Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp); - } - -static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, - const Tb rxp, const Tb rxm, const dcmplx * restrict alm) - { - Tv agr=vload(creal(alm[0])), agi=vload(cimag(alm[0])), - acr=vload(creal(alm[1])), aci=vload(cimag(alm[1])); - for (int i=0; iqr.v[i],agr,lw); - vfmaeq(px->qi.v[i],agi,lw); - vfmaeq(px->ur.v[i],acr,lw); - vfmaeq(px->ui.v[i],aci,lw); - Tv lx=vsub(rxm.v[i],rxp.v[i]); - vfmseq(py->qr.v[i],aci,lx); - vfmaeq(py->qi.v[i],acr,lx); - vfmaeq(py->ur.v[i],agi,lx); - vfmseq(py->ui.v[i],agr,lx); - } - } - -static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, - const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m, - const dcmplx * restrict alm1, const dcmplx * restrict alm2) - { - Tv agr1=vload(creal(alm1[0])), agi1=vload(cimag(alm1[0])), - acr1=vload(creal(alm1[1])), aci1=vload(cimag(alm1[1])); - Tv agr2=vload(creal(alm2[0])), agi2=vload(cimag(alm2[0])), - acr2=vload(creal(alm2[1])), aci2=vload(cimag(alm2[1])); - for (int i=0; iqr.v[i],agr1,lw1,aci2,lx2); - vfmaaeq(p1->qi.v[i],agi1,lw1,acr2,lx2); - vfmaaeq(p1->ur.v[i],acr1,lw1,agi2,lx2); - vfmaseq(p1->ui.v[i],aci1,lw1,agr2,lx2); - Tv lx1=r2m.v[i]-r2p.v[i]; - Tv lw2=r1p.v[i]+r1m.v[i]; - vfmaseq(p2->qr.v[i],agr2,lw2,aci1,lx1); - vfmaaeq(p2->qi.v[i],agi2,lw2,acr1,lx1); - vfmaaeq(p2->ur.v[i],acr2,lw2,agi1,lx1); - vfmaseq(p2->ui.v[i],aci2,lw2,agr1,lx1); - } - } - -static inline void Z(saddstep2) (const Y(Tbqu) * restrict px, - const Y(Tbqu) * restrict py, const Tb * restrict rxp, - const Tb * restrict rxm, dcmplx * restrict alm) - { - Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero; - for (int i=0; iv[i],rxm->v[i]); - vfmaeq(agr,px->qr.v[i],lw); - vfmaeq(agi,px->qi.v[i],lw); - vfmaeq(acr,px->ur.v[i],lw); - vfmaeq(aci,px->ui.v[i],lw); - Tv lx=vsub(rxm->v[i],rxp->v[i]); - vfmseq(agr,py->ui.v[i],lx); - vfmaeq(agi,py->ur.v[i],lx); - vfmaeq(acr,py->qi.v[i],lx); - vfmseq(aci,py->qr.v[i],lx); - } - vhsum_cmplx_special(agr,agi,acr,aci,alm); - } - -NOINLINE static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1, - Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, - const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, - int lmax) - { - while (llmax; - Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; - Y(iter_to_ieee_spin) - (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); - job->opcnt += (l-gen->m) * 10*VLEN*nvec; - if (l>lmax) return; - job->opcnt += (lmax+1-l) * 28*VLEN*nvec; - - const sharp_ylmgen_dbl3 * restrict fx = gen->fx; - Tb corfacp,corfacm; - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); - const dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); - while (!full_ieee) - { - Z(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), - &alm[2*l]); - if (++l>lmax) break; - Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); - Z(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), - &alm[2*l]); - if (++l>lmax) break; - Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); - if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) - { - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); - full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); - } - } - - if (l>lmax) return; - - Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); - Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); - Z(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, - lmax); - } - -NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth, - const sharp_Ylmgen_C * restrict gen, sharp_job *job, - const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2) - { - int l, lmax=gen->lmax; - Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; - Y(iter_to_ieee_spin) - (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); - job->opcnt += (l-gen->m) * 10*VLEN*nvec; - if (l>lmax) return; - job->opcnt += (lmax+1-l) * 28*VLEN*nvec; - - const sharp_ylmgen_dbl3 * restrict fx = gen->fx; - Tb corfacp,corfacm; - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); - dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); - while (!full_ieee) - { - Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm); - Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*l]); - if (++l>lmax) return; - Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); - t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm); - Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*l]); - if (++l>lmax) return; - Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); - if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) - { - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); - full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); - } - } - - Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); - Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); - Z(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax); - } - -static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, - const Tb rxp, const Tb rxm, const dcmplx * restrict alm) - { - Tv ar=vload(creal(alm[0])), ai=vload(cimag(alm[0])); - for (int i=0; iqr.v[i],ar,lw); - vfmaeq(px->qi.v[i],ai,lw); - Tv lx=vsub(rxm.v[i],rxp.v[i]); - vfmaeq(py->ur.v[i],ai,lx); - vfmseq(py->ui.v[i],ar,lx); - } - } - -NOINLINE static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1, - Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, - const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, - int lmax) - { - while (llmax; - Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; - Y(iter_to_ieee_spin) - (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); - job->opcnt += (l-gen->m) * 10*VLEN*nvec; - if (l>lmax) return; - job->opcnt += (lmax+1-l) * 20*VLEN*nvec; - - const sharp_ylmgen_dbl3 * restrict fx = gen->fx; - Tb corfacp,corfacm; - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); - const dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); - while (!full_ieee) - { - Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), - &alm[l]); - if (++l>lmax) break; - Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); - Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), - &alm[l]); - if (++l>lmax) break; - Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); - if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) - { - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); - full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); - } - } - - if (l>lmax) return; - - Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); - Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); - Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, - lmax); - } - - -#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) - -NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair, - const double *cth_, const double *sth_, int llim, int ulim, - sharp_Ylmgen_C *gen, int mi, const int *mlim) - { - const int nval=nvec*VLEN; - const int m = job->ainfo->mval[mi]; - sharp_Ylmgen_prepare (gen, m); - - switch (job->type) - { - case SHARP_ALM2MAP: - case SHARP_ALM2MAP_DERIV1: - { - if (job->spin==0) - { - for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; - if (mlim[itot]>=m) skip=0; - cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; - } - if (!skip) - Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1.b,&p2.b); - - for (int i=0; is_th + mi*job->s_m; - complex double r1 = p1.s.r[i] + p1.s.i[i]*_Complex_I, - r2 = p2.s.r[i] + p2.s.i[i]*_Complex_I; - job->phase[phas_idx] = r1+r2; - if (ispair[itot]) - job->phase[phas_idx+1] = r1-r2; - } - } - } - } - else - { - for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; - if (mlim[itot]>=m) skip=0; - cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; - } - if (!skip) - (job->type==SHARP_ALM2MAP) ? - Z(calc_alm2map_spin ) - (cth.b,sth.b,gen,job,&p1.b,&p2.b) : - Z(calc_alm2map_deriv1) - (cth.b,sth.b,gen,job,&p1.b,&p2.b); - - for (int i=0; is_th + mi*job->s_m; - complex double q1 = p1.s.qr[i] + p1.s.qi[i]*_Complex_I, - q2 = p2.s.qr[i] + p2.s.qi[i]*_Complex_I, - u1 = p1.s.ur[i] + p1.s.ui[i]*_Complex_I, - u2 = p2.s.ur[i] + p2.s.ui[i]*_Complex_I; - job->phase[phas_idx] = q1+q2; - job->phase[phas_idx+2] = u1+u2; - if (ispair[itot]) - { - dcmplx *phQ = &(job->phase[phas_idx+1]), - *phU = &(job->phase[phas_idx+3]); - *phQ = q1-q2; - *phU = u1-u2; - if ((gen->mhi-gen->m+gen->s)&1) - { *phQ=-(*phQ); *phU=-(*phU); } - } - } - } - } - } - break; - } - default: - { - UTIL_FAIL("must not happen"); - break; - } - } - } - -NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair, - const double *cth_, const double *sth_, int llim, int ulim, - sharp_Ylmgen_C *gen, int mi, const int *mlim) - { - const int nval=nvec*VLEN; - const int m = job->ainfo->mval[mi]; - sharp_Ylmgen_prepare (gen, m); - - switch (job->type) - { - case SHARP_MAP2ALM: - { - if (job->spin==0) - { - Tv atmp[2*(gen->lmax+2)]; - memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv)); - for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; - if (mlim[itot]>=m) skip=0; - cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; - if ((i+ith=m)) - { - int phas_idx = itot*job->s_th + mi*job->s_m; - dcmplx ph1=job->phase[phas_idx]; - dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.; - p1.s.r[i]=creal(ph1+ph2); p1.s.i[i]=cimag(ph1+ph2); - p2.s.r[i]=creal(ph1-ph2); p2.s.i[i]=cimag(ph1-ph2); - } - } - if (!skip) - Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp); - } - { - int istart=m, istop=gen->lmax+1; - for(; istartalmtmp[istart])); - for(; istartalmtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]); - } - } - else - { - for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; - if (mlim[itot]>=m) skip=0; - cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; - if (i+iths_th + mi*job->s_m; - dcmplx p1Q=job->phase[phas_idx], - p1U=job->phase[phas_idx+2], - p2Q=ispair[itot] ? job->phase[phas_idx+1]:0., - p2U=ispair[itot] ? job->phase[phas_idx+3]:0.; - if ((gen->mhi-gen->m+gen->s)&1) - { p2Q=-p2Q; p2U=-p2U; } - p1.s.qr[i]=creal(p1Q+p2Q); p1.s.qi[i]=cimag(p1Q+p2Q); - p1.s.ur[i]=creal(p1U+p2U); p1.s.ui[i]=cimag(p1U+p2U); - p2.s.qr[i]=creal(p1Q-p2Q); p2.s.qi[i]=cimag(p1Q-p2Q); - p2.s.ur[i]=creal(p1U-p2U); p2.s.ui[i]=cimag(p1U-p2U); - } - } - if (!skip) - Z(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1.b,&p2.b); - } - } - break; - } - default: - { - UTIL_FAIL("must not happen"); - break; - } - } - } - -static void Z(inner_loop) (sharp_job *job, const int *ispair, - const double *cth_, const double *sth_, int llim, int ulim, - sharp_Ylmgen_C *gen, int mi, const int *mlim) - { - (job->type==SHARP_MAP2ALM) ? - Z(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) : - Z(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim); - } - -#undef VZERO diff --git a/libsharp/sharp_core_inchelper.c b/libsharp/sharp_core_inchelper.c deleted file mode 100644 index c58cecc..0000000 --- a/libsharp/sharp_core_inchelper.c +++ /dev/null @@ -1,10 +0,0 @@ -#define Tb CONCAT2(Tb,nvec) -#define Y(arg) CONCAT2(arg,nvec) -#include "sharp_core_inc.c" - -#define Z(arg) CONCAT2(arg,nvec) -#include "sharp_core_inc2.c" -#undef Z - -#undef Y -#undef Tb