diff --git a/libsharp/oracle.inc b/libsharp/oracle.inc index a977729..74638c6 100644 --- a/libsharp/oracle.inc +++ b/libsharp/oracle.inc @@ -1,9 +1,9 @@ static const int maxtr = 6; static const int nv_opt[6][2][3] = { -{{4,2,-1},{2,1,-1}}, -{{4,2,-1},{2,1,-1}}, -{{5,2,-1},{5,1,-1}}, -{{5,2,-1},{5,2,-1}}, -{{5,2,-1},{5,2,-1}}, -{{5,2,-1},{5,2,-1}} +{{4,2,-1},{2,1,1}}, +{{4,2,-1},{2,1,1}}, +{{5,2,-1},{5,2,1}}, +{{5,2,-1},{5,2,1}}, +{{5,2,-1},{5,2,1}}, +{{5,2,-1},{5,2,1}} }; diff --git a/libsharp/sharp.c b/libsharp/sharp.c index e3d7a89..4cc659c 100644 --- a/libsharp/sharp.c +++ b/libsharp/sharp.c @@ -406,7 +406,7 @@ static void alm2almtmp (sharp_job *job, int lmax, int mi) { ptrdiff_t aidx = sharp_alm_index(job->ainfo,l,mi); double fct = (job->type==ALM2MAP) ? job->norm_l[l] : - -fabs(job->norm_l[l])*sqrt(l*(l+1.)); + fabs(job->norm_l[l])*sqrt(l*(l+1.)); for (int i=0; intrans*job->nalm; ++i) if (job->fde==DOUBLE) job->almtmp[job->ntrans*job->nalm*l+i] @@ -543,8 +543,8 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type, int spin const sharp_alm_info *alm_info, int ntrans) { UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms"); + if (type==ALM2MAP_DERIV1) spin=1; UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin"); - UTIL_ASSERT((type==MAP2ALM)||(type==ALM2MAP), "unsupported SHT type"); job->type = type; job->spin = spin; job->norm_l = NULL; @@ -586,8 +586,9 @@ int sharp_get_nv_max (void) int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans) { - UTIL_ASSERT(type!=ALM2MAP_DERIV1,"transform type not yet supported"); - + if (type==ALM2MAP_DERIV1) spin=1; + UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms"); + UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin"); #include "oracle.inc" return nv_opt[IMIN(ntrans,maxtr)-1][spin!=0][type]; diff --git a/libsharp/sharp_acctest.c b/libsharp/sharp_acctest.c index 37ee25b..b6f6952 100644 --- a/libsharp/sharp_acctest.c +++ b/libsharp/sharp_acctest.c @@ -100,42 +100,86 @@ static void check_sign_scale(void) sharp_make_triangular_alm_info(lmax,mmax,1,&alms); ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax); - double **map; - ALLOC2D(map,double,2,npix); + for (int ntrans=1; ntrans<10; ++ntrans) + { + double **map; + ALLOC2D(map,double,2*ntrans,npix); - dcmplx **alm; - ALLOC2D(alm,dcmplx,2,nalms); - for (int i=0; i<2; ++i) - for (int j=0; jj[j].qr.v[i],ar,lw); + vfmaeq(px->j[j].qi.v[i],ai,lw); + } + for (int i=0; ij[j].ur.v[i],ai,lx); + vfmseq(py->j[j].ui.v[i],ar,lx); + } + } + } + +static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1, + Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, + const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, + int lmax) + { + while (llmax; + Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; + Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); + job->opcnt += (l-gen->m) * 10*VLEN*nvec; + if (l>lmax) + { *done=1; return; } + job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec; + + const ylmgen_dbl3 * restrict fx = gen->fx; + Tb corfacp,corfacm; + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + const dcmplx * restrict alm=job->almtmp; + int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); + while (!full_ieee) + { + Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), + &alm[njobs*l]); + if (++l>lmax) break; + Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); + Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), + &alm[njobs*l]); + if (++l>lmax) break; + Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); + if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) + { + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); + } + } + + if (l>lmax) + { *done=1; return; } + + Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); + Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); + Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, + lmax); + } + #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) static void Z(inner_loop) (sharp_job *job, const int *ispair, @@ -535,6 +633,7 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair, switch (job->type) { case ALM2MAP: + case ALM2MAP_DERIV1: { if (job->spin==0) { @@ -592,7 +691,9 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair, itot=idx[itot]; cth.s[i]=cth_[itot]; } - Z(calc_alm2map_spin) (cth.b,gen,job,&p1.b,&p2.b,&done); + (job->type==ALM2MAP) ? + Z(calc_alm2map_spin ) (cth.b,gen,job,&p1.b,&p2.b,&done) : + Z(calc_alm2map_deriv1) (cth.b,gen,job,&p1.b,&p2.b,&done); } for (int i=0; ispin==0) diff --git a/libsharp/sharp_core_inc3.c b/libsharp/sharp_core_inc3.c index 9fecec4..950c84c 100644 --- a/libsharp/sharp_core_inc3.c +++ b/libsharp/sharp_core_inc3.c @@ -511,6 +511,106 @@ static void Y(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen, Y(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax,njobs); } +static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, + const Tb rxp, const Tb rxm, const dcmplx * restrict alm, int njobs) + { + for (int j=0; jlmax; + Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; + Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); + job->opcnt += (l-gen->m) * 10*VLEN*nvec; + if (l>lmax) + { *done=1; return; } + job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec; + + const ylmgen_dbl3 * restrict fx = gen->fx; + Tb corfacp,corfacm; + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + const dcmplx * restrict alm=job->almtmp; + int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); + while (!full_ieee) + { + Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), + &alm[njobs*l],njobs); + if (++l>lmax) break; + Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); + Y(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), + &alm[njobs*l], njobs); + if (++l>lmax) break; + Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); + if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) + { + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); + } + } + + if (l>lmax) + { *done=1; return; } + + Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); + Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); + Y(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, + lmax, njobs); + } + + #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) static void Y(inner_loop) (sharp_job *job, const int *ispair, @@ -524,6 +624,7 @@ static void Y(inner_loop) (sharp_job *job, const int *ispair, switch (job->type) { case ALM2MAP: + case ALM2MAP_DERIV1: { if (job->spin==0) { @@ -581,7 +682,11 @@ static void Y(inner_loop) (sharp_job *job, const int *ispair, itot=idx[itot]; cth.s[i]=cth_[itot]; } - Y(calc_alm2map_spin) (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done); + (job->type==ALM2MAP) ? + Y(calc_alm2map_spin ) + (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done) : + Y(calc_alm2map_deriv1) + (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done); } for (int i=0; ispin==0)