fix SHTs for colatitudes with sin(theta)<0

This commit is contained in:
Martin Reinecke 2013-02-24 08:26:21 +01:00
parent 515ef8a4aa
commit 44cf6c2339
2 changed files with 28 additions and 17 deletions

View file

@ -220,7 +220,7 @@ static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
} }
} }
static void Y(iter_to_ieee_spin) (const Tb cth, int *l_, static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_, Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_,
Tb * scalep_, Tb * scalem_, const sharp_Ylmgen_C * restrict gen) Tb * scalep_, Tb * scalem_, const sharp_Ylmgen_C * restrict gen)
{ {
@ -232,6 +232,11 @@ static void Y(iter_to_ieee_spin) (const Tb cth, int *l_,
cth2.v[i]=vmax(cth2.v[i],vload(1e-15)); cth2.v[i]=vmax(cth2.v[i],vload(1e-15));
sth2.v[i]=vsqrt(vmul(vsub(vone,cth.v[i]),vload(0.5))); sth2.v[i]=vsqrt(vmul(vsub(vone,cth.v[i]),vload(0.5)));
sth2.v[i]=vmax(sth2.v[i],vload(1e-15)); sth2.v[i]=vmax(sth2.v[i],vload(1e-15));
Tv mask=vlt(sth.v[i],vzero);
Tv cfct=vblend(vand(mask,vlt(cth.v[i],vzero)),vload(-1.),vone);
cth2.v[i]=vmul(cth2.v[i],cfct);
Tv sfct=vblend(vand(mask,vgt(cth.v[i],vzero)),vload(-1.),vone);
sth2.v[i]=vmul(sth2.v[i],sfct);
} }
Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps; Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;

View file

@ -429,12 +429,14 @@ static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2); Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
} }
static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen, static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2 NJ1) const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2 NJ1)
{ {
int l, lmax=gen->lmax; int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); Y(iter_to_ieee_spin)
(cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec; job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax) return; if (l>lmax) return;
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
@ -473,12 +475,14 @@ static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
lmax NJ2); lmax NJ2);
} }
static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen, static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1) const sharp_Ylmgen_C * restrict gen, sharp_job *job,
const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1)
{ {
int l, lmax=gen->lmax; int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); Y(iter_to_ieee_spin)
(cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec; job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax) return; if (l>lmax) return;
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
@ -568,12 +572,14 @@ static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2); Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2);
} }
static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen, static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2 NJ1) const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2 NJ1)
{ {
int l, lmax=gen->lmax; int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); Y(iter_to_ieee_spin)
(cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec; job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax) return; if (l>lmax) return;
job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec; job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
@ -669,7 +675,7 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
for (int ith=0; ith<ulim-llim; ith+=nval) for (int ith=0; ith<ulim-llim; ith+=nval)
{ {
Y(Tbuqu) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2); Y(Tbuqu) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
Y(Tbu) cth; Y(Tbu) cth, sth;
int skip=1; int skip=1;
for (int i=0; i<nval; ++i) for (int i=0; i<nval; ++i)
@ -677,14 +683,14 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
int itot=i+ith; int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1; if (itot>=ulim-llim) itot=ulim-llim-1;
if (mlim[itot]>=m) skip=0; if (mlim[itot]>=m) skip=0;
cth.s[i]=cth_[itot]; cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
} }
if (!skip) if (!skip)
(job->type==SHARP_ALM2MAP) ? (job->type==SHARP_ALM2MAP) ?
Z(calc_alm2map_spin ) Z(calc_alm2map_spin )
(cth.b,gen,job,&p1[0].b,&p2[0].b NJ2) : (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2) :
Z(calc_alm2map_deriv1) Z(calc_alm2map_deriv1)
(cth.b,gen,job,&p1[0].b,&p2[0].b NJ2); (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
for (int i=0; i<nval; ++i) for (int i=0; i<nval; ++i)
{ {
@ -753,7 +759,7 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
for (int ith=0; ith<ulim-llim; ith+=nval) for (int ith=0; ith<ulim-llim; ith+=nval)
{ {
Y(Tbuqu) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2); Y(Tbuqu) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
Y(Tbu) cth; Y(Tbu) cth, sth;
int skip=1; int skip=1;
for (int i=0; i<nval; ++i) for (int i=0; i<nval; ++i)
@ -761,7 +767,7 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
int itot=i+ith; int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1; if (itot>=ulim-llim) itot=ulim-llim-1;
if (mlim[itot]>=m) skip=0; if (mlim[itot]>=m) skip=0;
cth.s[i]=cth_[itot]; cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
if (i+ith<ulim-llim) if (i+ith<ulim-llim)
{ {
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
@ -781,7 +787,7 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
} }
} }
if (!skip) if (!skip)
Z(calc_map2alm_spin) (cth.b,gen,job,&p1[0].b,&p2[0].b NJ2); Z(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
} }
} }
break; break;