add ALM2MAP_DERIV1
This commit is contained in:
parent
39cf1ee90b
commit
84c0a7de18
6 changed files with 318 additions and 64 deletions
|
@ -1,9 +1,9 @@
|
|||
static const int maxtr = 6;
|
||||
static const int nv_opt[6][2][3] = {
|
||||
{{4,2,-1},{2,1,-1}},
|
||||
{{4,2,-1},{2,1,-1}},
|
||||
{{5,2,-1},{5,1,-1}},
|
||||
{{5,2,-1},{5,2,-1}},
|
||||
{{5,2,-1},{5,2,-1}},
|
||||
{{5,2,-1},{5,2,-1}}
|
||||
{{4,2,-1},{2,1,1}},
|
||||
{{4,2,-1},{2,1,1}},
|
||||
{{5,2,-1},{5,2,1}},
|
||||
{{5,2,-1},{5,2,1}},
|
||||
{{5,2,-1},{5,2,1}},
|
||||
{{5,2,-1},{5,2,1}}
|
||||
};
|
||||
|
|
|
@ -406,7 +406,7 @@ static void alm2almtmp (sharp_job *job, int lmax, int mi)
|
|||
{
|
||||
ptrdiff_t aidx = sharp_alm_index(job->ainfo,l,mi);
|
||||
double fct = (job->type==ALM2MAP) ? job->norm_l[l] :
|
||||
-fabs(job->norm_l[l])*sqrt(l*(l+1.));
|
||||
fabs(job->norm_l[l])*sqrt(l*(l+1.));
|
||||
for (int i=0; i<job->ntrans*job->nalm; ++i)
|
||||
if (job->fde==DOUBLE)
|
||||
job->almtmp[job->ntrans*job->nalm*l+i]
|
||||
|
@ -543,8 +543,8 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type, int spin
|
|||
const sharp_alm_info *alm_info, int ntrans)
|
||||
{
|
||||
UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
|
||||
if (type==ALM2MAP_DERIV1) spin=1;
|
||||
UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin");
|
||||
UTIL_ASSERT((type==MAP2ALM)||(type==ALM2MAP), "unsupported SHT type");
|
||||
job->type = type;
|
||||
job->spin = spin;
|
||||
job->norm_l = NULL;
|
||||
|
@ -586,8 +586,9 @@ int sharp_get_nv_max (void)
|
|||
|
||||
int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
|
||||
{
|
||||
UTIL_ASSERT(type!=ALM2MAP_DERIV1,"transform type not yet supported");
|
||||
|
||||
if (type==ALM2MAP_DERIV1) spin=1;
|
||||
UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
|
||||
UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin");
|
||||
#include "oracle.inc"
|
||||
|
||||
return nv_opt[IMIN(ntrans,maxtr)-1][spin!=0][type];
|
||||
|
|
|
@ -100,42 +100,86 @@ static void check_sign_scale(void)
|
|||
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
|
||||
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
|
||||
|
||||
double **map;
|
||||
ALLOC2D(map,double,2,npix);
|
||||
for (int ntrans=1; ntrans<10; ++ntrans)
|
||||
{
|
||||
double **map;
|
||||
ALLOC2D(map,double,2*ntrans,npix);
|
||||
|
||||
dcmplx **alm;
|
||||
ALLOC2D(alm,dcmplx,2,nalms);
|
||||
for (int i=0; i<2; ++i)
|
||||
for (int j=0; j<nalms; ++j)
|
||||
alm[i][j]=1.+_Complex_I;
|
||||
dcmplx **alm;
|
||||
ALLOC2D(alm,dcmplx,2*ntrans,nalms);
|
||||
for (int i=0; i<2*ntrans; ++i)
|
||||
for (int j=0; j<nalms; ++j)
|
||||
alm[i][j]=1.+_Complex_I;
|
||||
|
||||
sharp_job job;
|
||||
sharpd_build_job(&job,ALM2MAP,0,0,&alm[0],&map[0],tinfo,alms,1);
|
||||
sharp_execute_job(&job);
|
||||
UTIL_ASSERT(FAPPROX(map[0][0 ], 3.588246976618616912e+00,1e-12),"error");
|
||||
UTIL_ASSERT(FAPPROX(map[0][npix/2], 4.042209792157496651e+01,1e-12),"error");
|
||||
UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.234675107554816442e+01,1e-12),"error");
|
||||
sharp_job job;
|
||||
sharpd_build_job(&job,ALM2MAP,0,0,&alm[0],&map[0],tinfo,alms,ntrans);
|
||||
sharp_execute_job(&job);
|
||||
for (int it=0; it<ntrans; ++it)
|
||||
{
|
||||
UTIL_ASSERT(FAPPROX(map[it][0 ], 3.588246976618616912e+00,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[it][npix/2], 4.042209792157496651e+01,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[it][npix-1],-1.234675107554816442e+01,1e-12),
|
||||
"error");
|
||||
}
|
||||
sharpd_build_job(&job,ALM2MAP,1,0,&alm[0],&map[0],tinfo,alms,ntrans);
|
||||
sharp_execute_job(&job);
|
||||
for (int it=0; it<ntrans; ++it)
|
||||
{
|
||||
UTIL_ASSERT(FAPPROX(map[2*it ][0 ], 2.750897760535633285e+00,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[2*it ][npix/2], 3.137704477368562905e+01,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[2*it ][npix-1],-8.405730859837063917e+01,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[2*it+1][0 ],-2.398026536095463346e+00,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-4.961140548331700728e+01,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.412765834230440021e+01,1e-12),
|
||||
"error");
|
||||
}
|
||||
|
||||
sharpd_build_job(&job,ALM2MAP,1,0,&alm[0],&map[0],tinfo,alms,1);
|
||||
sharp_execute_job(&job);
|
||||
UTIL_ASSERT(FAPPROX(map[0][0 ], 2.750897760535633285e+00,1e-12),"error");
|
||||
UTIL_ASSERT(FAPPROX(map[0][npix/2], 3.137704477368562905e+01,1e-12),"error");
|
||||
UTIL_ASSERT(FAPPROX(map[0][npix-1],-8.405730859837063917e+01,1e-12),"error");
|
||||
UTIL_ASSERT(FAPPROX(map[1][0 ],-2.398026536095463346e+00,1e-12),"error");
|
||||
UTIL_ASSERT(FAPPROX(map[1][npix/2],-4.961140548331700728e+01,1e-12),"error");
|
||||
UTIL_ASSERT(FAPPROX(map[1][npix-1],-1.412765834230440021e+01,1e-12),"error");
|
||||
sharpd_build_job(&job,ALM2MAP,2,0,&alm[0],&map[0],tinfo,alms,ntrans);
|
||||
sharp_execute_job(&job);
|
||||
for (int it=0; it<ntrans; ++it)
|
||||
{
|
||||
UTIL_ASSERT(FAPPROX(map[2*it ][0 ],-1.398186224727334448e+00,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[2*it ][npix/2],-2.456676000884031197e+01,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[2*it ][npix-1],-1.516249174408820863e+02,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[2*it+1][0 ],-3.173406200299964119e+00,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-5.831327404513146462e+01,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.863257892248353897e+01,1e-12),
|
||||
"error");
|
||||
}
|
||||
|
||||
sharpd_build_job(&job,ALM2MAP,2,0,&alm[0],&map[0],tinfo,alms,1);
|
||||
sharp_execute_job(&job);
|
||||
UTIL_ASSERT(FAPPROX(map[0][0 ],-1.398186224727334448e+00,1e-12),"error");
|
||||
UTIL_ASSERT(FAPPROX(map[0][npix/2],-2.456676000884031197e+01,1e-12),"error");
|
||||
UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.516249174408820863e+02,1e-12),"error");
|
||||
UTIL_ASSERT(FAPPROX(map[1][0 ],-3.173406200299964119e+00,1e-12),"error");
|
||||
UTIL_ASSERT(FAPPROX(map[1][npix/2],-5.831327404513146462e+01,1e-12),"error");
|
||||
UTIL_ASSERT(FAPPROX(map[1][npix-1],-1.863257892248353897e+01,1e-12),"error");
|
||||
sharpd_build_job(&job,ALM2MAP_DERIV1,1,0,&alm[0],&map[0],tinfo,alms,ntrans);
|
||||
sharp_execute_job(&job);
|
||||
for (int it=0; it<ntrans; ++it)
|
||||
{
|
||||
UTIL_ASSERT(FAPPROX(map[2*it ][0 ],-6.859393905369091105e-01,1e-11),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[2*it ][npix/2],-2.103947835973212364e+02,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[2*it ][npix-1],-1.092463246472086439e+03,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[2*it+1][0 ],-1.411433220713928165e+02,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-1.146122859381925082e+03,1e-12),
|
||||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1], 7.821618677689795049e+02,1e-12),
|
||||
"error");
|
||||
}
|
||||
|
||||
DEALLOC2D(map);
|
||||
DEALLOC2D(alm);
|
||||
DEALLOC2D(map);
|
||||
DEALLOC2D(alm);
|
||||
}
|
||||
|
||||
sharp_destroy_alm_info(alms);
|
||||
sharp_destroy_geom_info(tinfo);
|
||||
|
|
|
@ -103,32 +103,39 @@ int main(void)
|
|||
fprintf(fp,"static const int maxtr = 6;\n");
|
||||
fprintf(fp,"static const int nv_opt[6][2][3] = {\n");
|
||||
|
||||
const char *shtname[]={"map2alm","alm2map","a2mder1"};
|
||||
|
||||
for (int ntr=1; ntr<=6; ++ntr)
|
||||
{
|
||||
fprintf(fp,"{");
|
||||
for (int spin=0; spin<=2; spin+=2)
|
||||
{
|
||||
fprintf(fp,"{");
|
||||
for (sharp_jobtype type=MAP2ALM; type<=ALM2MAP; ++type)
|
||||
for (sharp_jobtype type=MAP2ALM; type<=ALM2MAP_DERIV1; ++type)
|
||||
{
|
||||
int nvbest=-1, nvoracle=sharp_nv_oracle(type,spin,ntr);
|
||||
unsigned long long opmin=1000000000000000, op;
|
||||
double tmin=1e30;
|
||||
double *time=RALLOC(double,sharp_get_nv_max()+1);
|
||||
for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
|
||||
if ((type==ALM2MAP_DERIV1) && (spin==0))
|
||||
fprintf(fp,"-1");
|
||||
else
|
||||
{
|
||||
bench_sht (spin,nv,type,ntr,&time[nv],&op);
|
||||
if (op<opmin) opmin=op;
|
||||
if (time[nv]<tmin)
|
||||
{ tmin=time[nv]; nvbest=nv; }
|
||||
int nvbest=-1, nvoracle=sharp_nv_oracle(type,spin,ntr);
|
||||
unsigned long long opmin=1000000000000000, op;
|
||||
double tmin=1e30;
|
||||
double *time=RALLOC(double,sharp_get_nv_max()+1);
|
||||
for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
|
||||
{
|
||||
bench_sht (spin,nv,type,ntr,&time[nv],&op);
|
||||
if (op<opmin) opmin=op;
|
||||
if (time[nv]<tmin)
|
||||
{ tmin=time[nv]; nvbest=nv; }
|
||||
}
|
||||
printf("nt: %d %s spin: %d nv: %d time: %6.3f perf: %6.3f"
|
||||
" dev[%d]: %6.2f%%\n",ntr,shtname[type],
|
||||
spin,nvbest,tmin,opmin/tmin*1e-9,nvoracle,
|
||||
(time[nvoracle]-tmin)/tmin*100.);
|
||||
DEALLOC(time);
|
||||
fprintf(fp,"%d",nvbest);
|
||||
}
|
||||
printf("nt: %d %s spin: %d nv: %d time: %6.3f perf: %6.3f"
|
||||
" dev[%d]: %6.2f%%\n",ntr,(type==ALM2MAP)?"alm2map":"map2alm",
|
||||
spin,nvbest,tmin,opmin/tmin*1e-9,nvoracle,
|
||||
(time[nvoracle]-tmin)/tmin*100.);
|
||||
DEALLOC(time);
|
||||
fprintf(fp,"%d",nvbest);
|
||||
fprintf(fp,(type==MAP2ALM)?",":",-1");
|
||||
if (type!=ALM2MAP_DERIV1) fprintf(fp,",");
|
||||
}
|
||||
fprintf(fp,(spin==0)?"},":"}");
|
||||
printf("\n");
|
||||
|
|
|
@ -522,6 +522,104 @@ static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
|
|||
Z(map2alm_spin_kernel) (cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
|
||||
}
|
||||
|
||||
static inline void Z(saddstep_d) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py,
|
||||
const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
|
||||
{
|
||||
for (int j=0; j<njobs; ++j)
|
||||
{
|
||||
Tv ar=vload(creal(alm[j])), ai=vload(cimag(alm[j]));
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
Tv lw=vadd(rxp.v[i],rxm.v[i]);
|
||||
vfmaeq(px->j[j].qr.v[i],ar,lw);
|
||||
vfmaeq(px->j[j].qi.v[i],ai,lw);
|
||||
}
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
Tv lx=vsub(rxm.v[i],rxp.v[i]);
|
||||
vfmaeq(py->j[j].ur.v[i],ai,lx);
|
||||
vfmseq(py->j[j].ui.v[i],ar,lx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1,
|
||||
Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
|
||||
const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
|
||||
int lmax)
|
||||
{
|
||||
while (l<lmax)
|
||||
{
|
||||
Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
|
||||
fx2=vload(fx[l+1].f[2]);
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
|
||||
vmul(fx2,rec1p.v[i]));
|
||||
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
|
||||
vmul(fx2,rec1m.v[i]));
|
||||
}
|
||||
Z(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l]);
|
||||
Z(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)]);
|
||||
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
|
||||
fx2=vload(fx[l+2].f[2]);
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
|
||||
vmul(fx2,rec2p.v[i]));
|
||||
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
|
||||
vmul(fx2,rec2m.v[i]));
|
||||
}
|
||||
l+=2;
|
||||
}
|
||||
if (l==lmax)
|
||||
Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l]);
|
||||
}
|
||||
|
||||
static void Z(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
|
||||
sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done)
|
||||
{
|
||||
int l, lmax=gen->lmax;
|
||||
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
|
||||
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
|
||||
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
|
||||
if (l>lmax)
|
||||
{ *done=1; return; }
|
||||
job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
|
||||
|
||||
const ylmgen_dbl3 * restrict fx = gen->fx;
|
||||
Tb corfacp,corfacm;
|
||||
Y(getCorfac)(scalep,&corfacp,gen->cf);
|
||||
Y(getCorfac)(scalem,&corfacm,gen->cf);
|
||||
const dcmplx * restrict alm=job->almtmp;
|
||||
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
|
||||
while (!full_ieee)
|
||||
{
|
||||
Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
|
||||
&alm[njobs*l]);
|
||||
if (++l>lmax) break;
|
||||
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
|
||||
Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
|
||||
&alm[njobs*l]);
|
||||
if (++l>lmax) break;
|
||||
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
|
||||
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
|
||||
{
|
||||
Y(getCorfac)(scalep,&corfacp,gen->cf);
|
||||
Y(getCorfac)(scalem,&corfacm,gen->cf);
|
||||
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
|
||||
}
|
||||
}
|
||||
|
||||
if (l>lmax)
|
||||
{ *done=1; return; }
|
||||
|
||||
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
|
||||
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
|
||||
Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
|
||||
lmax);
|
||||
}
|
||||
|
||||
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
|
||||
|
||||
static void Z(inner_loop) (sharp_job *job, const int *ispair,
|
||||
|
@ -535,6 +633,7 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
|
|||
switch (job->type)
|
||||
{
|
||||
case ALM2MAP:
|
||||
case ALM2MAP_DERIV1:
|
||||
{
|
||||
if (job->spin==0)
|
||||
{
|
||||
|
@ -592,7 +691,9 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
|
|||
itot=idx[itot];
|
||||
cth.s[i]=cth_[itot];
|
||||
}
|
||||
Z(calc_alm2map_spin) (cth.b,gen,job,&p1.b,&p2.b,&done);
|
||||
(job->type==ALM2MAP) ?
|
||||
Z(calc_alm2map_spin ) (cth.b,gen,job,&p1.b,&p2.b,&done) :
|
||||
Z(calc_alm2map_deriv1) (cth.b,gen,job,&p1.b,&p2.b,&done);
|
||||
}
|
||||
|
||||
for (int i=0; i<nval; ++i)
|
||||
|
@ -626,8 +727,6 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
|
|||
}
|
||||
break;
|
||||
}
|
||||
case ALM2MAP_DERIV1:
|
||||
break;
|
||||
case MAP2ALM:
|
||||
{
|
||||
if (job->spin==0)
|
||||
|
|
|
@ -511,6 +511,106 @@ static void Y(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
|
|||
Y(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax,njobs);
|
||||
}
|
||||
|
||||
static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
|
||||
const Tb rxp, const Tb rxm, const dcmplx * restrict alm, int njobs)
|
||||
{
|
||||
for (int j=0; j<njobs; ++j)
|
||||
{
|
||||
Tv ar=vload(creal(alm[j])), ai=vload(cimag(alm[j]));
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
Tv lw=vadd(rxp.v[i],rxm.v[i]);
|
||||
vfmaeq(px[j].qr.v[i],ar,lw);
|
||||
vfmaeq(px[j].qi.v[i],ai,lw);
|
||||
}
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
Tv lx=vsub(rxm.v[i],rxp.v[i]);
|
||||
vfmaeq(py[j].ur.v[i],ai,lx);
|
||||
vfmseq(py[j].ui.v[i],ar,lx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
|
||||
Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
|
||||
const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
|
||||
int lmax, int njobs)
|
||||
{
|
||||
while (l<lmax)
|
||||
{
|
||||
Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
|
||||
fx2=vload(fx[l+1].f[2]);
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
|
||||
vmul(fx2,rec1p.v[i]));
|
||||
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
|
||||
vmul(fx2,rec1m.v[i]));
|
||||
}
|
||||
Y(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l],njobs);
|
||||
Y(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)],njobs);
|
||||
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
|
||||
fx2=vload(fx[l+2].f[2]);
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
|
||||
vmul(fx2,rec2p.v[i]));
|
||||
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
|
||||
vmul(fx2,rec2m.v[i]));
|
||||
}
|
||||
l+=2;
|
||||
}
|
||||
if (l==lmax)
|
||||
Y(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l], njobs);
|
||||
}
|
||||
|
||||
static void Y(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
|
||||
sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs,
|
||||
int *done)
|
||||
{
|
||||
int l, lmax=gen->lmax;
|
||||
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
|
||||
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
|
||||
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
|
||||
if (l>lmax)
|
||||
{ *done=1; return; }
|
||||
job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
|
||||
|
||||
const ylmgen_dbl3 * restrict fx = gen->fx;
|
||||
Tb corfacp,corfacm;
|
||||
Y(getCorfac)(scalep,&corfacp,gen->cf);
|
||||
Y(getCorfac)(scalem,&corfacm,gen->cf);
|
||||
const dcmplx * restrict alm=job->almtmp;
|
||||
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
|
||||
while (!full_ieee)
|
||||
{
|
||||
Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
|
||||
&alm[njobs*l],njobs);
|
||||
if (++l>lmax) break;
|
||||
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
|
||||
Y(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
|
||||
&alm[njobs*l], njobs);
|
||||
if (++l>lmax) break;
|
||||
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
|
||||
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
|
||||
{
|
||||
Y(getCorfac)(scalep,&corfacp,gen->cf);
|
||||
Y(getCorfac)(scalem,&corfacm,gen->cf);
|
||||
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
|
||||
}
|
||||
}
|
||||
|
||||
if (l>lmax)
|
||||
{ *done=1; return; }
|
||||
|
||||
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
|
||||
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
|
||||
Y(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
|
||||
lmax, njobs);
|
||||
}
|
||||
|
||||
|
||||
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
|
||||
|
||||
static void Y(inner_loop) (sharp_job *job, const int *ispair,
|
||||
|
@ -524,6 +624,7 @@ static void Y(inner_loop) (sharp_job *job, const int *ispair,
|
|||
switch (job->type)
|
||||
{
|
||||
case ALM2MAP:
|
||||
case ALM2MAP_DERIV1:
|
||||
{
|
||||
if (job->spin==0)
|
||||
{
|
||||
|
@ -581,7 +682,11 @@ static void Y(inner_loop) (sharp_job *job, const int *ispair,
|
|||
itot=idx[itot];
|
||||
cth.s[i]=cth_[itot];
|
||||
}
|
||||
Y(calc_alm2map_spin) (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
|
||||
(job->type==ALM2MAP) ?
|
||||
Y(calc_alm2map_spin )
|
||||
(cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done) :
|
||||
Y(calc_alm2map_deriv1)
|
||||
(cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
|
||||
}
|
||||
|
||||
for (int i=0; i<nval; ++i)
|
||||
|
@ -615,8 +720,6 @@ static void Y(inner_loop) (sharp_job *job, const int *ispair,
|
|||
}
|
||||
break;
|
||||
}
|
||||
case ALM2MAP_DERIV1:
|
||||
break;
|
||||
case MAP2ALM:
|
||||
{
|
||||
if (job->spin==0)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue