clearer macro names
This commit is contained in:
parent
ea8d4b4ecd
commit
65f47d10cc
2 changed files with 25 additions and 11 deletions
|
@ -41,13 +41,16 @@ if (njobs>1)
|
|||
Tb lam_3, lam_4;
|
||||
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
|
||||
for (int i=0; i<nvec; ++i)
|
||||
lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
|
||||
// lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
|
||||
lam_3.v[i] = vabmc(vmul(cth.v[i],lam_2.v[i]),r0,vmul(lam_1.v[i],r1));
|
||||
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
|
||||
for (int i=0; i<nvec; ++i)
|
||||
lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
|
||||
// lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
|
||||
lam_4.v[i] = vabmc(vmul(cth.v[i],lam_3.v[i]),r0,vmul(lam_2.v[i],r1));
|
||||
r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
|
||||
for (int i=0; i<nvec; ++i)
|
||||
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
|
||||
// lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
|
||||
lam_1.v[i] = vabmc(vmul(cth.v[i],lam_4.v[i]),r0,vmul(lam_3.v[i],r1));
|
||||
for (int j=0; j<njobs; ++j)
|
||||
{
|
||||
Tv ar2=vload(creal(alm[njobs*l+j])),
|
||||
|
@ -71,7 +74,8 @@ if (njobs>1)
|
|||
}
|
||||
r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
|
||||
for (int i=0; i<nvec; ++i)
|
||||
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
|
||||
// lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
|
||||
lam_2.v[i] = vabmc(vmul(cth.v[i],lam_1.v[i]),r0,vmul(lam_4.v[i],r1));
|
||||
l+=4;
|
||||
}
|
||||
}
|
||||
|
@ -127,13 +131,15 @@ NOINLINE static void Z(map2alm_kernel) (const Tb cth,
|
|||
while (l<lmax)
|
||||
{
|
||||
for (int i=0; i<nvec; ++i)
|
||||
lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
|
||||
- vload(rf[l].f[1])*lam_1.v[i];
|
||||
lam_1.v[i] = vabmc(vload(rf[l].f[0]),vmul(cth.v[i],lam_2.v[i]),
|
||||
vmul(vload(rf[l].f[1]),lam_1.v[i]));
|
||||
for (int j=0; j<njobs; ++j)
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
atmp[2*(l*njobs+j)]+=lam_2.v[i]*p1[j].r.v[i];
|
||||
atmp[2*(l*njobs+j)+1]+=lam_2.v[i]*p1[j].i.v[i];
|
||||
vfmaeq(atmp[2*(l*njobs+j)],lam_2.v[i],p1[j].r.v[i]);
|
||||
vfmaeq(atmp[2*(l*njobs+j)+1],lam_2.v[i],p1[j].i.v[i]);
|
||||
// atmp[2*(l*njobs+j)]+=lam_2.v[i]*p1[j].r.v[i];
|
||||
// atmp[2*(l*njobs+j)+1]+=lam_2.v[i]*p1[j].i.v[i];
|
||||
}
|
||||
for (int i=0; i<nvec; ++i)
|
||||
lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
|
||||
|
@ -141,8 +147,10 @@ NOINLINE static void Z(map2alm_kernel) (const Tb cth,
|
|||
for (int j=0; j<njobs; ++j)
|
||||
for (int i=0; i<nvec; ++i)
|
||||
{
|
||||
atmp[2*((l+1)*njobs+j)]+=lam_1.v[i]*p2[j].r.v[i];
|
||||
atmp[2*((l+1)*njobs+j)+1]+=lam_1.v[i]*p2[j].i.v[i];
|
||||
vfmaeq(atmp[2*((l+1)*njobs+j)],lam_1.v[i],p2[j].r.v[i]);
|
||||
vfmaeq(atmp[2*((l+1)*njobs+j)+1],lam_1.v[i],p2[j].i.v[i]);
|
||||
// atmp[2*((l+1)*njobs+j)]+=lam_1.v[i]*p2[j].r.v[i];
|
||||
// atmp[2*((l+1)*njobs+j)+1]+=lam_1.v[i]*p2[j].i.v[i];
|
||||
}
|
||||
l+=2;
|
||||
}
|
||||
|
|
|
@ -58,6 +58,7 @@ typedef int Tm;
|
|||
#define vfmaeq(a,b,c) ((a)+=(b)*(c))
|
||||
#define vfmaeq_s(a,b,c) ((a)+=(b)*(c))
|
||||
#define vfmseq(a,b,c) ((a)-=(b)*(c))
|
||||
#define vabmc(a,b,c) ((a)*(b)-(c))
|
||||
#define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
|
||||
#define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
|
||||
#define vneg(a) (-(a))
|
||||
|
@ -125,6 +126,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
|
|||
#define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
|
||||
#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c))
|
||||
#define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
|
||||
#define vabmc(a,b,c) _mm_sub_pd(_mm_mul_pd(a,b),c)
|
||||
#define vfmaaeq(a,b,c,d,e) \
|
||||
a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
|
||||
#define vfmaseq(a,b,c,d,e) \
|
||||
|
@ -182,6 +184,7 @@ typedef __m256d Tm;
|
|||
#define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
|
||||
#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
|
||||
#define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
|
||||
#define vabmc(a,b,c) _mm256_msub_pd(a,b,c)
|
||||
#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
|
||||
#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
|
||||
#else
|
||||
|
@ -189,12 +192,14 @@ typedef __m256d Tm;
|
|||
#define vfmaeq(a,b,c) a=_mm256_fmadd_pd(b,c,a)
|
||||
#define vfmaeq_s(a,b,c) a=_mm256_fmadd_ps(b,c,a)
|
||||
#define vfmseq(a,b,c) a=_mm256_fnmadd_pd(b,c,a)
|
||||
#define vabmc(a,b,c) _mm256_fmsub_pd(a,b,c)
|
||||
#define vfmaaeq(a,b,c,d,e) a=_mm256_fmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
|
||||
#define vfmaseq(a,b,c,d,e) a=_mm256_fnmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
|
||||
#else
|
||||
#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
|
||||
#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
|
||||
#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
|
||||
#define vabmc(a,b,c) _mm256_sub_pd(_mm256_mul_pd(a,b),c)
|
||||
#define vfmaaeq(a,b,c,d,e) \
|
||||
a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
|
||||
#define vfmaseq(a,b,c,d,e) \
|
||||
|
@ -241,7 +246,8 @@ typedef __mmask8 Tm;
|
|||
#define vmuleq(a,b) a=_mm512_mul_pd(a,b)
|
||||
#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
|
||||
#define vfmaeq(a,b,c) a=_mm512_fmadd_pd(b,c,a)
|
||||
#define vfmseq(a,b,c) a=_mm512_fnmadd_pd(b,c,a)
|
||||
//#define vabmc(a,b,c) a=_mm512_fnmadd_pd(b,c,a)
|
||||
//#define vfms(a,b,c) _mm512_fnmadd_pd(b,c,a)
|
||||
#define vfmaaeq(a,b,c,d,e) a=_mm512_fmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
|
||||
#define vfmaseq(a,b,c,d,e) a=_mm512_fnmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
|
||||
#define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue