clearer macro names

This commit is contained in:
Martin Reinecke 2018-12-10 14:37:34 +01:00
parent ea8d4b4ecd
commit 65f47d10cc
2 changed files with 25 additions and 11 deletions

View file

@ -41,13 +41,16 @@ if (njobs>1)
Tb lam_3, lam_4;
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
// lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
lam_3.v[i] = vabmc(vmul(cth.v[i],lam_2.v[i]),r0,vmul(lam_1.v[i],r1));
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
// lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
lam_4.v[i] = vabmc(vmul(cth.v[i],lam_3.v[i]),r0,vmul(lam_2.v[i],r1));
r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
// lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
lam_1.v[i] = vabmc(vmul(cth.v[i],lam_4.v[i]),r0,vmul(lam_3.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar2=vload(creal(alm[njobs*l+j])),
@ -71,7 +74,8 @@ if (njobs>1)
}
r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
// lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
lam_2.v[i] = vabmc(vmul(cth.v[i],lam_1.v[i]),r0,vmul(lam_4.v[i],r1));
l+=4;
}
}
@ -127,13 +131,15 @@ NOINLINE static void Z(map2alm_kernel) (const Tb cth,
while (l<lmax)
{
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
- vload(rf[l].f[1])*lam_1.v[i];
lam_1.v[i] = vabmc(vload(rf[l].f[0]),vmul(cth.v[i],lam_2.v[i]),
vmul(vload(rf[l].f[1]),lam_1.v[i]));
for (int j=0; j<njobs; ++j)
for (int i=0; i<nvec; ++i)
{
atmp[2*(l*njobs+j)]+=lam_2.v[i]*p1[j].r.v[i];
atmp[2*(l*njobs+j)+1]+=lam_2.v[i]*p1[j].i.v[i];
vfmaeq(atmp[2*(l*njobs+j)],lam_2.v[i],p1[j].r.v[i]);
vfmaeq(atmp[2*(l*njobs+j)+1],lam_2.v[i],p1[j].i.v[i]);
// atmp[2*(l*njobs+j)]+=lam_2.v[i]*p1[j].r.v[i];
// atmp[2*(l*njobs+j)+1]+=lam_2.v[i]*p1[j].i.v[i];
}
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
@ -141,8 +147,10 @@ NOINLINE static void Z(map2alm_kernel) (const Tb cth,
for (int j=0; j<njobs; ++j)
for (int i=0; i<nvec; ++i)
{
atmp[2*((l+1)*njobs+j)]+=lam_1.v[i]*p2[j].r.v[i];
atmp[2*((l+1)*njobs+j)+1]+=lam_1.v[i]*p2[j].i.v[i];
vfmaeq(atmp[2*((l+1)*njobs+j)],lam_1.v[i],p2[j].r.v[i]);
vfmaeq(atmp[2*((l+1)*njobs+j)+1],lam_1.v[i],p2[j].i.v[i]);
// atmp[2*((l+1)*njobs+j)]+=lam_1.v[i]*p2[j].r.v[i];
// atmp[2*((l+1)*njobs+j)+1]+=lam_1.v[i]*p2[j].i.v[i];
}
l+=2;
}

View file

@ -58,6 +58,7 @@ typedef int Tm;
#define vfmaeq(a,b,c) ((a)+=(b)*(c))
#define vfmaeq_s(a,b,c) ((a)+=(b)*(c))
#define vfmseq(a,b,c) ((a)-=(b)*(c))
#define vabmc(a,b,c) ((a)*(b)-(c))
#define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
#define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
#define vneg(a) (-(a))
@ -125,6 +126,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
#define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c))
#define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
#define vabmc(a,b,c) _mm_sub_pd(_mm_mul_pd(a,b),c)
#define vfmaaeq(a,b,c,d,e) \
a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
#define vfmaseq(a,b,c,d,e) \
@ -182,6 +184,7 @@ typedef __m256d Tm;
#define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
#define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
#define vabmc(a,b,c) _mm256_msub_pd(a,b,c)
#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
#else
@ -189,12 +192,14 @@ typedef __m256d Tm;
#define vfmaeq(a,b,c) a=_mm256_fmadd_pd(b,c,a)
#define vfmaeq_s(a,b,c) a=_mm256_fmadd_ps(b,c,a)
#define vfmseq(a,b,c) a=_mm256_fnmadd_pd(b,c,a)
#define vabmc(a,b,c) _mm256_fmsub_pd(a,b,c)
#define vfmaaeq(a,b,c,d,e) a=_mm256_fmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
#define vfmaseq(a,b,c,d,e) a=_mm256_fnmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
#else
#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
#define vabmc(a,b,c) _mm256_sub_pd(_mm256_mul_pd(a,b),c)
#define vfmaaeq(a,b,c,d,e) \
a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
#define vfmaseq(a,b,c,d,e) \
@ -241,7 +246,8 @@ typedef __mmask8 Tm;
#define vmuleq(a,b) a=_mm512_mul_pd(a,b)
#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
#define vfmaeq(a,b,c) a=_mm512_fmadd_pd(b,c,a)
#define vfmseq(a,b,c) a=_mm512_fnmadd_pd(b,c,a)
//#define vabmc(a,b,c) a=_mm512_fnmadd_pd(b,c,a)
//#define vfms(a,b,c) _mm512_fnmadd_pd(b,c,a)
#define vfmaaeq(a,b,c,d,e) a=_mm512_fmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
#define vfmaseq(a,b,c,d,e) a=_mm512_fnmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
#define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))