diff --git a/.gitignore b/.gitignore index 12a6531..4675273 100644 --- a/.gitignore +++ b/.gitignore @@ -6,14 +6,24 @@ **~ **.pyc **.pyo +.libs +**/.deps +**/.dirstamp /auto /autom4te.cache -/config.log -/config.status -/config/config.auto +/m4 +config.log +config.guess +config.sub +ltmain.sh +compile +missing +/comp /configure -/sharp_oracle.inc - -/python/libsharp/libsharp.c -/python/libsharp/libsharp_mpi.c +/Makefile +/Makefile.in +/aclocal.m4 +/ar-lib +/depcomp +/install-sh diff --git a/Makefile.am b/Makefile.am index 2649be1..a1541f8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -10,6 +10,11 @@ src_sharp = \ libsharp/sharp.c \ libsharp/sharp_almhelpers.c \ libsharp/sharp_core.c \ + libsharp/sharp_core_avx.c \ + libsharp/sharp_core_avx2.c \ + libsharp/sharp_core_fma.c \ + libsharp/sharp_core_fma4.c \ + libsharp/sharp_core_avx512f.c \ libsharp/sharp_geomhelpers.c \ libsharp/sharp_legendre_roots.c \ libsharp/sharp_ylmgen_c.c \ @@ -24,6 +29,9 @@ include_HEADERS = \ libsharp/sharp_almhelpers.h \ libsharp/sharp_cxx.h +EXTRA_DIST = \ + libsharp/sharp_core_inc.c + libsharp_la_SOURCES = $(src_sharp) check_PROGRAMS = sharp_testsuite diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c index 146be76..036a8ed 100644 --- a/libsharp/sharp_core.c +++ b/libsharp/sharp_core.c @@ -1,1168 +1,116 @@ -/* - * This file is part of libsharp. - * - * libsharp is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * libsharp is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with libsharp; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ +#define XCONCATX(a,b) a##_##b +#define XCONCATX2(a,b) XCONCATX(a,b) +#define XARCH(a) XCONCATX2(a,ARCH) -/* - * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik - * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt - * (DLR). - */ +#define ARCH default +#include "sharp_core_inc.c" +#undef ARCH -/*! \file sharp_core.c - * Computational core - * - * Copyright (C) 2012-2019 Max-Planck-Society - * \author Martin Reinecke - */ - -#include -#include -#include -#include "sharp_vecsupport.h" -#include "sharp.h" -#include "sharp_internal.h" -#include "c_utils.h" - -typedef complex double dcmplx; - -#define nv0 (128/VLEN) -#define nvx (64/VLEN) - -typedef Tv Tbv0[nv0]; -typedef double Tbs0[nv0*VLEN]; - -typedef struct - { - Tbv0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; - } s0data_v; - -typedef struct - { - Tbs0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; - } s0data_s; - -typedef union - { - s0data_v v; - s0data_s s; - } s0data_u; - -typedef Tv Tbvx[nvx]; -typedef double Tbsx[nvx*VLEN]; - -typedef struct - { - Tbvx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth, - p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi; - } sxdata_v; - -typedef struct - { - Tbsx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth, - p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi; - } sxdata_s; - -typedef union - { - sxdata_v v; - sxdata_s s; - } sxdata_u; - -static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale, - double maxval) - { - const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval); - const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig); - Tm mask = vgt(vabs(*val),vfmax); - while (vanyTrue(mask)) - { - vmuleq_mask(mask,*val,vfsmall); - vaddeq_mask(mask,*scale,vone); - mask = vgt(vabs(*val),vfmax); - } - mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero)); - while (vanyTrue(mask)) - { - vmuleq_mask(mask,*val,vfbig); - vsubeq_mask(mask,*scale,vone); - mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero)); - } - } - -static void mypow(Tv val, int npow, const double * restrict powlimit, - Tv * restrict resd, Tv * restrict ress) - { - Tv vminv=vload(powlimit[npow]); - Tm mask = vlt(vabs(val),vminv); - if (!vanyTrue(mask)) // no underflows possible, use quick algoritm - { - Tv res=vone; - do - { - if (npow&1) - res*=val; - val*=val; - } - while(npow>>=1); - *resd=res; - *ress=vzero; - } - else - { - Tv scale=vzero, scaleint=vzero, res=vone; - Tvnormalize(&val,&scaleint,sharp_fbighalf); - do - { - if (npow&1) - { - res*=val; - scale+=scaleint; - Tvnormalize(&res,&scale,sharp_fbighalf); - } - val*=val; - scaleint+=scaleint; - Tvnormalize(&val,&scaleint,sharp_fbighalf); - } - while(npow>>=1); - *resd=res; - *ress=scale; - } - } - -static inline void getCorfac(Tv scale, Tv * restrict corfac, - const double * restrict cf) - { - typedef union - { Tv v; double s[VLEN]; } Tvu; - - Tvu sc, corf; - sc.v=scale; - for (int i=0; im, il=0; - Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); - Tv limscale=vload(sharp_limscale); - int below_limit = 1; - for (int i=0; ilam1[i]=vzero; - mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]); - d->lam2[i] *= mfac; - Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol); - below_limit &= vallTrue(vlt(d->scale[i],limscale)); - } - - while (below_limit) - { - if (l+4>gen->lmax) {*l_=gen->lmax+1;return;} - below_limit=1; - Tv a1=vload(gen->coef[il ][0]), b1=vload(gen->coef[il ][1]); - Tv a2=vload(gen->coef[il+1][0]), b2=vload(gen->coef[il+1][1]); - for (int i=0; ilam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; - d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i]; - if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) - below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale))); - } - l+=4; il+=2; - } - *l_=l; *il_=il; - } - -NOINLINE static void alm2map_kernel(s0data_v * restrict d, - const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm, - int l, int il, int lmax, int nv2) - { - if (nv2==nv0) - { - for (; l<=lmax-2; il+=2, l+=4) - { - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); - Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); - Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); - Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); - Tv a1=vload(coef[il ][0]), b1=vload(coef[il ][1]); - Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]); - for (int i=0; ip1r[i] += d->lam2[i]*ar1; - d->p1i[i] += d->lam2[i]*ai1; - d->p2r[i] += d->lam2[i]*ar2; - d->p2i[i] += d->lam2[i]*ai2; - d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; - d->p1r[i] += d->lam1[i]*ar3; - d->p1i[i] += d->lam1[i]*ai3; - d->p2r[i] += d->lam1[i]*ar4; - d->p2i[i] += d->lam1[i]*ai4; - d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i]; - } - } - } - else - { - for (; l<=lmax-2; il+=2, l+=4) - { - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); - Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); - Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); - Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); - Tv a1=vload(coef[il ][0]), b1=vload(coef[il ][1]); - Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]); - for (int i=0; ip1r[i] += d->lam2[i]*ar1; - d->p1i[i] += d->lam2[i]*ai1; - d->p2r[i] += d->lam2[i]*ar2; - d->p2i[i] += d->lam2[i]*ai2; - d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; - d->p1r[i] += d->lam1[i]*ar3; - d->p1i[i] += d->lam1[i]*ai3; - d->p2r[i] += d->lam1[i]*ar4; - d->p2i[i] += d->lam1[i]*ai4; - d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i]; - } - } - } - for (; l<=lmax; ++il, l+=2) - { - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); - Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); - Tv a=vload(coef[il][0]), b=vload(coef[il][1]); - for (int i=0; ip1r[i] += d->lam2[i]*ar1; - d->p1i[i] += d->lam2[i]*ai1; - d->p2r[i] += d->lam2[i]*ar2; - d->p2i[i] += d->lam2[i]*ai2; - Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i]; - d->lam1[i] = d->lam2[i]; - d->lam2[i] = tmp; - } - } - } - -NOINLINE static void calc_alm2map (sharp_job * restrict job, - const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth) - { - int l,il,lmax=gen->lmax; - int nv2 = (nth+VLEN-1)/VLEN; - iter_to_ieee(gen, d, &l, &il, nv2); - job->opcnt += il * 4*nth; - if (l>lmax) return; - job->opcnt += (lmax+1-l) * 6*nth; - - const sharp_ylmgen_dbl2 * restrict coef = gen->coef; - const dcmplx * restrict alm=job->almtmp; - int full_ieee=1; - for (int i=0; iscale[i], &d->corfac[i], gen->cf); - full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); - } - - while((!full_ieee) && (l<=lmax)) - { - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); - Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); - Tv a=vload(coef[il][0]), b=vload(coef[il][1]); - full_ieee=1; - for (int i=0; ip1r[i] += d->lam2[i]*d->corfac[i]*ar1; - d->p1i[i] += d->lam2[i]*d->corfac[i]*ai1; - d->p2r[i] += d->lam2[i]*d->corfac[i]*ar2; - d->p2i[i] += d->lam2[i]*d->corfac[i]*ai2; - Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i]; - d->lam1[i] = d->lam2[i]; - d->lam2[i] = tmp; - if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) - getCorfac(d->scale[i], &d->corfac[i], gen->cf); - full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); - } - l+=2; ++il; - } - if (l>lmax) return; - - for (int i=0; ilam1[i] *= d->corfac[i]; - d->lam2[i] *= d->corfac[i]; - } - alm2map_kernel(d, coef, alm, l, il, lmax, nv2); - } - -NOINLINE static void map2alm_kernel(s0data_v * restrict d, - const sharp_ylmgen_dbl2 * restrict coef, dcmplx * restrict alm, int l, - int il, int lmax, int nv2) - { - for (; l<=lmax-2; il+=2, l+=4) - { - Tv a1=vload(coef[il ][0]), b1=vload(coef[il ][1]); - Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]); - Tv atmp1[4] = {vzero, vzero, vzero, vzero}; - Tv atmp2[4] = {vzero, vzero, vzero, vzero}; - for (int i=0; ilam2[i]*d->p1r[i]; - atmp1[1] += d->lam2[i]*d->p1i[i]; - atmp1[2] += d->lam2[i]*d->p2r[i]; - atmp1[3] += d->lam2[i]*d->p2i[i]; - d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; - atmp2[0] += d->lam1[i]*d->p1r[i]; - atmp2[1] += d->lam1[i]*d->p1i[i]; - atmp2[2] += d->lam1[i]*d->p2r[i]; - atmp2[3] += d->lam1[i]*d->p2i[i]; - d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i]; - } - vhsum_cmplx_special (atmp1[0], atmp1[1], atmp1[2], atmp1[3], &alm[l ]); - vhsum_cmplx_special (atmp2[0], atmp2[1], atmp2[2], atmp2[3], &alm[l+2]); - } - for (; l<=lmax; ++il, l+=2) - { - Tv a=vload(coef[il][0]), b=vload(coef[il][1]); - Tv atmp[4] = {vzero, vzero, vzero, vzero}; - for (int i=0; ilam2[i]*d->p1r[i]; - atmp[1] += d->lam2[i]*d->p1i[i]; - atmp[2] += d->lam2[i]*d->p2r[i]; - atmp[3] += d->lam2[i]*d->p2i[i]; - Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i]; - d->lam1[i] = d->lam2[i]; - d->lam2[i] = tmp; - } - vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]); - } - } - -NOINLINE static void calc_map2alm (sharp_job * restrict job, - const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth) - { - int l,il,lmax=gen->lmax; - int nv2 = (nth+VLEN-1)/VLEN; - iter_to_ieee(gen, d, &l, &il, nv2); - job->opcnt += il * 4*nth; - if (l>lmax) return; - job->opcnt += (lmax+1-l) * 6*nth; - - const sharp_ylmgen_dbl2 * restrict coef = gen->coef; - dcmplx * restrict alm=job->almtmp; - int full_ieee=1; - for (int i=0; iscale[i], &d->corfac[i], gen->cf); - full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); - } - - while((!full_ieee) && (l<=lmax)) - { - Tv a=vload(coef[il][0]), b=vload(coef[il][1]); - Tv atmp[4] = {vzero, vzero, vzero, vzero}; - full_ieee=1; - for (int i=0; ilam2[i]*d->corfac[i]*d->p1r[i]; - atmp[1] += d->lam2[i]*d->corfac[i]*d->p1i[i]; - atmp[2] += d->lam2[i]*d->corfac[i]*d->p2r[i]; - atmp[3] += d->lam2[i]*d->corfac[i]*d->p2i[i]; - Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i]; - d->lam1[i] = d->lam2[i]; - d->lam2[i] = tmp; - if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) - getCorfac(d->scale[i], &d->corfac[i], gen->cf); - full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); - } - vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]); - l+=2; ++il; - } - if (l>lmax) return; - - for (int i=0; ilam1[i] *= d->corfac[i]; - d->lam2[i] *= d->corfac[i]; - } - map2alm_kernel(d, coef, alm, l, il, lmax, nv2); - } - -NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen, - sxdata_v * restrict d, int * restrict l_, int nv2) - { - const sharp_ylmgen_dbl2 * restrict fx = gen->coef; - Tv prefac=vload(gen->prefac[gen->m]), - prescale=vload(gen->fscale[gen->m]); - Tv limscale=vload(sharp_limscale); - int below_limit=1; - for (int i=0; icth[i])*vload(0.5))); - Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5))); - Tm mask=vlt(d->sth[i],vzero); - vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.)); - vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.)); - - Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps; - mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps); - mypow(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps); - mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps); - mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps); - - d->l1p[i] = vzero; - d->l1m[i] = vzero; - d->l2p[i] = prefac*ccp; - d->scp[i] = prescale+ccps; - d->l2m[i] = prefac*csp; - d->scm[i] = prescale+csps; - Tvnormalize(&d->l2m[i],&d->scm[i],sharp_fbighalf); - Tvnormalize(&d->l2p[i],&d->scp[i],sharp_fbighalf); - d->l2p[i] *= ssp; - d->scp[i] += ssps; - d->l2m[i] *= scp; - d->scm[i] += scps; - if (gen->preMinus_p) - d->l2p[i] = vneg(d->l2p[i]); - if (gen->preMinus_m) - d->l2m[i] = vneg(d->l2m[i]); - if (gen->s&1) - d->l2p[i] = vneg(d->l2p[i]); - - Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol); - Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol); - - below_limit &= vallTrue(vlt(d->scm[i],limscale)) && - vallTrue(vlt(d->scp[i],limscale)); - } - - int l=gen->mhi; - - while (below_limit) - { - if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} - below_limit=1; - Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); - Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); - for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; - d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; - d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; - d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) || - rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol))) - below_limit &= vallTrue(vlt(d->scp[i],limscale)) && - vallTrue(vlt(d->scm[i],limscale)); - } - l+=2; - } - - *l_=l; - } - -NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d, - const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm, - int l, int lmax, int nv2) - { - int lsave = l; - while (l<=lmax) - { - Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); - Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); - Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), - acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); - Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), - acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); - for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; - d->p1pr[i] += agr1*d->l2p[i]; - d->p1pi[i] += agi1*d->l2p[i]; - d->p1mr[i] += acr1*d->l2p[i]; - d->p1mi[i] += aci1*d->l2p[i]; - - d->p1pr[i] += aci2*d->l1p[i]; - d->p1pi[i] -= acr2*d->l1p[i]; - d->p1mr[i] -= agi2*d->l1p[i]; - d->p1mi[i] += agr2*d->l1p[i]; - d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; - } - l+=2; - } - l=lsave; - while (l<=lmax) - { - Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); - Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); - Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), - acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); - Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), - acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); - for (int i=0; il1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; - d->p2pr[i] -= aci1*d->l2m[i]; - d->p2pi[i] += acr1*d->l2m[i]; - d->p2mr[i] += agi1*d->l2m[i]; - d->p2mi[i] -= agr1*d->l2m[i]; - - d->p2pr[i] += agr2*d->l1m[i]; - d->p2pi[i] += agi2*d->l1m[i]; - d->p2mr[i] += acr2*d->l1m[i]; - d->p2mi[i] += aci2*d->l1m[i]; - d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - } - l+=2; - } - } - -NOINLINE static void calc_alm2map_spin (sharp_job * restrict job, - const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) - { - int l,lmax=gen->lmax; - int nv2 = (nth+VLEN-1)/VLEN; - iter_to_ieee_spin(gen, d, &l, nv2); - job->opcnt += (l-gen->mhi) * 7*nth; - if (l>lmax) return; - job->opcnt += (lmax+1-l) * 23*nth; - - const sharp_ylmgen_dbl2 * restrict fx = gen->coef; - const dcmplx * restrict alm=job->almtmp; - int full_ieee=1; - for (int i=0; iscp[i], &d->cfp[i], gen->cf); - getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && - vallTrue(vge(d->scm[i],vload(sharp_minscale))); - } - - while((!full_ieee) && (l<=lmax)) - { - Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); - Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); - Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), - acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); - Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), - acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); - full_ieee=1; - for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; - d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; - - Tv l2p=d->l2p[i]*d->cfp[i], l2m=d->l2m[i]*d->cfm[i]; - Tv l1m=d->l1m[i]*d->cfm[i], l1p=d->l1p[i]*d->cfp[i]; - - d->p1pr[i] += agr1*l2p + aci2*l1p; - d->p1pi[i] += agi1*l2p - acr2*l1p; - d->p1mr[i] += acr1*l2p - agi2*l1p; - d->p1mi[i] += aci1*l2p + agr2*l1p; - - d->p2pr[i] += agr2*l1m - aci1*l2m; - d->p2pi[i] += agi2*l1m + acr1*l2m; - d->p2mr[i] += acr2*l1m + agi1*l2m; - d->p2mi[i] += aci2*l1m - agr1*l2m; - - d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; - d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) - getCorfac(d->scp[i], &d->cfp[i], gen->cf); - full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); - if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) - getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); - } - l+=2; - } -// if (l>lmax) return; - - for (int i=0; il1p[i] *= d->cfp[i]; - d->l2p[i] *= d->cfp[i]; - d->l1m[i] *= d->cfm[i]; - d->l2m[i] *= d->cfm[i]; - } - alm2map_spin_kernel(d, fx, alm, l, lmax, nv2); - - for (int i=0; ip1pr[i]; d->p1pr[i] -= d->p2mi[i]; d->p2mi[i] += tmp; - tmp = d->p1pi[i]; d->p1pi[i] += d->p2mr[i]; d->p2mr[i] -= tmp; - tmp = d->p1mr[i]; d->p1mr[i] += d->p2pi[i]; d->p2pi[i] -= tmp; - tmp = d->p1mi[i]; d->p1mi[i] -= d->p2pr[i]; d->p2pr[i] += tmp; - } - } - -NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d, - const sharp_ylmgen_dbl2 * restrict fx, dcmplx * restrict alm, - int l, int lmax, int nv2) - { - int lsave=l; - while (l<=lmax) - { - Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); - Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); - Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; - Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; - for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; - agr1 += d->p2mi[i]*d->l2p[i]; - agi1 -= d->p2mr[i]*d->l2p[i]; - acr1 -= d->p2pi[i]*d->l2p[i]; - aci1 += d->p2pr[i]*d->l2p[i]; - agr2 += d->p2pr[i]*d->l1p[i]; - agi2 += d->p2pi[i]*d->l1p[i]; - acr2 += d->p2mr[i]*d->l1p[i]; - aci2 += d->p2mi[i]*d->l1p[i]; - d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; - } - vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); - vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]); - l+=2; - } - l=lsave; - while (l<=lmax) - { - Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); - Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); - Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; - Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; - for (int i=0; il1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; - agr1 += d->p1pr[i]*d->l2m[i]; - agi1 += d->p1pi[i]*d->l2m[i]; - acr1 += d->p1mr[i]*d->l2m[i]; - aci1 += d->p1mi[i]*d->l2m[i]; - agr2 -= d->p1mi[i]*d->l1m[i]; - agi2 += d->p1mr[i]*d->l1m[i]; - acr2 += d->p1pi[i]*d->l1m[i]; - aci2 -= d->p1pr[i]*d->l1m[i]; - d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - } - vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); - vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]); - l+=2; - } - } - -NOINLINE static void calc_map2alm_spin (sharp_job * restrict job, - const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) - { - int l,lmax=gen->lmax; - int nv2 = (nth+VLEN-1)/VLEN; - iter_to_ieee_spin(gen, d, &l, nv2); - job->opcnt += (l-gen->mhi) * 7*nth; - if (l>lmax) return; - job->opcnt += (lmax+1-l) * 23*nth; - - const sharp_ylmgen_dbl2 * restrict fx = gen->coef; - dcmplx * restrict alm=job->almtmp; - int full_ieee=1; - for (int i=0; iscp[i], &d->cfp[i], gen->cf); - getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && - vallTrue(vge(d->scm[i],vload(sharp_minscale))); - } - for (int i=0; ip1pr[i]; d->p1pr[i] -= d->p2mi[i]; d->p2mi[i] += tmp; - tmp = d->p1pi[i]; d->p1pi[i] += d->p2mr[i]; d->p2mr[i] -= tmp; - tmp = d->p1mr[i]; d->p1mr[i] += d->p2pi[i]; d->p2pi[i] -= tmp; - tmp = d->p1mi[i]; d->p1mi[i] -= d->p2pr[i]; d->p2pr[i] += tmp; - } - - while((!full_ieee) && (l<=lmax)) - { - Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); - Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); - Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; - Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; - full_ieee=1; - for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; - d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; - Tv l2p = d->l2p[i]*d->cfp[i], l2m = d->l2m[i]*d->cfm[i]; - Tv l1p = d->l1p[i]*d->cfp[i], l1m = d->l1m[i]*d->cfm[i]; - agr1 += d->p1pr[i]*l2m + d->p2mi[i]*l2p; - agi1 += d->p1pi[i]*l2m - d->p2mr[i]*l2p; - acr1 += d->p1mr[i]*l2m - d->p2pi[i]*l2p; - aci1 += d->p1mi[i]*l2m + d->p2pr[i]*l2p; - agr2 += d->p2pr[i]*l1p - d->p1mi[i]*l1m; - agi2 += d->p2pi[i]*l1p + d->p1mr[i]*l1m; - acr2 += d->p2mr[i]*l1p + d->p1pi[i]*l1m; - aci2 += d->p2mi[i]*l1p - d->p1pr[i]*l1m; - - d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; - d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) - getCorfac(d->scp[i], &d->cfp[i], gen->cf); - full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); - if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) - getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); - } - vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); - vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]); - l+=2; - } - if (l>lmax) return; - - for (int i=0; il1p[i] *= d->cfp[i]; - d->l2p[i] *= d->cfp[i]; - d->l1m[i] *= d->cfm[i]; - d->l2m[i] *= d->cfm[i]; - } - map2alm_spin_kernel(d, fx, alm, l, lmax, nv2); - } - - -NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d, - const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm, - int l, int lmax, int nv2) - { - while (l<=lmax) - { - Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); - Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), - ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); - for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; - d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; - Tv lw=d->l2p[i]+d->l2m[i]; - d->p1pr[i] += ar1*lw; - d->p1pi[i] += ai1*lw; - Tv lx=d->l2m[i]-d->l2p[i]; - d->p2mr[i] += ai1*lx; - d->p2mi[i] -= ar1*lx; - lw=d->l1p[i]+d->l1m[i]; - d->p2pr[i] += ar2*lw; - d->p2pi[i] += ai2*lw; - lx=d->l1m[i]-d->l1p[i]; - d->p1mr[i] += ai2*lx; - d->p1mi[i] -= ar2*lx; - d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; - d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - } - l+=2; - } - } - -NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job, - const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) - { - int l,lmax=gen->lmax; - int nv2 = (nth+VLEN-1)/VLEN; - iter_to_ieee_spin(gen, d, &l, nv2); - job->opcnt += (l-gen->mhi) * 7*nth; - if (l>lmax) return; - job->opcnt += (lmax+1-l) * 17*nth; - - const sharp_ylmgen_dbl2 * restrict fx = gen->coef; - const dcmplx * restrict alm=job->almtmp; - int full_ieee=1; - for (int i=0; iscp[i], &d->cfp[i], gen->cf); - getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && - vallTrue(vge(d->scm[i],vload(sharp_minscale))); - } - - while((!full_ieee) && (l<=lmax)) - { - Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); - Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), - ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); - full_ieee=1; - for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; - d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; - Tv lw=d->l2p[i]*d->cfp[i]+d->l2m[i]*d->cfm[i]; - d->p1pr[i] += ar1*lw; - d->p1pi[i] += ai1*lw; - Tv lx=d->l2m[i]*d->cfm[i]-d->l2p[i]*d->cfp[i]; - d->p2mr[i] += ai1*lx; - d->p2mi[i] -= ar1*lx; - lw=d->l1p[i]*d->cfp[i]+d->l1m[i]*d->cfm[i]; - d->p2pr[i] += ar2*lw; - d->p2pi[i] += ai2*lw; - lx=d->l1m[i]*d->cfm[i]-d->l1p[i]*d->cfp[i]; - d->p1mr[i] += ai2*lx; - d->p1mi[i] -= ar2*lx; - d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; - d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) - { - getCorfac(d->scp[i], &d->cfp[i], gen->cf); - full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); - } - if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) - { - getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); - } - } - l+=2; - } - if (l>lmax) return; - - for (int i=0; il1p[i] *= d->cfp[i]; - d->l2p[i] *= d->cfp[i]; - d->l1m[i] *= d->cfm[i]; - d->l2m[i] *= d->cfm[i]; - } - alm2map_deriv1_kernel(d, fx, alm, l, lmax, nv2); - } - - -#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) - -NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, +typedef void (*t_inner_loop) (sharp_job *job, const int *ispair, const double *cth_, const double *sth_, int llim, int ulim, - sharp_Ylmgen_C *gen, int mi, const int *mlim) + sharp_Ylmgen_C *gen, int mi, const int *mlim); +typedef int (*t_veclen) (void); +typedef int (*t_max_nvec) (int spin); +typedef const char *(*t_architecture) (void); + +static t_inner_loop inner_loop_ = NULL; +static t_veclen veclen_ = NULL; +static t_max_nvec max_nvec_ = NULL; +static t_architecture architecture_ = NULL; + +#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) + +#define DECL(arch) \ +static int XCONCATX2(have,arch)(void) \ + { \ + static int res=-1; \ + if (res<0) \ + { \ + __builtin_cpu_init(); \ + res = __builtin_cpu_supports(#arch); \ + } \ + return res; \ + } \ +\ +void XCONCATX2(inner_loop,arch) (sharp_job *job, const int *ispair, \ + const double *cth_, const double *sth_, int llim, int ulim, \ + sharp_Ylmgen_C *gen, int mi, const int *mlim); \ +int XCONCATX2(sharp_veclen,arch) (void); \ +int XCONCATX2(sharp_max_nvec,arch) (int spin); \ +const char *XCONCATX2(sharp_architecture,arch) (void); + +#if (!defined(__AVX512F__)) +DECL(avx512f) +#endif +#if (!defined(__FMA4__)) +DECL(fma4) +#endif +#if (!defined(__FMA__)) +DECL(fma) +#endif +#if (!defined(__AVX2__)) +DECL(avx2) +#endif +#if (!defined(__AVX__)) +DECL(avx) +#endif + +#endif + +static void assign_funcs(void) { - const int m = job->ainfo->mval[mi]; - sharp_Ylmgen_prepare (gen, m); - - switch (job->type) - { - case SHARP_ALM2MAP: - case SHARP_ALM2MAP_DERIV1: - { - if (job->spin==0) - { - //adjust the a_lm for the new algorithm - dcmplx * restrict alm=job->almtmp; - for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2) - { - dcmplx al = alm[l]; - dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1]; - dcmplx al2 = (l+2>gen->lmax) ? 0. : alm[l+2]; - alm[l ] = gen->alpha[il]*(gen->eps[l+1]*al + gen->eps[l+2]*al2); - alm[l+1] = gen->alpha[il]*al1; - } - - const int nval=nv0*VLEN; - int ith=0; - int itgt[nval]; - while (ith=m) - { - itgt[nth] = ith; - d.s.csq[nth]=cth_[ith]*cth_[ith]; - d.s.sth[nth]=sth_[ith]; - ++nth; - } - else - { - int phas_idx = ith*job->s_th + mi*job->s_m; - job->phase[phas_idx] = job->phase[phas_idx+1] = 0; - } - ++ith; - } - if (nth>0) - { - int i2=((nth+VLEN-1)/VLEN)*VLEN; - for (int i=nth; is_th + mi*job->s_m; - complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I, - r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I; - job->phase[phas_idx] = r1+r2; - if (ispair[tgt]) - job->phase[phas_idx+1] = r1-r2; - } - } - } - } - else - { - //adjust the a_lm for the new algorithm - if (job->nalm==2) - for (int l=gen->mhi; l<=gen->lmax+1; ++l) - { - job->almtmp[2*l ]*=gen->alpha[l]; - job->almtmp[2*l+1]*=gen->alpha[l]; - } - else - for (int l=gen->mhi; l<=gen->lmax+1; ++l) - job->almtmp[l]*=gen->alpha[l]; - - const int nval=nvx*VLEN; - int ith=0; - int itgt[nval]; - while (ith=m) - { - itgt[nth] = ith; - d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith]; - ++nth; - } - else - { - int phas_idx = ith*job->s_th + mi*job->s_m; - job->phase[phas_idx ] = job->phase[phas_idx+1] = 0; - job->phase[phas_idx+2] = job->phase[phas_idx+3] = 0; - } - ++ith; - } - if (nth>0) - { - int i2=((nth+VLEN-1)/VLEN)*VLEN; - for (int i=nth; itype==SHARP_ALM2MAP) ? - calc_alm2map_spin (job, gen, &d.v, nth) : - calc_alm2map_deriv1(job, gen, &d.v, nth); - for (int i=0; is_th + mi*job->s_m; - complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I, - q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I, - u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I, - u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I; - job->phase[phas_idx ] = q1+q2; - job->phase[phas_idx+2] = u1+u2; - if (ispair[tgt]) - { - dcmplx *phQ = &(job->phase[phas_idx+1]), - *phU = &(job->phase[phas_idx+3]); - *phQ = q1-q2; - *phU = u1-u2; - if ((gen->mhi-gen->m+gen->s)&1) - { *phQ=-(*phQ); *phU=-(*phU); } - } - } - } - } - } - break; - } - default: - { - UTIL_FAIL("must not happen"); - break; - } +#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) +#define DECL2(arch) \ + if (XCONCATX2(have,arch)()) \ + { \ + inner_loop_ = XCONCATX2(inner_loop,arch); \ + veclen_ = XCONCATX2(sharp_veclen,arch); \ + max_nvec_ = XCONCATX2(sharp_max_nvec,arch); \ + architecture_ = XCONCATX2(sharp_architecture,arch); \ + return; \ } +#if (!defined(__AVX512F__)) +DECL2(avx512f) +#endif +#if (!defined(__FMA4__)) +DECL2(fma4) +#endif +#if (!defined(__FMA__)) +DECL2(fma) +#endif +#if (!defined(__AVX2__)) +DECL2(avx2) +#endif +#if (!defined(__AVX__)) +DECL2(avx) +#endif +#endif + inner_loop_ = inner_loop_default; + veclen_ = sharp_veclen_default; + max_nvec_ = sharp_max_nvec_default; + architecture_ = sharp_architecture_default; } -NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, - const double *cth_, const double *sth_, int llim, int ulim, - sharp_Ylmgen_C *gen, int mi, const int *mlim) + +void inner_loop (sharp_job *job, const int *ispair,const double *cth, + const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, + const int *mlim) { - const int m = job->ainfo->mval[mi]; - sharp_Ylmgen_prepare (gen, m); - - switch (job->type) - { - case SHARP_MAP2ALM: - { - if (job->spin==0) - { - const int nval=nv0*VLEN; - int ith=0; - while (ith=m) - { - d.s.csq[nth]=cth_[ith]*cth_[ith]; d.s.sth[nth]=sth_[ith]; - int phas_idx = ith*job->s_th + mi*job->s_m; - dcmplx ph1=job->phase[phas_idx]; - dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.; - d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2); - d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2); - //adjust for new algorithm - d.s.p2r[nth]*=cth_[ith]; - d.s.p2i[nth]*=cth_[ith]; - ++nth; - } - ++ith; - } - if (nth>0) - { - int i2=((nth+VLEN-1)/VLEN)*VLEN; - for (int i=nth; ialmtmp; - dcmplx alm2 = 0.; - double alold=0; - for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2) - { - dcmplx al = alm[l]; - dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1]; - alm[l ] = gen->alpha[il]*gen->eps[l+1]*al + alold*gen->eps[l]*alm2; - alm[l+1] = gen->alpha[il]*al1; - alm2=al; - alold=gen->alpha[il]; - } - } - else - { - const int nval=nvx*VLEN; - int ith=0; - while (ith=m) - { - d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith]; - int phas_idx = ith*job->s_th + mi*job->s_m; - dcmplx p1Q=job->phase[phas_idx], - p1U=job->phase[phas_idx+2], - p2Q=ispair[ith] ? job->phase[phas_idx+1]:0., - p2U=ispair[ith] ? job->phase[phas_idx+3]:0.; - if ((gen->mhi-gen->m+gen->s)&1) - { p2Q=-p2Q; p2U=-p2U; } - d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q); - d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U); - d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q); - d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U); - ++nth; - } - ++ith; - } - if (nth>0) - { - int i2=((nth+VLEN-1)/VLEN)*VLEN; - for (int i=nth; imhi; l<=gen->lmax; ++l) - { - job->almtmp[2*l ]*=gen->alpha[l]; - job->almtmp[2*l+1]*=gen->alpha[l]; - } - } - break; - } - default: - { - UTIL_FAIL("must not happen"); - break; - } - } + if (!inner_loop_) assign_funcs(); + inner_loop_(job, ispair, cth, sth, llim, ulim, gen, mi, mlim); } - -void inner_loop (sharp_job *job, const int *ispair, - const double *cth_, const double *sth_, int llim, int ulim, - sharp_Ylmgen_C *gen, int mi, const int *mlim) - { - (job->type==SHARP_MAP2ALM) ? - inner_loop_m2a(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) : - inner_loop_a2m(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim); - } - -#undef VZERO - int sharp_veclen(void) { - return VLEN; + if (!veclen_) assign_funcs(); + return veclen_(); } - int sharp_max_nvec(int spin) { - return (spin==0) ? nv0 : nvx; + if (!max_nvec_) assign_funcs(); + return max_nvec_(spin); + } +const char *sharp_architecture(void) + { + if (!architecture_) assign_funcs(); + return architecture_(); } diff --git a/libsharp/sharp_core_avx.c b/libsharp/sharp_core_avx.c new file mode 100644 index 0000000..724e629 --- /dev/null +++ b/libsharp/sharp_core_avx.c @@ -0,0 +1,11 @@ +#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) + +#define XCONCATX(a,b) a##_##b +#define XCONCATX2(a,b) XCONCATX(a,b) +#define XARCH(a) XCONCATX2(a,ARCH) + +#define ARCH avx +#pragma GCC target("avx") +#include "sharp_core_inc.c" + +#endif diff --git a/libsharp/sharp_core_avx2.c b/libsharp/sharp_core_avx2.c new file mode 100644 index 0000000..a7ab0a7 --- /dev/null +++ b/libsharp/sharp_core_avx2.c @@ -0,0 +1,11 @@ +#if (!defined(__AVX2__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) + +#define XCONCATX(a,b) a##_##b +#define XCONCATX2(a,b) XCONCATX(a,b) +#define XARCH(a) XCONCATX2(a,ARCH) + +#define ARCH avx2 +#pragma GCC target("avx2") +#include "sharp_core_inc.c" + +#endif diff --git a/libsharp/sharp_core_avx512f.c b/libsharp/sharp_core_avx512f.c new file mode 100644 index 0000000..7f17429 --- /dev/null +++ b/libsharp/sharp_core_avx512f.c @@ -0,0 +1,11 @@ +#if (!defined(__AVX512F__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) + +#define XCONCATX(a,b) a##_##b +#define XCONCATX2(a,b) XCONCATX(a,b) +#define XARCH(a) XCONCATX2(a,ARCH) + +#define ARCH avx512f +#pragma GCC target("avx512f") +#include "sharp_core_inc.c" + +#endif diff --git a/libsharp/sharp_core_fma.c b/libsharp/sharp_core_fma.c new file mode 100644 index 0000000..793151f --- /dev/null +++ b/libsharp/sharp_core_fma.c @@ -0,0 +1,11 @@ +#if (!defined(__FMA__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) + +#define XCONCATX(a,b) a##_##b +#define XCONCATX2(a,b) XCONCATX(a,b) +#define XARCH(a) XCONCATX2(a,ARCH) + +#define ARCH fma +#pragma GCC target("fma") +#include "sharp_core_inc.c" + +#endif diff --git a/libsharp/sharp_core_fma4.c b/libsharp/sharp_core_fma4.c new file mode 100644 index 0000000..d71de74 --- /dev/null +++ b/libsharp/sharp_core_fma4.c @@ -0,0 +1,11 @@ +#if (!defined(__FMA4__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) + +#define XCONCATX(a,b) a##_##b +#define XCONCATX2(a,b) XCONCATX(a,b) +#define XARCH(a) XCONCATX2(a,ARCH) + +#define ARCH fma4 +#pragma GCC target("fma4") +#include "sharp_core_inc.c" + +#endif diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c new file mode 100644 index 0000000..ddd08df --- /dev/null +++ b/libsharp/sharp_core_inc.c @@ -0,0 +1,1175 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_core.c + * Computational core + * + * Copyright (C) 2012-2019 Max-Planck-Society + * \author Martin Reinecke + */ + +#include +#include +#include +#include "sharp_vecsupport.h" +#include "sharp.h" +#include "sharp_internal.h" +#include "c_utils.h" + +typedef complex double dcmplx; + +#define nv0 (128/VLEN) +#define nvx (64/VLEN) + +typedef Tv Tbv0[nv0]; +typedef double Tbs0[nv0*VLEN]; + +typedef struct + { + Tbv0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; + } s0data_v; + +typedef struct + { + Tbs0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; + } s0data_s; + +typedef union + { + s0data_v v; + s0data_s s; + } s0data_u; + +typedef Tv Tbvx[nvx]; +typedef double Tbsx[nvx*VLEN]; + +typedef struct + { + Tbvx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth, + p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi; + } sxdata_v; + +typedef struct + { + Tbsx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth, + p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi; + } sxdata_s; + +typedef union + { + sxdata_v v; + sxdata_s s; + } sxdata_u; + +static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale, + double maxval) + { + const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval); + const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig); + Tm mask = vgt(vabs(*val),vfmax); + while (vanyTrue(mask)) + { + vmuleq_mask(mask,*val,vfsmall); + vaddeq_mask(mask,*scale,vone); + mask = vgt(vabs(*val),vfmax); + } + mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero)); + while (vanyTrue(mask)) + { + vmuleq_mask(mask,*val,vfbig); + vsubeq_mask(mask,*scale,vone); + mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero)); + } + } + +static void mypow(Tv val, int npow, const double * restrict powlimit, + Tv * restrict resd, Tv * restrict ress) + { + Tv vminv=vload(powlimit[npow]); + Tm mask = vlt(vabs(val),vminv); + if (!vanyTrue(mask)) // no underflows possible, use quick algoritm + { + Tv res=vone; + do + { + if (npow&1) + res*=val; + val*=val; + } + while(npow>>=1); + *resd=res; + *ress=vzero; + } + else + { + Tv scale=vzero, scaleint=vzero, res=vone; + Tvnormalize(&val,&scaleint,sharp_fbighalf); + do + { + if (npow&1) + { + res*=val; + scale+=scaleint; + Tvnormalize(&res,&scale,sharp_fbighalf); + } + val*=val; + scaleint+=scaleint; + Tvnormalize(&val,&scaleint,sharp_fbighalf); + } + while(npow>>=1); + *resd=res; + *ress=scale; + } + } + +static inline void getCorfac(Tv scale, Tv * restrict corfac, + const double * restrict cf) + { + typedef union + { Tv v; double s[VLEN]; } Tvu; + + Tvu sc, corf; + sc.v=scale; + for (int i=0; im, il=0; + Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); + Tv limscale=vload(sharp_limscale); + int below_limit = 1; + for (int i=0; ilam1[i]=vzero; + mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]); + d->lam2[i] *= mfac; + Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol); + below_limit &= vallTrue(vlt(d->scale[i],limscale)); + } + + while (below_limit) + { + if (l+4>gen->lmax) {*l_=gen->lmax+1;return;} + below_limit=1; + Tv a1=vload(gen->coef[il ][0]), b1=vload(gen->coef[il ][1]); + Tv a2=vload(gen->coef[il+1][0]), b2=vload(gen->coef[il+1][1]); + for (int i=0; ilam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; + d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i]; + if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) + below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale))); + } + l+=4; il+=2; + } + *l_=l; *il_=il; + } + +NOINLINE static void alm2map_kernel(s0data_v * restrict d, + const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm, + int l, int il, int lmax, int nv2) + { + if (nv2==nv0) + { + for (; l<=lmax-2; il+=2, l+=4) + { + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); + Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); + Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); + Tv a1=vload(coef[il ][0]), b1=vload(coef[il ][1]); + Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]); + for (int i=0; ip1r[i] += d->lam2[i]*ar1; + d->p1i[i] += d->lam2[i]*ai1; + d->p2r[i] += d->lam2[i]*ar2; + d->p2i[i] += d->lam2[i]*ai2; + d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; + d->p1r[i] += d->lam1[i]*ar3; + d->p1i[i] += d->lam1[i]*ai3; + d->p2r[i] += d->lam1[i]*ar4; + d->p2i[i] += d->lam1[i]*ai4; + d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i]; + } + } + } + else + { + for (; l<=lmax-2; il+=2, l+=4) + { + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); + Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); + Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); + Tv a1=vload(coef[il ][0]), b1=vload(coef[il ][1]); + Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]); + for (int i=0; ip1r[i] += d->lam2[i]*ar1; + d->p1i[i] += d->lam2[i]*ai1; + d->p2r[i] += d->lam2[i]*ar2; + d->p2i[i] += d->lam2[i]*ai2; + d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; + d->p1r[i] += d->lam1[i]*ar3; + d->p1i[i] += d->lam1[i]*ai3; + d->p2r[i] += d->lam1[i]*ar4; + d->p2i[i] += d->lam1[i]*ai4; + d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i]; + } + } + } + for (; l<=lmax; ++il, l+=2) + { + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); + Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + Tv a=vload(coef[il][0]), b=vload(coef[il][1]); + for (int i=0; ip1r[i] += d->lam2[i]*ar1; + d->p1i[i] += d->lam2[i]*ai1; + d->p2r[i] += d->lam2[i]*ar2; + d->p2i[i] += d->lam2[i]*ai2; + Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i]; + d->lam1[i] = d->lam2[i]; + d->lam2[i] = tmp; + } + } + } + +NOINLINE static void calc_alm2map (sharp_job * restrict job, + const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth) + { + int l,il,lmax=gen->lmax; + int nv2 = (nth+VLEN-1)/VLEN; + iter_to_ieee(gen, d, &l, &il, nv2); + job->opcnt += il * 4*nth; + if (l>lmax) return; + job->opcnt += (lmax+1-l) * 6*nth; + + const sharp_ylmgen_dbl2 * restrict coef = gen->coef; + const dcmplx * restrict alm=job->almtmp; + int full_ieee=1; + for (int i=0; iscale[i], &d->corfac[i], gen->cf); + full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); + } + + while((!full_ieee) && (l<=lmax)) + { + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); + Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + Tv a=vload(coef[il][0]), b=vload(coef[il][1]); + full_ieee=1; + for (int i=0; ip1r[i] += d->lam2[i]*d->corfac[i]*ar1; + d->p1i[i] += d->lam2[i]*d->corfac[i]*ai1; + d->p2r[i] += d->lam2[i]*d->corfac[i]*ar2; + d->p2i[i] += d->lam2[i]*d->corfac[i]*ai2; + Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i]; + d->lam1[i] = d->lam2[i]; + d->lam2[i] = tmp; + if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) + getCorfac(d->scale[i], &d->corfac[i], gen->cf); + full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); + } + l+=2; ++il; + } + if (l>lmax) return; + + for (int i=0; ilam1[i] *= d->corfac[i]; + d->lam2[i] *= d->corfac[i]; + } + alm2map_kernel(d, coef, alm, l, il, lmax, nv2); + } + +NOINLINE static void map2alm_kernel(s0data_v * restrict d, + const sharp_ylmgen_dbl2 * restrict coef, dcmplx * restrict alm, int l, + int il, int lmax, int nv2) + { + for (; l<=lmax-2; il+=2, l+=4) + { + Tv a1=vload(coef[il ][0]), b1=vload(coef[il ][1]); + Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]); + Tv atmp1[4] = {vzero, vzero, vzero, vzero}; + Tv atmp2[4] = {vzero, vzero, vzero, vzero}; + for (int i=0; ilam2[i]*d->p1r[i]; + atmp1[1] += d->lam2[i]*d->p1i[i]; + atmp1[2] += d->lam2[i]*d->p2r[i]; + atmp1[3] += d->lam2[i]*d->p2i[i]; + d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; + atmp2[0] += d->lam1[i]*d->p1r[i]; + atmp2[1] += d->lam1[i]*d->p1i[i]; + atmp2[2] += d->lam1[i]*d->p2r[i]; + atmp2[3] += d->lam1[i]*d->p2i[i]; + d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i]; + } + vhsum_cmplx_special (atmp1[0], atmp1[1], atmp1[2], atmp1[3], &alm[l ]); + vhsum_cmplx_special (atmp2[0], atmp2[1], atmp2[2], atmp2[3], &alm[l+2]); + } + for (; l<=lmax; ++il, l+=2) + { + Tv a=vload(coef[il][0]), b=vload(coef[il][1]); + Tv atmp[4] = {vzero, vzero, vzero, vzero}; + for (int i=0; ilam2[i]*d->p1r[i]; + atmp[1] += d->lam2[i]*d->p1i[i]; + atmp[2] += d->lam2[i]*d->p2r[i]; + atmp[3] += d->lam2[i]*d->p2i[i]; + Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i]; + d->lam1[i] = d->lam2[i]; + d->lam2[i] = tmp; + } + vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]); + } + } + +NOINLINE static void calc_map2alm (sharp_job * restrict job, + const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth) + { + int l,il,lmax=gen->lmax; + int nv2 = (nth+VLEN-1)/VLEN; + iter_to_ieee(gen, d, &l, &il, nv2); + job->opcnt += il * 4*nth; + if (l>lmax) return; + job->opcnt += (lmax+1-l) * 6*nth; + + const sharp_ylmgen_dbl2 * restrict coef = gen->coef; + dcmplx * restrict alm=job->almtmp; + int full_ieee=1; + for (int i=0; iscale[i], &d->corfac[i], gen->cf); + full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); + } + + while((!full_ieee) && (l<=lmax)) + { + Tv a=vload(coef[il][0]), b=vload(coef[il][1]); + Tv atmp[4] = {vzero, vzero, vzero, vzero}; + full_ieee=1; + for (int i=0; ilam2[i]*d->corfac[i]*d->p1r[i]; + atmp[1] += d->lam2[i]*d->corfac[i]*d->p1i[i]; + atmp[2] += d->lam2[i]*d->corfac[i]*d->p2r[i]; + atmp[3] += d->lam2[i]*d->corfac[i]*d->p2i[i]; + Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i]; + d->lam1[i] = d->lam2[i]; + d->lam2[i] = tmp; + if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) + getCorfac(d->scale[i], &d->corfac[i], gen->cf); + full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); + } + vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]); + l+=2; ++il; + } + if (l>lmax) return; + + for (int i=0; ilam1[i] *= d->corfac[i]; + d->lam2[i] *= d->corfac[i]; + } + map2alm_kernel(d, coef, alm, l, il, lmax, nv2); + } + +NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen, + sxdata_v * restrict d, int * restrict l_, int nv2) + { + const sharp_ylmgen_dbl2 * restrict fx = gen->coef; + Tv prefac=vload(gen->prefac[gen->m]), + prescale=vload(gen->fscale[gen->m]); + Tv limscale=vload(sharp_limscale); + int below_limit=1; + for (int i=0; icth[i])*vload(0.5))); + Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5))); + Tm mask=vlt(d->sth[i],vzero); + vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.)); + vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.)); + + Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps; + mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps); + mypow(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps); + mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps); + mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps); + + d->l1p[i] = vzero; + d->l1m[i] = vzero; + d->l2p[i] = prefac*ccp; + d->scp[i] = prescale+ccps; + d->l2m[i] = prefac*csp; + d->scm[i] = prescale+csps; + Tvnormalize(&d->l2m[i],&d->scm[i],sharp_fbighalf); + Tvnormalize(&d->l2p[i],&d->scp[i],sharp_fbighalf); + d->l2p[i] *= ssp; + d->scp[i] += ssps; + d->l2m[i] *= scp; + d->scm[i] += scps; + if (gen->preMinus_p) + d->l2p[i] = vneg(d->l2p[i]); + if (gen->preMinus_m) + d->l2m[i] = vneg(d->l2m[i]); + if (gen->s&1) + d->l2p[i] = vneg(d->l2p[i]); + + Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol); + Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol); + + below_limit &= vallTrue(vlt(d->scm[i],limscale)) && + vallTrue(vlt(d->scp[i],limscale)); + } + + int l=gen->mhi; + + while (below_limit) + { + if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} + below_limit=1; + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); + for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; + d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; + d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; + d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; + if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) || + rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol))) + below_limit &= vallTrue(vlt(d->scp[i],limscale)) && + vallTrue(vlt(d->scm[i],limscale)); + } + l+=2; + } + + *l_=l; + } + +NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d, + const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm, + int l, int lmax, int nv2) + { + int lsave = l; + while (l<=lmax) + { + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); + Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), + acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); + Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), + acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); + for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; + d->p1pr[i] += agr1*d->l2p[i]; + d->p1pi[i] += agi1*d->l2p[i]; + d->p1mr[i] += acr1*d->l2p[i]; + d->p1mi[i] += aci1*d->l2p[i]; + + d->p1pr[i] += aci2*d->l1p[i]; + d->p1pi[i] -= acr2*d->l1p[i]; + d->p1mr[i] -= agi2*d->l1p[i]; + d->p1mi[i] += agr2*d->l1p[i]; + d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; + } + l+=2; + } + l=lsave; + while (l<=lmax) + { + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); + Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), + acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); + Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), + acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); + for (int i=0; il1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; + d->p2pr[i] -= aci1*d->l2m[i]; + d->p2pi[i] += acr1*d->l2m[i]; + d->p2mr[i] += agi1*d->l2m[i]; + d->p2mi[i] -= agr1*d->l2m[i]; + + d->p2pr[i] += agr2*d->l1m[i]; + d->p2pi[i] += agi2*d->l1m[i]; + d->p2mr[i] += acr2*d->l1m[i]; + d->p2mi[i] += aci2*d->l1m[i]; + d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; + } + l+=2; + } + } + +NOINLINE static void calc_alm2map_spin (sharp_job * restrict job, + const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) + { + int l,lmax=gen->lmax; + int nv2 = (nth+VLEN-1)/VLEN; + iter_to_ieee_spin(gen, d, &l, nv2); + job->opcnt += (l-gen->mhi) * 7*nth; + if (l>lmax) return; + job->opcnt += (lmax+1-l) * 23*nth; + + const sharp_ylmgen_dbl2 * restrict fx = gen->coef; + const dcmplx * restrict alm=job->almtmp; + int full_ieee=1; + for (int i=0; iscp[i], &d->cfp[i], gen->cf); + getCorfac(d->scm[i], &d->cfm[i], gen->cf); + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && + vallTrue(vge(d->scm[i],vload(sharp_minscale))); + } + + while((!full_ieee) && (l<=lmax)) + { + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); + Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), + acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); + Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), + acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); + full_ieee=1; + for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; + d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; + + Tv l2p=d->l2p[i]*d->cfp[i], l2m=d->l2m[i]*d->cfm[i]; + Tv l1m=d->l1m[i]*d->cfm[i], l1p=d->l1p[i]*d->cfp[i]; + + d->p1pr[i] += agr1*l2p + aci2*l1p; + d->p1pi[i] += agi1*l2p - acr2*l1p; + d->p1mr[i] += acr1*l2p - agi2*l1p; + d->p1mi[i] += aci1*l2p + agr2*l1p; + + d->p2pr[i] += agr2*l1m - aci1*l2m; + d->p2pi[i] += agi2*l1m + acr1*l2m; + d->p2mr[i] += acr2*l1m + agi1*l2m; + d->p2mi[i] += aci2*l1m - agr1*l2m; + + d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; + d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; + if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) + getCorfac(d->scp[i], &d->cfp[i], gen->cf); + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); + if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) + getCorfac(d->scm[i], &d->cfm[i], gen->cf); + full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); + } + l+=2; + } +// if (l>lmax) return; + + for (int i=0; il1p[i] *= d->cfp[i]; + d->l2p[i] *= d->cfp[i]; + d->l1m[i] *= d->cfm[i]; + d->l2m[i] *= d->cfm[i]; + } + alm2map_spin_kernel(d, fx, alm, l, lmax, nv2); + + for (int i=0; ip1pr[i]; d->p1pr[i] -= d->p2mi[i]; d->p2mi[i] += tmp; + tmp = d->p1pi[i]; d->p1pi[i] += d->p2mr[i]; d->p2mr[i] -= tmp; + tmp = d->p1mr[i]; d->p1mr[i] += d->p2pi[i]; d->p2pi[i] -= tmp; + tmp = d->p1mi[i]; d->p1mi[i] -= d->p2pr[i]; d->p2pr[i] += tmp; + } + } + +NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d, + const sharp_ylmgen_dbl2 * restrict fx, dcmplx * restrict alm, + int l, int lmax, int nv2) + { + int lsave=l; + while (l<=lmax) + { + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); + Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; + Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; + for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; + agr1 += d->p2mi[i]*d->l2p[i]; + agi1 -= d->p2mr[i]*d->l2p[i]; + acr1 -= d->p2pi[i]*d->l2p[i]; + aci1 += d->p2pr[i]*d->l2p[i]; + agr2 += d->p2pr[i]*d->l1p[i]; + agi2 += d->p2pi[i]*d->l1p[i]; + acr2 += d->p2mr[i]*d->l1p[i]; + aci2 += d->p2mi[i]*d->l1p[i]; + d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; + } + vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); + vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]); + l+=2; + } + l=lsave; + while (l<=lmax) + { + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); + Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; + Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; + for (int i=0; il1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; + agr1 += d->p1pr[i]*d->l2m[i]; + agi1 += d->p1pi[i]*d->l2m[i]; + acr1 += d->p1mr[i]*d->l2m[i]; + aci1 += d->p1mi[i]*d->l2m[i]; + agr2 -= d->p1mi[i]*d->l1m[i]; + agi2 += d->p1mr[i]*d->l1m[i]; + acr2 += d->p1pi[i]*d->l1m[i]; + aci2 -= d->p1pr[i]*d->l1m[i]; + d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; + } + vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); + vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]); + l+=2; + } + } + +NOINLINE static void calc_map2alm_spin (sharp_job * restrict job, + const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) + { + int l,lmax=gen->lmax; + int nv2 = (nth+VLEN-1)/VLEN; + iter_to_ieee_spin(gen, d, &l, nv2); + job->opcnt += (l-gen->mhi) * 7*nth; + if (l>lmax) return; + job->opcnt += (lmax+1-l) * 23*nth; + + const sharp_ylmgen_dbl2 * restrict fx = gen->coef; + dcmplx * restrict alm=job->almtmp; + int full_ieee=1; + for (int i=0; iscp[i], &d->cfp[i], gen->cf); + getCorfac(d->scm[i], &d->cfm[i], gen->cf); + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && + vallTrue(vge(d->scm[i],vload(sharp_minscale))); + } + for (int i=0; ip1pr[i]; d->p1pr[i] -= d->p2mi[i]; d->p2mi[i] += tmp; + tmp = d->p1pi[i]; d->p1pi[i] += d->p2mr[i]; d->p2mr[i] -= tmp; + tmp = d->p1mr[i]; d->p1mr[i] += d->p2pi[i]; d->p2pi[i] -= tmp; + tmp = d->p1mi[i]; d->p1mi[i] -= d->p2pr[i]; d->p2pr[i] += tmp; + } + + while((!full_ieee) && (l<=lmax)) + { + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); + Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; + Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; + full_ieee=1; + for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; + d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; + Tv l2p = d->l2p[i]*d->cfp[i], l2m = d->l2m[i]*d->cfm[i]; + Tv l1p = d->l1p[i]*d->cfp[i], l1m = d->l1m[i]*d->cfm[i]; + agr1 += d->p1pr[i]*l2m + d->p2mi[i]*l2p; + agi1 += d->p1pi[i]*l2m - d->p2mr[i]*l2p; + acr1 += d->p1mr[i]*l2m - d->p2pi[i]*l2p; + aci1 += d->p1mi[i]*l2m + d->p2pr[i]*l2p; + agr2 += d->p2pr[i]*l1p - d->p1mi[i]*l1m; + agi2 += d->p2pi[i]*l1p + d->p1mr[i]*l1m; + acr2 += d->p2mr[i]*l1p + d->p1pi[i]*l1m; + aci2 += d->p2mi[i]*l1p - d->p1pr[i]*l1m; + + d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; + d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; + if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) + getCorfac(d->scp[i], &d->cfp[i], gen->cf); + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); + if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) + getCorfac(d->scm[i], &d->cfm[i], gen->cf); + full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); + } + vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); + vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]); + l+=2; + } + if (l>lmax) return; + + for (int i=0; il1p[i] *= d->cfp[i]; + d->l2p[i] *= d->cfp[i]; + d->l1m[i] *= d->cfm[i]; + d->l2m[i] *= d->cfm[i]; + } + map2alm_spin_kernel(d, fx, alm, l, lmax, nv2); + } + + +NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d, + const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm, + int l, int lmax, int nv2) + { + while (l<=lmax) + { + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), + ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; + d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; + Tv lw=d->l2p[i]+d->l2m[i]; + d->p1pr[i] += ar1*lw; + d->p1pi[i] += ai1*lw; + Tv lx=d->l2m[i]-d->l2p[i]; + d->p2mr[i] += ai1*lx; + d->p2mi[i] -= ar1*lx; + lw=d->l1p[i]+d->l1m[i]; + d->p2pr[i] += ar2*lw; + d->p2pi[i] += ai2*lw; + lx=d->l1m[i]-d->l1p[i]; + d->p1mr[i] += ai2*lx; + d->p1mi[i] -= ar2*lx; + d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; + d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; + } + l+=2; + } + } + +NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job, + const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) + { + int l,lmax=gen->lmax; + int nv2 = (nth+VLEN-1)/VLEN; + iter_to_ieee_spin(gen, d, &l, nv2); + job->opcnt += (l-gen->mhi) * 7*nth; + if (l>lmax) return; + job->opcnt += (lmax+1-l) * 17*nth; + + const sharp_ylmgen_dbl2 * restrict fx = gen->coef; + const dcmplx * restrict alm=job->almtmp; + int full_ieee=1; + for (int i=0; iscp[i], &d->cfp[i], gen->cf); + getCorfac(d->scm[i], &d->cfm[i], gen->cf); + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && + vallTrue(vge(d->scm[i],vload(sharp_minscale))); + } + + while((!full_ieee) && (l<=lmax)) + { + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), + ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + full_ieee=1; + for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; + d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; + Tv lw=d->l2p[i]*d->cfp[i]+d->l2m[i]*d->cfm[i]; + d->p1pr[i] += ar1*lw; + d->p1pi[i] += ai1*lw; + Tv lx=d->l2m[i]*d->cfm[i]-d->l2p[i]*d->cfp[i]; + d->p2mr[i] += ai1*lx; + d->p2mi[i] -= ar1*lx; + lw=d->l1p[i]*d->cfp[i]+d->l1m[i]*d->cfm[i]; + d->p2pr[i] += ar2*lw; + d->p2pi[i] += ai2*lw; + lx=d->l1m[i]*d->cfm[i]-d->l1p[i]*d->cfp[i]; + d->p1mr[i] += ai2*lx; + d->p1mi[i] -= ar2*lx; + d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; + d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; + if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) + { + getCorfac(d->scp[i], &d->cfp[i], gen->cf); + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); + } + if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) + { + getCorfac(d->scm[i], &d->cfm[i], gen->cf); + full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); + } + } + l+=2; + } + if (l>lmax) return; + + for (int i=0; il1p[i] *= d->cfp[i]; + d->l2p[i] *= d->cfp[i]; + d->l1m[i] *= d->cfm[i]; + d->l2m[i] *= d->cfm[i]; + } + alm2map_deriv1_kernel(d, fx, alm, l, lmax, nv2); + } + + +#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) + +NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, + const double *cth_, const double *sth_, int llim, int ulim, + sharp_Ylmgen_C *gen, int mi, const int *mlim) + { + const int m = job->ainfo->mval[mi]; + sharp_Ylmgen_prepare (gen, m); + + switch (job->type) + { + case SHARP_ALM2MAP: + case SHARP_ALM2MAP_DERIV1: + { + if (job->spin==0) + { + //adjust the a_lm for the new algorithm + dcmplx * restrict alm=job->almtmp; + for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2) + { + dcmplx al = alm[l]; + dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1]; + dcmplx al2 = (l+2>gen->lmax) ? 0. : alm[l+2]; + alm[l ] = gen->alpha[il]*(gen->eps[l+1]*al + gen->eps[l+2]*al2); + alm[l+1] = gen->alpha[il]*al1; + } + + const int nval=nv0*VLEN; + int ith=0; + int itgt[nval]; + while (ith=m) + { + itgt[nth] = ith; + d.s.csq[nth]=cth_[ith]*cth_[ith]; + d.s.sth[nth]=sth_[ith]; + ++nth; + } + else + { + int phas_idx = ith*job->s_th + mi*job->s_m; + job->phase[phas_idx] = job->phase[phas_idx+1] = 0; + } + ++ith; + } + if (nth>0) + { + int i2=((nth+VLEN-1)/VLEN)*VLEN; + for (int i=nth; is_th + mi*job->s_m; + complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I, + r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I; + job->phase[phas_idx] = r1+r2; + if (ispair[tgt]) + job->phase[phas_idx+1] = r1-r2; + } + } + } + } + else + { + //adjust the a_lm for the new algorithm + if (job->nalm==2) + for (int l=gen->mhi; l<=gen->lmax+1; ++l) + { + job->almtmp[2*l ]*=gen->alpha[l]; + job->almtmp[2*l+1]*=gen->alpha[l]; + } + else + for (int l=gen->mhi; l<=gen->lmax+1; ++l) + job->almtmp[l]*=gen->alpha[l]; + + const int nval=nvx*VLEN; + int ith=0; + int itgt[nval]; + while (ith=m) + { + itgt[nth] = ith; + d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith]; + ++nth; + } + else + { + int phas_idx = ith*job->s_th + mi*job->s_m; + job->phase[phas_idx ] = job->phase[phas_idx+1] = 0; + job->phase[phas_idx+2] = job->phase[phas_idx+3] = 0; + } + ++ith; + } + if (nth>0) + { + int i2=((nth+VLEN-1)/VLEN)*VLEN; + for (int i=nth; itype==SHARP_ALM2MAP) ? + calc_alm2map_spin (job, gen, &d.v, nth) : + calc_alm2map_deriv1(job, gen, &d.v, nth); + for (int i=0; is_th + mi*job->s_m; + complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I, + q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I, + u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I, + u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I; + job->phase[phas_idx ] = q1+q2; + job->phase[phas_idx+2] = u1+u2; + if (ispair[tgt]) + { + dcmplx *phQ = &(job->phase[phas_idx+1]), + *phU = &(job->phase[phas_idx+3]); + *phQ = q1-q2; + *phU = u1-u2; + if ((gen->mhi-gen->m+gen->s)&1) + { *phQ=-(*phQ); *phU=-(*phU); } + } + } + } + } + } + break; + } + default: + { + UTIL_FAIL("must not happen"); + break; + } + } + } + +NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, + const double *cth_, const double *sth_, int llim, int ulim, + sharp_Ylmgen_C *gen, int mi, const int *mlim) + { + const int m = job->ainfo->mval[mi]; + sharp_Ylmgen_prepare (gen, m); + + switch (job->type) + { + case SHARP_MAP2ALM: + { + if (job->spin==0) + { + const int nval=nv0*VLEN; + int ith=0; + while (ith=m) + { + d.s.csq[nth]=cth_[ith]*cth_[ith]; d.s.sth[nth]=sth_[ith]; + int phas_idx = ith*job->s_th + mi*job->s_m; + dcmplx ph1=job->phase[phas_idx]; + dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.; + d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2); + d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2); + //adjust for new algorithm + d.s.p2r[nth]*=cth_[ith]; + d.s.p2i[nth]*=cth_[ith]; + ++nth; + } + ++ith; + } + if (nth>0) + { + int i2=((nth+VLEN-1)/VLEN)*VLEN; + for (int i=nth; ialmtmp; + dcmplx alm2 = 0.; + double alold=0; + for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2) + { + dcmplx al = alm[l]; + dcmplx al1 = (l+1>gen->lmax) ? 0. : alm[l+1]; + alm[l ] = gen->alpha[il]*gen->eps[l+1]*al + alold*gen->eps[l]*alm2; + alm[l+1] = gen->alpha[il]*al1; + alm2=al; + alold=gen->alpha[il]; + } + } + else + { + const int nval=nvx*VLEN; + int ith=0; + while (ith=m) + { + d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith]; + int phas_idx = ith*job->s_th + mi*job->s_m; + dcmplx p1Q=job->phase[phas_idx], + p1U=job->phase[phas_idx+2], + p2Q=ispair[ith] ? job->phase[phas_idx+1]:0., + p2U=ispair[ith] ? job->phase[phas_idx+3]:0.; + if ((gen->mhi-gen->m+gen->s)&1) + { p2Q=-p2Q; p2U=-p2U; } + d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q); + d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U); + d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q); + d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U); + ++nth; + } + ++ith; + } + if (nth>0) + { + int i2=((nth+VLEN-1)/VLEN)*VLEN; + for (int i=nth; imhi; l<=gen->lmax; ++l) + { + job->almtmp[2*l ]*=gen->alpha[l]; + job->almtmp[2*l+1]*=gen->alpha[l]; + } + } + break; + } + default: + { + UTIL_FAIL("must not happen"); + break; + } + } + } + +void XARCH(inner_loop) (sharp_job *job, const int *ispair, + const double *cth_, const double *sth_, int llim, int ulim, + sharp_Ylmgen_C *gen, int mi, const int *mlim) + { + (job->type==SHARP_MAP2ALM) ? + inner_loop_m2a(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) : + inner_loop_a2m(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim); + } + +#undef VZERO + +int XARCH(sharp_veclen)(void) + { + return VLEN; + } + +int XARCH(sharp_max_nvec)(int spin) + { + return (spin==0) ? nv0 : nvx; + } + +#define xstr(a) str(a) +#define str(a) #a +const char *XARCH(sharp_architecture)(void) + { + return xstr(ARCH); + } diff --git a/libsharp/sharp_internal.h b/libsharp/sharp_internal.h index 4c67cc8..a6b3120 100644 --- a/libsharp/sharp_internal.h +++ b/libsharp/sharp_internal.h @@ -68,5 +68,6 @@ void inner_loop (sharp_job *job, const int *ispair,const double *cth, int sharp_veclen(void); int sharp_max_nvec(int spin); +const char *sharp_architecture(void); #endif diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c index 1a05bdc..f22a91e 100644 --- a/libsharp/sharp_testsuite.c +++ b/libsharp/sharp_testsuite.c @@ -71,9 +71,6 @@ static void MPI_status(void) #endif } -static void vecmath_status(void) - { printf("Supported vector length: %d\n",sharp_veclen()); } - static void sharp_announce (const char *name) { size_t m, nlen=strlen(name); @@ -84,7 +81,8 @@ static void sharp_announce (const char *name) printf("+-"); for (m=0; m