diff --git a/Makefile.am b/Makefile.am index 7a035ba..c738f29 100644 --- a/Makefile.am +++ b/Makefile.am @@ -11,7 +11,6 @@ src_sharp = \ libsharp/sharp_almhelpers.c \ libsharp/sharp_announce.c \ libsharp/sharp_core.c \ - libsharp/sharp_core_avx.c \ libsharp/sharp_geomhelpers.c \ libsharp/sharp_legendre_roots.c \ libsharp/sharp_ylmgen_c.c \ @@ -32,10 +31,6 @@ include_HEADERS = \ libsharp/sharp_almhelpers.h \ libsharp/sharp_cxx.h -EXTRA_DIST = \ - libsharp/sharp_core_inc0.c \ - libsharp/sharp_core_inc.c - libsharp_la_SOURCES = $(src_sharp) check_PROGRAMS = sharp_testsuite diff --git a/libsharp/sharp.c b/libsharp/sharp.c index 943bd79..f312fc3 100644 --- a/libsharp/sharp.c +++ b/libsharp/sharp.c @@ -36,7 +36,6 @@ #include "sharp_internal.h" #include "c_utils.h" #include "sharp_core.h" -#include "sharp_vecutil.h" #include "walltime_c.h" #include "sharp_almhelpers.h" #include "sharp_geomhelpers.h" @@ -854,7 +853,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job) init_output (job); int nchunks, chunksize; - get_chunk_info(job->ginfo->npairs,6*VLEN,&nchunks,&chunksize); + get_chunk_info(job->ginfo->npairs,sharp_veclen()*sharp_max_nvec(),&nchunks,&chunksize); //FIXME: needs to be changed to "nm" alloc_phase (job,mmax+1,chunksize); diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c index 1d6618d..7038b38 100644 --- a/libsharp/sharp_core.c +++ b/libsharp/sharp_core.c @@ -29,46 +29,512 @@ * \author Martin Reinecke */ -#define ARCH _default -#include "sharp_core_inc0.c" -#undef ARCH +#include +#include +#include +#include "sharp_vecsupport.h" +#include "sharp_complex_hacks.h" +#include "sharp.h" +#include "sharp_core.h" +#include "c_utils.h" -#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) +typedef complex double dcmplx; -static int have_avx(void) +#define nvec (128/VLEN) +typedef struct + { Tv v[nvec]; } Tb; + +typedef union + { Tb b; double s[VLEN*nvec]; } Tbu; + +typedef struct + { Tb r, i; } Tbri; + +typedef struct + { Tb qr, qi, ur, ui; } Tbqu; + +typedef struct + { double r[VLEN*nvec], i[VLEN*nvec]; } Tsri; + +typedef struct + { double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Tsqu; + +typedef union + { Tbri b; Tsri s; } Tburi; + +typedef union + { Tbqu b; Tsqu s; } Tbuqu; + +static inline Tb Tbconst(double val) { - static int res=-1; - if (res<0) - { - __builtin_cpu_init(); - res = __builtin_cpu_supports("avx"); - } + Tv v=vload(val); + Tb res; + for (int i=0; iv[i],v); } -void inner_loop (sharp_job *job, const int *ispair,const double *cth, - const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, - const int *mlim) +static inline Tb Tbprod(Tb a, Tb b) + { Tb r; for (int i=0; iv[i],b.v[i]); } + +static void Tbnormalize (Tb * restrict val, Tb * restrict scale, + double maxval) { -#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) - if (have_avx()) - inner_loop_avx (job, ispair, cth, sth, llim, ulim, gen, mi, mlim); - else -#endif - inner_loop_default (job, ispair, cth, sth, llim, ulim, gen, mi, mlim); + const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval); + const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig); + for (int i=0;iv[i]),vfmax); + while (vanyTrue(mask)) + { + vmuleq_mask(mask,val->v[i],vfsmall); + vaddeq_mask(mask,scale->v[i],vone); + mask = vgt(vabs(val->v[i]),vfmax); + } + mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero)); + while (vanyTrue(mask)) + { + vmuleq_mask(mask,val->v[i],vfbig); + vsubeq_mask(mask,scale->v[i],vone); + mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero)); + } + } } +NOINLINE static void mypow (Tb val, int npow, const double * restrict powlimit, + Tb * restrict resd, Tb * restrict ress) + { + Tv vminv=vload(powlimit[npow]); + Tm mask = vlt(vabs(val.v[0]),vminv); + for (int i=1;i>=1); + *resd=res; + *ress=Tbconst(0.); + } + else + { + Tb scale=Tbconst(0.), scaleint=Tbconst(0.), res=Tbconst(1.); + Tbnormalize(&val,&scaleint,sharp_fbighalf); + do + { + if (npow&1) + { + for (int i=0; i>=1); + *resd=res; + *ress=scale; + } + } + +static inline int rescale(Tb * restrict lam1, Tb * restrict lam2, + Tb * restrict scale) + { + int did_scale=0; + for (int i=0;iv[i]),vload(sharp_ftol)); + if (vanyTrue(mask)) + { + did_scale=1; + vmuleq_mask(mask,lam1->v[i],vload(sharp_fsmall)); + vmuleq_mask(mask,lam2->v[i],vload(sharp_fsmall)); + vaddeq_mask(mask,scale->v[i],vone); + } + } + return did_scale; + } + +static inline int TballLt(Tb a,double b) + { + Tv vb=vload(b); + Tm res=vlt(a.v[0],vb); + for (int i=1; im; + Tb lam_1=Tbconst(0.), lam_2, scale; + mypow(sth,l,gen->powlimit,&lam_2,&scale); + Tbmuleq1(&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); + Tbnormalize(&lam_2,&scale,sharp_ftol); + + int below_limit = TballLt(scale,sharp_limscale); + while (below_limit) + { + if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} + for (int i=0; irf[l].f[0])*(cth.v[i]*lam_2.v[i]) + - vload(gen->rf[l].f[1])*lam_1.v[i]; + lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i]) + - vload(gen->rf[l+1].f[1])*lam_2.v[i]; + } + if (rescale(&lam_1,&lam_2,&scale)) + below_limit = TballLt(scale,sharp_limscale); + l+=2; + } + *l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale; + } + + +NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1, + Tbri * restrict p2, Tb lam_1, Tb lam_2, + const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, + int l, int lmax) + { + while (l<=lmax) + { + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); + Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]), + f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]); + for (int i=0; ir.v[i] += lam_2.v[i]*ar1; + p1->i.v[i] += lam_2.v[i]*ai1; + lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i]; + p2->r.v[i] += lam_1.v[i]*ar2; + p2->i.v[i] += lam_1.v[i]*ai2; + } + l+=2; + } + } + +NOINLINE static void map2alm_kernel (const Tb cth, + const Tbri * restrict p1, const Tbri * restrict p2, Tb lam_1, Tb lam_2, + const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp) + { + while (l<=lmax) + { + Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]), + f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]); + for (int i=0; ir.v[i]); + vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]); + lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i]; + vfmaeq(atmp[2*(l+1) ],lam_1.v[i],p2->r.v[i]); + vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]); + } + l+=2; + } + } + +NOINLINE static void calc_alm2map (const Tb cth, const Tb sth, + const sharp_Ylmgen_C *gen, sharp_job *job, Tbri * restrict p1, + Tbri * restrict p2) + { + int l,lmax=gen->lmax; + Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale; + iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen); + job->opcnt += (l-gen->m) * 4*VLEN*nvec; + if (l>lmax) return; + job->opcnt += (lmax+1-l) * 8*VLEN*nvec; + + Tb corfac; + getCorfac(scale,&corfac,gen->cf); + const sharp_ylmgen_dbl2 * restrict rf = gen->rf; + const dcmplx * restrict alm=job->almtmp; + int full_ieee = TballGe(scale,sharp_minscale); + while (!full_ieee) + { + { + Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l])); + for (int i=0; ir.v[i],tmp,ar); + vfmaeq(p1->i.v[i],tmp,ai); + } + } + if (++l>lmax) break; + Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]); + for (int i=0; ir.v[i],tmp,ar); + vfmaeq(p2->i.v[i],tmp,ai); + } + } + if (++l>lmax) break; + r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]); + for (int i=0; icf); + full_ieee = TballGe(scale,sharp_minscale); + } + } + if (l>lmax) return; + + Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac); + alm2map_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax); + } + +NOINLINE static void calc_map2alm(const Tb cth, const Tb sth, + const sharp_Ylmgen_C *gen, sharp_job *job, const Tbri * restrict p1, + const Tbri * restrict p2, Tv *restrict atmp) + { + int lmax=gen->lmax; + Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale; + int l=gen->m; + iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen); + job->opcnt += (l-gen->m) * 4*VLEN*nvec; + if (l>lmax) return; + job->opcnt += (lmax+1-l) * 8*VLEN*nvec; + + const sharp_ylmgen_dbl2 * restrict rf = gen->rf; + Tb corfac; + getCorfac(scale,&corfac,gen->cf); + int full_ieee = TballGe(scale,sharp_minscale); + while (!full_ieee) + { + for (int i=0; ir.v[i]; + atmp[2*l+1]+=tmp*p1->i.v[i]; + } + if (++l>lmax) return; + for (int i=0; ir.v[i]; + atmp[2*l+1]+=tmp*p2->i.v[i]; + } + if (++l>lmax) return; + for (int i=0; icf); + full_ieee = TballGe(scale,sharp_minscale); + } + } + + Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac); + map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp); + } + + +#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) + +NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, + const double *cth_, const double *sth_, int llim, int ulim, + sharp_Ylmgen_C *gen, int mi, const int *mlim) + { + const int nval=nvec*VLEN; + const int m = job->ainfo->mval[mi]; + sharp_Ylmgen_prepare (gen, m); + + switch (job->type) + { + case SHARP_ALM2MAP: + case SHARP_ALM2MAP_DERIV1: + { + if (job->spin==0) + { + for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; + if (mlim[itot]>=m) skip=0; + cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; + } + if (!skip) + calc_alm2map (cth.b,sth.b,gen,job,&p1.b,&p2.b); + + for (int i=0; is_th + mi*job->s_m; + complex double r1 = p1.s.r[i] + p1.s.i[i]*_Complex_I, + r2 = p2.s.r[i] + p2.s.i[i]*_Complex_I; + job->phase[phas_idx] = r1+r2; + if (ispair[itot]) + job->phase[phas_idx+1] = r1-r2; + } + } + } + } + else + { + UTIL_FAIL("only spin==0 allowed at the moment"); + } + break; + } + default: + { + UTIL_FAIL("must not happen"); + break; + } + } + } + +NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, + const double *cth_, const double *sth_, int llim, int ulim, + sharp_Ylmgen_C *gen, int mi, const int *mlim) + { + const int nval=nvec*VLEN; + const int m = job->ainfo->mval[mi]; + sharp_Ylmgen_prepare (gen, m); + + switch (job->type) + { + case SHARP_MAP2ALM: + { + if (job->spin==0) + { + Tv atmp[2*(gen->lmax+2)]; + memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv)); + for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; + if (mlim[itot]>=m) skip=0; + cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; + if ((i+ith=m)) + { + int phas_idx = itot*job->s_th + mi*job->s_m; + dcmplx ph1=job->phase[phas_idx]; + dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.; + p1.s.r[i]=creal(ph1+ph2); p1.s.i[i]=cimag(ph1+ph2); + p2.s.r[i]=creal(ph1-ph2); p2.s.i[i]=cimag(ph1-ph2); + } + } + if (!skip) + calc_map2alm(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp); + } + { + int istart=m, istop=gen->lmax+1; + for(; istartalmtmp[istart])); + for(; istartalmtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]); + } + } + else + { + UTIL_FAIL("only spin==0 allowed at the moment"); + } + break; + } + default: + { + UTIL_FAIL("must not happen"); + break; + } + } + } + +void inner_loop (sharp_job *job, const int *ispair, + const double *cth_, const double *sth_, int llim, int ulim, + sharp_Ylmgen_C *gen, int mi, const int *mlim) + { + (job->type==SHARP_MAP2ALM) ? + inner_loop_m2a(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) : + inner_loop_a2m(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim); + } + +#undef VZERO +#undef nvec + int sharp_veclen(void) { -#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) - if (have_avx()) - return 4; - else -#endif - return VLEN; + return VLEN; + } + +int sharp_max_nvec(void) + { + return 128/VLEN; } diff --git a/libsharp/sharp_core.h b/libsharp/sharp_core.h index f641125..a9e509b 100644 --- a/libsharp/sharp_core.h +++ b/libsharp/sharp_core.h @@ -44,6 +44,7 @@ void inner_loop (sharp_job *job, const int *ispair,const double *cth, const int *mlim); int sharp_veclen(void); +int sharp_max_nvec(void); #ifdef __cplusplus } diff --git a/libsharp/sharp_core_avx.c b/libsharp/sharp_core_avx.c deleted file mode 100644 index 79f1e79..0000000 --- a/libsharp/sharp_core_avx.c +++ /dev/null @@ -1,10 +0,0 @@ -#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) -// if we arrive here, we can benefit from an additional AVX version -// #warning entering gcc and x86_64 specific code branch - -#define ARCH _avx -#pragma GCC target("avx") -#include "sharp_core_inc0.c" -#undef ARCH - -#endif diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c deleted file mode 100644 index dd3ac4b..0000000 --- a/libsharp/sharp_core_inc.c +++ /dev/null @@ -1,517 +0,0 @@ -/* - * This file is part of libsharp. - * - * libsharp is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * libsharp is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with libsharp; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* - * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik - * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt - * (DLR). - */ - -/*! \file sharp_core_inc.c - * Type-dependent code for the computational core - * - * Copyright (C) 2012-2017 Max-Planck-Society - * \author Martin Reinecke - */ - -typedef struct - { Tv v[nvec]; } Tb; - -typedef union - { Tb b; double s[VLEN*nvec]; } Tbu; - -typedef struct - { Tb r, i; } Tbri; - -typedef struct - { Tb qr, qi, ur, ui; } Tbqu; - -typedef struct - { double r[VLEN*nvec], i[VLEN*nvec]; } Tsri; - -typedef struct - { double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Tsqu; - -typedef union - { Tbri b; Tsri s; } Tburi; - -typedef union - { Tbqu b; Tsqu s; } Tbuqu; - -static inline Tb Tbconst(double val) - { - Tv v=vload(val); - Tb res; - for (int i=0; iv[i],v); } - -static inline Tb Tbprod(Tb a, Tb b) - { Tb r; for (int i=0; iv[i],b.v[i]); } - -static void Tbnormalize (Tb * restrict val, Tb * restrict scale, - double maxval) - { - const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval); - const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig); - for (int i=0;iv[i]),vfmax); - while (vanyTrue(mask)) - { - vmuleq_mask(mask,val->v[i],vfsmall); - vaddeq_mask(mask,scale->v[i],vone); - mask = vgt(vabs(val->v[i]),vfmax); - } - mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero)); - while (vanyTrue(mask)) - { - vmuleq_mask(mask,val->v[i],vfbig); - vsubeq_mask(mask,scale->v[i],vone); - mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero)); - } - } - } - -NOINLINE static void mypow (Tb val, int npow, const double * restrict powlimit, - Tb * restrict resd, Tb * restrict ress) - { - Tv vminv=vload(powlimit[npow]); - Tm mask = vlt(vabs(val.v[0]),vminv); - for (int i=1;i>=1); - *resd=res; - *ress=Tbconst(0.); - } - else - { - Tb scale=Tbconst(0.), scaleint=Tbconst(0.), res=Tbconst(1.); - Tbnormalize(&val,&scaleint,sharp_fbighalf); - do - { - if (npow&1) - { - for (int i=0; i>=1); - *resd=res; - *ress=scale; - } - } - -static inline int rescale(Tb * restrict lam1, Tb * restrict lam2, - Tb * restrict scale) - { - int did_scale=0; - for (int i=0;iv[i]),vload(sharp_ftol)); - if (vanyTrue(mask)) - { - did_scale=1; - vmuleq_mask(mask,lam1->v[i],vload(sharp_fsmall)); - vmuleq_mask(mask,lam2->v[i],vload(sharp_fsmall)); - vaddeq_mask(mask,scale->v[i],vone); - } - } - return did_scale; - } - -static inline int TballLt(Tb a,double b) - { - Tv vb=vload(b); - Tm res=vlt(a.v[0],vb); - for (int i=1; im; - Tb lam_1=Tbconst(0.), lam_2, scale; - mypow(sth,l,gen->powlimit,&lam_2,&scale); - Tbmuleq1(&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); - Tbnormalize(&lam_2,&scale,sharp_ftol); - - int below_limit = TballLt(scale,sharp_limscale); - while (below_limit) - { - if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} - for (int i=0; irf[l].f[0])*(cth.v[i]*lam_2.v[i]) - - vload(gen->rf[l].f[1])*lam_1.v[i]; - lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i]) - - vload(gen->rf[l+1].f[1])*lam_2.v[i]; - } - if (rescale(&lam_1,&lam_2,&scale)) - below_limit = TballLt(scale,sharp_limscale); - l+=2; - } - *l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale; - } - - -NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1, - Tbri * restrict p2, Tb lam_1, Tb lam_2, - const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, - int l, int lmax) - { - while (l<=lmax) - { - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); - Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); - Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]), - f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]); - for (int i=0; ir.v[i] += lam_2.v[i]*ar1; - p1->i.v[i] += lam_2.v[i]*ai1; - lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i]; - p2->r.v[i] += lam_1.v[i]*ar2; - p2->i.v[i] += lam_1.v[i]*ai2; - } - l+=2; - } - } - -NOINLINE static void map2alm_kernel (const Tb cth, - const Tbri * restrict p1, const Tbri * restrict p2, Tb lam_1, Tb lam_2, - const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp) - { - while (l<=lmax) - { - Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]), - f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]); - for (int i=0; ir.v[i]); - vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]); - lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i]; - vfmaeq(atmp[2*(l+1) ],lam_1.v[i],p2->r.v[i]); - vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]); - } - l+=2; - } - } - -NOINLINE static void calc_alm2map (const Tb cth, const Tb sth, - const sharp_Ylmgen_C *gen, sharp_job *job, Tbri * restrict p1, - Tbri * restrict p2) - { - int l,lmax=gen->lmax; - Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale; - iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen); - job->opcnt += (l-gen->m) * 4*VLEN*nvec; - if (l>lmax) return; - job->opcnt += (lmax+1-l) * 8*VLEN*nvec; - - Tb corfac; - getCorfac(scale,&corfac,gen->cf); - const sharp_ylmgen_dbl2 * restrict rf = gen->rf; - const dcmplx * restrict alm=job->almtmp; - int full_ieee = TballGe(scale,sharp_minscale); - while (!full_ieee) - { - { - Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l])); - for (int i=0; ir.v[i],tmp,ar); - vfmaeq(p1->i.v[i],tmp,ai); - } - } - if (++l>lmax) break; - Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]); - for (int i=0; ir.v[i],tmp,ar); - vfmaeq(p2->i.v[i],tmp,ai); - } - } - if (++l>lmax) break; - r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]); - for (int i=0; icf); - full_ieee = TballGe(scale,sharp_minscale); - } - } - if (l>lmax) return; - - Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac); - alm2map_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax); - } - -NOINLINE static void calc_map2alm(const Tb cth, const Tb sth, - const sharp_Ylmgen_C *gen, sharp_job *job, const Tbri * restrict p1, - const Tbri * restrict p2, Tv *restrict atmp) - { - int lmax=gen->lmax; - Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale; - int l=gen->m; - iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen); - job->opcnt += (l-gen->m) * 4*VLEN*nvec; - if (l>lmax) return; - job->opcnt += (lmax+1-l) * 8*VLEN*nvec; - - const sharp_ylmgen_dbl2 * restrict rf = gen->rf; - Tb corfac; - getCorfac(scale,&corfac,gen->cf); - int full_ieee = TballGe(scale,sharp_minscale); - while (!full_ieee) - { - for (int i=0; ir.v[i]; - atmp[2*l+1]+=tmp*p1->i.v[i]; - } - if (++l>lmax) return; - for (int i=0; ir.v[i]; - atmp[2*l+1]+=tmp*p2->i.v[i]; - } - if (++l>lmax) return; - for (int i=0; icf); - full_ieee = TballGe(scale,sharp_minscale); - } - } - - Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac); - map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp); - } - - -#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) - -NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, - const double *cth_, const double *sth_, int llim, int ulim, - sharp_Ylmgen_C *gen, int mi, const int *mlim) - { - const int nval=nvec*VLEN; - const int m = job->ainfo->mval[mi]; - sharp_Ylmgen_prepare (gen, m); - - switch (job->type) - { - case SHARP_ALM2MAP: - case SHARP_ALM2MAP_DERIV1: - { - if (job->spin==0) - { - for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; - if (mlim[itot]>=m) skip=0; - cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; - } - if (!skip) - calc_alm2map (cth.b,sth.b,gen,job,&p1.b,&p2.b); - - for (int i=0; is_th + mi*job->s_m; - complex double r1 = p1.s.r[i] + p1.s.i[i]*_Complex_I, - r2 = p2.s.r[i] + p2.s.i[i]*_Complex_I; - job->phase[phas_idx] = r1+r2; - if (ispair[itot]) - job->phase[phas_idx+1] = r1-r2; - } - } - } - } - else - { - UTIL_FAIL("only spin==0 allowed at the moment"); - } - break; - } - default: - { - UTIL_FAIL("must not happen"); - break; - } - } - } - -NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, - const double *cth_, const double *sth_, int llim, int ulim, - sharp_Ylmgen_C *gen, int mi, const int *mlim) - { - const int nval=nvec*VLEN; - const int m = job->ainfo->mval[mi]; - sharp_Ylmgen_prepare (gen, m); - - switch (job->type) - { - case SHARP_MAP2ALM: - { - if (job->spin==0) - { - Tv atmp[2*(gen->lmax+2)]; - memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv)); - for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; - if (mlim[itot]>=m) skip=0; - cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; - if ((i+ith=m)) - { - int phas_idx = itot*job->s_th + mi*job->s_m; - dcmplx ph1=job->phase[phas_idx]; - dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.; - p1.s.r[i]=creal(ph1+ph2); p1.s.i[i]=cimag(ph1+ph2); - p2.s.r[i]=creal(ph1-ph2); p2.s.i[i]=cimag(ph1-ph2); - } - } - if (!skip) - calc_map2alm(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp); - } - { - int istart=m, istop=gen->lmax+1; - for(; istartalmtmp[istart])); - for(; istartalmtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]); - } - } - else - { - UTIL_FAIL("only spin==0 allowed at the moment"); - } - break; - } - default: - { - UTIL_FAIL("must not happen"); - break; - } - } - } - -static void inner_loop_ (sharp_job *job, const int *ispair, - const double *cth_, const double *sth_, int llim, int ulim, - sharp_Ylmgen_C *gen, int mi, const int *mlim) - { - (job->type==SHARP_MAP2ALM) ? - inner_loop_m2a(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) : - inner_loop_a2m(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim); - } - -#undef VZERO diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c deleted file mode 100644 index b209cae..0000000 --- a/libsharp/sharp_core_inc0.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * This file is part of libsharp. - * - * libsharp is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * libsharp is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with libsharp; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* - * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik - * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt - * (DLR). - */ - -/*! \file sharp_core_inc0.c - * Computational core - * - * Copyright (C) 2012-2018 Max-Planck-Society - * \author Martin Reinecke - */ - -#include -#include -#include -#include "sharp_vecsupport.h" -#include "sharp_complex_hacks.h" -#include "sharp.h" -#include "sharp_core.h" -#include "c_utils.h" - -typedef complex double dcmplx; - -#define XCONCATX(a,b) a##b -#define CONCATX(a,b) XCONCATX(a,b) - -#define nvec 6 -#define Y(arg) arg -#include "sharp_core_inc.c" - -#undef Y -#undef nvec - -void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *cth, - const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, - const int *mlim) - { - inner_loop_(job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - } diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c index 26b9f92..4b124af 100644 --- a/libsharp/sharp_testsuite.c +++ b/libsharp/sharp_testsuite.c @@ -44,7 +44,6 @@ #include "c_utils.h" #include "sharp_announce.h" #include "memusage.h" -#include "sharp_vecsupport.h" typedef complex double dcmplx; @@ -597,7 +596,7 @@ static void sharp_test (int argc, const char **argv) if (mytask==0) printf("%-12s %-10s %2d %d %2d %3d %6d %6d %6d %6d %2d %.2e %7.2f %.2e %7.2f" " %9.2f %6.2f %.2e %.2e\n", - getenv("HOST"),argv[2],spin,VLEN,nomp,ntasks,lmax,mmax,gpar1,gpar2, + getenv("HOST"),argv[2],spin,sharp_veclen(),nomp,ntasks,lmax,mmax,gpar1,gpar2, t_a2m,1e-9*op_a2m/t_a2m,t_m2a,1e-9*op_m2a/t_m2a,tmem/(1<<20), 100.*(1.-iosize/tmem),maxerel,maxeabs);