runs as C++, no vector support yet

This commit is contained in:
Martin Reinecke 2020-01-03 17:22:31 +01:00
parent 54856313a5
commit 0378ce155a
17 changed files with 90 additions and 96 deletions

View file

@ -3,16 +3,16 @@ ACLOCAL_AMFLAGS = -I m4
lib_LTLIBRARIES = libsharp2.la lib_LTLIBRARIES = libsharp2.la
libsharp2_la_SOURCES = \ libsharp2_la_SOURCES = \
libsharp2/pocketfft.c \ libsharp2/pocketfft.cc \
libsharp2/pocketfft.h \ libsharp2/pocketfft.h \
libsharp2/sharp_utils.c \ libsharp2/sharp_utils.cc \
libsharp2/sharp_utils.h \ libsharp2/sharp_utils.h \
libsharp2/sharp.c \ libsharp2/sharp.cc \
libsharp2/sharp_almhelpers.c \ libsharp2/sharp_almhelpers.cc \
libsharp2/sharp_core.c \ libsharp2/sharp_core.cc \
libsharp2/sharp_geomhelpers.c \ libsharp2/sharp_geomhelpers.cc \
libsharp2/sharp_legendre_roots.c \ libsharp2/sharp_legendre_roots.cc \
libsharp2/sharp_ylmgen_c.c \ libsharp2/sharp_ylmgen_c.cc \
libsharp2/sharp_internal.h \ libsharp2/sharp_internal.h \
libsharp2/sharp_legendre_roots.h \ libsharp2/sharp_legendre_roots.h \
libsharp2/sharp_vecsupport.h \ libsharp2/sharp_vecsupport.h \
@ -26,23 +26,23 @@ libsharp2_la_SOURCES = \
# ==> age <= current # ==> age <= current
libsharp2_la_LDFLAGS = -version-info 0:0:0 libsharp2_la_LDFLAGS = -version-info 0:0:0
AM_CFLAGS = @AM_CFLAGS@ AM_CXXFLAGS = @AM_CXXFLAGS@
if HAVE_MULTIARCH if HAVE_MULTIARCH
libavx_la_SOURCES = libsharp2/sharp_core_inc.c libavx_la_SOURCES = libsharp2/sharp_core_inc.cc
libavx2_la_SOURCES = libsharp2/sharp_core_inc.c libavx2_la_SOURCES = libsharp2/sharp_core_inc.cc
libfma_la_SOURCES = libsharp2/sharp_core_inc.c libfma_la_SOURCES = libsharp2/sharp_core_inc.cc
libfma4_la_SOURCES = libsharp2/sharp_core_inc.c libfma4_la_SOURCES = libsharp2/sharp_core_inc.cc
libavx512f_la_SOURCES = libsharp2/sharp_core_inc.c libavx512f_la_SOURCES = libsharp2/sharp_core_inc.cc
noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx libavx_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx -DARCH=avx
libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2 libavx2_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx2 -DARCH=avx2
libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma libfma_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma -DARCH=fma
libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4 libfma4_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma4 -DARCH=fma4
libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f libavx512f_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx512f -DARCH=avx512f
libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
@ -56,10 +56,10 @@ nobase_include_HEADERS = \
libsharp2/sharp_cxx.h libsharp2/sharp_cxx.h
EXTRA_DIST = \ EXTRA_DIST = \
runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.c runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.cc
check_PROGRAMS = sharp2_testsuite check_PROGRAMS = sharp2_testsuite
sharp2_testsuite_SOURCES = test/sharp2_testsuite.c test/memusage.c test/memusage.h sharp2_testsuite_SOURCES = test/sharp2_testsuite.cc test/memusage.cc test/memusage.h
sharp2_testsuite_LDADD = libsharp2.la sharp2_testsuite_LDADD = libsharp2.la
TESTS = runtest.sh TESTS = runtest.sh

View file

@ -20,28 +20,21 @@ dnl
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
AC_PROG_CC_C99 AC_PROG_CXX
AC_OPENMP AX_CXX_COMPILE_STDCXX([11])
# add math library
LIBS="-lm"
AC_PROG_LIBTOOL AC_PROG_LIBTOOL
tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'` tmpval=`echo $CXXFLAGS | grep -c '\-DMULTIARCH'`
AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0]) AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS"
PACKAGE_LIBS="-lsharp2" PACKAGE_LIBS="-lsharp2"
PACKAGE_CFLAGS="$PACKAGE_CFLAGS $OPENMP_CFLAGS"
PACKAGE_LDFLAGS="$PACKAGE_LDFLAGS $OPENMP_CFLAGS"
dnl dnl
dnl Create pkgconfig .pc file. dnl Create pkgconfig .pc file.
dnl dnl
AX_CREATE_PKGCONFIG_INFO(,,,,[]) AX_CREATE_PKGCONFIG_INFO(,,,,[])
AC_SUBST([AM_CFLAGS]) AC_SUBST([AM_CXXFLAGS])
AC_CONFIG_FILES([Makefile]) AC_CONFIG_FILES([Makefile])
AC_OUTPUT AC_OUTPUT

View file

@ -34,8 +34,8 @@
#include "libsharp2/sharp_almhelpers.h" #include "libsharp2/sharp_almhelpers.h"
#include "libsharp2/sharp_geomhelpers.h" #include "libsharp2/sharp_geomhelpers.h"
typedef complex double dcmplx; typedef complex<double> dcmplx;
typedef complex float fcmplx; typedef complex<float> fcmplx;
static const double sqrt_one_half = 0.707106781186547572737310929369; static const double sqrt_one_half = 0.707106781186547572737310929369;
static const double sqrt_two = 1.414213562373095145474621858739; static const double sqrt_two = 1.414213562373095145474621858739;
@ -105,7 +105,7 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou
self->phi0_ = phi0; self->phi0_ = phi0;
// FIXME: improve this by using sincos2pibyn(nph) etc. // FIXME: improve this by using sincos2pibyn(nph) etc.
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0); self->shiftarr[m] = dcmplx(cos(m*phi0),sin(m*phi0));
// double *tmp=(double *) self->shiftarr; // double *tmp=(double *) self->shiftarr;
// sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2); // sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
} }
@ -120,12 +120,12 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou
static int ringinfo_compare (const void *xa, const void *xb) static int ringinfo_compare (const void *xa, const void *xb)
{ {
const sharp_ringinfo *a=xa, *b=xb; const sharp_ringinfo *a=(const sharp_ringinfo *)xa, *b=(const sharp_ringinfo *)xb;
return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0; return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
} }
static int ringpair_compare (const void *xa, const void *xb) static int ringpair_compare (const void *xa, const void *xb)
{ {
const sharp_ringpair *a=xa, *b=xb; const sharp_ringpair *a=(const sharp_ringpair *)xa, *b=(const sharp_ringpair *)xb;
// return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0; // return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
if (a->r1.nph==b->r1.nph) if (a->r1.nph==b->r1.nph)
return (a->r1.phi0 < b->r1.phi0) ? -1 : return (a->r1.phi0 < b->r1.phi0) ? -1 :
@ -292,22 +292,22 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
if (self->norot) if (self->norot)
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
{ {
data[2*m]=creal(phase[m*pstride])*wgt; data[2*m]=phase[m*pstride].real()*wgt;
data[2*m+1]=cimag(phase[m*pstride])*wgt; data[2*m+1]=phase[m*pstride].imag()*wgt;
} }
else else
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
{ {
dcmplx tmp = phase[m*pstride]*self->shiftarr[m]; dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
data[2*m]=creal(tmp)*wgt; data[2*m]=tmp.real()*wgt;
data[2*m+1]=cimag(tmp)*wgt; data[2*m+1]=tmp.imag()*wgt;
} }
for (int m=2*(mmax+1); m<nph+2; ++m) for (int m=2*(mmax+1); m<nph+2; ++m)
data[m]=0.; data[m]=0.;
} }
else else
{ {
data[0]=creal(phase[0])*wgt; data[0]=phase[0].real()*wgt;
SET_ARRAY(data,1,nph+2,0.); SET_ARRAY(data,1,nph+2,0.);
int idx1=1, idx2=nph-1; int idx1=1, idx2=nph-1;
@ -317,13 +317,13 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
if(!self->norot) tmp*=self->shiftarr[m]; if(!self->norot) tmp*=self->shiftarr[m];
if (idx1<(nph+2)/2) if (idx1<(nph+2)/2)
{ {
data[2*idx1]+=creal(tmp); data[2*idx1]+=tmp.real();
data[2*idx1+1]+=cimag(tmp); data[2*idx1+1]+=tmp.imag();
} }
if (idx2<(nph+2)/2) if (idx2<(nph+2)/2)
{ {
data[2*idx2]+=creal(tmp); data[2*idx2]+=tmp.real();
data[2*idx2+1]-=cimag(tmp); data[2*idx2+1]-=tmp.imag();
} }
if (++idx1>=nph) idx1=0; if (++idx1>=nph) idx1=0;
if (--idx2<0) idx2=nph-1; if (--idx2<0) idx2=nph-1;
@ -357,11 +357,11 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
{ {
if (self->norot) if (self->norot)
for (int m=0; m<=maxidx; ++m) for (int m=0; m<=maxidx; ++m)
phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * wgt; phase[m*pstride] = dcmplx(data[2*m], data[2*m+1]) * wgt;
else else
for (int m=0; m<=maxidx; ++m) for (int m=0; m<=maxidx; ++m)
phase[m*pstride] = phase[m*pstride] =
(data[2*m] + _Complex_I*data[2*m+1]) * self->shiftarr[m] * wgt; dcmplx(data[2*m], data[2*m+1]) * self->shiftarr[m] * wgt;
} }
else else
{ {
@ -370,9 +370,9 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
int idx=m%nph; int idx=m%nph;
dcmplx val; dcmplx val;
if (idx<(nph-idx)) if (idx<(nph-idx))
val = (data[2*idx] + _Complex_I*data[2*idx+1]) * wgt; val = dcmplx(data[2*idx], data[2*idx+1]) * wgt;
else else
val = (data[2*(nph-idx)] - _Complex_I*data[2*(nph-idx)+1]) * wgt; val = dcmplx(data[2*(nph-idx)], -data[2*(nph-idx)+1]) * wgt;
if (!self->norot) if (!self->norot)
val *= self->shiftarr[m]; val *= self->shiftarr[m];
phase[m*pstride]=val; phase[m*pstride]=val;
@ -577,7 +577,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
if (job->flags&SHARP_DP) if (job->flags&SHARP_DP)
COPY_LOOP(double, dcmplx, x*job->norm_l[l]) COPY_LOOP(double, dcmplx, x*job->norm_l[l])
else else
COPY_LOOP(float, fcmplx, x*job->norm_l[l]) COPY_LOOP(float, fcmplx, x*float(job->norm_l[l]))
} }
} }
} }
@ -617,9 +617,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
if (m==0) if (m==0)
{ {
if (job->flags&SHARP_DP) if (job->flags&SHARP_DP)
COPY_LOOP(double, double, creal(x)*norm_m0) COPY_LOOP(double, double, x.real()*norm_m0)
else else
COPY_LOOP(float, float, crealf(x)*norm_m0) COPY_LOOP(float, float, x.real()*norm_m0)
} }
else else
{ {
@ -634,9 +634,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
if (m==0) if (m==0)
{ {
if (job->flags&SHARP_DP) if (job->flags&SHARP_DP)
COPY_LOOP(double, double, creal(x)*job->norm_l[l]*norm_m0) COPY_LOOP(double, double, x.real()*job->norm_l[l]*norm_m0)
else else
COPY_LOOP(float, fcmplx, (float)(creal(x)*job->norm_l[l]*norm_m0)) COPY_LOOP(float, fcmplx, (float)(x.real()*job->norm_l[l]*norm_m0))
} }
else else
{ {
@ -721,7 +721,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ? phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt : ((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt; ((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*float(wgt);
} }
} }
static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
@ -934,8 +934,8 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
job->flags|=SHARP_REAL_HARMONICS; job->flags|=SHARP_REAL_HARMONICS;
job->time = 0.; job->time = 0.;
job->opcnt = 0; job->opcnt = 0;
job->alm=alm; job->alm=(void **)alm;
job->map=map; job->map=(void **)map;
} }
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map, void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,

View file

@ -27,7 +27,7 @@
#define ARCH default #define ARCH default
#define GENERIC_ARCH #define GENERIC_ARCH
#include "libsharp2/sharp_core_inc.c" #include "libsharp2/sharp_core_inc.cc"
#undef GENERIC_ARCH #undef GENERIC_ARCH
#undef ARCH #undef ARCH

View file

@ -34,7 +34,7 @@
#define XCONCATX2(a,b) XCONCATX(a,b) #define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH) #define XARCH(a) XCONCATX2(a,ARCH)
#include <complex.h> #include <complex>
#include <math.h> #include <math.h>
#include <string.h> #include <string.h>
#include "libsharp2/sharp_vecsupport.h" #include "libsharp2/sharp_vecsupport.h"
@ -49,7 +49,9 @@
// Unfortunately, most compilers don't act on this pragma yet. // Unfortunately, most compilers don't act on this pragma yet.
#pragma STDC FP_CONTRACT ON #pragma STDC FP_CONTRACT ON
typedef complex double dcmplx; typedef complex<double> dcmplx;
inline double creal(const dcmplx &v) {return v.real(); }
inline double cimag(const dcmplx &v) {return v.imag(); }
#define nv0 (128/VLEN) #define nv0 (128/VLEN)
#define nvx (64/VLEN) #define nvx (64/VLEN)
@ -224,10 +226,10 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
{ {
for (; l<=lmax-2; il+=2, l+=4) for (; l<=lmax-2; il+=2, l+=4)
{ {
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); Tv ar1=vload(alm[l ].real()), ai1=vload(alm[l ].imag());
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); Tv ar2=vload(alm[l+1].real()), ai2=vload(alm[l+1].imag());
Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); Tv ar3=vload(alm[l+2].real()), ai3=vload(alm[l+2].imag());
Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); Tv ar4=vload(alm[l+3].real()), ai4=vload(alm[l+3].imag());
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b); Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b); Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
for (int i=0; i<nv0; ++i) for (int i=0; i<nv0; ++i)
@ -963,8 +965,8 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
d.s.p2r[i]*=cth_[tgt]; d.s.p2r[i]*=cth_[tgt];
d.s.p2i[i]*=cth_[tgt]; d.s.p2i[i]*=cth_[tgt];
int phas_idx = tgt*job->s_th + mi*job->s_m; int phas_idx = tgt*job->s_th + mi*job->s_m;
complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I, complex<double> r1(d.s.p1r[i], d.s.p1i[i]),
r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I; r2(d.s.p2r[i], d.s.p2i[i]);
job->phase[phas_idx] = r1+r2; job->phase[phas_idx] = r1+r2;
if (ispair[tgt]) if (ispair[tgt])
job->phase[phas_idx+1] = r1-r2; job->phase[phas_idx+1] = r1-r2;
@ -1027,10 +1029,10 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
{ {
int tgt=itgt[i]; int tgt=itgt[i];
int phas_idx = tgt*job->s_th + mi*job->s_m; int phas_idx = tgt*job->s_th + mi*job->s_m;
complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I, complex<double> q1(d.s.p1pr[i], d.s.p1pi[i]),
q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I, q2(d.s.p2pr[i], d.s.p2pi[i]),
u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I, u1(d.s.p1mr[i], d.s.p1mi[i]),
u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I; u2(d.s.p2mr[i], d.s.p2mi[i]);
job->phase[phas_idx ] = q1+q2; job->phase[phas_idx ] = q1+q2;
job->phase[phas_idx+2] = u1+u2; job->phase[phas_idx+2] = u1+u2;
if (ispair[tgt]) if (ispair[tgt])

View file

@ -28,14 +28,12 @@
#ifndef SHARP2_INTERNAL_H #ifndef SHARP2_INTERNAL_H
#define SHARP2_INTERNAL_H #define SHARP2_INTERNAL_H
#ifdef __cplusplus #include <complex>
#error This header file cannot be included from C++, only from C
#endif
#include <complex.h>
#include "libsharp2/sharp.h" #include "libsharp2/sharp.h"
#include "libsharp2/sharp_ylmgen_c.h" #include "libsharp2/sharp_ylmgen_c.h"
using std::complex;
typedef struct typedef struct
{ {
sharp_jobtype type; sharp_jobtype type;
@ -45,9 +43,9 @@ typedef struct
void **map; void **map;
void **alm; void **alm;
int s_m, s_th; // strides in m and theta direction int s_m, s_th; // strides in m and theta direction
complex double *phase; complex<double> *phase;
double *norm_l; double *norm_l;
complex double *almtmp; complex<double> *almtmp;
const sharp_geom_info *ginfo; const sharp_geom_info *ginfo;
const sharp_alm_info *ainfo; const sharp_alm_info *ainfo;
double time; double time;

View file

@ -28,7 +28,9 @@
#ifndef SHARP2_VECSUPPORT_H #ifndef SHARP2_VECSUPPORT_H
#define SHARP2_VECSUPPORT_H #define SHARP2_VECSUPPORT_H
#include <math.h> #include <cmath>
#include <complex>
using std::complex;
#ifndef VLEN #ifndef VLEN
@ -73,8 +75,8 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
#define vallTrue(a) (a) #define vallTrue(a) (a)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc) complex<double> * restrict cc)
{ cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; } { cc[0] += complex<double>(a,b); cc[1] += complex<double>(c,d); }
#endif #endif
@ -121,9 +123,9 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
#define vallTrue(a) (_mm_movemask_pd(a)==3) #define vallTrue(a) (_mm_movemask_pd(a)==3)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c,
Tv d, _Complex double * restrict cc) Tv d, complex<double> * restrict cc)
{ {
union {Tv v; _Complex double c; } u1, u2; union {Tv v; complex<double> c; } u1, u2;
#if defined(__SSE3__) #if defined(__SSE3__)
u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d); u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
#else #else
@ -167,13 +169,13 @@ typedef __m256d Tm;
#define vallTrue(a) (_mm256_movemask_pd(a)==15) #define vallTrue(a) (_mm256_movemask_pd(a)==15)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc) complex<double> * restrict cc)
{ {
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d); Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49), Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32); tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
tmp1=tmp3+tmp4; tmp1=tmp3+tmp4;
union {Tv v; _Complex double c[2]; } u; union {Tv v; complex<double> c[2]; } u;
u.v=tmp1; u.v=tmp1;
cc[0]+=u.c[0]; cc[1]+=u.c[1]; cc[0]+=u.c[0]; cc[1]+=u.c[1];
} }
@ -209,7 +211,7 @@ typedef __mmask8 Tm;
#define vallTrue(a) (a==255) #define vallTrue(a) (a==255)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc) complex<double> * restrict cc)
{ {
cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d); cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);

View file

@ -26,7 +26,8 @@
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <complex.h> #include <complex>
using std::complex;
#ifdef USE_MPI #ifdef USE_MPI
#include "mpi.h" #include "mpi.h"
#include "libsharp2/sharp_mpi.h" #include "libsharp2/sharp_mpi.h"
@ -38,7 +39,7 @@
#include "libsharp2/sharp_geomhelpers.h" #include "libsharp2/sharp_geomhelpers.h"
#include "libsharp2/sharp_almhelpers.h" #include "libsharp2/sharp_almhelpers.h"
#include "libsharp2/sharp_utils.h" #include "libsharp2/sharp_utils.h"
#include "libsharp2/sharp_utils.c" #include "libsharp2/sharp_utils.cc"
#include "test/memusage.h" #include "test/memusage.h"
static void OpenMP_status(void) static void OpenMP_status(void)
@ -94,7 +95,7 @@ static void sharp_module_startup (const char *name, int argc, int argc_expected,
exit(1); exit(1);
} }
typedef complex double dcmplx; typedef complex<double> dcmplx;
int ntasks, mytask; int ntasks, mytask;
@ -122,7 +123,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
{ {
double rv = drand(-1,1,&state); double rv = drand(-1,1,&state);
double iv = (m==0) ? 0 : drand(-1,1,&state); double iv = (m==0) ? 0 : drand(-1,1,&state);
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv; alm[sharp_alm_index(helper,l,mi)] = dcmplx(rv,iv);
} }
} }
} }
@ -230,8 +231,7 @@ static double *get_sqsum_and_invert (dcmplx **alm, ptrdiff_t nalms, int ncomp)
sqsum[i]=0; sqsum[i]=0;
for (ptrdiff_t j=0; j<nalms; ++j) for (ptrdiff_t j=0; j<nalms; ++j)
{ {
sqsum[i]+=creal(alm[i][j])*creal(alm[i][j]) sqsum[i]+=norm(alm[i][j]);
+cimag(alm[i][j])*cimag(alm[i][j]);
alm[i][j]=-alm[i][j]; alm[i][j]=-alm[i][j];
} }
} }
@ -253,8 +253,7 @@ static void get_errors (dcmplx **alm, ptrdiff_t nalms, int ncomp, double *sqsum,
double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot; double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot;
for (ptrdiff_t j=0; j<nalms; ++j) for (ptrdiff_t j=0; j<nalms; ++j)
{ {
double sqr=creal(alm[i][j])*creal(alm[i][j]) double sqr=norm(alm[i][j]);
+cimag(alm[i][j])*cimag(alm[i][j]);
sum+=sqr; sum+=sqr;
if (sqr>maxdiff) maxdiff=sqr; if (sqr>maxdiff) maxdiff=sqr;
} }
@ -414,7 +413,7 @@ static void check_sign_scale(void)
ALLOC2D(alm,dcmplx,2,nalms); ALLOC2D(alm,dcmplx,2,nalms);
for (int i=0; i<2; ++i) for (int i=0; i<2; ++i)
for (int j=0; j<nalms; ++j) for (int j=0; j<nalms; ++j)
alm[i][j]=1.+_Complex_I; alm[i][j]=dcmplx(1.,1.);
sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP, sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP,
NULL,NULL); NULL,NULL);