runs as C++, no vector support yet

This commit is contained in:
Martin Reinecke 2020-01-03 17:22:31 +01:00
parent 54856313a5
commit 0378ce155a
17 changed files with 90 additions and 96 deletions

View file

@ -49,7 +49,7 @@ It can be disabled at configuration time by specifying "--disable-openmp" at the
configure command line. configure command line.
At runtime OMP_NUM_THREADS should be set to the number of hardware threads At runtime OMP_NUM_THREADS should be set to the number of hardware threads
(not physical cores) of the system. (not physical cores) of the system.
(Usually this is already the default setting when OMP_NUM_THREADS is not (Usually this is already the default setting when OMP_NUM_THREADS is not
specified.) specified.)

View file

@ -3,16 +3,16 @@ ACLOCAL_AMFLAGS = -I m4
lib_LTLIBRARIES = libsharp2.la lib_LTLIBRARIES = libsharp2.la
libsharp2_la_SOURCES = \ libsharp2_la_SOURCES = \
libsharp2/pocketfft.c \ libsharp2/pocketfft.cc \
libsharp2/pocketfft.h \ libsharp2/pocketfft.h \
libsharp2/sharp_utils.c \ libsharp2/sharp_utils.cc \
libsharp2/sharp_utils.h \ libsharp2/sharp_utils.h \
libsharp2/sharp.c \ libsharp2/sharp.cc \
libsharp2/sharp_almhelpers.c \ libsharp2/sharp_almhelpers.cc \
libsharp2/sharp_core.c \ libsharp2/sharp_core.cc \
libsharp2/sharp_geomhelpers.c \ libsharp2/sharp_geomhelpers.cc \
libsharp2/sharp_legendre_roots.c \ libsharp2/sharp_legendre_roots.cc \
libsharp2/sharp_ylmgen_c.c \ libsharp2/sharp_ylmgen_c.cc \
libsharp2/sharp_internal.h \ libsharp2/sharp_internal.h \
libsharp2/sharp_legendre_roots.h \ libsharp2/sharp_legendre_roots.h \
libsharp2/sharp_vecsupport.h \ libsharp2/sharp_vecsupport.h \
@ -26,23 +26,23 @@ libsharp2_la_SOURCES = \
# ==> age <= current # ==> age <= current
libsharp2_la_LDFLAGS = -version-info 0:0:0 libsharp2_la_LDFLAGS = -version-info 0:0:0
AM_CFLAGS = @AM_CFLAGS@ AM_CXXFLAGS = @AM_CXXFLAGS@
if HAVE_MULTIARCH if HAVE_MULTIARCH
libavx_la_SOURCES = libsharp2/sharp_core_inc.c libavx_la_SOURCES = libsharp2/sharp_core_inc.cc
libavx2_la_SOURCES = libsharp2/sharp_core_inc.c libavx2_la_SOURCES = libsharp2/sharp_core_inc.cc
libfma_la_SOURCES = libsharp2/sharp_core_inc.c libfma_la_SOURCES = libsharp2/sharp_core_inc.cc
libfma4_la_SOURCES = libsharp2/sharp_core_inc.c libfma4_la_SOURCES = libsharp2/sharp_core_inc.cc
libavx512f_la_SOURCES = libsharp2/sharp_core_inc.c libavx512f_la_SOURCES = libsharp2/sharp_core_inc.cc
noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx libavx_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx -DARCH=avx
libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2 libavx2_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx2 -DARCH=avx2
libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma libfma_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma -DARCH=fma
libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4 libfma4_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma4 -DARCH=fma4
libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f libavx512f_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx512f -DARCH=avx512f
libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
@ -56,10 +56,10 @@ nobase_include_HEADERS = \
libsharp2/sharp_cxx.h libsharp2/sharp_cxx.h
EXTRA_DIST = \ EXTRA_DIST = \
runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.c runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.cc
check_PROGRAMS = sharp2_testsuite check_PROGRAMS = sharp2_testsuite
sharp2_testsuite_SOURCES = test/sharp2_testsuite.c test/memusage.c test/memusage.h sharp2_testsuite_SOURCES = test/sharp2_testsuite.cc test/memusage.cc test/memusage.h
sharp2_testsuite_LDADD = libsharp2.la sharp2_testsuite_LDADD = libsharp2.la
TESTS = runtest.sh TESTS = runtest.sh

View file

@ -20,28 +20,21 @@ dnl
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
AC_PROG_CC_C99 AC_PROG_CXX
AC_OPENMP AX_CXX_COMPILE_STDCXX([11])
# add math library
LIBS="-lm"
AC_PROG_LIBTOOL AC_PROG_LIBTOOL
tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'` tmpval=`echo $CXXFLAGS | grep -c '\-DMULTIARCH'`
AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0]) AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS"
PACKAGE_LIBS="-lsharp2" PACKAGE_LIBS="-lsharp2"
PACKAGE_CFLAGS="$PACKAGE_CFLAGS $OPENMP_CFLAGS"
PACKAGE_LDFLAGS="$PACKAGE_LDFLAGS $OPENMP_CFLAGS"
dnl dnl
dnl Create pkgconfig .pc file. dnl Create pkgconfig .pc file.
dnl dnl
AX_CREATE_PKGCONFIG_INFO(,,,,[]) AX_CREATE_PKGCONFIG_INFO(,,,,[])
AC_SUBST([AM_CFLAGS]) AC_SUBST([AM_CXXFLAGS])
AC_CONFIG_FILES([Makefile]) AC_CONFIG_FILES([Makefile])
AC_OUTPUT AC_OUTPUT

View file

@ -34,8 +34,8 @@
#include "libsharp2/sharp_almhelpers.h" #include "libsharp2/sharp_almhelpers.h"
#include "libsharp2/sharp_geomhelpers.h" #include "libsharp2/sharp_geomhelpers.h"
typedef complex double dcmplx; typedef complex<double> dcmplx;
typedef complex float fcmplx; typedef complex<float> fcmplx;
static const double sqrt_one_half = 0.707106781186547572737310929369; static const double sqrt_one_half = 0.707106781186547572737310929369;
static const double sqrt_two = 1.414213562373095145474621858739; static const double sqrt_two = 1.414213562373095145474621858739;
@ -105,7 +105,7 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou
self->phi0_ = phi0; self->phi0_ = phi0;
// FIXME: improve this by using sincos2pibyn(nph) etc. // FIXME: improve this by using sincos2pibyn(nph) etc.
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0); self->shiftarr[m] = dcmplx(cos(m*phi0),sin(m*phi0));
// double *tmp=(double *) self->shiftarr; // double *tmp=(double *) self->shiftarr;
// sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2); // sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
} }
@ -120,12 +120,12 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou
static int ringinfo_compare (const void *xa, const void *xb) static int ringinfo_compare (const void *xa, const void *xb)
{ {
const sharp_ringinfo *a=xa, *b=xb; const sharp_ringinfo *a=(const sharp_ringinfo *)xa, *b=(const sharp_ringinfo *)xb;
return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0; return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
} }
static int ringpair_compare (const void *xa, const void *xb) static int ringpair_compare (const void *xa, const void *xb)
{ {
const sharp_ringpair *a=xa, *b=xb; const sharp_ringpair *a=(const sharp_ringpair *)xa, *b=(const sharp_ringpair *)xb;
// return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0; // return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
if (a->r1.nph==b->r1.nph) if (a->r1.nph==b->r1.nph)
return (a->r1.phi0 < b->r1.phi0) ? -1 : return (a->r1.phi0 < b->r1.phi0) ? -1 :
@ -292,22 +292,22 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
if (self->norot) if (self->norot)
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
{ {
data[2*m]=creal(phase[m*pstride])*wgt; data[2*m]=phase[m*pstride].real()*wgt;
data[2*m+1]=cimag(phase[m*pstride])*wgt; data[2*m+1]=phase[m*pstride].imag()*wgt;
} }
else else
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
{ {
dcmplx tmp = phase[m*pstride]*self->shiftarr[m]; dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
data[2*m]=creal(tmp)*wgt; data[2*m]=tmp.real()*wgt;
data[2*m+1]=cimag(tmp)*wgt; data[2*m+1]=tmp.imag()*wgt;
} }
for (int m=2*(mmax+1); m<nph+2; ++m) for (int m=2*(mmax+1); m<nph+2; ++m)
data[m]=0.; data[m]=0.;
} }
else else
{ {
data[0]=creal(phase[0])*wgt; data[0]=phase[0].real()*wgt;
SET_ARRAY(data,1,nph+2,0.); SET_ARRAY(data,1,nph+2,0.);
int idx1=1, idx2=nph-1; int idx1=1, idx2=nph-1;
@ -317,13 +317,13 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
if(!self->norot) tmp*=self->shiftarr[m]; if(!self->norot) tmp*=self->shiftarr[m];
if (idx1<(nph+2)/2) if (idx1<(nph+2)/2)
{ {
data[2*idx1]+=creal(tmp); data[2*idx1]+=tmp.real();
data[2*idx1+1]+=cimag(tmp); data[2*idx1+1]+=tmp.imag();
} }
if (idx2<(nph+2)/2) if (idx2<(nph+2)/2)
{ {
data[2*idx2]+=creal(tmp); data[2*idx2]+=tmp.real();
data[2*idx2+1]-=cimag(tmp); data[2*idx2+1]-=tmp.imag();
} }
if (++idx1>=nph) idx1=0; if (++idx1>=nph) idx1=0;
if (--idx2<0) idx2=nph-1; if (--idx2<0) idx2=nph-1;
@ -357,11 +357,11 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
{ {
if (self->norot) if (self->norot)
for (int m=0; m<=maxidx; ++m) for (int m=0; m<=maxidx; ++m)
phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * wgt; phase[m*pstride] = dcmplx(data[2*m], data[2*m+1]) * wgt;
else else
for (int m=0; m<=maxidx; ++m) for (int m=0; m<=maxidx; ++m)
phase[m*pstride] = phase[m*pstride] =
(data[2*m] + _Complex_I*data[2*m+1]) * self->shiftarr[m] * wgt; dcmplx(data[2*m], data[2*m+1]) * self->shiftarr[m] * wgt;
} }
else else
{ {
@ -370,9 +370,9 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
int idx=m%nph; int idx=m%nph;
dcmplx val; dcmplx val;
if (idx<(nph-idx)) if (idx<(nph-idx))
val = (data[2*idx] + _Complex_I*data[2*idx+1]) * wgt; val = dcmplx(data[2*idx], data[2*idx+1]) * wgt;
else else
val = (data[2*(nph-idx)] - _Complex_I*data[2*(nph-idx)+1]) * wgt; val = dcmplx(data[2*(nph-idx)], -data[2*(nph-idx)+1]) * wgt;
if (!self->norot) if (!self->norot)
val *= self->shiftarr[m]; val *= self->shiftarr[m];
phase[m*pstride]=val; phase[m*pstride]=val;
@ -577,7 +577,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
if (job->flags&SHARP_DP) if (job->flags&SHARP_DP)
COPY_LOOP(double, dcmplx, x*job->norm_l[l]) COPY_LOOP(double, dcmplx, x*job->norm_l[l])
else else
COPY_LOOP(float, fcmplx, x*job->norm_l[l]) COPY_LOOP(float, fcmplx, x*float(job->norm_l[l]))
} }
} }
} }
@ -617,9 +617,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
if (m==0) if (m==0)
{ {
if (job->flags&SHARP_DP) if (job->flags&SHARP_DP)
COPY_LOOP(double, double, creal(x)*norm_m0) COPY_LOOP(double, double, x.real()*norm_m0)
else else
COPY_LOOP(float, float, crealf(x)*norm_m0) COPY_LOOP(float, float, x.real()*norm_m0)
} }
else else
{ {
@ -634,9 +634,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
if (m==0) if (m==0)
{ {
if (job->flags&SHARP_DP) if (job->flags&SHARP_DP)
COPY_LOOP(double, double, creal(x)*job->norm_l[l]*norm_m0) COPY_LOOP(double, double, x.real()*job->norm_l[l]*norm_m0)
else else
COPY_LOOP(float, fcmplx, (float)(creal(x)*job->norm_l[l]*norm_m0)) COPY_LOOP(float, fcmplx, (float)(x.real()*job->norm_l[l]*norm_m0))
} }
else else
{ {
@ -721,7 +721,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ? phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt : ((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt; ((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*float(wgt);
} }
} }
static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
@ -934,8 +934,8 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
job->flags|=SHARP_REAL_HARMONICS; job->flags|=SHARP_REAL_HARMONICS;
job->time = 0.; job->time = 0.;
job->opcnt = 0; job->opcnt = 0;
job->alm=alm; job->alm=(void **)alm;
job->map=map; job->map=(void **)map;
} }
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map, void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,

View file

@ -27,7 +27,7 @@
#define ARCH default #define ARCH default
#define GENERIC_ARCH #define GENERIC_ARCH
#include "libsharp2/sharp_core_inc.c" #include "libsharp2/sharp_core_inc.cc"
#undef GENERIC_ARCH #undef GENERIC_ARCH
#undef ARCH #undef ARCH

View file

@ -34,7 +34,7 @@
#define XCONCATX2(a,b) XCONCATX(a,b) #define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH) #define XARCH(a) XCONCATX2(a,ARCH)
#include <complex.h> #include <complex>
#include <math.h> #include <math.h>
#include <string.h> #include <string.h>
#include "libsharp2/sharp_vecsupport.h" #include "libsharp2/sharp_vecsupport.h"
@ -49,7 +49,9 @@
// Unfortunately, most compilers don't act on this pragma yet. // Unfortunately, most compilers don't act on this pragma yet.
#pragma STDC FP_CONTRACT ON #pragma STDC FP_CONTRACT ON
typedef complex double dcmplx; typedef complex<double> dcmplx;
inline double creal(const dcmplx &v) {return v.real(); }
inline double cimag(const dcmplx &v) {return v.imag(); }
#define nv0 (128/VLEN) #define nv0 (128/VLEN)
#define nvx (64/VLEN) #define nvx (64/VLEN)
@ -224,10 +226,10 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
{ {
for (; l<=lmax-2; il+=2, l+=4) for (; l<=lmax-2; il+=2, l+=4)
{ {
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); Tv ar1=vload(alm[l ].real()), ai1=vload(alm[l ].imag());
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); Tv ar2=vload(alm[l+1].real()), ai2=vload(alm[l+1].imag());
Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); Tv ar3=vload(alm[l+2].real()), ai3=vload(alm[l+2].imag());
Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); Tv ar4=vload(alm[l+3].real()), ai4=vload(alm[l+3].imag());
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b); Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b); Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
for (int i=0; i<nv0; ++i) for (int i=0; i<nv0; ++i)
@ -963,8 +965,8 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
d.s.p2r[i]*=cth_[tgt]; d.s.p2r[i]*=cth_[tgt];
d.s.p2i[i]*=cth_[tgt]; d.s.p2i[i]*=cth_[tgt];
int phas_idx = tgt*job->s_th + mi*job->s_m; int phas_idx = tgt*job->s_th + mi*job->s_m;
complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I, complex<double> r1(d.s.p1r[i], d.s.p1i[i]),
r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I; r2(d.s.p2r[i], d.s.p2i[i]);
job->phase[phas_idx] = r1+r2; job->phase[phas_idx] = r1+r2;
if (ispair[tgt]) if (ispair[tgt])
job->phase[phas_idx+1] = r1-r2; job->phase[phas_idx+1] = r1-r2;
@ -1027,10 +1029,10 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
{ {
int tgt=itgt[i]; int tgt=itgt[i];
int phas_idx = tgt*job->s_th + mi*job->s_m; int phas_idx = tgt*job->s_th + mi*job->s_m;
complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I, complex<double> q1(d.s.p1pr[i], d.s.p1pi[i]),
q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I, q2(d.s.p2pr[i], d.s.p2pi[i]),
u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I, u1(d.s.p1mr[i], d.s.p1mi[i]),
u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I; u2(d.s.p2mr[i], d.s.p2mi[i]);
job->phase[phas_idx ] = q1+q2; job->phase[phas_idx ] = q1+q2;
job->phase[phas_idx+2] = u1+u2; job->phase[phas_idx+2] = u1+u2;
if (ispair[tgt]) if (ispair[tgt])

View file

@ -28,14 +28,12 @@
#ifndef SHARP2_INTERNAL_H #ifndef SHARP2_INTERNAL_H
#define SHARP2_INTERNAL_H #define SHARP2_INTERNAL_H
#ifdef __cplusplus #include <complex>
#error This header file cannot be included from C++, only from C
#endif
#include <complex.h>
#include "libsharp2/sharp.h" #include "libsharp2/sharp.h"
#include "libsharp2/sharp_ylmgen_c.h" #include "libsharp2/sharp_ylmgen_c.h"
using std::complex;
typedef struct typedef struct
{ {
sharp_jobtype type; sharp_jobtype type;
@ -45,9 +43,9 @@ typedef struct
void **map; void **map;
void **alm; void **alm;
int s_m, s_th; // strides in m and theta direction int s_m, s_th; // strides in m and theta direction
complex double *phase; complex<double> *phase;
double *norm_l; double *norm_l;
complex double *almtmp; complex<double> *almtmp;
const sharp_geom_info *ginfo; const sharp_geom_info *ginfo;
const sharp_alm_info *ainfo; const sharp_alm_info *ainfo;
double time; double time;

View file

@ -28,7 +28,9 @@
#ifndef SHARP2_VECSUPPORT_H #ifndef SHARP2_VECSUPPORT_H
#define SHARP2_VECSUPPORT_H #define SHARP2_VECSUPPORT_H
#include <math.h> #include <cmath>
#include <complex>
using std::complex;
#ifndef VLEN #ifndef VLEN
@ -73,8 +75,8 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
#define vallTrue(a) (a) #define vallTrue(a) (a)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc) complex<double> * restrict cc)
{ cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; } { cc[0] += complex<double>(a,b); cc[1] += complex<double>(c,d); }
#endif #endif
@ -121,9 +123,9 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
#define vallTrue(a) (_mm_movemask_pd(a)==3) #define vallTrue(a) (_mm_movemask_pd(a)==3)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c,
Tv d, _Complex double * restrict cc) Tv d, complex<double> * restrict cc)
{ {
union {Tv v; _Complex double c; } u1, u2; union {Tv v; complex<double> c; } u1, u2;
#if defined(__SSE3__) #if defined(__SSE3__)
u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d); u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
#else #else
@ -167,13 +169,13 @@ typedef __m256d Tm;
#define vallTrue(a) (_mm256_movemask_pd(a)==15) #define vallTrue(a) (_mm256_movemask_pd(a)==15)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc) complex<double> * restrict cc)
{ {
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d); Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49), Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32); tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
tmp1=tmp3+tmp4; tmp1=tmp3+tmp4;
union {Tv v; _Complex double c[2]; } u; union {Tv v; complex<double> c[2]; } u;
u.v=tmp1; u.v=tmp1;
cc[0]+=u.c[0]; cc[1]+=u.c[1]; cc[0]+=u.c[0]; cc[1]+=u.c[1];
} }
@ -209,7 +211,7 @@ typedef __mmask8 Tm;
#define vallTrue(a) (a==255) #define vallTrue(a) (a==255)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc) complex<double> * restrict cc)
{ {
cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d); cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);

View file

@ -26,7 +26,8 @@
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <complex.h> #include <complex>
using std::complex;
#ifdef USE_MPI #ifdef USE_MPI
#include "mpi.h" #include "mpi.h"
#include "libsharp2/sharp_mpi.h" #include "libsharp2/sharp_mpi.h"
@ -38,7 +39,7 @@
#include "libsharp2/sharp_geomhelpers.h" #include "libsharp2/sharp_geomhelpers.h"
#include "libsharp2/sharp_almhelpers.h" #include "libsharp2/sharp_almhelpers.h"
#include "libsharp2/sharp_utils.h" #include "libsharp2/sharp_utils.h"
#include "libsharp2/sharp_utils.c" #include "libsharp2/sharp_utils.cc"
#include "test/memusage.h" #include "test/memusage.h"
static void OpenMP_status(void) static void OpenMP_status(void)
@ -94,7 +95,7 @@ static void sharp_module_startup (const char *name, int argc, int argc_expected,
exit(1); exit(1);
} }
typedef complex double dcmplx; typedef complex<double> dcmplx;
int ntasks, mytask; int ntasks, mytask;
@ -122,7 +123,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
{ {
double rv = drand(-1,1,&state); double rv = drand(-1,1,&state);
double iv = (m==0) ? 0 : drand(-1,1,&state); double iv = (m==0) ? 0 : drand(-1,1,&state);
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv; alm[sharp_alm_index(helper,l,mi)] = dcmplx(rv,iv);
} }
} }
} }
@ -230,8 +231,7 @@ static double *get_sqsum_and_invert (dcmplx **alm, ptrdiff_t nalms, int ncomp)
sqsum[i]=0; sqsum[i]=0;
for (ptrdiff_t j=0; j<nalms; ++j) for (ptrdiff_t j=0; j<nalms; ++j)
{ {
sqsum[i]+=creal(alm[i][j])*creal(alm[i][j]) sqsum[i]+=norm(alm[i][j]);
+cimag(alm[i][j])*cimag(alm[i][j]);
alm[i][j]=-alm[i][j]; alm[i][j]=-alm[i][j];
} }
} }
@ -253,8 +253,7 @@ static void get_errors (dcmplx **alm, ptrdiff_t nalms, int ncomp, double *sqsum,
double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot; double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot;
for (ptrdiff_t j=0; j<nalms; ++j) for (ptrdiff_t j=0; j<nalms; ++j)
{ {
double sqr=creal(alm[i][j])*creal(alm[i][j]) double sqr=norm(alm[i][j]);
+cimag(alm[i][j])*cimag(alm[i][j]);
sum+=sqr; sum+=sqr;
if (sqr>maxdiff) maxdiff=sqr; if (sqr>maxdiff) maxdiff=sqr;
} }
@ -414,7 +413,7 @@ static void check_sign_scale(void)
ALLOC2D(alm,dcmplx,2,nalms); ALLOC2D(alm,dcmplx,2,nalms);
for (int i=0; i<2; ++i) for (int i=0; i<2; ++i)
for (int j=0; j<nalms; ++j) for (int j=0; j<nalms; ++j)
alm[i][j]=1.+_Complex_I; alm[i][j]=dcmplx(1.,1.);
sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP, sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP,
NULL,NULL); NULL,NULL);