runs as C++, no vector support yet

This commit is contained in:
Martin Reinecke 2020-01-03 17:22:31 +01:00
parent 54856313a5
commit 0378ce155a
17 changed files with 90 additions and 96 deletions

View file

@ -49,7 +49,7 @@ It can be disabled at configuration time by specifying "--disable-openmp" at the
configure command line.
At runtime OMP_NUM_THREADS should be set to the number of hardware threads
(not physical cores) of the system.
(Usually this is already the default setting when OMP_NUM_THREADS is not
(Usually this is already the default setting when OMP_NUM_THREADS is not
specified.)

View file

@ -3,16 +3,16 @@ ACLOCAL_AMFLAGS = -I m4
lib_LTLIBRARIES = libsharp2.la
libsharp2_la_SOURCES = \
libsharp2/pocketfft.c \
libsharp2/pocketfft.cc \
libsharp2/pocketfft.h \
libsharp2/sharp_utils.c \
libsharp2/sharp_utils.cc \
libsharp2/sharp_utils.h \
libsharp2/sharp.c \
libsharp2/sharp_almhelpers.c \
libsharp2/sharp_core.c \
libsharp2/sharp_geomhelpers.c \
libsharp2/sharp_legendre_roots.c \
libsharp2/sharp_ylmgen_c.c \
libsharp2/sharp.cc \
libsharp2/sharp_almhelpers.cc \
libsharp2/sharp_core.cc \
libsharp2/sharp_geomhelpers.cc \
libsharp2/sharp_legendre_roots.cc \
libsharp2/sharp_ylmgen_c.cc \
libsharp2/sharp_internal.h \
libsharp2/sharp_legendre_roots.h \
libsharp2/sharp_vecsupport.h \
@ -26,23 +26,23 @@ libsharp2_la_SOURCES = \
# ==> age <= current
libsharp2_la_LDFLAGS = -version-info 0:0:0
AM_CFLAGS = @AM_CFLAGS@
AM_CXXFLAGS = @AM_CXXFLAGS@
if HAVE_MULTIARCH
libavx_la_SOURCES = libsharp2/sharp_core_inc.c
libavx2_la_SOURCES = libsharp2/sharp_core_inc.c
libfma_la_SOURCES = libsharp2/sharp_core_inc.c
libfma4_la_SOURCES = libsharp2/sharp_core_inc.c
libavx512f_la_SOURCES = libsharp2/sharp_core_inc.c
libavx_la_SOURCES = libsharp2/sharp_core_inc.cc
libavx2_la_SOURCES = libsharp2/sharp_core_inc.cc
libfma_la_SOURCES = libsharp2/sharp_core_inc.cc
libfma4_la_SOURCES = libsharp2/sharp_core_inc.cc
libavx512f_la_SOURCES = libsharp2/sharp_core_inc.cc
noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx
libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2
libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma
libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4
libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f
libavx_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx -DARCH=avx
libavx2_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx2 -DARCH=avx2
libfma_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma -DARCH=fma
libfma4_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma4 -DARCH=fma4
libavx512f_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx512f -DARCH=avx512f
libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
@ -56,10 +56,10 @@ nobase_include_HEADERS = \
libsharp2/sharp_cxx.h
EXTRA_DIST = \
runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.c
runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.cc
check_PROGRAMS = sharp2_testsuite
sharp2_testsuite_SOURCES = test/sharp2_testsuite.c test/memusage.c test/memusage.h
sharp2_testsuite_SOURCES = test/sharp2_testsuite.cc test/memusage.cc test/memusage.h
sharp2_testsuite_LDADD = libsharp2.la
TESTS = runtest.sh

View file

@ -20,28 +20,21 @@ dnl
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
AC_PROG_CC_C99
AC_OPENMP
# add math library
LIBS="-lm"
AC_PROG_CXX
AX_CXX_COMPILE_STDCXX([11])
AC_PROG_LIBTOOL
tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'`
tmpval=`echo $CXXFLAGS | grep -c '\-DMULTIARCH'`
AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS"
PACKAGE_LIBS="-lsharp2"
PACKAGE_CFLAGS="$PACKAGE_CFLAGS $OPENMP_CFLAGS"
PACKAGE_LDFLAGS="$PACKAGE_LDFLAGS $OPENMP_CFLAGS"
dnl
dnl Create pkgconfig .pc file.
dnl
AX_CREATE_PKGCONFIG_INFO(,,,,[])
AC_SUBST([AM_CFLAGS])
AC_SUBST([AM_CXXFLAGS])
AC_CONFIG_FILES([Makefile])
AC_OUTPUT

View file

@ -34,8 +34,8 @@
#include "libsharp2/sharp_almhelpers.h"
#include "libsharp2/sharp_geomhelpers.h"
typedef complex double dcmplx;
typedef complex float fcmplx;
typedef complex<double> dcmplx;
typedef complex<float> fcmplx;
static const double sqrt_one_half = 0.707106781186547572737310929369;
static const double sqrt_two = 1.414213562373095145474621858739;
@ -105,7 +105,7 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou
self->phi0_ = phi0;
// FIXME: improve this by using sincos2pibyn(nph) etc.
for (int m=0; m<=mmax; ++m)
self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
self->shiftarr[m] = dcmplx(cos(m*phi0),sin(m*phi0));
// double *tmp=(double *) self->shiftarr;
// sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
}
@ -120,12 +120,12 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou
static int ringinfo_compare (const void *xa, const void *xb)
{
const sharp_ringinfo *a=xa, *b=xb;
const sharp_ringinfo *a=(const sharp_ringinfo *)xa, *b=(const sharp_ringinfo *)xb;
return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
}
static int ringpair_compare (const void *xa, const void *xb)
{
const sharp_ringpair *a=xa, *b=xb;
const sharp_ringpair *a=(const sharp_ringpair *)xa, *b=(const sharp_ringpair *)xb;
// return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
if (a->r1.nph==b->r1.nph)
return (a->r1.phi0 < b->r1.phi0) ? -1 :
@ -292,22 +292,22 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
if (self->norot)
for (int m=0; m<=mmax; ++m)
{
data[2*m]=creal(phase[m*pstride])*wgt;
data[2*m+1]=cimag(phase[m*pstride])*wgt;
data[2*m]=phase[m*pstride].real()*wgt;
data[2*m+1]=phase[m*pstride].imag()*wgt;
}
else
for (int m=0; m<=mmax; ++m)
{
dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
data[2*m]=creal(tmp)*wgt;
data[2*m+1]=cimag(tmp)*wgt;
data[2*m]=tmp.real()*wgt;
data[2*m+1]=tmp.imag()*wgt;
}
for (int m=2*(mmax+1); m<nph+2; ++m)
data[m]=0.;
}
else
{
data[0]=creal(phase[0])*wgt;
data[0]=phase[0].real()*wgt;
SET_ARRAY(data,1,nph+2,0.);
int idx1=1, idx2=nph-1;
@ -317,13 +317,13 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
if(!self->norot) tmp*=self->shiftarr[m];
if (idx1<(nph+2)/2)
{
data[2*idx1]+=creal(tmp);
data[2*idx1+1]+=cimag(tmp);
data[2*idx1]+=tmp.real();
data[2*idx1+1]+=tmp.imag();
}
if (idx2<(nph+2)/2)
{
data[2*idx2]+=creal(tmp);
data[2*idx2+1]-=cimag(tmp);
data[2*idx2]+=tmp.real();
data[2*idx2+1]-=tmp.imag();
}
if (++idx1>=nph) idx1=0;
if (--idx2<0) idx2=nph-1;
@ -357,11 +357,11 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
{
if (self->norot)
for (int m=0; m<=maxidx; ++m)
phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * wgt;
phase[m*pstride] = dcmplx(data[2*m], data[2*m+1]) * wgt;
else
for (int m=0; m<=maxidx; ++m)
phase[m*pstride] =
(data[2*m] + _Complex_I*data[2*m+1]) * self->shiftarr[m] * wgt;
dcmplx(data[2*m], data[2*m+1]) * self->shiftarr[m] * wgt;
}
else
{
@ -370,9 +370,9 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
int idx=m%nph;
dcmplx val;
if (idx<(nph-idx))
val = (data[2*idx] + _Complex_I*data[2*idx+1]) * wgt;
val = dcmplx(data[2*idx], data[2*idx+1]) * wgt;
else
val = (data[2*(nph-idx)] - _Complex_I*data[2*(nph-idx)+1]) * wgt;
val = dcmplx(data[2*(nph-idx)], -data[2*(nph-idx)+1]) * wgt;
if (!self->norot)
val *= self->shiftarr[m];
phase[m*pstride]=val;
@ -577,7 +577,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
if (job->flags&SHARP_DP)
COPY_LOOP(double, dcmplx, x*job->norm_l[l])
else
COPY_LOOP(float, fcmplx, x*job->norm_l[l])
COPY_LOOP(float, fcmplx, x*float(job->norm_l[l]))
}
}
}
@ -617,9 +617,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
if (m==0)
{
if (job->flags&SHARP_DP)
COPY_LOOP(double, double, creal(x)*norm_m0)
COPY_LOOP(double, double, x.real()*norm_m0)
else
COPY_LOOP(float, float, crealf(x)*norm_m0)
COPY_LOOP(float, float, x.real()*norm_m0)
}
else
{
@ -634,9 +634,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
if (m==0)
{
if (job->flags&SHARP_DP)
COPY_LOOP(double, double, creal(x)*job->norm_l[l]*norm_m0)
COPY_LOOP(double, double, x.real()*job->norm_l[l]*norm_m0)
else
COPY_LOOP(float, fcmplx, (float)(creal(x)*job->norm_l[l]*norm_m0))
COPY_LOOP(float, fcmplx, (float)(x.real()*job->norm_l[l]*norm_m0))
}
else
{
@ -721,7 +721,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
for (int m=0; m<=mmax; ++m)
phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt;
((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*float(wgt);
}
}
static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
@ -934,8 +934,8 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
job->flags|=SHARP_REAL_HARMONICS;
job->time = 0.;
job->opcnt = 0;
job->alm=alm;
job->map=map;
job->alm=(void **)alm;
job->map=(void **)map;
}
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,

View file

@ -27,7 +27,7 @@
#define ARCH default
#define GENERIC_ARCH
#include "libsharp2/sharp_core_inc.c"
#include "libsharp2/sharp_core_inc.cc"
#undef GENERIC_ARCH
#undef ARCH

View file

@ -34,7 +34,7 @@
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#include <complex.h>
#include <complex>
#include <math.h>
#include <string.h>
#include "libsharp2/sharp_vecsupport.h"
@ -49,7 +49,9 @@
// Unfortunately, most compilers don't act on this pragma yet.
#pragma STDC FP_CONTRACT ON
typedef complex double dcmplx;
typedef complex<double> dcmplx;
inline double creal(const dcmplx &v) {return v.real(); }
inline double cimag(const dcmplx &v) {return v.imag(); }
#define nv0 (128/VLEN)
#define nvx (64/VLEN)
@ -224,10 +226,10 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
{
for (; l<=lmax-2; il+=2, l+=4)
{
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
Tv ar1=vload(alm[l ].real()), ai1=vload(alm[l ].imag());
Tv ar2=vload(alm[l+1].real()), ai2=vload(alm[l+1].imag());
Tv ar3=vload(alm[l+2].real()), ai3=vload(alm[l+2].imag());
Tv ar4=vload(alm[l+3].real()), ai4=vload(alm[l+3].imag());
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
for (int i=0; i<nv0; ++i)
@ -963,8 +965,8 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
d.s.p2r[i]*=cth_[tgt];
d.s.p2i[i]*=cth_[tgt];
int phas_idx = tgt*job->s_th + mi*job->s_m;
complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I,
r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I;
complex<double> r1(d.s.p1r[i], d.s.p1i[i]),
r2(d.s.p2r[i], d.s.p2i[i]);
job->phase[phas_idx] = r1+r2;
if (ispair[tgt])
job->phase[phas_idx+1] = r1-r2;
@ -1027,10 +1029,10 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
{
int tgt=itgt[i];
int phas_idx = tgt*job->s_th + mi*job->s_m;
complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I,
q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I,
u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I,
u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I;
complex<double> q1(d.s.p1pr[i], d.s.p1pi[i]),
q2(d.s.p2pr[i], d.s.p2pi[i]),
u1(d.s.p1mr[i], d.s.p1mi[i]),
u2(d.s.p2mr[i], d.s.p2mi[i]);
job->phase[phas_idx ] = q1+q2;
job->phase[phas_idx+2] = u1+u2;
if (ispair[tgt])

View file

@ -28,14 +28,12 @@
#ifndef SHARP2_INTERNAL_H
#define SHARP2_INTERNAL_H
#ifdef __cplusplus
#error This header file cannot be included from C++, only from C
#endif
#include <complex.h>
#include <complex>
#include "libsharp2/sharp.h"
#include "libsharp2/sharp_ylmgen_c.h"
using std::complex;
typedef struct
{
sharp_jobtype type;
@ -45,9 +43,9 @@ typedef struct
void **map;
void **alm;
int s_m, s_th; // strides in m and theta direction
complex double *phase;
complex<double> *phase;
double *norm_l;
complex double *almtmp;
complex<double> *almtmp;
const sharp_geom_info *ginfo;
const sharp_alm_info *ainfo;
double time;

View file

@ -28,7 +28,9 @@
#ifndef SHARP2_VECSUPPORT_H
#define SHARP2_VECSUPPORT_H
#include <math.h>
#include <cmath>
#include <complex>
using std::complex;
#ifndef VLEN
@ -73,8 +75,8 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
#define vallTrue(a) (a)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc)
{ cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
complex<double> * restrict cc)
{ cc[0] += complex<double>(a,b); cc[1] += complex<double>(c,d); }
#endif
@ -121,9 +123,9 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
#define vallTrue(a) (_mm_movemask_pd(a)==3)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c,
Tv d, _Complex double * restrict cc)
Tv d, complex<double> * restrict cc)
{
union {Tv v; _Complex double c; } u1, u2;
union {Tv v; complex<double> c; } u1, u2;
#if defined(__SSE3__)
u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
#else
@ -167,13 +169,13 @@ typedef __m256d Tm;
#define vallTrue(a) (_mm256_movemask_pd(a)==15)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc)
complex<double> * restrict cc)
{
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
tmp1=tmp3+tmp4;
union {Tv v; _Complex double c[2]; } u;
union {Tv v; complex<double> c[2]; } u;
u.v=tmp1;
cc[0]+=u.c[0]; cc[1]+=u.c[1];
}
@ -209,7 +211,7 @@ typedef __mmask8 Tm;
#define vallTrue(a) (a==255)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc)
complex<double> * restrict cc)
{
cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);

View file

@ -26,7 +26,8 @@
#include <stdio.h>
#include <string.h>
#include <complex.h>
#include <complex>
using std::complex;
#ifdef USE_MPI
#include "mpi.h"
#include "libsharp2/sharp_mpi.h"
@ -38,7 +39,7 @@
#include "libsharp2/sharp_geomhelpers.h"
#include "libsharp2/sharp_almhelpers.h"
#include "libsharp2/sharp_utils.h"
#include "libsharp2/sharp_utils.c"
#include "libsharp2/sharp_utils.cc"
#include "test/memusage.h"
static void OpenMP_status(void)
@ -94,7 +95,7 @@ static void sharp_module_startup (const char *name, int argc, int argc_expected,
exit(1);
}
typedef complex double dcmplx;
typedef complex<double> dcmplx;
int ntasks, mytask;
@ -122,7 +123,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
{
double rv = drand(-1,1,&state);
double iv = (m==0) ? 0 : drand(-1,1,&state);
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
alm[sharp_alm_index(helper,l,mi)] = dcmplx(rv,iv);
}
}
}
@ -230,8 +231,7 @@ static double *get_sqsum_and_invert (dcmplx **alm, ptrdiff_t nalms, int ncomp)
sqsum[i]=0;
for (ptrdiff_t j=0; j<nalms; ++j)
{
sqsum[i]+=creal(alm[i][j])*creal(alm[i][j])
+cimag(alm[i][j])*cimag(alm[i][j]);
sqsum[i]+=norm(alm[i][j]);
alm[i][j]=-alm[i][j];
}
}
@ -253,8 +253,7 @@ static void get_errors (dcmplx **alm, ptrdiff_t nalms, int ncomp, double *sqsum,
double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot;
for (ptrdiff_t j=0; j<nalms; ++j)
{
double sqr=creal(alm[i][j])*creal(alm[i][j])
+cimag(alm[i][j])*cimag(alm[i][j]);
double sqr=norm(alm[i][j]);
sum+=sqr;
if (sqr>maxdiff) maxdiff=sqr;
}
@ -414,7 +413,7 @@ static void check_sign_scale(void)
ALLOC2D(alm,dcmplx,2,nalms);
for (int i=0; i<2; ++i)
for (int j=0; j<nalms; ++j)
alm[i][j]=1.+_Complex_I;
alm[i][j]=dcmplx(1.,1.);
sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP,
NULL,NULL);