runs as C++, no vector support yet
This commit is contained in:
parent
54856313a5
commit
0378ce155a
17 changed files with 90 additions and 96 deletions
2
COMPILE
2
COMPILE
|
@ -49,7 +49,7 @@ It can be disabled at configuration time by specifying "--disable-openmp" at the
|
||||||
configure command line.
|
configure command line.
|
||||||
At runtime OMP_NUM_THREADS should be set to the number of hardware threads
|
At runtime OMP_NUM_THREADS should be set to the number of hardware threads
|
||||||
(not physical cores) of the system.
|
(not physical cores) of the system.
|
||||||
(Usually this is already the default setting when OMP_NUM_THREADS is not
|
(Usually this is already the default setting when OMP_NUM_THREADS is not
|
||||||
specified.)
|
specified.)
|
||||||
|
|
||||||
|
|
||||||
|
|
42
Makefile.am
42
Makefile.am
|
@ -3,16 +3,16 @@ ACLOCAL_AMFLAGS = -I m4
|
||||||
lib_LTLIBRARIES = libsharp2.la
|
lib_LTLIBRARIES = libsharp2.la
|
||||||
|
|
||||||
libsharp2_la_SOURCES = \
|
libsharp2_la_SOURCES = \
|
||||||
libsharp2/pocketfft.c \
|
libsharp2/pocketfft.cc \
|
||||||
libsharp2/pocketfft.h \
|
libsharp2/pocketfft.h \
|
||||||
libsharp2/sharp_utils.c \
|
libsharp2/sharp_utils.cc \
|
||||||
libsharp2/sharp_utils.h \
|
libsharp2/sharp_utils.h \
|
||||||
libsharp2/sharp.c \
|
libsharp2/sharp.cc \
|
||||||
libsharp2/sharp_almhelpers.c \
|
libsharp2/sharp_almhelpers.cc \
|
||||||
libsharp2/sharp_core.c \
|
libsharp2/sharp_core.cc \
|
||||||
libsharp2/sharp_geomhelpers.c \
|
libsharp2/sharp_geomhelpers.cc \
|
||||||
libsharp2/sharp_legendre_roots.c \
|
libsharp2/sharp_legendre_roots.cc \
|
||||||
libsharp2/sharp_ylmgen_c.c \
|
libsharp2/sharp_ylmgen_c.cc \
|
||||||
libsharp2/sharp_internal.h \
|
libsharp2/sharp_internal.h \
|
||||||
libsharp2/sharp_legendre_roots.h \
|
libsharp2/sharp_legendre_roots.h \
|
||||||
libsharp2/sharp_vecsupport.h \
|
libsharp2/sharp_vecsupport.h \
|
||||||
|
@ -26,23 +26,23 @@ libsharp2_la_SOURCES = \
|
||||||
# ==> age <= current
|
# ==> age <= current
|
||||||
libsharp2_la_LDFLAGS = -version-info 0:0:0
|
libsharp2_la_LDFLAGS = -version-info 0:0:0
|
||||||
|
|
||||||
AM_CFLAGS = @AM_CFLAGS@
|
AM_CXXFLAGS = @AM_CXXFLAGS@
|
||||||
|
|
||||||
if HAVE_MULTIARCH
|
if HAVE_MULTIARCH
|
||||||
|
|
||||||
libavx_la_SOURCES = libsharp2/sharp_core_inc.c
|
libavx_la_SOURCES = libsharp2/sharp_core_inc.cc
|
||||||
libavx2_la_SOURCES = libsharp2/sharp_core_inc.c
|
libavx2_la_SOURCES = libsharp2/sharp_core_inc.cc
|
||||||
libfma_la_SOURCES = libsharp2/sharp_core_inc.c
|
libfma_la_SOURCES = libsharp2/sharp_core_inc.cc
|
||||||
libfma4_la_SOURCES = libsharp2/sharp_core_inc.c
|
libfma4_la_SOURCES = libsharp2/sharp_core_inc.cc
|
||||||
libavx512f_la_SOURCES = libsharp2/sharp_core_inc.c
|
libavx512f_la_SOURCES = libsharp2/sharp_core_inc.cc
|
||||||
|
|
||||||
noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
|
noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
|
||||||
|
|
||||||
libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx
|
libavx_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx -DARCH=avx
|
||||||
libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2
|
libavx2_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx2 -DARCH=avx2
|
||||||
libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma
|
libfma_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma -DARCH=fma
|
||||||
libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4
|
libfma4_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma4 -DARCH=fma4
|
||||||
libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f
|
libavx512f_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx512f -DARCH=avx512f
|
||||||
|
|
||||||
libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
|
libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
|
||||||
|
|
||||||
|
@ -56,10 +56,10 @@ nobase_include_HEADERS = \
|
||||||
libsharp2/sharp_cxx.h
|
libsharp2/sharp_cxx.h
|
||||||
|
|
||||||
EXTRA_DIST = \
|
EXTRA_DIST = \
|
||||||
runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.c
|
runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.cc
|
||||||
|
|
||||||
check_PROGRAMS = sharp2_testsuite
|
check_PROGRAMS = sharp2_testsuite
|
||||||
sharp2_testsuite_SOURCES = test/sharp2_testsuite.c test/memusage.c test/memusage.h
|
sharp2_testsuite_SOURCES = test/sharp2_testsuite.cc test/memusage.cc test/memusage.h
|
||||||
sharp2_testsuite_LDADD = libsharp2.la
|
sharp2_testsuite_LDADD = libsharp2.la
|
||||||
|
|
||||||
TESTS = runtest.sh
|
TESTS = runtest.sh
|
||||||
|
|
15
configure.ac
15
configure.ac
|
@ -20,28 +20,21 @@ dnl
|
||||||
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
|
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
|
||||||
|
|
||||||
|
|
||||||
AC_PROG_CC_C99
|
AC_PROG_CXX
|
||||||
AC_OPENMP
|
AX_CXX_COMPILE_STDCXX([11])
|
||||||
|
|
||||||
# add math library
|
|
||||||
LIBS="-lm"
|
|
||||||
|
|
||||||
AC_PROG_LIBTOOL
|
AC_PROG_LIBTOOL
|
||||||
|
|
||||||
tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'`
|
tmpval=`echo $CXXFLAGS | grep -c '\-DMULTIARCH'`
|
||||||
AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
|
AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
|
||||||
|
|
||||||
AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS"
|
|
||||||
|
|
||||||
PACKAGE_LIBS="-lsharp2"
|
PACKAGE_LIBS="-lsharp2"
|
||||||
PACKAGE_CFLAGS="$PACKAGE_CFLAGS $OPENMP_CFLAGS"
|
|
||||||
PACKAGE_LDFLAGS="$PACKAGE_LDFLAGS $OPENMP_CFLAGS"
|
|
||||||
|
|
||||||
dnl
|
dnl
|
||||||
dnl Create pkgconfig .pc file.
|
dnl Create pkgconfig .pc file.
|
||||||
dnl
|
dnl
|
||||||
AX_CREATE_PKGCONFIG_INFO(,,,,[])
|
AX_CREATE_PKGCONFIG_INFO(,,,,[])
|
||||||
AC_SUBST([AM_CFLAGS])
|
AC_SUBST([AM_CXXFLAGS])
|
||||||
|
|
||||||
AC_CONFIG_FILES([Makefile])
|
AC_CONFIG_FILES([Makefile])
|
||||||
AC_OUTPUT
|
AC_OUTPUT
|
||||||
|
|
|
@ -34,8 +34,8 @@
|
||||||
#include "libsharp2/sharp_almhelpers.h"
|
#include "libsharp2/sharp_almhelpers.h"
|
||||||
#include "libsharp2/sharp_geomhelpers.h"
|
#include "libsharp2/sharp_geomhelpers.h"
|
||||||
|
|
||||||
typedef complex double dcmplx;
|
typedef complex<double> dcmplx;
|
||||||
typedef complex float fcmplx;
|
typedef complex<float> fcmplx;
|
||||||
|
|
||||||
static const double sqrt_one_half = 0.707106781186547572737310929369;
|
static const double sqrt_one_half = 0.707106781186547572737310929369;
|
||||||
static const double sqrt_two = 1.414213562373095145474621858739;
|
static const double sqrt_two = 1.414213562373095145474621858739;
|
||||||
|
@ -105,7 +105,7 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou
|
||||||
self->phi0_ = phi0;
|
self->phi0_ = phi0;
|
||||||
// FIXME: improve this by using sincos2pibyn(nph) etc.
|
// FIXME: improve this by using sincos2pibyn(nph) etc.
|
||||||
for (int m=0; m<=mmax; ++m)
|
for (int m=0; m<=mmax; ++m)
|
||||||
self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
|
self->shiftarr[m] = dcmplx(cos(m*phi0),sin(m*phi0));
|
||||||
// double *tmp=(double *) self->shiftarr;
|
// double *tmp=(double *) self->shiftarr;
|
||||||
// sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
|
// sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
|
||||||
}
|
}
|
||||||
|
@ -120,12 +120,12 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou
|
||||||
|
|
||||||
static int ringinfo_compare (const void *xa, const void *xb)
|
static int ringinfo_compare (const void *xa, const void *xb)
|
||||||
{
|
{
|
||||||
const sharp_ringinfo *a=xa, *b=xb;
|
const sharp_ringinfo *a=(const sharp_ringinfo *)xa, *b=(const sharp_ringinfo *)xb;
|
||||||
return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
|
return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
|
||||||
}
|
}
|
||||||
static int ringpair_compare (const void *xa, const void *xb)
|
static int ringpair_compare (const void *xa, const void *xb)
|
||||||
{
|
{
|
||||||
const sharp_ringpair *a=xa, *b=xb;
|
const sharp_ringpair *a=(const sharp_ringpair *)xa, *b=(const sharp_ringpair *)xb;
|
||||||
// return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
|
// return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
|
||||||
if (a->r1.nph==b->r1.nph)
|
if (a->r1.nph==b->r1.nph)
|
||||||
return (a->r1.phi0 < b->r1.phi0) ? -1 :
|
return (a->r1.phi0 < b->r1.phi0) ? -1 :
|
||||||
|
@ -292,22 +292,22 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
|
||||||
if (self->norot)
|
if (self->norot)
|
||||||
for (int m=0; m<=mmax; ++m)
|
for (int m=0; m<=mmax; ++m)
|
||||||
{
|
{
|
||||||
data[2*m]=creal(phase[m*pstride])*wgt;
|
data[2*m]=phase[m*pstride].real()*wgt;
|
||||||
data[2*m+1]=cimag(phase[m*pstride])*wgt;
|
data[2*m+1]=phase[m*pstride].imag()*wgt;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
for (int m=0; m<=mmax; ++m)
|
for (int m=0; m<=mmax; ++m)
|
||||||
{
|
{
|
||||||
dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
|
dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
|
||||||
data[2*m]=creal(tmp)*wgt;
|
data[2*m]=tmp.real()*wgt;
|
||||||
data[2*m+1]=cimag(tmp)*wgt;
|
data[2*m+1]=tmp.imag()*wgt;
|
||||||
}
|
}
|
||||||
for (int m=2*(mmax+1); m<nph+2; ++m)
|
for (int m=2*(mmax+1); m<nph+2; ++m)
|
||||||
data[m]=0.;
|
data[m]=0.;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
data[0]=creal(phase[0])*wgt;
|
data[0]=phase[0].real()*wgt;
|
||||||
SET_ARRAY(data,1,nph+2,0.);
|
SET_ARRAY(data,1,nph+2,0.);
|
||||||
|
|
||||||
int idx1=1, idx2=nph-1;
|
int idx1=1, idx2=nph-1;
|
||||||
|
@ -317,13 +317,13 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
|
||||||
if(!self->norot) tmp*=self->shiftarr[m];
|
if(!self->norot) tmp*=self->shiftarr[m];
|
||||||
if (idx1<(nph+2)/2)
|
if (idx1<(nph+2)/2)
|
||||||
{
|
{
|
||||||
data[2*idx1]+=creal(tmp);
|
data[2*idx1]+=tmp.real();
|
||||||
data[2*idx1+1]+=cimag(tmp);
|
data[2*idx1+1]+=tmp.imag();
|
||||||
}
|
}
|
||||||
if (idx2<(nph+2)/2)
|
if (idx2<(nph+2)/2)
|
||||||
{
|
{
|
||||||
data[2*idx2]+=creal(tmp);
|
data[2*idx2]+=tmp.real();
|
||||||
data[2*idx2+1]-=cimag(tmp);
|
data[2*idx2+1]-=tmp.imag();
|
||||||
}
|
}
|
||||||
if (++idx1>=nph) idx1=0;
|
if (++idx1>=nph) idx1=0;
|
||||||
if (--idx2<0) idx2=nph-1;
|
if (--idx2<0) idx2=nph-1;
|
||||||
|
@ -357,11 +357,11 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
|
||||||
{
|
{
|
||||||
if (self->norot)
|
if (self->norot)
|
||||||
for (int m=0; m<=maxidx; ++m)
|
for (int m=0; m<=maxidx; ++m)
|
||||||
phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * wgt;
|
phase[m*pstride] = dcmplx(data[2*m], data[2*m+1]) * wgt;
|
||||||
else
|
else
|
||||||
for (int m=0; m<=maxidx; ++m)
|
for (int m=0; m<=maxidx; ++m)
|
||||||
phase[m*pstride] =
|
phase[m*pstride] =
|
||||||
(data[2*m] + _Complex_I*data[2*m+1]) * self->shiftarr[m] * wgt;
|
dcmplx(data[2*m], data[2*m+1]) * self->shiftarr[m] * wgt;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -370,9 +370,9 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
|
||||||
int idx=m%nph;
|
int idx=m%nph;
|
||||||
dcmplx val;
|
dcmplx val;
|
||||||
if (idx<(nph-idx))
|
if (idx<(nph-idx))
|
||||||
val = (data[2*idx] + _Complex_I*data[2*idx+1]) * wgt;
|
val = dcmplx(data[2*idx], data[2*idx+1]) * wgt;
|
||||||
else
|
else
|
||||||
val = (data[2*(nph-idx)] - _Complex_I*data[2*(nph-idx)+1]) * wgt;
|
val = dcmplx(data[2*(nph-idx)], -data[2*(nph-idx)+1]) * wgt;
|
||||||
if (!self->norot)
|
if (!self->norot)
|
||||||
val *= self->shiftarr[m];
|
val *= self->shiftarr[m];
|
||||||
phase[m*pstride]=val;
|
phase[m*pstride]=val;
|
||||||
|
@ -577,7 +577,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
|
||||||
if (job->flags&SHARP_DP)
|
if (job->flags&SHARP_DP)
|
||||||
COPY_LOOP(double, dcmplx, x*job->norm_l[l])
|
COPY_LOOP(double, dcmplx, x*job->norm_l[l])
|
||||||
else
|
else
|
||||||
COPY_LOOP(float, fcmplx, x*job->norm_l[l])
|
COPY_LOOP(float, fcmplx, x*float(job->norm_l[l]))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -617,9 +617,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
|
||||||
if (m==0)
|
if (m==0)
|
||||||
{
|
{
|
||||||
if (job->flags&SHARP_DP)
|
if (job->flags&SHARP_DP)
|
||||||
COPY_LOOP(double, double, creal(x)*norm_m0)
|
COPY_LOOP(double, double, x.real()*norm_m0)
|
||||||
else
|
else
|
||||||
COPY_LOOP(float, float, crealf(x)*norm_m0)
|
COPY_LOOP(float, float, x.real()*norm_m0)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -634,9 +634,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
|
||||||
if (m==0)
|
if (m==0)
|
||||||
{
|
{
|
||||||
if (job->flags&SHARP_DP)
|
if (job->flags&SHARP_DP)
|
||||||
COPY_LOOP(double, double, creal(x)*job->norm_l[l]*norm_m0)
|
COPY_LOOP(double, double, x.real()*job->norm_l[l]*norm_m0)
|
||||||
else
|
else
|
||||||
COPY_LOOP(float, fcmplx, (float)(creal(x)*job->norm_l[l]*norm_m0))
|
COPY_LOOP(float, fcmplx, (float)(x.real()*job->norm_l[l]*norm_m0))
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -721,7 +721,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
|
||||||
for (int m=0; m<=mmax; ++m)
|
for (int m=0; m<=mmax; ++m)
|
||||||
phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
|
phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
|
||||||
((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
|
((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
|
||||||
((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt;
|
((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*float(wgt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
|
static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
|
||||||
|
@ -934,8 +934,8 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
|
||||||
job->flags|=SHARP_REAL_HARMONICS;
|
job->flags|=SHARP_REAL_HARMONICS;
|
||||||
job->time = 0.;
|
job->time = 0.;
|
||||||
job->opcnt = 0;
|
job->opcnt = 0;
|
||||||
job->alm=alm;
|
job->alm=(void **)alm;
|
||||||
job->map=map;
|
job->map=(void **)map;
|
||||||
}
|
}
|
||||||
|
|
||||||
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
|
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
|
|
@ -27,7 +27,7 @@
|
||||||
|
|
||||||
#define ARCH default
|
#define ARCH default
|
||||||
#define GENERIC_ARCH
|
#define GENERIC_ARCH
|
||||||
#include "libsharp2/sharp_core_inc.c"
|
#include "libsharp2/sharp_core_inc.cc"
|
||||||
#undef GENERIC_ARCH
|
#undef GENERIC_ARCH
|
||||||
#undef ARCH
|
#undef ARCH
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
#define XCONCATX2(a,b) XCONCATX(a,b)
|
||||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
#define XARCH(a) XCONCATX2(a,ARCH)
|
||||||
|
|
||||||
#include <complex.h>
|
#include <complex>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "libsharp2/sharp_vecsupport.h"
|
#include "libsharp2/sharp_vecsupport.h"
|
||||||
|
@ -49,7 +49,9 @@
|
||||||
// Unfortunately, most compilers don't act on this pragma yet.
|
// Unfortunately, most compilers don't act on this pragma yet.
|
||||||
#pragma STDC FP_CONTRACT ON
|
#pragma STDC FP_CONTRACT ON
|
||||||
|
|
||||||
typedef complex double dcmplx;
|
typedef complex<double> dcmplx;
|
||||||
|
inline double creal(const dcmplx &v) {return v.real(); }
|
||||||
|
inline double cimag(const dcmplx &v) {return v.imag(); }
|
||||||
|
|
||||||
#define nv0 (128/VLEN)
|
#define nv0 (128/VLEN)
|
||||||
#define nvx (64/VLEN)
|
#define nvx (64/VLEN)
|
||||||
|
@ -224,10 +226,10 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
|
||||||
{
|
{
|
||||||
for (; l<=lmax-2; il+=2, l+=4)
|
for (; l<=lmax-2; il+=2, l+=4)
|
||||||
{
|
{
|
||||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
|
Tv ar1=vload(alm[l ].real()), ai1=vload(alm[l ].imag());
|
||||||
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
Tv ar2=vload(alm[l+1].real()), ai2=vload(alm[l+1].imag());
|
||||||
Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
|
Tv ar3=vload(alm[l+2].real()), ai3=vload(alm[l+2].imag());
|
||||||
Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
|
Tv ar4=vload(alm[l+3].real()), ai4=vload(alm[l+3].imag());
|
||||||
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
|
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
|
||||||
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
|
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
|
||||||
for (int i=0; i<nv0; ++i)
|
for (int i=0; i<nv0; ++i)
|
||||||
|
@ -963,8 +965,8 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
||||||
d.s.p2r[i]*=cth_[tgt];
|
d.s.p2r[i]*=cth_[tgt];
|
||||||
d.s.p2i[i]*=cth_[tgt];
|
d.s.p2i[i]*=cth_[tgt];
|
||||||
int phas_idx = tgt*job->s_th + mi*job->s_m;
|
int phas_idx = tgt*job->s_th + mi*job->s_m;
|
||||||
complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I,
|
complex<double> r1(d.s.p1r[i], d.s.p1i[i]),
|
||||||
r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I;
|
r2(d.s.p2r[i], d.s.p2i[i]);
|
||||||
job->phase[phas_idx] = r1+r2;
|
job->phase[phas_idx] = r1+r2;
|
||||||
if (ispair[tgt])
|
if (ispair[tgt])
|
||||||
job->phase[phas_idx+1] = r1-r2;
|
job->phase[phas_idx+1] = r1-r2;
|
||||||
|
@ -1027,10 +1029,10 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
||||||
{
|
{
|
||||||
int tgt=itgt[i];
|
int tgt=itgt[i];
|
||||||
int phas_idx = tgt*job->s_th + mi*job->s_m;
|
int phas_idx = tgt*job->s_th + mi*job->s_m;
|
||||||
complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I,
|
complex<double> q1(d.s.p1pr[i], d.s.p1pi[i]),
|
||||||
q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I,
|
q2(d.s.p2pr[i], d.s.p2pi[i]),
|
||||||
u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I,
|
u1(d.s.p1mr[i], d.s.p1mi[i]),
|
||||||
u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I;
|
u2(d.s.p2mr[i], d.s.p2mi[i]);
|
||||||
job->phase[phas_idx ] = q1+q2;
|
job->phase[phas_idx ] = q1+q2;
|
||||||
job->phase[phas_idx+2] = u1+u2;
|
job->phase[phas_idx+2] = u1+u2;
|
||||||
if (ispair[tgt])
|
if (ispair[tgt])
|
|
@ -28,14 +28,12 @@
|
||||||
#ifndef SHARP2_INTERNAL_H
|
#ifndef SHARP2_INTERNAL_H
|
||||||
#define SHARP2_INTERNAL_H
|
#define SHARP2_INTERNAL_H
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#include <complex>
|
||||||
#error This header file cannot be included from C++, only from C
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <complex.h>
|
|
||||||
#include "libsharp2/sharp.h"
|
#include "libsharp2/sharp.h"
|
||||||
#include "libsharp2/sharp_ylmgen_c.h"
|
#include "libsharp2/sharp_ylmgen_c.h"
|
||||||
|
|
||||||
|
using std::complex;
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
sharp_jobtype type;
|
sharp_jobtype type;
|
||||||
|
@ -45,9 +43,9 @@ typedef struct
|
||||||
void **map;
|
void **map;
|
||||||
void **alm;
|
void **alm;
|
||||||
int s_m, s_th; // strides in m and theta direction
|
int s_m, s_th; // strides in m and theta direction
|
||||||
complex double *phase;
|
complex<double> *phase;
|
||||||
double *norm_l;
|
double *norm_l;
|
||||||
complex double *almtmp;
|
complex<double> *almtmp;
|
||||||
const sharp_geom_info *ginfo;
|
const sharp_geom_info *ginfo;
|
||||||
const sharp_alm_info *ainfo;
|
const sharp_alm_info *ainfo;
|
||||||
double time;
|
double time;
|
||||||
|
|
|
@ -28,7 +28,9 @@
|
||||||
#ifndef SHARP2_VECSUPPORT_H
|
#ifndef SHARP2_VECSUPPORT_H
|
||||||
#define SHARP2_VECSUPPORT_H
|
#define SHARP2_VECSUPPORT_H
|
||||||
|
|
||||||
#include <math.h>
|
#include <cmath>
|
||||||
|
#include <complex>
|
||||||
|
using std::complex;
|
||||||
|
|
||||||
#ifndef VLEN
|
#ifndef VLEN
|
||||||
|
|
||||||
|
@ -73,8 +75,8 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
|
||||||
#define vallTrue(a) (a)
|
#define vallTrue(a) (a)
|
||||||
|
|
||||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||||
_Complex double * restrict cc)
|
complex<double> * restrict cc)
|
||||||
{ cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
|
{ cc[0] += complex<double>(a,b); cc[1] += complex<double>(c,d); }
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -121,9 +123,9 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
|
||||||
#define vallTrue(a) (_mm_movemask_pd(a)==3)
|
#define vallTrue(a) (_mm_movemask_pd(a)==3)
|
||||||
|
|
||||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c,
|
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c,
|
||||||
Tv d, _Complex double * restrict cc)
|
Tv d, complex<double> * restrict cc)
|
||||||
{
|
{
|
||||||
union {Tv v; _Complex double c; } u1, u2;
|
union {Tv v; complex<double> c; } u1, u2;
|
||||||
#if defined(__SSE3__)
|
#if defined(__SSE3__)
|
||||||
u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
|
u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
|
||||||
#else
|
#else
|
||||||
|
@ -167,13 +169,13 @@ typedef __m256d Tm;
|
||||||
#define vallTrue(a) (_mm256_movemask_pd(a)==15)
|
#define vallTrue(a) (_mm256_movemask_pd(a)==15)
|
||||||
|
|
||||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||||
_Complex double * restrict cc)
|
complex<double> * restrict cc)
|
||||||
{
|
{
|
||||||
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
|
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
|
||||||
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
|
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
|
||||||
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
|
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
|
||||||
tmp1=tmp3+tmp4;
|
tmp1=tmp3+tmp4;
|
||||||
union {Tv v; _Complex double c[2]; } u;
|
union {Tv v; complex<double> c[2]; } u;
|
||||||
u.v=tmp1;
|
u.v=tmp1;
|
||||||
cc[0]+=u.c[0]; cc[1]+=u.c[1];
|
cc[0]+=u.c[0]; cc[1]+=u.c[1];
|
||||||
}
|
}
|
||||||
|
@ -209,7 +211,7 @@ typedef __mmask8 Tm;
|
||||||
#define vallTrue(a) (a==255)
|
#define vallTrue(a) (a==255)
|
||||||
|
|
||||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||||
_Complex double * restrict cc)
|
complex<double> * restrict cc)
|
||||||
{
|
{
|
||||||
cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
|
cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
|
||||||
cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
|
cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
|
||||||
|
|
|
@ -26,7 +26,8 @@
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <complex.h>
|
#include <complex>
|
||||||
|
using std::complex;
|
||||||
#ifdef USE_MPI
|
#ifdef USE_MPI
|
||||||
#include "mpi.h"
|
#include "mpi.h"
|
||||||
#include "libsharp2/sharp_mpi.h"
|
#include "libsharp2/sharp_mpi.h"
|
||||||
|
@ -38,7 +39,7 @@
|
||||||
#include "libsharp2/sharp_geomhelpers.h"
|
#include "libsharp2/sharp_geomhelpers.h"
|
||||||
#include "libsharp2/sharp_almhelpers.h"
|
#include "libsharp2/sharp_almhelpers.h"
|
||||||
#include "libsharp2/sharp_utils.h"
|
#include "libsharp2/sharp_utils.h"
|
||||||
#include "libsharp2/sharp_utils.c"
|
#include "libsharp2/sharp_utils.cc"
|
||||||
#include "test/memusage.h"
|
#include "test/memusage.h"
|
||||||
|
|
||||||
static void OpenMP_status(void)
|
static void OpenMP_status(void)
|
||||||
|
@ -94,7 +95,7 @@ static void sharp_module_startup (const char *name, int argc, int argc_expected,
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef complex double dcmplx;
|
typedef complex<double> dcmplx;
|
||||||
|
|
||||||
int ntasks, mytask;
|
int ntasks, mytask;
|
||||||
|
|
||||||
|
@ -122,7 +123,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
|
||||||
{
|
{
|
||||||
double rv = drand(-1,1,&state);
|
double rv = drand(-1,1,&state);
|
||||||
double iv = (m==0) ? 0 : drand(-1,1,&state);
|
double iv = (m==0) ? 0 : drand(-1,1,&state);
|
||||||
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
|
alm[sharp_alm_index(helper,l,mi)] = dcmplx(rv,iv);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -230,8 +231,7 @@ static double *get_sqsum_and_invert (dcmplx **alm, ptrdiff_t nalms, int ncomp)
|
||||||
sqsum[i]=0;
|
sqsum[i]=0;
|
||||||
for (ptrdiff_t j=0; j<nalms; ++j)
|
for (ptrdiff_t j=0; j<nalms; ++j)
|
||||||
{
|
{
|
||||||
sqsum[i]+=creal(alm[i][j])*creal(alm[i][j])
|
sqsum[i]+=norm(alm[i][j]);
|
||||||
+cimag(alm[i][j])*cimag(alm[i][j]);
|
|
||||||
alm[i][j]=-alm[i][j];
|
alm[i][j]=-alm[i][j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -253,8 +253,7 @@ static void get_errors (dcmplx **alm, ptrdiff_t nalms, int ncomp, double *sqsum,
|
||||||
double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot;
|
double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot;
|
||||||
for (ptrdiff_t j=0; j<nalms; ++j)
|
for (ptrdiff_t j=0; j<nalms; ++j)
|
||||||
{
|
{
|
||||||
double sqr=creal(alm[i][j])*creal(alm[i][j])
|
double sqr=norm(alm[i][j]);
|
||||||
+cimag(alm[i][j])*cimag(alm[i][j]);
|
|
||||||
sum+=sqr;
|
sum+=sqr;
|
||||||
if (sqr>maxdiff) maxdiff=sqr;
|
if (sqr>maxdiff) maxdiff=sqr;
|
||||||
}
|
}
|
||||||
|
@ -414,7 +413,7 @@ static void check_sign_scale(void)
|
||||||
ALLOC2D(alm,dcmplx,2,nalms);
|
ALLOC2D(alm,dcmplx,2,nalms);
|
||||||
for (int i=0; i<2; ++i)
|
for (int i=0; i<2; ++i)
|
||||||
for (int j=0; j<nalms; ++j)
|
for (int j=0; j<nalms; ++j)
|
||||||
alm[i][j]=1.+_Complex_I;
|
alm[i][j]=dcmplx(1.,1.);
|
||||||
|
|
||||||
sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP,
|
sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP,
|
||||||
NULL,NULL);
|
NULL,NULL);
|
Loading…
Add table
Add a link
Reference in a new issue