Merge branch 'multiarch' into 'master'
Multiarch See merge request mtr/libsharp!17
This commit is contained in:
commit
48e8268036
10 changed files with 48 additions and 104 deletions
26
COMPILE
26
COMPILE
|
@ -28,14 +28,14 @@ to the C99 standard, you should still be able to compile libsharp with
|
||||||
Runtime CPU selection with gcc
|
Runtime CPU selection with gcc
|
||||||
------------------------------
|
------------------------------
|
||||||
|
|
||||||
When using a recent gcc (6.0 and newer) on an x86_64 platform, the build
|
When using a recent gcc (6.0 and newer) or a recent clang (successfully tested
|
||||||
machinery will compile the time-critical functions for several different
|
with versions 6 and 7) on an x86_64 platform, the build machinery can compile
|
||||||
architectures (SSE2, AVX, AVX2, FMA3, FMA4, AVX512F), and the appropriate
|
the time-critical functions for several different architectures (SSE2, AVX,
|
||||||
implementation will be selected at runtime.
|
AVX2, FMA3, FMA4, AVX512F), and the appropriate implementation will be selected
|
||||||
This only happens if you do _not_ explicitly specify a target architecture via
|
at runtime.
|
||||||
the compiler flags. I.e., please do _not_ specify "-march=native" or
|
This is enabled by passing "-DMULTIARCH" as part of the CFLAGS.
|
||||||
"-mtarget=avx" or similar if you want a portable binary that will run
|
If this is enabled, please do _not_ specify "-march=native" or
|
||||||
efficiently on different x86_64 CPUs.
|
"-mtarget=avx" or similar!
|
||||||
If you are compiling libsharp for a particular target CPU only, or if you are
|
If you are compiling libsharp for a particular target CPU only, or if you are
|
||||||
using a different compiler, however, "-march-native" should be used. The
|
using a different compiler, however, "-march-native" should be used. The
|
||||||
resulting binary will most likely not run on other computers, though.
|
resulting binary will most likely not run on other computers, though.
|
||||||
|
@ -65,16 +65,16 @@ Example configure invocations
|
||||||
=============================
|
=============================
|
||||||
|
|
||||||
GCC, OpenMP, portable binary:
|
GCC, OpenMP, portable binary:
|
||||||
CFLAGS="-std=c99 -O3 -ffast-math -fopenmp" ./configure
|
CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure
|
||||||
|
|
||||||
GCC, no OpenMP, portable binary:
|
GCC, no OpenMP, portable binary:
|
||||||
CFLAGS="-std=c99 -O3 -ffast-math" ./configure
|
CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math" ./configure
|
||||||
|
|
||||||
Clang, OpenMP, nonportable binary:
|
Clang, OpenMP, portable binary:
|
||||||
CC=clang CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure
|
CC=clang CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure
|
||||||
|
|
||||||
Intel C compiler, OpenMP, nonportable binary:
|
Intel C compiler, OpenMP, nonportable binary:
|
||||||
CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure
|
CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp -D__PURE_INTEL_C99_HEADERS__" ./configure
|
||||||
|
|
||||||
MPI support, nonportable binary:
|
MPI support, nonportable binary:
|
||||||
CC=mpicc CFLAGS="-DUSE_MPI -std=c99 -O3 -march=native -ffast-math" ./configure
|
CC=mpicc CFLAGS="-DUSE_MPI -std=c99 -O3 -march=native -ffast-math" ./configure
|
||||||
|
|
26
Makefile.am
26
Makefile.am
|
@ -2,7 +2,7 @@ ACLOCAL_AMFLAGS = -I m4
|
||||||
|
|
||||||
lib_LTLIBRARIES = libsharp.la
|
lib_LTLIBRARIES = libsharp.la
|
||||||
|
|
||||||
src_sharp = \
|
libsharp_la_SOURCES = \
|
||||||
c_utils/c_utils.c \
|
c_utils/c_utils.c \
|
||||||
c_utils/c_utils.h \
|
c_utils/c_utils.h \
|
||||||
pocketfft/pocketfft.c \
|
pocketfft/pocketfft.c \
|
||||||
|
@ -10,11 +10,6 @@ src_sharp = \
|
||||||
libsharp/sharp.c \
|
libsharp/sharp.c \
|
||||||
libsharp/sharp_almhelpers.c \
|
libsharp/sharp_almhelpers.c \
|
||||||
libsharp/sharp_core.c \
|
libsharp/sharp_core.c \
|
||||||
libsharp/sharp_core_avx.c \
|
|
||||||
libsharp/sharp_core_avx2.c \
|
|
||||||
libsharp/sharp_core_fma.c \
|
|
||||||
libsharp/sharp_core_fma4.c \
|
|
||||||
libsharp/sharp_core_avx512f.c \
|
|
||||||
libsharp/sharp_geomhelpers.c \
|
libsharp/sharp_geomhelpers.c \
|
||||||
libsharp/sharp_legendre_roots.c \
|
libsharp/sharp_legendre_roots.c \
|
||||||
libsharp/sharp_ylmgen_c.c \
|
libsharp/sharp_ylmgen_c.c \
|
||||||
|
@ -23,6 +18,16 @@ src_sharp = \
|
||||||
libsharp/sharp_vecsupport.h \
|
libsharp/sharp_vecsupport.h \
|
||||||
libsharp/sharp_ylmgen_c.h
|
libsharp/sharp_ylmgen_c.h
|
||||||
|
|
||||||
|
libavx_la_SOURCES = libsharp/sharp_core_inc.c
|
||||||
|
libavx2_la_SOURCES = libsharp/sharp_core_inc.c
|
||||||
|
libfma_la_SOURCES = libsharp/sharp_core_inc.c
|
||||||
|
libfma4_la_SOURCES = libsharp/sharp_core_inc.c
|
||||||
|
libavx512f_la_SOURCES = libsharp/sharp_core_inc.c
|
||||||
|
|
||||||
|
noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
|
||||||
|
|
||||||
|
libsharp_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
|
||||||
|
|
||||||
include_HEADERS = \
|
include_HEADERS = \
|
||||||
libsharp/sharp.h \
|
libsharp/sharp.h \
|
||||||
libsharp/sharp_geomhelpers.h \
|
libsharp/sharp_geomhelpers.h \
|
||||||
|
@ -30,11 +35,8 @@ include_HEADERS = \
|
||||||
libsharp/sharp_cxx.h
|
libsharp/sharp_cxx.h
|
||||||
|
|
||||||
EXTRA_DIST = \
|
EXTRA_DIST = \
|
||||||
libsharp/sharp_core_inc.c \
|
|
||||||
runtest.sh
|
runtest.sh
|
||||||
|
|
||||||
libsharp_la_SOURCES = $(src_sharp)
|
|
||||||
|
|
||||||
check_PROGRAMS = sharp_testsuite
|
check_PROGRAMS = sharp_testsuite
|
||||||
sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h c_utils/walltime_c.c c_utils/walltime_c.h
|
sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h c_utils/walltime_c.c c_utils/walltime_c.h
|
||||||
sharp_testsuite_LDADD = libsharp.la
|
sharp_testsuite_LDADD = libsharp.la
|
||||||
|
@ -43,6 +45,12 @@ TESTS = runtest.sh
|
||||||
|
|
||||||
AM_CFLAGS = -I$(top_srcdir)/c_utils -I$(top_srcdir)/libsharp @AM_CFLAGS@
|
AM_CFLAGS = -I$(top_srcdir)/c_utils -I$(top_srcdir)/libsharp @AM_CFLAGS@
|
||||||
|
|
||||||
|
libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx
|
||||||
|
libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2
|
||||||
|
libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma
|
||||||
|
libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4
|
||||||
|
libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f
|
||||||
|
|
||||||
pkgconfigdir = $(libdir)/pkgconfig
|
pkgconfigdir = $(libdir)/pkgconfig
|
||||||
nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc
|
nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,7 @@ personal communication).
|
||||||
|
|
||||||
These improvements reduce the fraction of CPU time spent on evaluating the
|
These improvements reduce the fraction of CPU time spent on evaluating the
|
||||||
recurrences for Y_lm coefficients, which means that computing multiple
|
recurrences for Y_lm coefficients, which means that computing multiple
|
||||||
simultaneous SHTs no longer have a big performance advantage compared to SHTs
|
simultaneous SHTs no longer has a big performance advantage compared to SHTs
|
||||||
done one after the other.
|
done one after the other.
|
||||||
As a consequence, libsharp support for simultaneous SHTs was dropped, making
|
As a consequence, libsharp support for simultaneous SHTs was dropped, making
|
||||||
its interface much simpler.
|
its interface much simpler.
|
||||||
|
|
|
@ -1,9 +1,7 @@
|
||||||
#define XCONCATX(a,b) a##_##b
|
|
||||||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
|
||||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
|
||||||
|
|
||||||
#define ARCH default
|
#define ARCH default
|
||||||
|
#define GENERIC_ARCH
|
||||||
#include "sharp_core_inc.c"
|
#include "sharp_core_inc.c"
|
||||||
|
#undef GENERIC_ARCH
|
||||||
#undef ARCH
|
#undef ARCH
|
||||||
|
|
||||||
typedef void (*t_inner_loop) (sharp_job *job, const int *ispair,
|
typedef void (*t_inner_loop) (sharp_job *job, const int *ispair,
|
||||||
|
@ -18,7 +16,12 @@ static t_veclen veclen_ = NULL;
|
||||||
static t_max_nvec max_nvec_ = NULL;
|
static t_max_nvec max_nvec_ = NULL;
|
||||||
static t_architecture architecture_ = NULL;
|
static t_architecture architecture_ = NULL;
|
||||||
|
|
||||||
#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
|
#ifdef MULTIARCH
|
||||||
|
|
||||||
|
#if (defined(___AVX512F__) || defined(__FMA4__) || defined(__FMA__) || \
|
||||||
|
defined(__AVX2__) || defined(__AVX__))
|
||||||
|
#error MULTIARCH specified but platform-specific flags detected
|
||||||
|
#endif
|
||||||
|
|
||||||
#define DECL(arch) \
|
#define DECL(arch) \
|
||||||
static int XCONCATX2(have,arch)(void) \
|
static int XCONCATX2(have,arch)(void) \
|
||||||
|
@ -39,27 +42,17 @@ int XCONCATX2(sharp_veclen,arch) (void); \
|
||||||
int XCONCATX2(sharp_max_nvec,arch) (int spin); \
|
int XCONCATX2(sharp_max_nvec,arch) (int spin); \
|
||||||
const char *XCONCATX2(sharp_architecture,arch) (void);
|
const char *XCONCATX2(sharp_architecture,arch) (void);
|
||||||
|
|
||||||
#if (!defined(__AVX512F__))
|
|
||||||
DECL(avx512f)
|
DECL(avx512f)
|
||||||
#endif
|
|
||||||
#if (!defined(__FMA4__))
|
|
||||||
DECL(fma4)
|
DECL(fma4)
|
||||||
#endif
|
|
||||||
#if (!defined(__FMA__))
|
|
||||||
DECL(fma)
|
DECL(fma)
|
||||||
#endif
|
|
||||||
#if (!defined(__AVX2__))
|
|
||||||
DECL(avx2)
|
DECL(avx2)
|
||||||
#endif
|
|
||||||
#if (!defined(__AVX__))
|
|
||||||
DECL(avx)
|
DECL(avx)
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void assign_funcs(void)
|
static void assign_funcs(void)
|
||||||
{
|
{
|
||||||
#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
|
#ifdef MULTIARCH
|
||||||
#define DECL2(arch) \
|
#define DECL2(arch) \
|
||||||
if (XCONCATX2(have,arch)()) \
|
if (XCONCATX2(have,arch)()) \
|
||||||
{ \
|
{ \
|
||||||
|
@ -69,21 +62,11 @@ static void assign_funcs(void)
|
||||||
architecture_ = XCONCATX2(sharp_architecture,arch); \
|
architecture_ = XCONCATX2(sharp_architecture,arch); \
|
||||||
return; \
|
return; \
|
||||||
}
|
}
|
||||||
#if (!defined(__AVX512F__))
|
|
||||||
DECL2(avx512f)
|
DECL2(avx512f)
|
||||||
#endif
|
|
||||||
#if (!defined(__FMA4__))
|
|
||||||
DECL2(fma4)
|
DECL2(fma4)
|
||||||
#endif
|
|
||||||
#if (!defined(__FMA__))
|
|
||||||
DECL2(fma)
|
DECL2(fma)
|
||||||
#endif
|
|
||||||
#if (!defined(__AVX2__))
|
|
||||||
DECL2(avx2)
|
DECL2(avx2)
|
||||||
#endif
|
|
||||||
#if (!defined(__AVX__))
|
|
||||||
DECL2(avx)
|
DECL2(avx)
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
inner_loop_ = inner_loop_default;
|
inner_loop_ = inner_loop_default;
|
||||||
veclen_ = sharp_veclen_default;
|
veclen_ = sharp_veclen_default;
|
||||||
|
|
|
@ -1,11 +0,0 @@
|
||||||
#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
|
|
||||||
|
|
||||||
#define XCONCATX(a,b) a##_##b
|
|
||||||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
|
||||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
|
||||||
|
|
||||||
#define ARCH avx
|
|
||||||
#pragma GCC target("avx")
|
|
||||||
#include "sharp_core_inc.c"
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -1,11 +0,0 @@
|
||||||
#if (!defined(__AVX2__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
|
|
||||||
|
|
||||||
#define XCONCATX(a,b) a##_##b
|
|
||||||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
|
||||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
|
||||||
|
|
||||||
#define ARCH avx2
|
|
||||||
#pragma GCC target("avx2")
|
|
||||||
#include "sharp_core_inc.c"
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -1,11 +0,0 @@
|
||||||
#if (!defined(__AVX512F__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
|
|
||||||
|
|
||||||
#define XCONCATX(a,b) a##_##b
|
|
||||||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
|
||||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
|
||||||
|
|
||||||
#define ARCH avx512f
|
|
||||||
#pragma GCC target("avx512f")
|
|
||||||
#include "sharp_core_inc.c"
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -1,11 +0,0 @@
|
||||||
#if (!defined(__FMA__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
|
|
||||||
|
|
||||||
#define XCONCATX(a,b) a##_##b
|
|
||||||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
|
||||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
|
||||||
|
|
||||||
#define ARCH fma
|
|
||||||
#pragma GCC target("fma")
|
|
||||||
#include "sharp_core_inc.c"
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -1,11 +0,0 @@
|
||||||
#if (!defined(__FMA4__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
|
|
||||||
|
|
||||||
#define XCONCATX(a,b) a##_##b
|
|
||||||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
|
||||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
|
||||||
|
|
||||||
#define ARCH fma4
|
|
||||||
#pragma GCC target("fma4")
|
|
||||||
#include "sharp_core_inc.c"
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -29,6 +29,12 @@
|
||||||
* \author Martin Reinecke
|
* \author Martin Reinecke
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#if (defined(MULTIARCH) || defined(GENERIC_ARCH))
|
||||||
|
|
||||||
|
#define XCONCATX(a,b) a##_##b
|
||||||
|
#define XCONCATX2(a,b) XCONCATX(a,b)
|
||||||
|
#define XARCH(a) XCONCATX2(a,ARCH)
|
||||||
|
|
||||||
#include <complex.h>
|
#include <complex.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
@ -1179,3 +1185,5 @@ const char *XARCH(sharp_architecture)(void)
|
||||||
{
|
{
|
||||||
return xstr(ARCH);
|
return xstr(ARCH);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue