Merge branch 'multiarch' into 'master'

Multiarch

See merge request mtr/libsharp!17
This commit is contained in:
Martin Reinecke 2019-01-21 12:08:42 +01:00
commit 48e8268036
10 changed files with 48 additions and 104 deletions

26
COMPILE
View file

@ -28,14 +28,14 @@ to the C99 standard, you should still be able to compile libsharp with
Runtime CPU selection with gcc Runtime CPU selection with gcc
------------------------------ ------------------------------
When using a recent gcc (6.0 and newer) on an x86_64 platform, the build When using a recent gcc (6.0 and newer) or a recent clang (successfully tested
machinery will compile the time-critical functions for several different with versions 6 and 7) on an x86_64 platform, the build machinery can compile
architectures (SSE2, AVX, AVX2, FMA3, FMA4, AVX512F), and the appropriate the time-critical functions for several different architectures (SSE2, AVX,
implementation will be selected at runtime. AVX2, FMA3, FMA4, AVX512F), and the appropriate implementation will be selected
This only happens if you do _not_ explicitly specify a target architecture via at runtime.
the compiler flags. I.e., please do _not_ specify "-march=native" or This is enabled by passing "-DMULTIARCH" as part of the CFLAGS.
"-mtarget=avx" or similar if you want a portable binary that will run If this is enabled, please do _not_ specify "-march=native" or
efficiently on different x86_64 CPUs. "-mtarget=avx" or similar!
If you are compiling libsharp for a particular target CPU only, or if you are If you are compiling libsharp for a particular target CPU only, or if you are
using a different compiler, however, "-march-native" should be used. The using a different compiler, however, "-march-native" should be used. The
resulting binary will most likely not run on other computers, though. resulting binary will most likely not run on other computers, though.
@ -65,16 +65,16 @@ Example configure invocations
============================= =============================
GCC, OpenMP, portable binary: GCC, OpenMP, portable binary:
CFLAGS="-std=c99 -O3 -ffast-math -fopenmp" ./configure CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure
GCC, no OpenMP, portable binary: GCC, no OpenMP, portable binary:
CFLAGS="-std=c99 -O3 -ffast-math" ./configure CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math" ./configure
Clang, OpenMP, nonportable binary: Clang, OpenMP, portable binary:
CC=clang CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure CC=clang CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure
Intel C compiler, OpenMP, nonportable binary: Intel C compiler, OpenMP, nonportable binary:
CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp -D__PURE_INTEL_C99_HEADERS__" ./configure
MPI support, nonportable binary: MPI support, nonportable binary:
CC=mpicc CFLAGS="-DUSE_MPI -std=c99 -O3 -march=native -ffast-math" ./configure CC=mpicc CFLAGS="-DUSE_MPI -std=c99 -O3 -march=native -ffast-math" ./configure

View file

@ -2,7 +2,7 @@ ACLOCAL_AMFLAGS = -I m4
lib_LTLIBRARIES = libsharp.la lib_LTLIBRARIES = libsharp.la
src_sharp = \ libsharp_la_SOURCES = \
c_utils/c_utils.c \ c_utils/c_utils.c \
c_utils/c_utils.h \ c_utils/c_utils.h \
pocketfft/pocketfft.c \ pocketfft/pocketfft.c \
@ -10,11 +10,6 @@ src_sharp = \
libsharp/sharp.c \ libsharp/sharp.c \
libsharp/sharp_almhelpers.c \ libsharp/sharp_almhelpers.c \
libsharp/sharp_core.c \ libsharp/sharp_core.c \
libsharp/sharp_core_avx.c \
libsharp/sharp_core_avx2.c \
libsharp/sharp_core_fma.c \
libsharp/sharp_core_fma4.c \
libsharp/sharp_core_avx512f.c \
libsharp/sharp_geomhelpers.c \ libsharp/sharp_geomhelpers.c \
libsharp/sharp_legendre_roots.c \ libsharp/sharp_legendre_roots.c \
libsharp/sharp_ylmgen_c.c \ libsharp/sharp_ylmgen_c.c \
@ -23,6 +18,16 @@ src_sharp = \
libsharp/sharp_vecsupport.h \ libsharp/sharp_vecsupport.h \
libsharp/sharp_ylmgen_c.h libsharp/sharp_ylmgen_c.h
libavx_la_SOURCES = libsharp/sharp_core_inc.c
libavx2_la_SOURCES = libsharp/sharp_core_inc.c
libfma_la_SOURCES = libsharp/sharp_core_inc.c
libfma4_la_SOURCES = libsharp/sharp_core_inc.c
libavx512f_la_SOURCES = libsharp/sharp_core_inc.c
noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
libsharp_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
include_HEADERS = \ include_HEADERS = \
libsharp/sharp.h \ libsharp/sharp.h \
libsharp/sharp_geomhelpers.h \ libsharp/sharp_geomhelpers.h \
@ -30,11 +35,8 @@ include_HEADERS = \
libsharp/sharp_cxx.h libsharp/sharp_cxx.h
EXTRA_DIST = \ EXTRA_DIST = \
libsharp/sharp_core_inc.c \
runtest.sh runtest.sh
libsharp_la_SOURCES = $(src_sharp)
check_PROGRAMS = sharp_testsuite check_PROGRAMS = sharp_testsuite
sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h c_utils/walltime_c.c c_utils/walltime_c.h sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h c_utils/walltime_c.c c_utils/walltime_c.h
sharp_testsuite_LDADD = libsharp.la sharp_testsuite_LDADD = libsharp.la
@ -43,6 +45,12 @@ TESTS = runtest.sh
AM_CFLAGS = -I$(top_srcdir)/c_utils -I$(top_srcdir)/libsharp @AM_CFLAGS@ AM_CFLAGS = -I$(top_srcdir)/c_utils -I$(top_srcdir)/libsharp @AM_CFLAGS@
libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx
libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2
libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma
libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4
libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f
pkgconfigdir = $(libdir)/pkgconfig pkgconfigdir = $(libdir)/pkgconfig
nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc

View file

@ -18,7 +18,7 @@ personal communication).
These improvements reduce the fraction of CPU time spent on evaluating the These improvements reduce the fraction of CPU time spent on evaluating the
recurrences for Y_lm coefficients, which means that computing multiple recurrences for Y_lm coefficients, which means that computing multiple
simultaneous SHTs no longer have a big performance advantage compared to SHTs simultaneous SHTs no longer has a big performance advantage compared to SHTs
done one after the other. done one after the other.
As a consequence, libsharp support for simultaneous SHTs was dropped, making As a consequence, libsharp support for simultaneous SHTs was dropped, making
its interface much simpler. its interface much simpler.

View file

@ -1,9 +1,7 @@
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH default #define ARCH default
#define GENERIC_ARCH
#include "sharp_core_inc.c" #include "sharp_core_inc.c"
#undef GENERIC_ARCH
#undef ARCH #undef ARCH
typedef void (*t_inner_loop) (sharp_job *job, const int *ispair, typedef void (*t_inner_loop) (sharp_job *job, const int *ispair,
@ -18,7 +16,12 @@ static t_veclen veclen_ = NULL;
static t_max_nvec max_nvec_ = NULL; static t_max_nvec max_nvec_ = NULL;
static t_architecture architecture_ = NULL; static t_architecture architecture_ = NULL;
#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) #ifdef MULTIARCH
#if (defined(___AVX512F__) || defined(__FMA4__) || defined(__FMA__) || \
defined(__AVX2__) || defined(__AVX__))
#error MULTIARCH specified but platform-specific flags detected
#endif
#define DECL(arch) \ #define DECL(arch) \
static int XCONCATX2(have,arch)(void) \ static int XCONCATX2(have,arch)(void) \
@ -39,27 +42,17 @@ int XCONCATX2(sharp_veclen,arch) (void); \
int XCONCATX2(sharp_max_nvec,arch) (int spin); \ int XCONCATX2(sharp_max_nvec,arch) (int spin); \
const char *XCONCATX2(sharp_architecture,arch) (void); const char *XCONCATX2(sharp_architecture,arch) (void);
#if (!defined(__AVX512F__))
DECL(avx512f) DECL(avx512f)
#endif
#if (!defined(__FMA4__))
DECL(fma4) DECL(fma4)
#endif
#if (!defined(__FMA__))
DECL(fma) DECL(fma)
#endif
#if (!defined(__AVX2__))
DECL(avx2) DECL(avx2)
#endif
#if (!defined(__AVX__))
DECL(avx) DECL(avx)
#endif
#endif #endif
static void assign_funcs(void) static void assign_funcs(void)
{ {
#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) #ifdef MULTIARCH
#define DECL2(arch) \ #define DECL2(arch) \
if (XCONCATX2(have,arch)()) \ if (XCONCATX2(have,arch)()) \
{ \ { \
@ -69,21 +62,11 @@ static void assign_funcs(void)
architecture_ = XCONCATX2(sharp_architecture,arch); \ architecture_ = XCONCATX2(sharp_architecture,arch); \
return; \ return; \
} }
#if (!defined(__AVX512F__))
DECL2(avx512f) DECL2(avx512f)
#endif
#if (!defined(__FMA4__))
DECL2(fma4) DECL2(fma4)
#endif
#if (!defined(__FMA__))
DECL2(fma) DECL2(fma)
#endif
#if (!defined(__AVX2__))
DECL2(avx2) DECL2(avx2)
#endif
#if (!defined(__AVX__))
DECL2(avx) DECL2(avx)
#endif
#endif #endif
inner_loop_ = inner_loop_default; inner_loop_ = inner_loop_default;
veclen_ = sharp_veclen_default; veclen_ = sharp_veclen_default;

View file

@ -1,11 +0,0 @@
#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH avx
#pragma GCC target("avx")
#include "sharp_core_inc.c"
#endif

View file

@ -1,11 +0,0 @@
#if (!defined(__AVX2__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH avx2
#pragma GCC target("avx2")
#include "sharp_core_inc.c"
#endif

View file

@ -1,11 +0,0 @@
#if (!defined(__AVX512F__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH avx512f
#pragma GCC target("avx512f")
#include "sharp_core_inc.c"
#endif

View file

@ -1,11 +0,0 @@
#if (!defined(__FMA__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH fma
#pragma GCC target("fma")
#include "sharp_core_inc.c"
#endif

View file

@ -1,11 +0,0 @@
#if (!defined(__FMA4__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH fma4
#pragma GCC target("fma4")
#include "sharp_core_inc.c"
#endif

View file

@ -29,6 +29,12 @@
* \author Martin Reinecke * \author Martin Reinecke
*/ */
#if (defined(MULTIARCH) || defined(GENERIC_ARCH))
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#include <complex.h> #include <complex.h>
#include <math.h> #include <math.h>
#include <string.h> #include <string.h>
@ -1179,3 +1185,5 @@ const char *XARCH(sharp_architecture)(void)
{ {
return xstr(ARCH); return xstr(ARCH);
} }
#endif