Merge branch 'multiarch' into 'master'

Multiarch

See merge request mtr/libsharp!17
This commit is contained in:
Martin Reinecke 2019-01-21 12:08:42 +01:00
commit 48e8268036
10 changed files with 48 additions and 104 deletions

26
COMPILE
View file

@ -28,14 +28,14 @@ to the C99 standard, you should still be able to compile libsharp with
Runtime CPU selection with gcc
------------------------------
When using a recent gcc (6.0 and newer) on an x86_64 platform, the build
machinery will compile the time-critical functions for several different
architectures (SSE2, AVX, AVX2, FMA3, FMA4, AVX512F), and the appropriate
implementation will be selected at runtime.
This only happens if you do _not_ explicitly specify a target architecture via
the compiler flags. I.e., please do _not_ specify "-march=native" or
"-mtarget=avx" or similar if you want a portable binary that will run
efficiently on different x86_64 CPUs.
When using a recent gcc (6.0 and newer) or a recent clang (successfully tested
with versions 6 and 7) on an x86_64 platform, the build machinery can compile
the time-critical functions for several different architectures (SSE2, AVX,
AVX2, FMA3, FMA4, AVX512F), and the appropriate implementation will be selected
at runtime.
This is enabled by passing "-DMULTIARCH" as part of the CFLAGS.
If this is enabled, please do _not_ specify "-march=native" or
"-mtarget=avx" or similar!
If you are compiling libsharp for a particular target CPU only, or if you are
using a different compiler, however, "-march-native" should be used. The
resulting binary will most likely not run on other computers, though.
@ -65,16 +65,16 @@ Example configure invocations
=============================
GCC, OpenMP, portable binary:
CFLAGS="-std=c99 -O3 -ffast-math -fopenmp" ./configure
CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure
GCC, no OpenMP, portable binary:
CFLAGS="-std=c99 -O3 -ffast-math" ./configure
CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math" ./configure
Clang, OpenMP, nonportable binary:
CC=clang CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure
Clang, OpenMP, portable binary:
CC=clang CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure
Intel C compiler, OpenMP, nonportable binary:
CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure
CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp -D__PURE_INTEL_C99_HEADERS__" ./configure
MPI support, nonportable binary:
CC=mpicc CFLAGS="-DUSE_MPI -std=c99 -O3 -march=native -ffast-math" ./configure

View file

@ -2,7 +2,7 @@ ACLOCAL_AMFLAGS = -I m4
lib_LTLIBRARIES = libsharp.la
src_sharp = \
libsharp_la_SOURCES = \
c_utils/c_utils.c \
c_utils/c_utils.h \
pocketfft/pocketfft.c \
@ -10,11 +10,6 @@ src_sharp = \
libsharp/sharp.c \
libsharp/sharp_almhelpers.c \
libsharp/sharp_core.c \
libsharp/sharp_core_avx.c \
libsharp/sharp_core_avx2.c \
libsharp/sharp_core_fma.c \
libsharp/sharp_core_fma4.c \
libsharp/sharp_core_avx512f.c \
libsharp/sharp_geomhelpers.c \
libsharp/sharp_legendre_roots.c \
libsharp/sharp_ylmgen_c.c \
@ -23,6 +18,16 @@ src_sharp = \
libsharp/sharp_vecsupport.h \
libsharp/sharp_ylmgen_c.h
libavx_la_SOURCES = libsharp/sharp_core_inc.c
libavx2_la_SOURCES = libsharp/sharp_core_inc.c
libfma_la_SOURCES = libsharp/sharp_core_inc.c
libfma4_la_SOURCES = libsharp/sharp_core_inc.c
libavx512f_la_SOURCES = libsharp/sharp_core_inc.c
noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
libsharp_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
include_HEADERS = \
libsharp/sharp.h \
libsharp/sharp_geomhelpers.h \
@ -30,11 +35,8 @@ include_HEADERS = \
libsharp/sharp_cxx.h
EXTRA_DIST = \
libsharp/sharp_core_inc.c \
runtest.sh
libsharp_la_SOURCES = $(src_sharp)
check_PROGRAMS = sharp_testsuite
sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h c_utils/walltime_c.c c_utils/walltime_c.h
sharp_testsuite_LDADD = libsharp.la
@ -43,6 +45,12 @@ TESTS = runtest.sh
AM_CFLAGS = -I$(top_srcdir)/c_utils -I$(top_srcdir)/libsharp @AM_CFLAGS@
libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx
libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2
libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma
libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4
libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f
pkgconfigdir = $(libdir)/pkgconfig
nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc

View file

@ -18,7 +18,7 @@ personal communication).
These improvements reduce the fraction of CPU time spent on evaluating the
recurrences for Y_lm coefficients, which means that computing multiple
simultaneous SHTs no longer have a big performance advantage compared to SHTs
simultaneous SHTs no longer has a big performance advantage compared to SHTs
done one after the other.
As a consequence, libsharp support for simultaneous SHTs was dropped, making
its interface much simpler.

View file

@ -1,9 +1,7 @@
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH default
#define GENERIC_ARCH
#include "sharp_core_inc.c"
#undef GENERIC_ARCH
#undef ARCH
typedef void (*t_inner_loop) (sharp_job *job, const int *ispair,
@ -18,7 +16,12 @@ static t_veclen veclen_ = NULL;
static t_max_nvec max_nvec_ = NULL;
static t_architecture architecture_ = NULL;
#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#ifdef MULTIARCH
#if (defined(___AVX512F__) || defined(__FMA4__) || defined(__FMA__) || \
defined(__AVX2__) || defined(__AVX__))
#error MULTIARCH specified but platform-specific flags detected
#endif
#define DECL(arch) \
static int XCONCATX2(have,arch)(void) \
@ -39,27 +42,17 @@ int XCONCATX2(sharp_veclen,arch) (void); \
int XCONCATX2(sharp_max_nvec,arch) (int spin); \
const char *XCONCATX2(sharp_architecture,arch) (void);
#if (!defined(__AVX512F__))
DECL(avx512f)
#endif
#if (!defined(__FMA4__))
DECL(fma4)
#endif
#if (!defined(__FMA__))
DECL(fma)
#endif
#if (!defined(__AVX2__))
DECL(avx2)
#endif
#if (!defined(__AVX__))
DECL(avx)
#endif
#endif
static void assign_funcs(void)
{
#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#ifdef MULTIARCH
#define DECL2(arch) \
if (XCONCATX2(have,arch)()) \
{ \
@ -69,21 +62,11 @@ static void assign_funcs(void)
architecture_ = XCONCATX2(sharp_architecture,arch); \
return; \
}
#if (!defined(__AVX512F__))
DECL2(avx512f)
#endif
#if (!defined(__FMA4__))
DECL2(fma4)
#endif
#if (!defined(__FMA__))
DECL2(fma)
#endif
#if (!defined(__AVX2__))
DECL2(avx2)
#endif
#if (!defined(__AVX__))
DECL2(avx)
#endif
#endif
inner_loop_ = inner_loop_default;
veclen_ = sharp_veclen_default;

View file

@ -1,11 +0,0 @@
#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH avx
#pragma GCC target("avx")
#include "sharp_core_inc.c"
#endif

View file

@ -1,11 +0,0 @@
#if (!defined(__AVX2__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH avx2
#pragma GCC target("avx2")
#include "sharp_core_inc.c"
#endif

View file

@ -1,11 +0,0 @@
#if (!defined(__AVX512F__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH avx512f
#pragma GCC target("avx512f")
#include "sharp_core_inc.c"
#endif

View file

@ -1,11 +0,0 @@
#if (!defined(__FMA__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH fma
#pragma GCC target("fma")
#include "sharp_core_inc.c"
#endif

View file

@ -1,11 +0,0 @@
#if (!defined(__FMA4__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH fma4
#pragma GCC target("fma4")
#include "sharp_core_inc.c"
#endif

View file

@ -29,6 +29,12 @@
* \author Martin Reinecke
*/
#if (defined(MULTIARCH) || defined(GENERIC_ARCH))
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#include <complex.h>
#include <math.h>
#include <string.h>
@ -1179,3 +1185,5 @@ const char *XARCH(sharp_architecture)(void)
{
return xstr(ARCH);
}
#endif