Merge branch 'multiarch' into 'master'
Multiarch See merge request mtr/libsharp!17
This commit is contained in:
commit
48e8268036
10 changed files with 48 additions and 104 deletions
26
COMPILE
26
COMPILE
|
@ -28,14 +28,14 @@ to the C99 standard, you should still be able to compile libsharp with
|
|||
Runtime CPU selection with gcc
|
||||
------------------------------
|
||||
|
||||
When using a recent gcc (6.0 and newer) on an x86_64 platform, the build
|
||||
machinery will compile the time-critical functions for several different
|
||||
architectures (SSE2, AVX, AVX2, FMA3, FMA4, AVX512F), and the appropriate
|
||||
implementation will be selected at runtime.
|
||||
This only happens if you do _not_ explicitly specify a target architecture via
|
||||
the compiler flags. I.e., please do _not_ specify "-march=native" or
|
||||
"-mtarget=avx" or similar if you want a portable binary that will run
|
||||
efficiently on different x86_64 CPUs.
|
||||
When using a recent gcc (6.0 and newer) or a recent clang (successfully tested
|
||||
with versions 6 and 7) on an x86_64 platform, the build machinery can compile
|
||||
the time-critical functions for several different architectures (SSE2, AVX,
|
||||
AVX2, FMA3, FMA4, AVX512F), and the appropriate implementation will be selected
|
||||
at runtime.
|
||||
This is enabled by passing "-DMULTIARCH" as part of the CFLAGS.
|
||||
If this is enabled, please do _not_ specify "-march=native" or
|
||||
"-mtarget=avx" or similar!
|
||||
If you are compiling libsharp for a particular target CPU only, or if you are
|
||||
using a different compiler, however, "-march-native" should be used. The
|
||||
resulting binary will most likely not run on other computers, though.
|
||||
|
@ -65,16 +65,16 @@ Example configure invocations
|
|||
=============================
|
||||
|
||||
GCC, OpenMP, portable binary:
|
||||
CFLAGS="-std=c99 -O3 -ffast-math -fopenmp" ./configure
|
||||
CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure
|
||||
|
||||
GCC, no OpenMP, portable binary:
|
||||
CFLAGS="-std=c99 -O3 -ffast-math" ./configure
|
||||
CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math" ./configure
|
||||
|
||||
Clang, OpenMP, nonportable binary:
|
||||
CC=clang CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure
|
||||
Clang, OpenMP, portable binary:
|
||||
CC=clang CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure
|
||||
|
||||
Intel C compiler, OpenMP, nonportable binary:
|
||||
CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure
|
||||
CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp -D__PURE_INTEL_C99_HEADERS__" ./configure
|
||||
|
||||
MPI support, nonportable binary:
|
||||
CC=mpicc CFLAGS="-DUSE_MPI -std=c99 -O3 -march=native -ffast-math" ./configure
|
||||
|
|
26
Makefile.am
26
Makefile.am
|
@ -2,7 +2,7 @@ ACLOCAL_AMFLAGS = -I m4
|
|||
|
||||
lib_LTLIBRARIES = libsharp.la
|
||||
|
||||
src_sharp = \
|
||||
libsharp_la_SOURCES = \
|
||||
c_utils/c_utils.c \
|
||||
c_utils/c_utils.h \
|
||||
pocketfft/pocketfft.c \
|
||||
|
@ -10,11 +10,6 @@ src_sharp = \
|
|||
libsharp/sharp.c \
|
||||
libsharp/sharp_almhelpers.c \
|
||||
libsharp/sharp_core.c \
|
||||
libsharp/sharp_core_avx.c \
|
||||
libsharp/sharp_core_avx2.c \
|
||||
libsharp/sharp_core_fma.c \
|
||||
libsharp/sharp_core_fma4.c \
|
||||
libsharp/sharp_core_avx512f.c \
|
||||
libsharp/sharp_geomhelpers.c \
|
||||
libsharp/sharp_legendre_roots.c \
|
||||
libsharp/sharp_ylmgen_c.c \
|
||||
|
@ -23,6 +18,16 @@ src_sharp = \
|
|||
libsharp/sharp_vecsupport.h \
|
||||
libsharp/sharp_ylmgen_c.h
|
||||
|
||||
libavx_la_SOURCES = libsharp/sharp_core_inc.c
|
||||
libavx2_la_SOURCES = libsharp/sharp_core_inc.c
|
||||
libfma_la_SOURCES = libsharp/sharp_core_inc.c
|
||||
libfma4_la_SOURCES = libsharp/sharp_core_inc.c
|
||||
libavx512f_la_SOURCES = libsharp/sharp_core_inc.c
|
||||
|
||||
noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
|
||||
|
||||
libsharp_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
|
||||
|
||||
include_HEADERS = \
|
||||
libsharp/sharp.h \
|
||||
libsharp/sharp_geomhelpers.h \
|
||||
|
@ -30,11 +35,8 @@ include_HEADERS = \
|
|||
libsharp/sharp_cxx.h
|
||||
|
||||
EXTRA_DIST = \
|
||||
libsharp/sharp_core_inc.c \
|
||||
runtest.sh
|
||||
|
||||
libsharp_la_SOURCES = $(src_sharp)
|
||||
|
||||
check_PROGRAMS = sharp_testsuite
|
||||
sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h c_utils/walltime_c.c c_utils/walltime_c.h
|
||||
sharp_testsuite_LDADD = libsharp.la
|
||||
|
@ -43,6 +45,12 @@ TESTS = runtest.sh
|
|||
|
||||
AM_CFLAGS = -I$(top_srcdir)/c_utils -I$(top_srcdir)/libsharp @AM_CFLAGS@
|
||||
|
||||
libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx
|
||||
libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2
|
||||
libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma
|
||||
libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4
|
||||
libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f
|
||||
|
||||
pkgconfigdir = $(libdir)/pkgconfig
|
||||
nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc
|
||||
|
||||
|
|
|
@ -18,7 +18,7 @@ personal communication).
|
|||
|
||||
These improvements reduce the fraction of CPU time spent on evaluating the
|
||||
recurrences for Y_lm coefficients, which means that computing multiple
|
||||
simultaneous SHTs no longer have a big performance advantage compared to SHTs
|
||||
simultaneous SHTs no longer has a big performance advantage compared to SHTs
|
||||
done one after the other.
|
||||
As a consequence, libsharp support for simultaneous SHTs was dropped, making
|
||||
its interface much simpler.
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
#define XCONCATX(a,b) a##_##b
|
||||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
||||
|
||||
#define ARCH default
|
||||
#define GENERIC_ARCH
|
||||
#include "sharp_core_inc.c"
|
||||
#undef GENERIC_ARCH
|
||||
#undef ARCH
|
||||
|
||||
typedef void (*t_inner_loop) (sharp_job *job, const int *ispair,
|
||||
|
@ -18,7 +16,12 @@ static t_veclen veclen_ = NULL;
|
|||
static t_max_nvec max_nvec_ = NULL;
|
||||
static t_architecture architecture_ = NULL;
|
||||
|
||||
#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
|
||||
#ifdef MULTIARCH
|
||||
|
||||
#if (defined(___AVX512F__) || defined(__FMA4__) || defined(__FMA__) || \
|
||||
defined(__AVX2__) || defined(__AVX__))
|
||||
#error MULTIARCH specified but platform-specific flags detected
|
||||
#endif
|
||||
|
||||
#define DECL(arch) \
|
||||
static int XCONCATX2(have,arch)(void) \
|
||||
|
@ -39,27 +42,17 @@ int XCONCATX2(sharp_veclen,arch) (void); \
|
|||
int XCONCATX2(sharp_max_nvec,arch) (int spin); \
|
||||
const char *XCONCATX2(sharp_architecture,arch) (void);
|
||||
|
||||
#if (!defined(__AVX512F__))
|
||||
DECL(avx512f)
|
||||
#endif
|
||||
#if (!defined(__FMA4__))
|
||||
DECL(fma4)
|
||||
#endif
|
||||
#if (!defined(__FMA__))
|
||||
DECL(fma)
|
||||
#endif
|
||||
#if (!defined(__AVX2__))
|
||||
DECL(avx2)
|
||||
#endif
|
||||
#if (!defined(__AVX__))
|
||||
DECL(avx)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
static void assign_funcs(void)
|
||||
{
|
||||
#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
|
||||
#ifdef MULTIARCH
|
||||
#define DECL2(arch) \
|
||||
if (XCONCATX2(have,arch)()) \
|
||||
{ \
|
||||
|
@ -69,21 +62,11 @@ static void assign_funcs(void)
|
|||
architecture_ = XCONCATX2(sharp_architecture,arch); \
|
||||
return; \
|
||||
}
|
||||
#if (!defined(__AVX512F__))
|
||||
DECL2(avx512f)
|
||||
#endif
|
||||
#if (!defined(__FMA4__))
|
||||
DECL2(fma4)
|
||||
#endif
|
||||
#if (!defined(__FMA__))
|
||||
DECL2(fma)
|
||||
#endif
|
||||
#if (!defined(__AVX2__))
|
||||
DECL2(avx2)
|
||||
#endif
|
||||
#if (!defined(__AVX__))
|
||||
DECL2(avx)
|
||||
#endif
|
||||
#endif
|
||||
inner_loop_ = inner_loop_default;
|
||||
veclen_ = sharp_veclen_default;
|
||||
|
|
|
@ -1,11 +0,0 @@
|
|||
#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
|
||||
|
||||
#define XCONCATX(a,b) a##_##b
|
||||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
||||
|
||||
#define ARCH avx
|
||||
#pragma GCC target("avx")
|
||||
#include "sharp_core_inc.c"
|
||||
|
||||
#endif
|
|
@ -1,11 +0,0 @@
|
|||
#if (!defined(__AVX2__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
|
||||
|
||||
#define XCONCATX(a,b) a##_##b
|
||||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
||||
|
||||
#define ARCH avx2
|
||||
#pragma GCC target("avx2")
|
||||
#include "sharp_core_inc.c"
|
||||
|
||||
#endif
|
|
@ -1,11 +0,0 @@
|
|||
#if (!defined(__AVX512F__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
|
||||
|
||||
#define XCONCATX(a,b) a##_##b
|
||||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
||||
|
||||
#define ARCH avx512f
|
||||
#pragma GCC target("avx512f")
|
||||
#include "sharp_core_inc.c"
|
||||
|
||||
#endif
|
|
@ -1,11 +0,0 @@
|
|||
#if (!defined(__FMA__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
|
||||
|
||||
#define XCONCATX(a,b) a##_##b
|
||||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
||||
|
||||
#define ARCH fma
|
||||
#pragma GCC target("fma")
|
||||
#include "sharp_core_inc.c"
|
||||
|
||||
#endif
|
|
@ -1,11 +0,0 @@
|
|||
#if (!defined(__FMA4__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
|
||||
|
||||
#define XCONCATX(a,b) a##_##b
|
||||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
||||
|
||||
#define ARCH fma4
|
||||
#pragma GCC target("fma4")
|
||||
#include "sharp_core_inc.c"
|
||||
|
||||
#endif
|
|
@ -29,6 +29,12 @@
|
|||
* \author Martin Reinecke
|
||||
*/
|
||||
|
||||
#if (defined(MULTIARCH) || defined(GENERIC_ARCH))
|
||||
|
||||
#define XCONCATX(a,b) a##_##b
|
||||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
||||
|
||||
#include <complex.h>
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
|
@ -1179,3 +1185,5 @@ const char *XARCH(sharp_architecture)(void)
|
|||
{
|
||||
return xstr(ARCH);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue