From 1871f8500c748a941954fe46f2a0503ebd5e0c72 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Fri, 18 Jan 2019 09:35:07 +0100 Subject: [PATCH 1/5] typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c993dd1..e135b44 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ personal communication). These improvements reduce the fraction of CPU time spent on evaluating the recurrences for Y_lm coefficients, which means that computing multiple -simultaneous SHTs no longer have a big performance advantage compared to SHTs +simultaneous SHTs no longer has a big performance advantage compared to SHTs done one after the other. As a consequence, libsharp support for simultaneous SHTs was dropped, making its interface much simpler. From 90d2444bf1799e89577efa00ab53407a96c384c9 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Fri, 18 Jan 2019 15:19:45 +0100 Subject: [PATCH 2/5] multiarch experiment --- Makefile.am | 25 +++++++++++++++++-------- libsharp/sharp_core.c | 4 ++-- libsharp/sharp_core_avx.c | 5 +---- libsharp/sharp_core_avx2.c | 5 +---- libsharp/sharp_core_avx512f.c | 5 +---- libsharp/sharp_core_fma.c | 5 +---- libsharp/sharp_core_fma4.c | 5 +---- 7 files changed, 24 insertions(+), 30 deletions(-) diff --git a/Makefile.am b/Makefile.am index bcf53ff..a51f568 100644 --- a/Makefile.am +++ b/Makefile.am @@ -2,7 +2,7 @@ ACLOCAL_AMFLAGS = -I m4 lib_LTLIBRARIES = libsharp.la -src_sharp = \ +libsharp_la_SOURCES = \ c_utils/c_utils.c \ c_utils/c_utils.h \ pocketfft/pocketfft.c \ @@ -10,11 +10,6 @@ src_sharp = \ libsharp/sharp.c \ libsharp/sharp_almhelpers.c \ libsharp/sharp_core.c \ - libsharp/sharp_core_avx.c \ - libsharp/sharp_core_avx2.c \ - libsharp/sharp_core_fma.c \ - libsharp/sharp_core_fma4.c \ - libsharp/sharp_core_avx512f.c \ libsharp/sharp_geomhelpers.c \ libsharp/sharp_legendre_roots.c \ libsharp/sharp_ylmgen_c.c \ @@ -23,6 +18,16 @@ src_sharp = \ libsharp/sharp_vecsupport.h \ libsharp/sharp_ylmgen_c.h +libavx_la_SOURCES = libsharp/sharp_core_avx.c +libavx2_la_SOURCES = libsharp/sharp_core_avx2.c +libfma_la_SOURCES = libsharp/sharp_core_fma.c +libfma4_la_SOURCES = libsharp/sharp_core_fma4.c +libavx512f_la_SOURCES = libsharp/sharp_core_avx512f.c + +noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la + +libsharp_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la + include_HEADERS = \ libsharp/sharp.h \ libsharp/sharp_geomhelpers.h \ @@ -33,8 +38,6 @@ EXTRA_DIST = \ libsharp/sharp_core_inc.c \ runtest.sh -libsharp_la_SOURCES = $(src_sharp) - check_PROGRAMS = sharp_testsuite sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h c_utils/walltime_c.c c_utils/walltime_c.h sharp_testsuite_LDADD = libsharp.la @@ -43,6 +46,12 @@ TESTS = runtest.sh AM_CFLAGS = -I$(top_srcdir)/c_utils -I$(top_srcdir)/libsharp @AM_CFLAGS@ +libavx_la_CFLAGS = ${AM_CFLAGS} -mavx +libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 +libfma_la_CFLAGS = ${AM_CFLAGS} -mfma +libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 +libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f + pkgconfigdir = $(libdir)/pkgconfig nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c index 036a8ed..e4c1059 100644 --- a/libsharp/sharp_core.c +++ b/libsharp/sharp_core.c @@ -18,7 +18,7 @@ static t_veclen veclen_ = NULL; static t_max_nvec max_nvec_ = NULL; static t_architecture architecture_ = NULL; -#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) +#ifdef MULTIARCH #define DECL(arch) \ static int XCONCATX2(have,arch)(void) \ @@ -59,7 +59,7 @@ DECL(avx) static void assign_funcs(void) { -#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) +#ifdef MULTIARCH #define DECL2(arch) \ if (XCONCATX2(have,arch)()) \ { \ diff --git a/libsharp/sharp_core_avx.c b/libsharp/sharp_core_avx.c index 724e629..053dfba 100644 --- a/libsharp/sharp_core_avx.c +++ b/libsharp/sharp_core_avx.c @@ -1,11 +1,8 @@ -#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) - +#ifdef MULTIARCH #define XCONCATX(a,b) a##_##b #define XCONCATX2(a,b) XCONCATX(a,b) #define XARCH(a) XCONCATX2(a,ARCH) #define ARCH avx -#pragma GCC target("avx") #include "sharp_core_inc.c" - #endif diff --git a/libsharp/sharp_core_avx2.c b/libsharp/sharp_core_avx2.c index a7ab0a7..ca0a3b9 100644 --- a/libsharp/sharp_core_avx2.c +++ b/libsharp/sharp_core_avx2.c @@ -1,11 +1,8 @@ -#if (!defined(__AVX2__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) - +#ifdef MULTIARCH #define XCONCATX(a,b) a##_##b #define XCONCATX2(a,b) XCONCATX(a,b) #define XARCH(a) XCONCATX2(a,ARCH) #define ARCH avx2 -#pragma GCC target("avx2") #include "sharp_core_inc.c" - #endif diff --git a/libsharp/sharp_core_avx512f.c b/libsharp/sharp_core_avx512f.c index 7f17429..5781e3c 100644 --- a/libsharp/sharp_core_avx512f.c +++ b/libsharp/sharp_core_avx512f.c @@ -1,11 +1,8 @@ -#if (!defined(__AVX512F__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) - +#ifdef MULTIARCH #define XCONCATX(a,b) a##_##b #define XCONCATX2(a,b) XCONCATX(a,b) #define XARCH(a) XCONCATX2(a,ARCH) #define ARCH avx512f -#pragma GCC target("avx512f") #include "sharp_core_inc.c" - #endif diff --git a/libsharp/sharp_core_fma.c b/libsharp/sharp_core_fma.c index 793151f..bb0af2c 100644 --- a/libsharp/sharp_core_fma.c +++ b/libsharp/sharp_core_fma.c @@ -1,11 +1,8 @@ -#if (!defined(__FMA__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) - +#ifdef MULTIARCH #define XCONCATX(a,b) a##_##b #define XCONCATX2(a,b) XCONCATX(a,b) #define XARCH(a) XCONCATX2(a,ARCH) #define ARCH fma -#pragma GCC target("fma") #include "sharp_core_inc.c" - #endif diff --git a/libsharp/sharp_core_fma4.c b/libsharp/sharp_core_fma4.c index d71de74..9b7f67b 100644 --- a/libsharp/sharp_core_fma4.c +++ b/libsharp/sharp_core_fma4.c @@ -1,11 +1,8 @@ -#if (!defined(__FMA4__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) - +#ifdef MULTIARCH #define XCONCATX(a,b) a##_##b #define XCONCATX2(a,b) XCONCATX(a,b) #define XARCH(a) XCONCATX2(a,ARCH) #define ARCH fma4 -#pragma GCC target("fma4") #include "sharp_core_inc.c" - #endif From e231a0e184a23aea0c2a40b5253f4ad8f59bc665 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 21 Jan 2019 11:14:19 +0100 Subject: [PATCH 3/5] better documentation --- COMPILE | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/COMPILE b/COMPILE index 8a5f3cd..5b1c5b2 100644 --- a/COMPILE +++ b/COMPILE @@ -28,14 +28,14 @@ to the C99 standard, you should still be able to compile libsharp with Runtime CPU selection with gcc ------------------------------ -When using a recent gcc (6.0 and newer) on an x86_64 platform, the build -machinery will compile the time-critical functions for several different -architectures (SSE2, AVX, AVX2, FMA3, FMA4, AVX512F), and the appropriate -implementation will be selected at runtime. -This only happens if you do _not_ explicitly specify a target architecture via -the compiler flags. I.e., please do _not_ specify "-march=native" or -"-mtarget=avx" or similar if you want a portable binary that will run -efficiently on different x86_64 CPUs. +When using a recent gcc (6.0 and newer) or a recent clang (successfully tested +with versions 6 and 7) on an x86_64 platform, the build machinery can compile +the time-critical functions for several different architectures (SSE2, AVX, +AVX2, FMA3, FMA4, AVX512F), and the appropriate implementation will be selected +at runtime. +This is enabled by passing "-DMULTIARCH" as part of the CFLAGS. +If this is enabled, please do _not_ specify "-march=native" or +"-mtarget=avx" or similar! If you are compiling libsharp for a particular target CPU only, or if you are using a different compiler, however, "-march-native" should be used. The resulting binary will most likely not run on other computers, though. @@ -65,16 +65,16 @@ Example configure invocations ============================= GCC, OpenMP, portable binary: -CFLAGS="-std=c99 -O3 -ffast-math -fopenmp" ./configure +CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure GCC, no OpenMP, portable binary: -CFLAGS="-std=c99 -O3 -ffast-math" ./configure +CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math" ./configure -Clang, OpenMP, nonportable binary: -CC=clang CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure +Clang, OpenMP, portable binary: +CC=clang CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure Intel C compiler, OpenMP, nonportable binary: -CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure +CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp -D__PURE_INTEL_C99_HEADERS__" ./configure MPI support, nonportable binary: CC=mpicc CFLAGS="-DUSE_MPI -std=c99 -O3 -march=native -ffast-math" ./configure From 7ef585e3bb164b02d7a6116b26456d8f97dd571c Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 21 Jan 2019 11:30:05 +0100 Subject: [PATCH 4/5] cleanup --- Makefile.am | 21 ++++++++++----------- libsharp/sharp_core.c | 6 ++---- libsharp/sharp_core_avx.c | 8 -------- libsharp/sharp_core_avx2.c | 8 -------- libsharp/sharp_core_avx512f.c | 8 -------- libsharp/sharp_core_fma.c | 8 -------- libsharp/sharp_core_fma4.c | 8 -------- libsharp/sharp_core_inc.c | 8 ++++++++ 8 files changed, 20 insertions(+), 55 deletions(-) delete mode 100644 libsharp/sharp_core_avx.c delete mode 100644 libsharp/sharp_core_avx2.c delete mode 100644 libsharp/sharp_core_avx512f.c delete mode 100644 libsharp/sharp_core_fma.c delete mode 100644 libsharp/sharp_core_fma4.c diff --git a/Makefile.am b/Makefile.am index a51f568..b0b09ee 100644 --- a/Makefile.am +++ b/Makefile.am @@ -18,11 +18,11 @@ libsharp_la_SOURCES = \ libsharp/sharp_vecsupport.h \ libsharp/sharp_ylmgen_c.h -libavx_la_SOURCES = libsharp/sharp_core_avx.c -libavx2_la_SOURCES = libsharp/sharp_core_avx2.c -libfma_la_SOURCES = libsharp/sharp_core_fma.c -libfma4_la_SOURCES = libsharp/sharp_core_fma4.c -libavx512f_la_SOURCES = libsharp/sharp_core_avx512f.c +libavx_la_SOURCES = libsharp/sharp_core_inc.c +libavx2_la_SOURCES = libsharp/sharp_core_inc.c +libfma_la_SOURCES = libsharp/sharp_core_inc.c +libfma4_la_SOURCES = libsharp/sharp_core_inc.c +libavx512f_la_SOURCES = libsharp/sharp_core_inc.c noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la @@ -35,7 +35,6 @@ include_HEADERS = \ libsharp/sharp_cxx.h EXTRA_DIST = \ - libsharp/sharp_core_inc.c \ runtest.sh check_PROGRAMS = sharp_testsuite @@ -46,11 +45,11 @@ TESTS = runtest.sh AM_CFLAGS = -I$(top_srcdir)/c_utils -I$(top_srcdir)/libsharp @AM_CFLAGS@ -libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f +libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx +libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2 +libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma +libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4 +libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f pkgconfigdir = $(libdir)/pkgconfig nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c index e4c1059..6cfb694 100644 --- a/libsharp/sharp_core.c +++ b/libsharp/sharp_core.c @@ -1,9 +1,7 @@ -#define XCONCATX(a,b) a##_##b -#define XCONCATX2(a,b) XCONCATX(a,b) -#define XARCH(a) XCONCATX2(a,ARCH) - #define ARCH default +#define GENERIC_ARCH #include "sharp_core_inc.c" +#undef GENERIC_ARCH #undef ARCH typedef void (*t_inner_loop) (sharp_job *job, const int *ispair, diff --git a/libsharp/sharp_core_avx.c b/libsharp/sharp_core_avx.c deleted file mode 100644 index 053dfba..0000000 --- a/libsharp/sharp_core_avx.c +++ /dev/null @@ -1,8 +0,0 @@ -#ifdef MULTIARCH -#define XCONCATX(a,b) a##_##b -#define XCONCATX2(a,b) XCONCATX(a,b) -#define XARCH(a) XCONCATX2(a,ARCH) - -#define ARCH avx -#include "sharp_core_inc.c" -#endif diff --git a/libsharp/sharp_core_avx2.c b/libsharp/sharp_core_avx2.c deleted file mode 100644 index ca0a3b9..0000000 --- a/libsharp/sharp_core_avx2.c +++ /dev/null @@ -1,8 +0,0 @@ -#ifdef MULTIARCH -#define XCONCATX(a,b) a##_##b -#define XCONCATX2(a,b) XCONCATX(a,b) -#define XARCH(a) XCONCATX2(a,ARCH) - -#define ARCH avx2 -#include "sharp_core_inc.c" -#endif diff --git a/libsharp/sharp_core_avx512f.c b/libsharp/sharp_core_avx512f.c deleted file mode 100644 index 5781e3c..0000000 --- a/libsharp/sharp_core_avx512f.c +++ /dev/null @@ -1,8 +0,0 @@ -#ifdef MULTIARCH -#define XCONCATX(a,b) a##_##b -#define XCONCATX2(a,b) XCONCATX(a,b) -#define XARCH(a) XCONCATX2(a,ARCH) - -#define ARCH avx512f -#include "sharp_core_inc.c" -#endif diff --git a/libsharp/sharp_core_fma.c b/libsharp/sharp_core_fma.c deleted file mode 100644 index bb0af2c..0000000 --- a/libsharp/sharp_core_fma.c +++ /dev/null @@ -1,8 +0,0 @@ -#ifdef MULTIARCH -#define XCONCATX(a,b) a##_##b -#define XCONCATX2(a,b) XCONCATX(a,b) -#define XARCH(a) XCONCATX2(a,ARCH) - -#define ARCH fma -#include "sharp_core_inc.c" -#endif diff --git a/libsharp/sharp_core_fma4.c b/libsharp/sharp_core_fma4.c deleted file mode 100644 index 9b7f67b..0000000 --- a/libsharp/sharp_core_fma4.c +++ /dev/null @@ -1,8 +0,0 @@ -#ifdef MULTIARCH -#define XCONCATX(a,b) a##_##b -#define XCONCATX2(a,b) XCONCATX(a,b) -#define XARCH(a) XCONCATX2(a,ARCH) - -#define ARCH fma4 -#include "sharp_core_inc.c" -#endif diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c index d229a49..96981f6 100644 --- a/libsharp/sharp_core_inc.c +++ b/libsharp/sharp_core_inc.c @@ -29,6 +29,12 @@ * \author Martin Reinecke */ +#if (defined(MULTIARCH) || defined(GENERIC_ARCH)) + +#define XCONCATX(a,b) a##_##b +#define XCONCATX2(a,b) XCONCATX(a,b) +#define XARCH(a) XCONCATX2(a,ARCH) + #include #include #include @@ -1179,3 +1185,5 @@ const char *XARCH(sharp_architecture)(void) { return xstr(ARCH); } + +#endif From 4db6d3d8251e0f2c3e1b67f1273af56a455989fb Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 21 Jan 2019 11:49:50 +0100 Subject: [PATCH 5/5] better diagnostics --- libsharp/sharp_core.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c index 6cfb694..f54a058 100644 --- a/libsharp/sharp_core.c +++ b/libsharp/sharp_core.c @@ -18,6 +18,11 @@ static t_architecture architecture_ = NULL; #ifdef MULTIARCH +#if (defined(___AVX512F__) || defined(__FMA4__) || defined(__FMA__) || \ + defined(__AVX2__) || defined(__AVX__)) +#error MULTIARCH specified but platform-specific flags detected +#endif + #define DECL(arch) \ static int XCONCATX2(have,arch)(void) \ { \ @@ -37,21 +42,11 @@ int XCONCATX2(sharp_veclen,arch) (void); \ int XCONCATX2(sharp_max_nvec,arch) (int spin); \ const char *XCONCATX2(sharp_architecture,arch) (void); -#if (!defined(__AVX512F__)) DECL(avx512f) -#endif -#if (!defined(__FMA4__)) DECL(fma4) -#endif -#if (!defined(__FMA__)) DECL(fma) -#endif -#if (!defined(__AVX2__)) DECL(avx2) -#endif -#if (!defined(__AVX__)) DECL(avx) -#endif #endif @@ -67,21 +62,11 @@ static void assign_funcs(void) architecture_ = XCONCATX2(sharp_architecture,arch); \ return; \ } -#if (!defined(__AVX512F__)) DECL2(avx512f) -#endif -#if (!defined(__FMA4__)) DECL2(fma4) -#endif -#if (!defined(__FMA__)) DECL2(fma) -#endif -#if (!defined(__AVX2__)) DECL2(avx2) -#endif -#if (!defined(__AVX__)) DECL2(avx) -#endif #endif inner_loop_ = inner_loop_default; veclen_ = sharp_veclen_default;