heavy tweaking

This commit is contained in:
Martin Reinecke 2018-10-26 10:34:02 +02:00
parent 2affc388ad
commit f30d99cb2f
8 changed files with 2370 additions and 237 deletions

View file

@ -5,10 +5,13 @@ lib_LTLIBRARIES = libsharp.la
src_sharp = \
c_utils/c_utils.c \
c_utils/c_utils.h \
pocketfft/pocketfft.c \
pocketfft/pocketfft.h \
libsharp/sharp.c \
libsharp/sharp_almhelpers.c \
libsharp/sharp_announce.c \
libsharp/sharp_core.c \
libsharp/sharp_core_avx.c \
libsharp/sharp_geomhelpers.c \
libsharp/sharp_legendre_roots.c \
libsharp/sharp_ylmgen_c.c \
@ -30,6 +33,7 @@ include_HEADERS = \
libsharp/sharp_cxx.h
EXTRA_DIST = \
libsharp/sharp_core_inc0.c \
libsharp/sharp_core_inc.c \
libsharp/sharp_core_inc2.c \
libsharp/sharp_core_inchelper.c

View file

@ -2,6 +2,8 @@ AC_INIT([libsharp], [1.0.0])
AM_INIT_AUTOMAKE([foreign subdir-objects -Wall -Werror])
AM_MAINTAINER_MODE([enable])
AC_OPENMP
dnl
dnl Needed for linking on Windows.
dnl Protect with m4_ifdef because AM_PROG_AR is required in
@ -68,40 +70,10 @@ AX_CHECK_COMPILE_FLAG([-fno-trapping-math],[CFLAGS="$CFLAGS -fno-trapping-math"]
AX_CHECK_COMPILE_FLAG([-fno-rounding-math],[CFLAGS="$CFLAGS -fno-rounding-math"])
AX_CHECK_COMPILE_FLAG([-fno-signaling-nans],[CFLAGS="$CFLAGS -fno-signaling-nans"])
AX_CHECK_COMPILE_FLAG([-fcx-limited-range],[CFLAGS="$CFLAGS -fcx-limited-range"])
CFLAGS="$CFLAGS $OPENMP_CFLAGS"
# adding the lib to the files to link
LIBS="-lm"
LIBS="-lpocketfft $LIBS"
# introduce the optional configure parameter for a non-standard install prefix of XXX
AC_ARG_WITH([pocketfft],
[AS_HELP_STRING([--with-pocketfft=prefix],
[try this for a non-standard install prefix of the pocketfft library])],
[POCKETFFTPATHSET=1],
[POCKETFFTPATHSET=0])
# if optional parameter used, extend path flags for compliler and linker
if test $POCKETFFTPATHSET = 1 ; then
# extend the compiler and linker flags according to the path set
AM_CFLAGS="$AM_CFLAGS -I$with_pocketfft/include"
AM_LDFLAGS="$AM_LDFLAGS -L$with_pocketfft/lib"
fi
##########################################################################
# check for pocketfft
##########################################################################
OLD_CFLAGS=$CFLAGS;
OLD_LDFLAGS=$LDFLAGS;
CFLAGS="$AM_CFLAGS $CFLAGS"
LDFLAGS="$AM_LDFLAGS $LDFLAGS"
AC_CHECK_HEADERS([pocketfft/pocketfft.h],
[pocketfft_header_found=yes; break;])
AS_IF([test "x$pocketfft_header_found" != "xyes"],
[AC_MSG_ERROR([Unable to find pocketfft header])])
AC_SEARCH_LIBS([make_rfft_plan],[pocketfft],,AC_MSG_ERROR([pocketfft not found]))
CFLAGS=$OLD_CFLAGS
LDFLAGS=$OLD_LDFLAGS
AC_PROG_LIBTOOL

View file

@ -29,212 +29,25 @@
* \author Martin Reinecke
*/
#include <complex.h>
#include <math.h>
#include <string.h>
#include "sharp_vecsupport.h"
#include "sharp_complex_hacks.h"
#include "sharp_ylmgen_c.h"
#include "sharp.h"
#include "sharp_core.h"
#include "c_utils.h"
#define ARCH _default
#include "sharp_core_inc0.c"
#undef ARCH
typedef complex double dcmplx;
// must be in the range [0;6]
#define MAXJOB_SPECIAL 2
#define XCONCAT2(a,b) a##_##b
#define CONCAT2(a,b) XCONCAT2(a,b)
#define XCONCAT3(a,b,c) a##_##b##_##c
#define CONCAT3(a,b,c) XCONCAT3(a,b,c)
#define nvec 1
#include "sharp_core_inchelper.c"
#undef nvec
#define nvec 2
#include "sharp_core_inchelper.c"
#undef nvec
#define nvec 3
#include "sharp_core_inchelper.c"
#undef nvec
#define nvec 4
#include "sharp_core_inchelper.c"
#undef nvec
#define nvec 5
#include "sharp_core_inchelper.c"
#undef nvec
#define nvec 6
#include "sharp_core_inchelper.c"
#undef nvec
#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
void inner_loop_avx (sharp_job *job, const int *ispair,const double *cth,
const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
const int *mlim);
#endif
void inner_loop (sharp_job *job, const int *ispair,const double *cth,
const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
const int *mlim)
{
int njobs=job->ntrans, nv=job->flags&SHARP_NVMAX;
if (njobs<=MAXJOB_SPECIAL)
{
switch (njobs*16+nv)
{
#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1))
case 0x11:
CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x12:
CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x13:
CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x14:
CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x15:
CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x16:
CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
#if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2))
case 0x21:
CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x22:
CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x23:
CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x24:
CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x25:
CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x26:
CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
#if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3))
case 0x31:
CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x32:
CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x33:
CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x34:
CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x35:
CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x36:
CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
#if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4))
case 0x41:
CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x42:
CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x43:
CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x44:
CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x45:
CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x46:
CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
#if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5))
case 0x51:
CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x52:
CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x53:
CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x54:
CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x55:
CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x56:
CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
#if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6))
case 0x61:
CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x62:
CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x63:
CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x64:
CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x65:
CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x66:
CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
}
}
#if (SHARP_MAXTRANS>MAXJOB_SPECIAL)
#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
__builtin_cpu_init();
if (__builtin_cpu_supports("avx"))
inner_loop_avx (job, ispair, cth, sth, llim, ulim, gen, mi, mlim);
else
{
switch (nv)
{
case 1:
CONCAT2(inner_loop,1)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
case 2:
CONCAT2(inner_loop,2)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
case 3:
CONCAT2(inner_loop,3)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
case 4:
CONCAT2(inner_loop,4)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
case 5:
CONCAT2(inner_loop,5)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
case 6:
CONCAT2(inner_loop,6)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
}
}
#endif
UTIL_FAIL("Incorrect vector parameters");
inner_loop_default (job, ispair, cth, sth, llim, ulim, gen, mi, mlim);
}

14
libsharp/sharp_core_avx.c Normal file
View file

@ -0,0 +1,14 @@
#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
// if we arrive here, we can benefit from an additional AVX version
#warning entering gcc and x86_64 specific code branch
#define ARCH _avx
#define __AVX__
#pragma GCC push_options
#pragma GCC target("avx")
#include "sharp_core_inc0.c"
#pragma GCC pop_options
#undef __AVX__
#undef ARCH
#endif

242
libsharp/sharp_core_inc0.c Normal file
View file

@ -0,0 +1,242 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_core_inc0.c
* Computational core
*
* Copyright (C) 2012-2013 Max-Planck-Society
* \author Martin Reinecke
*/
#include <complex.h>
#include <math.h>
#include <string.h>
#include "sharp_vecsupport.h"
#include "sharp_complex_hacks.h"
#include "sharp_ylmgen_c.h"
#include "sharp.h"
#include "sharp_core.h"
#include "c_utils.h"
typedef complex double dcmplx;
// must be in the range [0;6]
#define MAXJOB_SPECIAL 2
#define XCONCATX(a,b) a##b
#define CONCATX(a,b) XCONCATX(a,b)
#define XCONCAT2(a,b) a##_##b
#define CONCAT2(a,b) XCONCAT2(a,b)
#define XCONCAT3(a,b,c) a##_##b##_##c
#define CONCAT3(a,b,c) XCONCAT3(a,b,c)
#define nvec 1
#include "sharp_core_inchelper.c"
#undef nvec
#define nvec 2
#include "sharp_core_inchelper.c"
#undef nvec
#define nvec 3
#include "sharp_core_inchelper.c"
#undef nvec
#define nvec 4
#include "sharp_core_inchelper.c"
#undef nvec
#define nvec 5
#include "sharp_core_inchelper.c"
#undef nvec
#define nvec 6
#include "sharp_core_inchelper.c"
#undef nvec
void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *cth,
const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
const int *mlim)
{
int njobs=job->ntrans, nv=job->flags&SHARP_NVMAX;
if (njobs<=MAXJOB_SPECIAL)
{
switch (njobs*16+nv)
{
#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1))
case 0x11:
CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x12:
CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x13:
CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x14:
CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x15:
CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x16:
CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
#if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2))
case 0x21:
CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x22:
CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x23:
CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x24:
CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x25:
CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x26:
CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
#if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3))
case 0x31:
CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x32:
CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x33:
CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x34:
CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x35:
CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x36:
CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
#if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4))
case 0x41:
CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x42:
CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x43:
CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x44:
CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x45:
CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x46:
CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
#if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5))
case 0x51:
CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x52:
CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x53:
CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x54:
CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x55:
CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x56:
CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
#if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6))
case 0x61:
CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x62:
CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x63:
CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x64:
CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x65:
CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
case 0x66:
CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
return;
#endif
}
}
#if (SHARP_MAXTRANS>MAXJOB_SPECIAL)
else
{
switch (nv)
{
case 1:
CONCAT2(inner_loop,1)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
case 2:
CONCAT2(inner_loop,2)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
case 3:
CONCAT2(inner_loop,3)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
case 4:
CONCAT2(inner_loop,4)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
case 5:
CONCAT2(inner_loop,5)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
case 6:
CONCAT2(inner_loop,6)
(job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
return;
}
}
#endif
UTIL_FAIL("Incorrect vector parameters");
}

View file

@ -46,12 +46,6 @@
#endif
#if (VLEN==1)
#define VLEN_s 1
#else
#define VLEN_s (2*VLEN)
#endif
#ifndef USE_FMA4
#ifdef __FMA4__
#define USE_FMA4 1

2060
pocketfft/pocketfft.c Normal file

File diff suppressed because it is too large Load diff

34
pocketfft/pocketfft.h Normal file
View file

@ -0,0 +1,34 @@
/*
* This file is part of pocketfft.
* Licensed under a 3-clause BSD style license - see LICENSE.md
*/
/*! \file pocketfft.h
* Public interface of the pocketfft library
*
* Copyright (C) 2008-2018 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef POCKETFFT_H
#define POCKETFFT_H
#include <stdlib.h>
struct cfft_plan_i;
typedef struct cfft_plan_i * cfft_plan;
cfft_plan make_cfft_plan (size_t length);
void destroy_cfft_plan (cfft_plan plan);
int cfft_backward(cfft_plan plan, double c[], double fct);
int cfft_forward(cfft_plan plan, double c[], double fct);
size_t cfft_length(cfft_plan plan);
struct rfft_plan_i;
typedef struct rfft_plan_i * rfft_plan;
rfft_plan make_rfft_plan (size_t length);
void destroy_rfft_plan (rfft_plan plan);
int rfft_backward(rfft_plan plan, double c[], double fct);
int rfft_forward(rfft_plan plan, double c[], double fct);
size_t rfft_length(rfft_plan plan);
#endif