From f30d99cb2fe46ee2df55843b308386878681b13b Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Fri, 26 Oct 2018 10:34:02 +0200 Subject: [PATCH] heavy tweaking --- Makefile.am | 4 + configure.ac | 34 +- libsharp/sharp_core.c | 213 +--- libsharp/sharp_core_avx.c | 14 + libsharp/sharp_core_inc0.c | 242 +++++ libsharp/sharp_vecutil.h | 6 - pocketfft/pocketfft.c | 2060 ++++++++++++++++++++++++++++++++++++ pocketfft/pocketfft.h | 34 + 8 files changed, 2370 insertions(+), 237 deletions(-) create mode 100644 libsharp/sharp_core_avx.c create mode 100644 libsharp/sharp_core_inc0.c create mode 100644 pocketfft/pocketfft.c create mode 100644 pocketfft/pocketfft.h diff --git a/Makefile.am b/Makefile.am index 3050371..5cd60d4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5,10 +5,13 @@ lib_LTLIBRARIES = libsharp.la src_sharp = \ c_utils/c_utils.c \ c_utils/c_utils.h \ + pocketfft/pocketfft.c \ + pocketfft/pocketfft.h \ libsharp/sharp.c \ libsharp/sharp_almhelpers.c \ libsharp/sharp_announce.c \ libsharp/sharp_core.c \ + libsharp/sharp_core_avx.c \ libsharp/sharp_geomhelpers.c \ libsharp/sharp_legendre_roots.c \ libsharp/sharp_ylmgen_c.c \ @@ -30,6 +33,7 @@ include_HEADERS = \ libsharp/sharp_cxx.h EXTRA_DIST = \ + libsharp/sharp_core_inc0.c \ libsharp/sharp_core_inc.c \ libsharp/sharp_core_inc2.c \ libsharp/sharp_core_inchelper.c diff --git a/configure.ac b/configure.ac index acad8ef..34626bc 100644 --- a/configure.ac +++ b/configure.ac @@ -2,6 +2,8 @@ AC_INIT([libsharp], [1.0.0]) AM_INIT_AUTOMAKE([foreign subdir-objects -Wall -Werror]) AM_MAINTAINER_MODE([enable]) +AC_OPENMP + dnl dnl Needed for linking on Windows. dnl Protect with m4_ifdef because AM_PROG_AR is required in @@ -68,40 +70,10 @@ AX_CHECK_COMPILE_FLAG([-fno-trapping-math],[CFLAGS="$CFLAGS -fno-trapping-math"] AX_CHECK_COMPILE_FLAG([-fno-rounding-math],[CFLAGS="$CFLAGS -fno-rounding-math"]) AX_CHECK_COMPILE_FLAG([-fno-signaling-nans],[CFLAGS="$CFLAGS -fno-signaling-nans"]) AX_CHECK_COMPILE_FLAG([-fcx-limited-range],[CFLAGS="$CFLAGS -fcx-limited-range"]) +CFLAGS="$CFLAGS $OPENMP_CFLAGS" # adding the lib to the files to link LIBS="-lm" -LIBS="-lpocketfft $LIBS" -# introduce the optional configure parameter for a non-standard install prefix of XXX -AC_ARG_WITH([pocketfft], - [AS_HELP_STRING([--with-pocketfft=prefix], - [try this for a non-standard install prefix of the pocketfft library])], - [POCKETFFTPATHSET=1], - [POCKETFFTPATHSET=0]) - -# if optional parameter used, extend path flags for compliler and linker -if test $POCKETFFTPATHSET = 1 ; then - # extend the compiler and linker flags according to the path set - AM_CFLAGS="$AM_CFLAGS -I$with_pocketfft/include" - AM_LDFLAGS="$AM_LDFLAGS -L$with_pocketfft/lib" -fi - -########################################################################## -# check for pocketfft -########################################################################## -OLD_CFLAGS=$CFLAGS; -OLD_LDFLAGS=$LDFLAGS; -CFLAGS="$AM_CFLAGS $CFLAGS" -LDFLAGS="$AM_LDFLAGS $LDFLAGS" -AC_CHECK_HEADERS([pocketfft/pocketfft.h], - [pocketfft_header_found=yes; break;]) - -AS_IF([test "x$pocketfft_header_found" != "xyes"], - [AC_MSG_ERROR([Unable to find pocketfft header])]) - -AC_SEARCH_LIBS([make_rfft_plan],[pocketfft],,AC_MSG_ERROR([pocketfft not found])) -CFLAGS=$OLD_CFLAGS -LDFLAGS=$OLD_LDFLAGS AC_PROG_LIBTOOL diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c index 7cd2f17..f052555 100644 --- a/libsharp/sharp_core.c +++ b/libsharp/sharp_core.c @@ -29,212 +29,25 @@ * \author Martin Reinecke */ -#include -#include -#include -#include "sharp_vecsupport.h" -#include "sharp_complex_hacks.h" -#include "sharp_ylmgen_c.h" -#include "sharp.h" -#include "sharp_core.h" -#include "c_utils.h" +#define ARCH _default +#include "sharp_core_inc0.c" +#undef ARCH -typedef complex double dcmplx; - -// must be in the range [0;6] -#define MAXJOB_SPECIAL 2 - -#define XCONCAT2(a,b) a##_##b -#define CONCAT2(a,b) XCONCAT2(a,b) -#define XCONCAT3(a,b,c) a##_##b##_##c -#define CONCAT3(a,b,c) XCONCAT3(a,b,c) - -#define nvec 1 -#include "sharp_core_inchelper.c" -#undef nvec - -#define nvec 2 -#include "sharp_core_inchelper.c" -#undef nvec - -#define nvec 3 -#include "sharp_core_inchelper.c" -#undef nvec - -#define nvec 4 -#include "sharp_core_inchelper.c" -#undef nvec - -#define nvec 5 -#include "sharp_core_inchelper.c" -#undef nvec - -#define nvec 6 -#include "sharp_core_inchelper.c" -#undef nvec +#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) +void inner_loop_avx (sharp_job *job, const int *ispair,const double *cth, + const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, + const int *mlim); +#endif void inner_loop (sharp_job *job, const int *ispair,const double *cth, const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, const int *mlim) { - int njobs=job->ntrans, nv=job->flags&SHARP_NVMAX; - if (njobs<=MAXJOB_SPECIAL) - { - switch (njobs*16+nv) - { -#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1)) - case 0x11: - CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x12: - CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x13: - CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x14: - CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x15: - CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x16: - CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; -#endif -#if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2)) - case 0x21: - CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x22: - CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x23: - CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x24: - CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x25: - CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x26: - CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; -#endif -#if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3)) - case 0x31: - CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x32: - CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x33: - CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x34: - CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x35: - CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x36: - CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; -#endif -#if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4)) - case 0x41: - CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x42: - CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x43: - CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x44: - CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x45: - CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x46: - CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; -#endif -#if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5)) - case 0x51: - CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x52: - CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x53: - CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x54: - CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x55: - CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x56: - CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; -#endif -#if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6)) - case 0x61: - CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x62: - CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x63: - CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x64: - CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x65: - CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; - case 0x66: - CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); - return; -#endif - } - } -#if (SHARP_MAXTRANS>MAXJOB_SPECIAL) +#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) + __builtin_cpu_init(); + if (__builtin_cpu_supports("avx")) + inner_loop_avx (job, ispair, cth, sth, llim, ulim, gen, mi, mlim); else - { - switch (nv) - { - case 1: - CONCAT2(inner_loop,1) - (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); - return; - case 2: - CONCAT2(inner_loop,2) - (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); - return; - case 3: - CONCAT2(inner_loop,3) - (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); - return; - case 4: - CONCAT2(inner_loop,4) - (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); - return; - case 5: - CONCAT2(inner_loop,5) - (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); - return; - case 6: - CONCAT2(inner_loop,6) - (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); - return; - } - } #endif - UTIL_FAIL("Incorrect vector parameters"); + inner_loop_default (job, ispair, cth, sth, llim, ulim, gen, mi, mlim); } diff --git a/libsharp/sharp_core_avx.c b/libsharp/sharp_core_avx.c new file mode 100644 index 0000000..dc6ee48 --- /dev/null +++ b/libsharp/sharp_core_avx.c @@ -0,0 +1,14 @@ +#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) +// if we arrive here, we can benefit from an additional AVX version +#warning entering gcc and x86_64 specific code branch + +#define ARCH _avx +#define __AVX__ +#pragma GCC push_options +#pragma GCC target("avx") +#include "sharp_core_inc0.c" +#pragma GCC pop_options +#undef __AVX__ +#undef ARCH + +#endif diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c new file mode 100644 index 0000000..8590d2c --- /dev/null +++ b/libsharp/sharp_core_inc0.c @@ -0,0 +1,242 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_core_inc0.c + * Computational core + * + * Copyright (C) 2012-2013 Max-Planck-Society + * \author Martin Reinecke + */ + +#include +#include +#include +#include "sharp_vecsupport.h" +#include "sharp_complex_hacks.h" +#include "sharp_ylmgen_c.h" +#include "sharp.h" +#include "sharp_core.h" +#include "c_utils.h" + +typedef complex double dcmplx; + +// must be in the range [0;6] +#define MAXJOB_SPECIAL 2 + +#define XCONCATX(a,b) a##b +#define CONCATX(a,b) XCONCATX(a,b) +#define XCONCAT2(a,b) a##_##b +#define CONCAT2(a,b) XCONCAT2(a,b) +#define XCONCAT3(a,b,c) a##_##b##_##c +#define CONCAT3(a,b,c) XCONCAT3(a,b,c) + +#define nvec 1 +#include "sharp_core_inchelper.c" +#undef nvec + +#define nvec 2 +#include "sharp_core_inchelper.c" +#undef nvec + +#define nvec 3 +#include "sharp_core_inchelper.c" +#undef nvec + +#define nvec 4 +#include "sharp_core_inchelper.c" +#undef nvec + +#define nvec 5 +#include "sharp_core_inchelper.c" +#undef nvec + +#define nvec 6 +#include "sharp_core_inchelper.c" +#undef nvec + +void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *cth, + const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, + const int *mlim) + { + int njobs=job->ntrans, nv=job->flags&SHARP_NVMAX; + if (njobs<=MAXJOB_SPECIAL) + { + switch (njobs*16+nv) + { +#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1)) + case 0x11: + CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x12: + CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x13: + CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x14: + CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x15: + CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x16: + CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; +#endif +#if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2)) + case 0x21: + CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x22: + CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x23: + CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x24: + CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x25: + CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x26: + CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; +#endif +#if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3)) + case 0x31: + CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x32: + CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x33: + CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x34: + CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x35: + CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x36: + CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; +#endif +#if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4)) + case 0x41: + CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x42: + CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x43: + CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x44: + CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x45: + CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x46: + CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; +#endif +#if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5)) + case 0x51: + CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x52: + CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x53: + CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x54: + CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x55: + CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x56: + CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; +#endif +#if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6)) + case 0x61: + CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x62: + CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x63: + CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x64: + CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x65: + CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; + case 0x66: + CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + return; +#endif + } + } +#if (SHARP_MAXTRANS>MAXJOB_SPECIAL) + else + { + switch (nv) + { + case 1: + CONCAT2(inner_loop,1) + (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); + return; + case 2: + CONCAT2(inner_loop,2) + (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); + return; + case 3: + CONCAT2(inner_loop,3) + (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); + return; + case 4: + CONCAT2(inner_loop,4) + (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); + return; + case 5: + CONCAT2(inner_loop,5) + (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); + return; + case 6: + CONCAT2(inner_loop,6) + (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans); + return; + } + } +#endif + UTIL_FAIL("Incorrect vector parameters"); + } diff --git a/libsharp/sharp_vecutil.h b/libsharp/sharp_vecutil.h index f6161ca..24a2e94 100644 --- a/libsharp/sharp_vecutil.h +++ b/libsharp/sharp_vecutil.h @@ -46,12 +46,6 @@ #endif -#if (VLEN==1) -#define VLEN_s 1 -#else -#define VLEN_s (2*VLEN) -#endif - #ifndef USE_FMA4 #ifdef __FMA4__ #define USE_FMA4 1 diff --git a/pocketfft/pocketfft.c b/pocketfft/pocketfft.c new file mode 100644 index 0000000..562ebc9 --- /dev/null +++ b/pocketfft/pocketfft.c @@ -0,0 +1,2060 @@ +/* + * This file is part of pocketfft. + * Licensed under a 3-clause BSD style license - see LICENSE.md + */ + +/* + * Main implementation file. + * + * Copyright (C) 2004-2018 Max-Planck-Society + * \author Martin Reinecke + */ + +#include +#include + +#include "pocketfft.h" + +#define RALLOC(type,num) \ + ((type *)malloc((num)*sizeof(type))) +#define DEALLOC(ptr) \ + do { free(ptr); (ptr)=NULL; } while(0) + +#define SWAP(a,b,type) \ + do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0) + +#ifdef __GNUC__ +#define NOINLINE __attribute__((noinline)) +#define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result)) +#else +#define NOINLINE +#define WARN_UNUSED_RESULT +#endif + +#if 0 +static void fracsincos(size_t m, size_t n, double *restrict res) + { + static const long double twopi=6.283185307179586476925286766559006L; + long double arg = twopi*(long double)m/((long double)n); + res[0] = (double)cosl(arg); res[1] = (double)sinl(arg); + } +#endif + +// adapted from https://stackoverflow.com/questions/42792939/ +// CAUTION: this function only works for arguments in the range [-0.25; 0.25]! +static void my_sincosm1pi (double a, double *restrict res) + { + double s = a * a; + /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */ + double r = -1.0369917389758117e-4; + r = fma (r, s, 1.9294935641298806e-3); + r = fma (r, s, -2.5806887942825395e-2); + r = fma (r, s, 2.3533063028328211e-1); + r = fma (r, s, -1.3352627688538006e+0); + r = fma (r, s, 4.0587121264167623e+0); + r = fma (r, s, -4.9348022005446790e+0); + double c = r*s; + /* Approximate sin(pi*x) for x in [-0.25,0.25] */ + r = 4.6151442520157035e-4; + r = fma (r, s, -7.3700183130883555e-3); + r = fma (r, s, 8.2145868949323936e-2); + r = fma (r, s, -5.9926452893214921e-1); + r = fma (r, s, 2.5501640398732688e+0); + r = fma (r, s, -5.1677127800499516e+0); + s = s * a; + r = r * s; + s = fma (a, 3.1415926535897931e+0, r); + res[0] = c; + res[1] = s; + } + +NOINLINE static void calc_first_octant(size_t den, double * restrict res) + { + size_t n = (den+4)>>3; + if (n==0) return; + res[0]=1.; res[1]=0.; + if (n==1) return; + size_t l1=(size_t)sqrt(n); + for (size_t i=1; in) end = n-start; + for (size_t i=1; i>2; + size_t i=0, idx1=0, idx2=2*ndone-2; + for (; i+1>1; + double * p = res+n-1; + calc_first_octant(n<<2, p); + int i4=0, in=n, i=0; + for (; i4<=in-i4; ++i, i4+=4) // octant 0 + { + res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; + } + for (; i4-in <= 0; ++i, i4+=4) // octant 1 + { + int xm = in-i4; + res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; + } + for (; i4<=3*in-i4; ++i, i4+=4) // octant 2 + { + int xm = i4-in; + res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; + } + for (; i>2; + if ((n&7)==0) + res[quart] = res[quart+1] = hsqt2; + for (size_t i=2, j=2*quart-2; i>1; + if ((n&3)==0) + { res[half] = 0.; res[half+1] = 1.; } + for (size_t i=2, j=2*half-2; i>1))<<1)==n) + { res=2; n=tmp; } + + size_t limit=(size_t)sqrt(n+0.01); + for (size_t x=3; x<=limit; x+=2) + while (((tmp=(n/x))*x)==n) + { + res=x; + n=tmp; + limit=(size_t)sqrt(n+0.01); + } + if (n>1) res=n; + + return res; + } + +static double cost_guess (size_t n) + { + const double lfp=1.1; // penalty for non-hardcoded larger factors + size_t ni=n; + double result=0.; + size_t tmp; + while (((tmp=(n>>1))<<1)==n) + { result+=2; n=tmp; } + + size_t limit=(size_t)sqrt(n+0.01); + for (size_t x=3; x<=limit; x+=2) + while ((tmp=(n/x))*x==n) + { + result+= (x<=5) ? x : lfp*x; // penalize larger prime factors + n=tmp; + limit=(size_t)sqrt(n+0.01); + } + if (n>1) result+=(n<=5) ? n : lfp*n; + + return result*ni; + } + +/* returns the smallest composite of 2, 3 and 5 which is >= n */ +static size_t good_size(size_t n) + { + if (n<=6) return n; + + size_t bestfac=2*n; + for (size_t f2=1; f2=n) bestfac=f235711; + return bestfac; + } + +typedef struct cmplx { + double r,i; +} cmplx; + +#define NFCT 25 +typedef struct cfftp_fctdata + { + size_t fct; + cmplx *tw, *tws; + } cfftp_fctdata; + +typedef struct cfftp_plan_i + { + size_t length, nfct; + cmplx *mem; + cfftp_fctdata fct[NFCT]; + } cfftp_plan_i; +typedef struct cfftp_plan_i * cfftp_plan; + +#define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; } +#define MPC(a,b,c,d) { a.r=c.r-d.r; a.i=c.i-d.i; b.r=c.r+d.r; b.i=c.i+d.i; } +#define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; } +#define SCALEC(a,b) { a.r*=b; a.i*=b; } +#define CONJFLIPC(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; } +#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))] +#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))] +#define WA(x,i) wa[(i)-1+(x)*(ido-1)] +/* a = b*c */ +#define MULPMC(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; } +#define MULMPC(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; } + +#define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; } +/* a = b*c */ +#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-sign*b.i*c.i; a.i=b.r*c.i+sign*b.i*c.r; } +/* a *= b */ +#define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; } + +NOINLINE static void pass2 (size_t ido, size_t l1, const cmplx * restrict cc, + cmplx * restrict ch, const cmplx * restrict wa, const int sign) + { + const size_t cdim=2; + + if (ido==1) + for (size_t k=0; k0) + for (size_t k=0; k0) + for (size_t i=1; iip) iwal-=ip; + cmplx xwal=wal[iwal]; + iwal+=l; if (iwal>ip) iwal-=ip; + cmplx xwal2=wal[iwal]; + for (size_t ik=0; ikip) iwal-=ip; + cmplx xwal=wal[iwal]; + for (size_t ik=0; iklength==1) return 0; + size_t len=plan->length; + size_t l1=1, nf=plan->nfct; + cmplx *ch = RALLOC(cmplx, len); + if (!ch) return -1; + cmplx *p1=c, *p2=ch; + + for(size_t k1=0; k1fct[k1].fct; + size_t l2=ip*l1; + size_t ido = len/l2; + if (ip==4) pass4 (ido, l1, p1, p2, plan->fct[k1].tw, sign); + else if(ip==2) pass2 (ido, l1, p1, p2, plan->fct[k1].tw, sign); + else if(ip==3) pass3 (ido, l1, p1, p2, plan->fct[k1].tw, sign); + else if(ip==5) pass5 (ido, l1, p1, p2, plan->fct[k1].tw, sign); + else if(ip==7) pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign); + else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign); + else + { + if (passg(ido, ip, l1, p1, p2, plan->fct[k1].tw, plan->fct[k1].tws, sign)) + { DEALLOC(ch); return -1; } + SWAP(p1,p2,cmplx *); + } + SWAP(p1,p2,cmplx *); + l1=l2; + } + if (p1!=c) + { + if (fct!=1.) + for (size_t i=0; ilength; + size_t nfct=0; + while ((length%4)==0) + { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; } + if ((length%2)==0) + { + length>>=1; + // factor 2 should be at the front of the factor list + if (nfct>=NFCT) return -1; + plan->fct[nfct++].fct=2; + SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t); + } + size_t maxl=(size_t)(sqrt((double)length))+1; + for (size_t divisor=3; (length>1)&&(divisor=NFCT) return -1; + plan->fct[nfct++].fct=divisor; + length/=divisor; + } + maxl=(size_t)(sqrt((double)length))+1; + } + if (length>1) plan->fct[nfct++].fct=length; + plan->nfct=nfct; + return 0; + } + +static size_t cfftp_twsize (cfftp_plan plan) + { + size_t twsize=0, l1=1; + for (size_t k=0; knfct; ++k) + { + size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip); + twsize+=(ip-1)*(ido-1); + if (ip>11) + twsize+=ip; + l1*=ip; + } + return twsize; + } + +WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan) + { + size_t length=plan->length; + double *twid = RALLOC(double, 2*length); + if (!twid) return -1; + sincos_2pibyn(length, twid); + size_t l1=1; + size_t memofs=0; + for (size_t k=0; knfct; ++k) + { + size_t ip=plan->fct[k].fct, ido= length/(l1*ip); + plan->fct[k].tw=plan->mem+memofs; + memofs+=(ip-1)*(ido-1); + for (size_t j=1; jfct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i]; + plan->fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1]; + } + if (ip>11) + { + plan->fct[k].tws=plan->mem+memofs; + memofs+=ip; + for (size_t j=0; jfct[k].tws[j].r = twid[2*j*l1*ido]; + plan->fct[k].tws[j].i = twid[2*j*l1*ido+1]; + } + } + l1*=ip; + } + DEALLOC(twid); + return 0; + } + +static cfftp_plan make_cfftp_plan (size_t length) + { + if (length==0) return NULL; + cfftp_plan plan = RALLOC(cfftp_plan_i,1); + if (!plan) return NULL; + plan->length=length; + plan->nfct=0; + for (size_t i=0; ifct[i]=(cfftp_fctdata){0,0,0}; + plan->mem=0; + if (length==1) return plan; + if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; } + size_t tws=cfftp_twsize(plan); + plan->mem=RALLOC(cmplx,tws); + if (!plan->mem) { DEALLOC(plan); return NULL; } + if (cfftp_comp_twiddle(plan)!=0) + { DEALLOC(plan->mem); DEALLOC(plan); return NULL; } + return plan; + } + +static void destroy_cfftp_plan (cfftp_plan plan) + { + DEALLOC(plan->mem); + DEALLOC(plan); + } + +typedef struct rfftp_fctdata + { + size_t fct; + double *tw, *tws; + } rfftp_fctdata; + +typedef struct rfftp_plan_i + { + size_t length, nfct; + double *mem; + rfftp_fctdata fct[NFCT]; + } rfftp_plan_i; +typedef struct rfftp_plan_i * rfftp_plan; + +#define WA(x,i) wa[(i)+(x)*(ido-1)] +#define PM(a,b,c,d) { a=c+d; b=c-d; } +/* (a+ib) = conj(c+id) * (e+if) */ +#define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; } + +#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))] +#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))] + +NOINLINE static void radf2 (size_t ido, size_t l1, const double * restrict cc, + double * restrict ch, const double * restrict wa) + { + const size_t cdim=2; + + for (size_t k=0; k1) + { + for (size_t j=1, jc=ip-1; j=ip) iang-=ip; + double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + double ar3=csarr[2*iang], ai3=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + double ar4=csarr[2*iang], ai4=csarr[2*iang+1]; + for (size_t ik=0; ik=ip) iang-=ip; + double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + for (size_t ik=0; ik=ip) iang-=ip; + double ar=csarr[2*iang], ai=csarr[2*iang+1]; + for (size_t ik=0; ikip) iang-=ip; + double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + double ar3=csarr[2*iang], ai3=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + double ar4=csarr[2*iang], ai4=csarr[2*iang+1]; + for (size_t ik=0; ikip) iang-=ip; + double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + for (size_t ik=0; ikip) iang-=ip; + double war=csarr[2*iang], wai=csarr[2*iang+1]; + for (size_t ik=0; iklength==1) return 0; + size_t n=plan->length; + size_t l1=n, nf=plan->nfct; + double *ch = RALLOC(double, n); + if (!ch) return -1; + double *p1=c, *p2=ch; + + for(size_t k1=0; k1fct[k].fct; + size_t ido=n / l1; + l1 /= ip; + if(ip==4) + radf4(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==2) + radf2(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==3) + radf3(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==5) + radf5(ido, l1, p1, p2, plan->fct[k].tw); + else + { + radfg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws); + SWAP (p1,p2,double *); + } + SWAP (p1,p2,double *); + } + copy_and_norm(c,p1,n,fct); + DEALLOC(ch); + return 0; + } + +WARN_UNUSED_RESULT +static int rfftp_backward(rfftp_plan plan, double c[], double fct) + { + if (plan->length==1) return 0; + size_t n=plan->length; + size_t l1=1, nf=plan->nfct; + double *ch = RALLOC(double, n); + if (!ch) return -1; + double *p1=c, *p2=ch; + + for(size_t k=0; kfct[k].fct, + ido= n/(ip*l1); + if(ip==4) + radb4(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==2) + radb2(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==3) + radb3(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==5) + radb5(ido, l1, p1, p2, plan->fct[k].tw); + else + radbg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws); + SWAP (p1,p2,double *); + l1*=ip; + } + copy_and_norm(c,p1,n,fct); + DEALLOC(ch); + return 0; + } + +WARN_UNUSED_RESULT +static int rfftp_factorize (rfftp_plan plan) + { + size_t length=plan->length; + size_t nfct=0; + while ((length%4)==0) + { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; } + if ((length%2)==0) + { + length>>=1; + // factor 2 should be at the front of the factor list + if (nfct>=NFCT) return -1; + plan->fct[nfct++].fct=2; + SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t); + } + size_t maxl=(size_t)(sqrt((double)length))+1; + for (size_t divisor=3; (length>1)&&(divisor=NFCT) return -1; + plan->fct[nfct++].fct=divisor; + length/=divisor; + } + maxl=(size_t)(sqrt((double)length))+1; + } + if (length>1) plan->fct[nfct++].fct=length; + plan->nfct=nfct; + return 0; + } + +static size_t rfftp_twsize(rfftp_plan plan) + { + size_t twsize=0, l1=1; + for (size_t k=0; knfct; ++k) + { + size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip); + twsize+=(ip-1)*(ido-1); + if (ip>5) twsize+=2*ip; + l1*=ip; + } + return twsize; + return 0; + } + +WARN_UNUSED_RESULT static int rfftp_comp_twiddle (rfftp_plan plan) + { + size_t length=plan->length; + double *twid = RALLOC(double, 2*length); + if (!twid) return -1; + sincos_2pibyn(length, twid); + size_t l1=1; + double *ptr=plan->mem; + for (size_t k=0; knfct; ++k) + { + size_t ip=plan->fct[k].fct, ido=length/(l1*ip); + if (knfct-1) // last factor doesn't need twiddles + { + plan->fct[k].tw=ptr; ptr+=(ip-1)*(ido-1); + for (size_t j=1; jfct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i]; + plan->fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1]; + } + } + if (ip>5) // special factors required by *g functions + { + plan->fct[k].tws=ptr; ptr+=2*ip; + for (size_t i=0; ifct[k].tws[2*i ] = twid[2*i*(length/ip)]; + plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1]; + } + } + l1*=ip; + } + DEALLOC(twid); + return 0; + } + +static rfftp_plan make_rfftp_plan (size_t length) + { + if (length==0) return NULL; + rfftp_plan plan = RALLOC(rfftp_plan_i,1); + if (!plan) return NULL; + plan->length=length; + plan->nfct=0; + plan->mem=NULL; + for (size_t i=0; ifct[i]=(rfftp_fctdata){0,0,0}; + if (length==1) return plan; + if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; } + size_t tws=rfftp_twsize(plan); + plan->mem=RALLOC(double,tws); + if (!plan->mem) { DEALLOC(plan); return NULL; } + if (rfftp_comp_twiddle(plan)!=0) + { DEALLOC(plan->mem); DEALLOC(plan); return NULL; } + return plan; + } + +static void destroy_rfftp_plan (rfftp_plan plan) + { + DEALLOC(plan->mem); + DEALLOC(plan); + } + +typedef struct fftblue_plan_i + { + size_t n, n2; + cfftp_plan plan; + double *mem; + double *bk, *bkf; + } fftblue_plan_i; +typedef struct fftblue_plan_i * fftblue_plan; + +static fftblue_plan make_fftblue_plan (size_t length) + { + fftblue_plan plan = RALLOC(fftblue_plan_i,1); + if (!plan) return NULL; + plan->n = length; + plan->n2 = good_size(plan->n*2-1); + plan->mem = RALLOC(double, 2*plan->n+2*plan->n2); + if (!plan->mem) { DEALLOC(plan); return NULL; } + plan->bk = plan->mem; + plan->bkf = plan->bk+2*plan->n; + +/* initialize b_k */ + double *tmp = RALLOC(double,4*plan->n); + if (!tmp) { DEALLOC(plan->mem); DEALLOC(plan); return NULL; } + sincos_2pibyn(2*plan->n,tmp); + plan->bk[0] = 1; + plan->bk[1] = 0; + + size_t coeff=0; + for (size_t m=1; mn; ++m) + { + coeff+=2*m-1; + if (coeff>=2*plan->n) coeff-=2*plan->n; + plan->bk[2*m ] = tmp[2*coeff ]; + plan->bk[2*m+1] = tmp[2*coeff+1]; + } + + /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ + double xn2 = 1./plan->n2; + plan->bkf[0] = plan->bk[0]*xn2; + plan->bkf[1] = plan->bk[1]*xn2; + for (size_t m=2; m<2*plan->n; m+=2) + { + plan->bkf[m] = plan->bkf[2*plan->n2-m] = plan->bk[m] *xn2; + plan->bkf[m+1] = plan->bkf[2*plan->n2-m+1] = plan->bk[m+1] *xn2; + } + for (size_t m=2*plan->n;m<=(2*plan->n2-2*plan->n+1);++m) + plan->bkf[m]=0.; + plan->plan=make_cfftp_plan(plan->n2); + if (!plan->plan) + { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; } + if (cfftp_forward(plan->plan,plan->bkf,1.)!=0) + { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; } + DEALLOC(tmp); + + return plan; + } + +static void destroy_fftblue_plan (fftblue_plan plan) + { + DEALLOC(plan->mem); + destroy_cfftp_plan(plan->plan); + DEALLOC(plan); + } + +WARN_UNUSED_RESULT +static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct) + { + size_t n=plan->n; + size_t n2=plan->n2; + double *bk = plan->bk; + double *bkf = plan->bkf; + double *akf = RALLOC(double, 2*n2); + if (!akf) return -1; + +/* initialize a_k and FFT it */ + if (isign>0) + for (size_t m=0; m<2*n; m+=2) + { + akf[m] = c[m]*bk[m] - c[m+1]*bk[m+1]; + akf[m+1] = c[m]*bk[m+1] + c[m+1]*bk[m]; + } + else + for (size_t m=0; m<2*n; m+=2) + { + akf[m] = c[m]*bk[m] + c[m+1]*bk[m+1]; + akf[m+1] =-c[m]*bk[m+1] + c[m+1]*bk[m]; + } + for (size_t m=2*n; m<2*n2; ++m) + akf[m]=0; + + if (cfftp_forward (plan->plan,akf,fct)!=0) + { DEALLOC(akf); return -1; } + +/* do the convolution */ + if (isign>0) + for (size_t m=0; m<2*n2; m+=2) + { + double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m]; + akf[m ] = akf[m]*bkf[m] + akf[m+1]*bkf[m+1]; + akf[m+1] = im; + } + else + for (size_t m=0; m<2*n2; m+=2) + { + double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m]; + akf[m ] = akf[m]*bkf[m] - akf[m+1]*bkf[m+1]; + akf[m+1] = im; + } + +/* inverse FFT */ + if (cfftp_backward (plan->plan,akf,1.)!=0) + { DEALLOC(akf); return -1; } + +/* multiply by b_k */ + if (isign>0) + for (size_t m=0; m<2*n; m+=2) + { + c[m] = bk[m] *akf[m] - bk[m+1]*akf[m+1]; + c[m+1] = bk[m+1]*akf[m] + bk[m] *akf[m+1]; + } + else + for (size_t m=0; m<2*n; m+=2) + { + c[m] = bk[m] *akf[m] + bk[m+1]*akf[m+1]; + c[m+1] =-bk[m+1]*akf[m] + bk[m] *akf[m+1]; + } + DEALLOC(akf); + return 0; + } + +WARN_UNUSED_RESULT +static int cfftblue_backward(fftblue_plan plan, double c[], double fct) + { return fftblue_fft(plan,c,1,fct); } + +WARN_UNUSED_RESULT +static int cfftblue_forward(fftblue_plan plan, double c[], double fct) + { return fftblue_fft(plan,c,-1,fct); } + +WARN_UNUSED_RESULT +static int rfftblue_backward(fftblue_plan plan, double c[], double fct) + { + size_t n=plan->n; + double *tmp = RALLOC(double,2*n); + if (!tmp) return -1; + tmp[0]=c[0]; + tmp[1]=0.; + memcpy (tmp+2,c+1, (n-1)*sizeof(double)); + if ((n&1)==0) tmp[n+1]=0.; + for (size_t m=2; mn; + double *tmp = RALLOC(double,2*n); + if (!tmp) return -1; + for (size_t m=0; mblueplan=0; + plan->packplan=0; + if ((length<50) || (largest_prime_factor(length)<=sqrt(length))) + { + plan->packplan=make_cfftp_plan(length); + if (!plan->packplan) { DEALLOC(plan); return NULL; } + return plan; + } + double comp1 = cost_guess(length); + double comp2 = 2*cost_guess(good_size(2*length-1)); + comp2*=1.5; /* fudge factor that appears to give good overall performance */ + if (comp2blueplan=make_fftblue_plan(length); + if (!plan->blueplan) { DEALLOC(plan); return NULL; } + } + else + { + plan->packplan=make_cfftp_plan(length); + if (!plan->packplan) { DEALLOC(plan); return NULL; } + } + return plan; + } + +void destroy_cfft_plan (cfft_plan plan) + { + if (plan->blueplan) + destroy_fftblue_plan(plan->blueplan); + if (plan->packplan) + destroy_cfftp_plan(plan->packplan); + DEALLOC(plan); + } + +WARN_UNUSED_RESULT int cfft_backward(cfft_plan plan, double c[], double fct) + { + if (plan->packplan) + return cfftp_backward(plan->packplan,c,fct); + // if (plan->blueplan) + return cfftblue_backward(plan->blueplan,c,fct); + } + +WARN_UNUSED_RESULT int cfft_forward(cfft_plan plan, double c[], double fct) + { + if (plan->packplan) + return cfftp_forward(plan->packplan,c,fct); + // if (plan->blueplan) + return cfftblue_forward(plan->blueplan,c,fct); + } + +typedef struct rfft_plan_i + { + rfftp_plan packplan; + fftblue_plan blueplan; + } rfft_plan_i; + +rfft_plan make_rfft_plan (size_t length) + { + if (length==0) return NULL; + rfft_plan plan = RALLOC(rfft_plan_i,1); + if (!plan) return NULL; + plan->blueplan=0; + plan->packplan=0; + if ((length<50) || (largest_prime_factor(length)<=sqrt(length))) + { + plan->packplan=make_rfftp_plan(length); + if (!plan->packplan) { DEALLOC(plan); return NULL; } + return plan; + } + double comp1 = 0.5*cost_guess(length); + double comp2 = 2*cost_guess(good_size(2*length-1)); + comp2*=1.5; /* fudge factor that appears to give good overall performance */ + if (comp2blueplan=make_fftblue_plan(length); + if (!plan->blueplan) { DEALLOC(plan); return NULL; } + } + else + { + plan->packplan=make_rfftp_plan(length); + if (!plan->packplan) { DEALLOC(plan); return NULL; } + } + return plan; + } + +void destroy_rfft_plan (rfft_plan plan) + { + if (plan->blueplan) + destroy_fftblue_plan(plan->blueplan); + if (plan->packplan) + destroy_rfftp_plan(plan->packplan); + DEALLOC(plan); + } + +size_t rfft_length(rfft_plan plan) + { + if (plan->packplan) return plan->packplan->length; + return plan->blueplan->n; + } + +size_t cfft_length(cfft_plan plan) + { + if (plan->packplan) return plan->packplan->length; + return plan->blueplan->n; + } + +WARN_UNUSED_RESULT int rfft_backward(rfft_plan plan, double c[], double fct) + { + if (plan->packplan) + return rfftp_backward(plan->packplan,c,fct); + else // if (plan->blueplan) + return rfftblue_backward(plan->blueplan,c,fct); + } + +WARN_UNUSED_RESULT int rfft_forward(rfft_plan plan, double c[], double fct) + { + if (plan->packplan) + return rfftp_forward(plan->packplan,c,fct); + else // if (plan->blueplan) + return rfftblue_forward(plan->blueplan,c,fct); + } diff --git a/pocketfft/pocketfft.h b/pocketfft/pocketfft.h new file mode 100644 index 0000000..9eb3985 --- /dev/null +++ b/pocketfft/pocketfft.h @@ -0,0 +1,34 @@ +/* + * This file is part of pocketfft. + * Licensed under a 3-clause BSD style license - see LICENSE.md + */ + +/*! \file pocketfft.h + * Public interface of the pocketfft library + * + * Copyright (C) 2008-2018 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef POCKETFFT_H +#define POCKETFFT_H + +#include + +struct cfft_plan_i; +typedef struct cfft_plan_i * cfft_plan; +cfft_plan make_cfft_plan (size_t length); +void destroy_cfft_plan (cfft_plan plan); +int cfft_backward(cfft_plan plan, double c[], double fct); +int cfft_forward(cfft_plan plan, double c[], double fct); +size_t cfft_length(cfft_plan plan); + +struct rfft_plan_i; +typedef struct rfft_plan_i * rfft_plan; +rfft_plan make_rfft_plan (size_t length); +void destroy_rfft_plan (rfft_plan plan); +int rfft_backward(rfft_plan plan, double c[], double fct); +int rfft_forward(rfft_plan plan, double c[], double fct); +size_t rfft_length(rfft_plan plan); + +#endif