From 351180baf46d8ad051fd58150dd2c905672aebfb Mon Sep 17 00:00:00 2001 From: Dag Sverre Seljebotn Date: Wed, 22 Apr 2015 13:16:50 +0200 Subject: [PATCH 1/6] Moved over Legendre transforms from Commander project --- Makefile | 8 + libsharp/planck.make | 2 +- libsharp/sharp.h | 1 + libsharp/sharp_legendre.c | 1305 ++++++++++++++++++++++++++++++++++ libsharp/sharp_legendre.c.in | 163 +++++ libsharp/sharp_legendre.h | 62 ++ libsharp/sharp_vecsupport.h | 37 + libsharp/sharp_vecutil.h | 10 + runjinja.py | 16 + 9 files changed, 1603 insertions(+), 1 deletion(-) create mode 100644 libsharp/sharp_legendre.c create mode 100644 libsharp/sharp_legendre.c.in create mode 100644 libsharp/sharp_legendre.h create mode 100755 runjinja.py diff --git a/Makefile b/Makefile index 5f2d9c8..dd9931e 100644 --- a/Makefile +++ b/Makefile @@ -53,3 +53,11 @@ perftest: compile_all $(BINDIR)/sharp_testsuite test gauss 2047 -1 -1 4096 0 1 && \ $(BINDIR)/sharp_testsuite test gauss 4095 -1 -1 8192 0 1 && \ $(BINDIR)/sharp_testsuite test gauss 8191 -1 -1 16384 0 1 + +# Jinja templates + +%.c: %.c.in + ./runjinja.py < $< > $@ + +genclean: + rm libsharp/sharp_legendre.c || exit 0 diff --git a/libsharp/planck.make b/libsharp/planck.make index 3397422..fcf51df 100644 --- a/libsharp/planck.make +++ b/libsharp/planck.make @@ -8,7 +8,7 @@ FULL_INCLUDE+= -I$(SD) HDR_$(PKG):=$(SD)/*.h LIB_$(PKG):=$(LIBDIR)/libsharp.a BIN:=sharp_testsuite -LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o +LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o ALLOBJ:=$(LIBOBJ) sharp_testsuite.o LIBOBJ:=$(LIBOBJ:%=$(OD)/%) ALLOBJ:=$(ALLOBJ:%=$(OD)/%) diff --git a/libsharp/sharp.h b/libsharp/sharp.h index 9c5dd57..da6fc4a 100644 --- a/libsharp/sharp.h +++ b/libsharp/sharp.h @@ -39,5 +39,6 @@ #include #include "sharp_lowlevel.h" +#include "sharp_legendre.h" #endif diff --git a/libsharp/sharp_legendre.c b/libsharp/sharp_legendre.c new file mode 100644 index 0000000..75518d6 --- /dev/null +++ b/libsharp/sharp_legendre.c @@ -0,0 +1,1305 @@ +/* + + NOTE NOTE NOTE + + This file is edited in sharp_legendre.c.in which is then preprocessed. + Do not make manual modifications to sharp_legendre.c. + + NOTE NOTE NOTE + +*/ + + +/* + * This file is part of libsharp. + * + * Redistribution and use in source and binary forms, with or without + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/*! \file sharp_legendre.c.in + * + * Copyright (C) 2015 University of Oslo + * \author Dag Sverre Seljebotn + */ + +#include "sharp_legendre.h" +#include "sharp_vecsupport.h" + +#include + + + +static void legendre_transform_vec1(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(1) * VLEN], + double out[(1) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vbroadcast(bl); + y0 = vmul(Pm1_0, b); + + + b = vbroadcast(bl + 1); + + vfmaeq(y0, P_0, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast(bl + l); + R = vbroadcast(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + vfmaeq(W1, W2, R); + P_0 = W1; + vfmaeq(y0, P_0, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + +} + +static void legendre_transform_vec2(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(2) * VLEN], + double out[(2) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv P_1, Pm1_1, Pm2_1, x1, y1; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vbroadcast(bl); + y0 = vmul(Pm1_0, b); + + x1 = vloadu(xarr + 1 * VLEN); + Pm1_1 = vload(1.0); + P_1 = x1; + b = vbroadcast(bl); + y1 = vmul(Pm1_1, b); + + + b = vbroadcast(bl + 1); + + vfmaeq(y0, P_0, b); + + vfmaeq(y1, P_1, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast(bl + l); + R = vbroadcast(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + vfmaeq(W1, W2, R); + P_0 = W1; + vfmaeq(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul(x1, Pm1_1); + W2 = W1; + W2 = vsub(W2, Pm2_1); + vfmaeq(W1, W2, R); + P_1 = W1; + vfmaeq(y1, P_1, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + + vstoreu(out + 1 * VLEN, y1); + +} + +static void legendre_transform_vec3(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(3) * VLEN], + double out[(3) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv P_1, Pm1_1, Pm2_1, x1, y1; + + Tv P_2, Pm1_2, Pm2_2, x2, y2; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vbroadcast(bl); + y0 = vmul(Pm1_0, b); + + x1 = vloadu(xarr + 1 * VLEN); + Pm1_1 = vload(1.0); + P_1 = x1; + b = vbroadcast(bl); + y1 = vmul(Pm1_1, b); + + x2 = vloadu(xarr + 2 * VLEN); + Pm1_2 = vload(1.0); + P_2 = x2; + b = vbroadcast(bl); + y2 = vmul(Pm1_2, b); + + + b = vbroadcast(bl + 1); + + vfmaeq(y0, P_0, b); + + vfmaeq(y1, P_1, b); + + vfmaeq(y2, P_2, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast(bl + l); + R = vbroadcast(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + vfmaeq(W1, W2, R); + P_0 = W1; + vfmaeq(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul(x1, Pm1_1); + W2 = W1; + W2 = vsub(W2, Pm2_1); + vfmaeq(W1, W2, R); + P_1 = W1; + vfmaeq(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul(x2, Pm1_2); + W2 = W1; + W2 = vsub(W2, Pm2_2); + vfmaeq(W1, W2, R); + P_2 = W1; + vfmaeq(y2, P_2, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + + vstoreu(out + 1 * VLEN, y1); + + vstoreu(out + 2 * VLEN, y2); + +} + +static void legendre_transform_vec4(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(4) * VLEN], + double out[(4) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv P_1, Pm1_1, Pm2_1, x1, y1; + + Tv P_2, Pm1_2, Pm2_2, x2, y2; + + Tv P_3, Pm1_3, Pm2_3, x3, y3; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vbroadcast(bl); + y0 = vmul(Pm1_0, b); + + x1 = vloadu(xarr + 1 * VLEN); + Pm1_1 = vload(1.0); + P_1 = x1; + b = vbroadcast(bl); + y1 = vmul(Pm1_1, b); + + x2 = vloadu(xarr + 2 * VLEN); + Pm1_2 = vload(1.0); + P_2 = x2; + b = vbroadcast(bl); + y2 = vmul(Pm1_2, b); + + x3 = vloadu(xarr + 3 * VLEN); + Pm1_3 = vload(1.0); + P_3 = x3; + b = vbroadcast(bl); + y3 = vmul(Pm1_3, b); + + + b = vbroadcast(bl + 1); + + vfmaeq(y0, P_0, b); + + vfmaeq(y1, P_1, b); + + vfmaeq(y2, P_2, b); + + vfmaeq(y3, P_3, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast(bl + l); + R = vbroadcast(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + vfmaeq(W1, W2, R); + P_0 = W1; + vfmaeq(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul(x1, Pm1_1); + W2 = W1; + W2 = vsub(W2, Pm2_1); + vfmaeq(W1, W2, R); + P_1 = W1; + vfmaeq(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul(x2, Pm1_2); + W2 = W1; + W2 = vsub(W2, Pm2_2); + vfmaeq(W1, W2, R); + P_2 = W1; + vfmaeq(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul(x3, Pm1_3); + W2 = W1; + W2 = vsub(W2, Pm2_3); + vfmaeq(W1, W2, R); + P_3 = W1; + vfmaeq(y3, P_3, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + + vstoreu(out + 1 * VLEN, y1); + + vstoreu(out + 2 * VLEN, y2); + + vstoreu(out + 3 * VLEN, y3); + +} + +static void legendre_transform_vec5(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(5) * VLEN], + double out[(5) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv P_1, Pm1_1, Pm2_1, x1, y1; + + Tv P_2, Pm1_2, Pm2_2, x2, y2; + + Tv P_3, Pm1_3, Pm2_3, x3, y3; + + Tv P_4, Pm1_4, Pm2_4, x4, y4; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vbroadcast(bl); + y0 = vmul(Pm1_0, b); + + x1 = vloadu(xarr + 1 * VLEN); + Pm1_1 = vload(1.0); + P_1 = x1; + b = vbroadcast(bl); + y1 = vmul(Pm1_1, b); + + x2 = vloadu(xarr + 2 * VLEN); + Pm1_2 = vload(1.0); + P_2 = x2; + b = vbroadcast(bl); + y2 = vmul(Pm1_2, b); + + x3 = vloadu(xarr + 3 * VLEN); + Pm1_3 = vload(1.0); + P_3 = x3; + b = vbroadcast(bl); + y3 = vmul(Pm1_3, b); + + x4 = vloadu(xarr + 4 * VLEN); + Pm1_4 = vload(1.0); + P_4 = x4; + b = vbroadcast(bl); + y4 = vmul(Pm1_4, b); + + + b = vbroadcast(bl + 1); + + vfmaeq(y0, P_0, b); + + vfmaeq(y1, P_1, b); + + vfmaeq(y2, P_2, b); + + vfmaeq(y3, P_3, b); + + vfmaeq(y4, P_4, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast(bl + l); + R = vbroadcast(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + vfmaeq(W1, W2, R); + P_0 = W1; + vfmaeq(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul(x1, Pm1_1); + W2 = W1; + W2 = vsub(W2, Pm2_1); + vfmaeq(W1, W2, R); + P_1 = W1; + vfmaeq(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul(x2, Pm1_2); + W2 = W1; + W2 = vsub(W2, Pm2_2); + vfmaeq(W1, W2, R); + P_2 = W1; + vfmaeq(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul(x3, Pm1_3); + W2 = W1; + W2 = vsub(W2, Pm2_3); + vfmaeq(W1, W2, R); + P_3 = W1; + vfmaeq(y3, P_3, b); + + Pm2_4 = Pm1_4; Pm1_4 = P_4; + W1 = vmul(x4, Pm1_4); + W2 = W1; + W2 = vsub(W2, Pm2_4); + vfmaeq(W1, W2, R); + P_4 = W1; + vfmaeq(y4, P_4, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + + vstoreu(out + 1 * VLEN, y1); + + vstoreu(out + 2 * VLEN, y2); + + vstoreu(out + 3 * VLEN, y3); + + vstoreu(out + 4 * VLEN, y4); + +} + +static void legendre_transform_vec6(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(6) * VLEN], + double out[(6) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv P_1, Pm1_1, Pm2_1, x1, y1; + + Tv P_2, Pm1_2, Pm2_2, x2, y2; + + Tv P_3, Pm1_3, Pm2_3, x3, y3; + + Tv P_4, Pm1_4, Pm2_4, x4, y4; + + Tv P_5, Pm1_5, Pm2_5, x5, y5; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vbroadcast(bl); + y0 = vmul(Pm1_0, b); + + x1 = vloadu(xarr + 1 * VLEN); + Pm1_1 = vload(1.0); + P_1 = x1; + b = vbroadcast(bl); + y1 = vmul(Pm1_1, b); + + x2 = vloadu(xarr + 2 * VLEN); + Pm1_2 = vload(1.0); + P_2 = x2; + b = vbroadcast(bl); + y2 = vmul(Pm1_2, b); + + x3 = vloadu(xarr + 3 * VLEN); + Pm1_3 = vload(1.0); + P_3 = x3; + b = vbroadcast(bl); + y3 = vmul(Pm1_3, b); + + x4 = vloadu(xarr + 4 * VLEN); + Pm1_4 = vload(1.0); + P_4 = x4; + b = vbroadcast(bl); + y4 = vmul(Pm1_4, b); + + x5 = vloadu(xarr + 5 * VLEN); + Pm1_5 = vload(1.0); + P_5 = x5; + b = vbroadcast(bl); + y5 = vmul(Pm1_5, b); + + + b = vbroadcast(bl + 1); + + vfmaeq(y0, P_0, b); + + vfmaeq(y1, P_1, b); + + vfmaeq(y2, P_2, b); + + vfmaeq(y3, P_3, b); + + vfmaeq(y4, P_4, b); + + vfmaeq(y5, P_5, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast(bl + l); + R = vbroadcast(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + vfmaeq(W1, W2, R); + P_0 = W1; + vfmaeq(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul(x1, Pm1_1); + W2 = W1; + W2 = vsub(W2, Pm2_1); + vfmaeq(W1, W2, R); + P_1 = W1; + vfmaeq(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul(x2, Pm1_2); + W2 = W1; + W2 = vsub(W2, Pm2_2); + vfmaeq(W1, W2, R); + P_2 = W1; + vfmaeq(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul(x3, Pm1_3); + W2 = W1; + W2 = vsub(W2, Pm2_3); + vfmaeq(W1, W2, R); + P_3 = W1; + vfmaeq(y3, P_3, b); + + Pm2_4 = Pm1_4; Pm1_4 = P_4; + W1 = vmul(x4, Pm1_4); + W2 = W1; + W2 = vsub(W2, Pm2_4); + vfmaeq(W1, W2, R); + P_4 = W1; + vfmaeq(y4, P_4, b); + + Pm2_5 = Pm1_5; Pm1_5 = P_5; + W1 = vmul(x5, Pm1_5); + W2 = W1; + W2 = vsub(W2, Pm2_5); + vfmaeq(W1, W2, R); + P_5 = W1; + vfmaeq(y5, P_5, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + + vstoreu(out + 1 * VLEN, y1); + + vstoreu(out + 2 * VLEN, y2); + + vstoreu(out + 3 * VLEN, y3); + + vstoreu(out + 4 * VLEN, y4); + + vstoreu(out + 5 * VLEN, y5); + +} + + + +static void legendre_transform_vec1_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(1) * VLEN_s], + float out[(1) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vbroadcast_s(bl); + y0 = vmul_s(Pm1_0, b); + + + b = vbroadcast_s(bl + 1); + + vfmaeq_s(y0, P_0, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast_s(bl + l); + R = vbroadcast_s(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + vfmaeq_s(W1, W2, R); + P_0 = W1; + vfmaeq_s(y0, P_0, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + +} + +static void legendre_transform_vec2_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(2) * VLEN_s], + float out[(2) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s P_1, Pm1_1, Pm2_1, x1, y1; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vbroadcast_s(bl); + y0 = vmul_s(Pm1_0, b); + + x1 = vloadu_s(xarr + 1 * VLEN_s); + Pm1_1 = vload_s(1.0); + P_1 = x1; + b = vbroadcast_s(bl); + y1 = vmul_s(Pm1_1, b); + + + b = vbroadcast_s(bl + 1); + + vfmaeq_s(y0, P_0, b); + + vfmaeq_s(y1, P_1, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast_s(bl + l); + R = vbroadcast_s(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + vfmaeq_s(W1, W2, R); + P_0 = W1; + vfmaeq_s(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul_s(x1, Pm1_1); + W2 = W1; + W2 = vsub_s(W2, Pm2_1); + vfmaeq_s(W1, W2, R); + P_1 = W1; + vfmaeq_s(y1, P_1, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + + vstoreu_s(out + 1 * VLEN_s, y1); + +} + +static void legendre_transform_vec3_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(3) * VLEN_s], + float out[(3) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s P_1, Pm1_1, Pm2_1, x1, y1; + + Tv_s P_2, Pm1_2, Pm2_2, x2, y2; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vbroadcast_s(bl); + y0 = vmul_s(Pm1_0, b); + + x1 = vloadu_s(xarr + 1 * VLEN_s); + Pm1_1 = vload_s(1.0); + P_1 = x1; + b = vbroadcast_s(bl); + y1 = vmul_s(Pm1_1, b); + + x2 = vloadu_s(xarr + 2 * VLEN_s); + Pm1_2 = vload_s(1.0); + P_2 = x2; + b = vbroadcast_s(bl); + y2 = vmul_s(Pm1_2, b); + + + b = vbroadcast_s(bl + 1); + + vfmaeq_s(y0, P_0, b); + + vfmaeq_s(y1, P_1, b); + + vfmaeq_s(y2, P_2, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast_s(bl + l); + R = vbroadcast_s(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + vfmaeq_s(W1, W2, R); + P_0 = W1; + vfmaeq_s(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul_s(x1, Pm1_1); + W2 = W1; + W2 = vsub_s(W2, Pm2_1); + vfmaeq_s(W1, W2, R); + P_1 = W1; + vfmaeq_s(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul_s(x2, Pm1_2); + W2 = W1; + W2 = vsub_s(W2, Pm2_2); + vfmaeq_s(W1, W2, R); + P_2 = W1; + vfmaeq_s(y2, P_2, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + + vstoreu_s(out + 1 * VLEN_s, y1); + + vstoreu_s(out + 2 * VLEN_s, y2); + +} + +static void legendre_transform_vec4_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(4) * VLEN_s], + float out[(4) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s P_1, Pm1_1, Pm2_1, x1, y1; + + Tv_s P_2, Pm1_2, Pm2_2, x2, y2; + + Tv_s P_3, Pm1_3, Pm2_3, x3, y3; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vbroadcast_s(bl); + y0 = vmul_s(Pm1_0, b); + + x1 = vloadu_s(xarr + 1 * VLEN_s); + Pm1_1 = vload_s(1.0); + P_1 = x1; + b = vbroadcast_s(bl); + y1 = vmul_s(Pm1_1, b); + + x2 = vloadu_s(xarr + 2 * VLEN_s); + Pm1_2 = vload_s(1.0); + P_2 = x2; + b = vbroadcast_s(bl); + y2 = vmul_s(Pm1_2, b); + + x3 = vloadu_s(xarr + 3 * VLEN_s); + Pm1_3 = vload_s(1.0); + P_3 = x3; + b = vbroadcast_s(bl); + y3 = vmul_s(Pm1_3, b); + + + b = vbroadcast_s(bl + 1); + + vfmaeq_s(y0, P_0, b); + + vfmaeq_s(y1, P_1, b); + + vfmaeq_s(y2, P_2, b); + + vfmaeq_s(y3, P_3, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast_s(bl + l); + R = vbroadcast_s(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + vfmaeq_s(W1, W2, R); + P_0 = W1; + vfmaeq_s(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul_s(x1, Pm1_1); + W2 = W1; + W2 = vsub_s(W2, Pm2_1); + vfmaeq_s(W1, W2, R); + P_1 = W1; + vfmaeq_s(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul_s(x2, Pm1_2); + W2 = W1; + W2 = vsub_s(W2, Pm2_2); + vfmaeq_s(W1, W2, R); + P_2 = W1; + vfmaeq_s(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul_s(x3, Pm1_3); + W2 = W1; + W2 = vsub_s(W2, Pm2_3); + vfmaeq_s(W1, W2, R); + P_3 = W1; + vfmaeq_s(y3, P_3, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + + vstoreu_s(out + 1 * VLEN_s, y1); + + vstoreu_s(out + 2 * VLEN_s, y2); + + vstoreu_s(out + 3 * VLEN_s, y3); + +} + +static void legendre_transform_vec5_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(5) * VLEN_s], + float out[(5) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s P_1, Pm1_1, Pm2_1, x1, y1; + + Tv_s P_2, Pm1_2, Pm2_2, x2, y2; + + Tv_s P_3, Pm1_3, Pm2_3, x3, y3; + + Tv_s P_4, Pm1_4, Pm2_4, x4, y4; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vbroadcast_s(bl); + y0 = vmul_s(Pm1_0, b); + + x1 = vloadu_s(xarr + 1 * VLEN_s); + Pm1_1 = vload_s(1.0); + P_1 = x1; + b = vbroadcast_s(bl); + y1 = vmul_s(Pm1_1, b); + + x2 = vloadu_s(xarr + 2 * VLEN_s); + Pm1_2 = vload_s(1.0); + P_2 = x2; + b = vbroadcast_s(bl); + y2 = vmul_s(Pm1_2, b); + + x3 = vloadu_s(xarr + 3 * VLEN_s); + Pm1_3 = vload_s(1.0); + P_3 = x3; + b = vbroadcast_s(bl); + y3 = vmul_s(Pm1_3, b); + + x4 = vloadu_s(xarr + 4 * VLEN_s); + Pm1_4 = vload_s(1.0); + P_4 = x4; + b = vbroadcast_s(bl); + y4 = vmul_s(Pm1_4, b); + + + b = vbroadcast_s(bl + 1); + + vfmaeq_s(y0, P_0, b); + + vfmaeq_s(y1, P_1, b); + + vfmaeq_s(y2, P_2, b); + + vfmaeq_s(y3, P_3, b); + + vfmaeq_s(y4, P_4, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast_s(bl + l); + R = vbroadcast_s(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + vfmaeq_s(W1, W2, R); + P_0 = W1; + vfmaeq_s(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul_s(x1, Pm1_1); + W2 = W1; + W2 = vsub_s(W2, Pm2_1); + vfmaeq_s(W1, W2, R); + P_1 = W1; + vfmaeq_s(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul_s(x2, Pm1_2); + W2 = W1; + W2 = vsub_s(W2, Pm2_2); + vfmaeq_s(W1, W2, R); + P_2 = W1; + vfmaeq_s(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul_s(x3, Pm1_3); + W2 = W1; + W2 = vsub_s(W2, Pm2_3); + vfmaeq_s(W1, W2, R); + P_3 = W1; + vfmaeq_s(y3, P_3, b); + + Pm2_4 = Pm1_4; Pm1_4 = P_4; + W1 = vmul_s(x4, Pm1_4); + W2 = W1; + W2 = vsub_s(W2, Pm2_4); + vfmaeq_s(W1, W2, R); + P_4 = W1; + vfmaeq_s(y4, P_4, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + + vstoreu_s(out + 1 * VLEN_s, y1); + + vstoreu_s(out + 2 * VLEN_s, y2); + + vstoreu_s(out + 3 * VLEN_s, y3); + + vstoreu_s(out + 4 * VLEN_s, y4); + +} + +static void legendre_transform_vec6_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(6) * VLEN_s], + float out[(6) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s P_1, Pm1_1, Pm2_1, x1, y1; + + Tv_s P_2, Pm1_2, Pm2_2, x2, y2; + + Tv_s P_3, Pm1_3, Pm2_3, x3, y3; + + Tv_s P_4, Pm1_4, Pm2_4, x4, y4; + + Tv_s P_5, Pm1_5, Pm2_5, x5, y5; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vbroadcast_s(bl); + y0 = vmul_s(Pm1_0, b); + + x1 = vloadu_s(xarr + 1 * VLEN_s); + Pm1_1 = vload_s(1.0); + P_1 = x1; + b = vbroadcast_s(bl); + y1 = vmul_s(Pm1_1, b); + + x2 = vloadu_s(xarr + 2 * VLEN_s); + Pm1_2 = vload_s(1.0); + P_2 = x2; + b = vbroadcast_s(bl); + y2 = vmul_s(Pm1_2, b); + + x3 = vloadu_s(xarr + 3 * VLEN_s); + Pm1_3 = vload_s(1.0); + P_3 = x3; + b = vbroadcast_s(bl); + y3 = vmul_s(Pm1_3, b); + + x4 = vloadu_s(xarr + 4 * VLEN_s); + Pm1_4 = vload_s(1.0); + P_4 = x4; + b = vbroadcast_s(bl); + y4 = vmul_s(Pm1_4, b); + + x5 = vloadu_s(xarr + 5 * VLEN_s); + Pm1_5 = vload_s(1.0); + P_5 = x5; + b = vbroadcast_s(bl); + y5 = vmul_s(Pm1_5, b); + + + b = vbroadcast_s(bl + 1); + + vfmaeq_s(y0, P_0, b); + + vfmaeq_s(y1, P_1, b); + + vfmaeq_s(y2, P_2, b); + + vfmaeq_s(y3, P_3, b); + + vfmaeq_s(y4, P_4, b); + + vfmaeq_s(y5, P_5, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast_s(bl + l); + R = vbroadcast_s(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + vfmaeq_s(W1, W2, R); + P_0 = W1; + vfmaeq_s(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul_s(x1, Pm1_1); + W2 = W1; + W2 = vsub_s(W2, Pm2_1); + vfmaeq_s(W1, W2, R); + P_1 = W1; + vfmaeq_s(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul_s(x2, Pm1_2); + W2 = W1; + W2 = vsub_s(W2, Pm2_2); + vfmaeq_s(W1, W2, R); + P_2 = W1; + vfmaeq_s(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul_s(x3, Pm1_3); + W2 = W1; + W2 = vsub_s(W2, Pm2_3); + vfmaeq_s(W1, W2, R); + P_3 = W1; + vfmaeq_s(y3, P_3, b); + + Pm2_4 = Pm1_4; Pm1_4 = P_4; + W1 = vmul_s(x4, Pm1_4); + W2 = W1; + W2 = vsub_s(W2, Pm2_4); + vfmaeq_s(W1, W2, R); + P_4 = W1; + vfmaeq_s(y4, P_4, b); + + Pm2_5 = Pm1_5; Pm1_5 = P_5; + W1 = vmul_s(x5, Pm1_5); + W2 = W1; + W2 = vsub_s(W2, Pm2_5); + vfmaeq_s(W1, W2, R); + P_5 = W1; + vfmaeq_s(y5, P_5, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + + vstoreu_s(out + 1 * VLEN_s, y1); + + vstoreu_s(out + 2 * VLEN_s, y2); + + vstoreu_s(out + 3 * VLEN_s, y3); + + vstoreu_s(out + 4 * VLEN_s, y4); + + vstoreu_s(out + 5 * VLEN_s, y5); + +} + + + + + +void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax) { + /* (l - 1) / l, for l >= 2 */ + ptrdiff_t l; + r[0] = 0; + r[1] = 1; + for (l = 2; l <= lmax; ++l) { + r[l] = (double)(l - 1) / (double)l; + } +} + +void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax) { + /* (l - 1) / l, for l >= 2 */ + ptrdiff_t l; + r[0] = 0; + r[1] = 1; + for (l = 2; l <= lmax; ++l) { + r[l] = (float)(l - 1) / (float)l; + } +} + + +/* + Compute sum_l b_l P_l(x_i) for all i. + */ + + + +#define CS 4 +#define LEN (CS * VLEN) +#define LEN_s (CS * VLEN_s) + + +void sharp_legendre_transform(double *bl, + double *recfac, + ptrdiff_t lmax, + double *x, double *out, ptrdiff_t nx) { + double xchunk[LEN], outchunk[LEN]; + int compute_recfac; + ptrdiff_t i, j, len; + + compute_recfac = (recfac == NULL); + if (compute_recfac) { + recfac = memalign(16, sizeof(double) * (lmax + 1)); + sharp_legendre_transform_recfac(recfac, lmax); + } + + for (j = 0; j != LEN; ++j) xchunk[j] = 0; + + for (i = 0; i < nx; i += LEN) { + len = (i + (LEN) <= nx) ? (LEN) : (nx - i); + for (j = 0; j != len; ++j) xchunk[j] = x[i + j]; + switch ((len + VLEN - 1) / VLEN) { + case 6: legendre_transform_vec6(recfac, bl, lmax, xchunk, outchunk); break; + case 5: legendre_transform_vec5(recfac, bl, lmax, xchunk, outchunk); break; + case 4: legendre_transform_vec4(recfac, bl, lmax, xchunk, outchunk); break; + case 3: legendre_transform_vec3(recfac, bl, lmax, xchunk, outchunk); break; + case 2: legendre_transform_vec2(recfac, bl, lmax, xchunk, outchunk); break; + case 1: + case 0: + legendre_transform_vec1(recfac, bl, lmax, xchunk, outchunk); break; + } + for (j = 0; j != len; ++j) out[i + j] = outchunk[j]; + } + if (compute_recfac) { + free(recfac); + } +} + +void sharp_legendre_transform_s(float *bl, + float *recfac, + ptrdiff_t lmax, + float *x, float *out, ptrdiff_t nx) { + float xchunk[LEN_s], outchunk[LEN_s]; + int compute_recfac; + ptrdiff_t i, j, len; + + compute_recfac = (recfac == NULL); + if (compute_recfac) { + recfac = memalign(16, sizeof(float) * (lmax + 1)); + sharp_legendre_transform_recfac_s(recfac, lmax); + } + + for (j = 0; j != LEN_s; ++j) xchunk[j] = 0; + + for (i = 0; i < nx; i += LEN_s) { + len = (i + (LEN_s) <= nx) ? (LEN_s) : (nx - i); + for (j = 0; j != len; ++j) xchunk[j] = x[i + j]; + switch ((len + VLEN_s - 1) / VLEN_s) { + case 6: legendre_transform_vec6_s(recfac, bl, lmax, xchunk, outchunk); break; + case 5: legendre_transform_vec5_s(recfac, bl, lmax, xchunk, outchunk); break; + case 4: legendre_transform_vec4_s(recfac, bl, lmax, xchunk, outchunk); break; + case 3: legendre_transform_vec3_s(recfac, bl, lmax, xchunk, outchunk); break; + case 2: legendre_transform_vec2_s(recfac, bl, lmax, xchunk, outchunk); break; + case 1: + case 0: + legendre_transform_vec1_s(recfac, bl, lmax, xchunk, outchunk); break; + } + for (j = 0; j != len; ++j) out[i + j] = outchunk[j]; + } + if (compute_recfac) { + free(recfac); + } +} + diff --git a/libsharp/sharp_legendre.c.in b/libsharp/sharp_legendre.c.in new file mode 100644 index 0000000..9862127 --- /dev/null +++ b/libsharp/sharp_legendre.c.in @@ -0,0 +1,163 @@ +/* + + NOTE NOTE NOTE + + This file is edited in sharp_legendre.c.in which is then preprocessed. + Do not make manual modifications to sharp_legendre.c. + + NOTE NOTE NOTE + +*/ + + +/* + * This file is part of libsharp. + * + * Redistribution and use in source and binary forms, with or without + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/*! \file sharp_legendre.c.in + * + * Copyright (C) 2015 University of Oslo + * \author Dag Sverre Seljebotn + */ + +#include "sharp_legendre.h" +#include "sharp_vecsupport.h" + +#include + +/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/ +/*{ for cs in range(1, 7) }*/ +static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *bl, ptrdiff_t lmax, + {{scalar}} xarr[({{cs}}) * VLEN{{T}}], + {{scalar}} out[({{cs}}) * VLEN{{T}}]) { + /*{ for i in range(cs) }*/ + Tv{{T}} P_{{i}}, Pm1_{{i}}, Pm2_{{i}}, x{{i}}, y{{i}}; + /*{ endfor }*/ + Tv{{T}} W1, W2, b, R; + ptrdiff_t l; + + /*{ for i in range(cs) }*/ + x{{i}} = vloadu{{T}}(xarr + {{i}} * VLEN{{T}}); + Pm1_{{i}} = vload{{T}}(1.0); + P_{{i}} = x{{i}}; + b = vbroadcast{{T}}(bl); + y{{i}} = vmul{{T}}(Pm1_{{i}}, b); + /*{ endfor }*/ + + b = vbroadcast{{T}}(bl + 1); + /*{ for i in range(cs) }*/ + vfmaeq{{T}}(y{{i}}, P_{{i}}, b); + /*{ endfor }*/ + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast{{T}}(bl + l); + R = vbroadcast{{T}}(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + /*{ for i in range(cs) }*/ + Pm2_{{i}} = Pm1_{{i}}; Pm1_{{i}} = P_{{i}}; + W1 = vmul{{T}}(x{{i}}, Pm1_{{i}}); + W2 = W1; + W2 = vsub{{T}}(W2, Pm2_{{i}}); + vfmaeq{{T}}(W1, W2, R); + P_{{i}} = W1; + vfmaeq{{T}}(y{{i}}, P_{{i}}, b); + /*{ endfor }*/ + + } + /*{ for i in range(cs) }*/ + vstoreu{{T}}(out + {{i}} * VLEN{{T}}, y{{i}}); + /*{ endfor }*/ +} +/*{ endfor }*/ +/*{ endfor }*/ + + +/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/ +void sharp_legendre_transform_recfac{{T}}({{scalar}} *r, ptrdiff_t lmax) { + /* (l - 1) / l, for l >= 2 */ + ptrdiff_t l; + r[0] = 0; + r[1] = 1; + for (l = 2; l <= lmax; ++l) { + r[l] = ({{scalar}})(l - 1) / ({{scalar}})l; + } +} +/*{ endfor }*/ + +/* + Compute sum_l b_l P_l(x_i) for all i. + */ + +/*{set cs=4}*/ + +#define CS {{cs}} +#define LEN (CS * VLEN) +#define LEN_s (CS * VLEN_s) + +/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/ +void sharp_legendre_transform{{T}}({{scalar}} *bl, + {{scalar}} *recfac, + ptrdiff_t lmax, + {{scalar}} *x, {{scalar}} *out, ptrdiff_t nx) { + {{scalar}} xchunk[LEN{{T}}], outchunk[LEN{{T}}]; + int compute_recfac; + ptrdiff_t i, j, len; + + compute_recfac = (recfac == NULL); + if (compute_recfac) { + recfac = memalign(16, sizeof({{scalar}}) * (lmax + 1)); + sharp_legendre_transform_recfac{{T}}(recfac, lmax); + } + + for (j = 0; j != LEN{{T}}; ++j) xchunk[j] = 0; + + for (i = 0; i < nx; i += LEN{{T}}) { + len = (i + (LEN{{T}}) <= nx) ? (LEN{{T}}) : (nx - i); + for (j = 0; j != len; ++j) xchunk[j] = x[i + j]; + switch ((len + VLEN{{T}} - 1) / VLEN{{T}}) { + case 6: legendre_transform_vec6{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + case 5: legendre_transform_vec5{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + case 4: legendre_transform_vec4{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + case 3: legendre_transform_vec3{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + case 2: legendre_transform_vec2{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + case 1: + case 0: + legendre_transform_vec1{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + } + for (j = 0; j != len; ++j) out[i + j] = outchunk[j]; + } + if (compute_recfac) { + free(recfac); + } +} +/*{ endfor }*/ + diff --git a/libsharp/sharp_legendre.h b/libsharp/sharp_legendre.h new file mode 100644 index 0000000..cfd8aee --- /dev/null +++ b/libsharp/sharp_legendre.h @@ -0,0 +1,62 @@ +/* + * This file is part of libsharp. + * + * Redistribution and use in source and binary forms, with or without + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/*! \file sharp_legendre.h + * Interface for the Legendre transform parts of the spherical transform library. + * + * Copyright (C) 2015 University of Oslo + * \author Dag Sverre Seljebotn + */ + +#ifndef SHARP_LEGENDRE_H +#define SHARP_LEGENDRE_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef NO_LEGENDRE + +void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x, + double *out, ptrdiff_t nx); +void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x, + float *out, ptrdiff_t nx); +void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax); +void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h index b26a539..28073ac 100644 --- a/libsharp/sharp_vecsupport.h +++ b/libsharp/sharp_vecsupport.h @@ -40,23 +40,31 @@ typedef double Ts; #if (VLEN==1) typedef double Tv; +typedef float Tv_s; typedef int Tm; #define vadd(a,b) ((a)+(b)) +#define vadd_s(a,b) ((a)+(b)) #define vaddeq(a,b) ((a)+=(b)) #define vaddeq_mask(mask,a,b) if (mask) (a)+=(b); #define vsub(a,b) ((a)-(b)) +#define vsub_s(a,b) ((a)-(b)) #define vsubeq(a,b) ((a)-=(b)) #define vsubeq_mask(mask,a,b) if (mask) (a)-=(b); #define vmul(a,b) ((a)*(b)) +#define vmul_s(a,b) ((a)*(b)) #define vmuleq(a,b) ((a)*=(b)) #define vmuleq_mask(mask,a,b) if (mask) (a)*=(b); #define vfmaeq(a,b,c) ((a)+=(b)*(c)) +#define vfmaeq_s(a,b,c) ((a)+=(b)*(c)) #define vfmseq(a,b,c) ((a)-=(b)*(c)) #define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e)) #define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e)) #define vneg(a) (-(a)) #define vload(a) (a) +#define vload_s(a) (a) +#define vloadu(p) (*p) +#define vloadu_s(p) (*p) #define vabs(a) fabs(a) #define vsqrt(a) sqrt(a) #define vlt(a,b) ((a)<(b)) @@ -64,6 +72,10 @@ typedef int Tm; #define vge(a,b) ((a)>=(b)) #define vne(a,b) ((a)!=(b)) #define vand_mask(a,b) ((a)&&(b)) +#define vbroadcast(p) (*p) +#define vbroadcast_s(p) (*p) +#define vstoreu(p, a) (*(p)=a) +#define vstoreu_s(p, a) (*(p)=a) static inline Tv vmin (Tv a, Tv b) { return (ab) ? a : b; } @@ -87,6 +99,7 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; } #endif typedef __m128d Tv; +typedef __m128 Tv_s; typedef __m128d Tm; #if defined(__SSE4_1__) @@ -99,15 +112,19 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) #define vone _mm_set1_pd(1.) #define vadd(a,b) _mm_add_pd(a,b) +#define vadd_s(a,b) _mm_add_ps(a,b) #define vaddeq(a,b) a=_mm_add_pd(a,b) #define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero)) #define vsub(a,b) _mm_sub_pd(a,b) +#define vsub_s(a,b) _mm_sub_ps(a,b) #define vsubeq(a,b) a=_mm_sub_pd(a,b) #define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero)) #define vmul(a,b) _mm_mul_pd(a,b) +#define vmul_s(a,b) _mm_mul_ps(a,b) #define vmuleq(a,b) a=_mm_mul_pd(a,b) #define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone)) #define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c)) +#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c)) #define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c)) #define vfmaaeq(a,b,c,d,e) \ a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e))) @@ -115,6 +132,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e))) #define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a) #define vload(a) _mm_set1_pd(a) +#define vload_s(a) _mm_set1_ps(a) #define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a) #define vsqrt(a) _mm_sqrt_pd(a) #define vlt(a,b) _mm_cmplt_pd(a,b) @@ -126,6 +144,12 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) #define vmax(a,b) _mm_max_pd(a,b); #define vanyTrue(a) (_mm_movemask_pd(a)!=0) #define vallTrue(a) (_mm_movemask_pd(a)==3) +#define vloadu(p) _mm_loadu_pd(p) +#define vloadu_s(p) _mm_loadu_ps(p) +#define vstoreu(p, v) _mm_storeu_pd(p, v) +#define vstoreu_s(p, v) _mm_storeu_ps(p, v) +#define vbroadcast(p) _mm_set_pd(*p, *p) +#define vbroadcast_s(p) _mm_set_ps(*p, *p, *p, *p) #endif @@ -137,6 +161,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) #endif typedef __m256d Tv; +typedef __m256 Tv_s; typedef __m256d Tm; #define vblend__(m,a,b) _mm256_blendv_pd(b,a,m) @@ -144,16 +169,20 @@ typedef __m256d Tm; #define vone _mm256_set1_pd(1.) #define vadd(a,b) _mm256_add_pd(a,b) +#define vadd_s(a,b) _mm256_add_ps(a,b) #define vaddeq(a,b) a=_mm256_add_pd(a,b) #define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero)) #define vsub(a,b) _mm256_sub_pd(a,b) +#define vsub_s(a,b) _mm256_sub_ps(a,b) #define vsubeq(a,b) a=_mm256_sub_pd(a,b) #define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero)) #define vmul(a,b) _mm256_mul_pd(a,b) +#define vmul_s(a,b) _mm256_mul_ps(a,b) #define vmuleq(a,b) a=_mm256_mul_pd(a,b) #define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone)) #ifdef __FMA4__ #define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a) +#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a) #define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a) #define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a)) #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a)) @@ -167,6 +196,7 @@ typedef __m256d Tm; #endif #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a) #define vload(a) _mm256_set1_pd(a) +#define vload_s(a) _mm256_set1_ps(a) #define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a) #define vsqrt(a) _mm256_sqrt_pd(a) #define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ) @@ -179,6 +209,13 @@ typedef __m256d Tm; #define vanyTrue(a) (_mm256_movemask_pd(a)!=0) #define vallTrue(a) (_mm256_movemask_pd(a)==15) +#define vloadu(p) _mm256_loadu_pd(p) +#define vloadu_s(p) _mm256_loadu_ps(p) +#define vstoreu(p, v) _mm256_storeu_pd(p, v) +#define vstoreu_s(p, v) _mm256_storeu_ps(p, v) +#define vbroadcast(p) _mm256_broadcast_sd(p) +#define vbroadcast_s(p) _mm256_broadcast_ss(p) + #endif #if (VLEN==8) diff --git a/libsharp/sharp_vecutil.h b/libsharp/sharp_vecutil.h index 16bfa13..0aee4ae 100644 --- a/libsharp/sharp_vecutil.h +++ b/libsharp/sharp_vecutil.h @@ -32,6 +32,8 @@ #ifndef SHARP_VECUTIL_H #define SHARP_VECUTIL_H +#ifndef VLEN + #if (defined (__MIC__)) #define VLEN 8 #elif (defined (__AVX__)) @@ -43,3 +45,11 @@ #endif #endif + +#if (VLEN==1) +#define VLEN_s 1 +#else +#define VLEN_s (2*VLEN) +#endif + +#endif diff --git a/runjinja.py b/runjinja.py new file mode 100755 index 0000000..5269c1a --- /dev/null +++ b/runjinja.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python + +""" +Preprocesses foo.c.in to foo.c. Reads STDIN and writes STDOUT. +""" + +import sys +from jinja2 import Template, Environment + +env = Environment(block_start_string='/*{', + block_end_string='}*/', + variable_start_string='{{', + variable_end_string='}}') + +extra_vars = dict(len=len) +sys.stdout.write(env.from_string(sys.stdin.read()).render(**extra_vars)) From 0a0cad34aa9edd1f985f7e39d951e54593660a5a Mon Sep 17 00:00:00 2001 From: Dag Sverre Seljebotn Date: Wed, 22 Apr 2015 13:18:43 +0200 Subject: [PATCH 2/6] Python wrapper with test for Legendre transform --- .gitignore | 5 ++ Makefile | 7 ++ python/fake_pyrex/Pyrex/Distutils/__init__.py | 1 + .../fake_pyrex/Pyrex/Distutils/build_ext.py | 1 + python/fake_pyrex/Pyrex/__init__.py | 1 + python/fake_pyrex/README | 2 + python/libsharp/__init__.py | 1 + python/libsharp/libsharp.pyx | 41 ++++++++++ python/libsharp/tests/__init__.py | 1 + python/libsharp/tests/test_legendre.py | 29 +++++++ python/setup.py | 76 +++++++++++++++++++ 11 files changed, 165 insertions(+) create mode 100644 python/fake_pyrex/Pyrex/Distutils/__init__.py create mode 100644 python/fake_pyrex/Pyrex/Distutils/build_ext.py create mode 100644 python/fake_pyrex/Pyrex/__init__.py create mode 100644 python/fake_pyrex/README create mode 100644 python/libsharp/__init__.py create mode 100644 python/libsharp/libsharp.pyx create mode 100644 python/libsharp/tests/__init__.py create mode 100644 python/libsharp/tests/test_legendre.py create mode 100644 python/setup.py diff --git a/.gitignore b/.gitignore index cd0bf21..b555caa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ *.o +*.so #* *~ +*.pyc +*.pyo /auto /autom4te.cache @@ -9,3 +12,5 @@ /config/config.auto /configure /sharp_oracle.inc + +/python/libsharp/libsharp.c diff --git a/Makefile b/Makefile index dd9931e..f44ecfd 100644 --- a/Makefile +++ b/Makefile @@ -61,3 +61,10 @@ perftest: compile_all genclean: rm libsharp/sharp_legendre.c || exit 0 + +pytest: + rm python/libsharp/libsharp.so || exit 0 + cd python && LIBSHARP_INCLUDE=$(INCDIR) LIBSHARP_LIB=$(LIBDIR) python setup.py build_ext --inplace + cd python && nosetests libsharp + + diff --git a/python/fake_pyrex/Pyrex/Distutils/__init__.py b/python/fake_pyrex/Pyrex/Distutils/__init__.py new file mode 100644 index 0000000..51c8e16 --- /dev/null +++ b/python/fake_pyrex/Pyrex/Distutils/__init__.py @@ -0,0 +1 @@ +# work around broken setuptools monkey patching diff --git a/python/fake_pyrex/Pyrex/Distutils/build_ext.py b/python/fake_pyrex/Pyrex/Distutils/build_ext.py new file mode 100644 index 0000000..4f846f6 --- /dev/null +++ b/python/fake_pyrex/Pyrex/Distutils/build_ext.py @@ -0,0 +1 @@ +build_ext = "yes, it's there!" diff --git a/python/fake_pyrex/Pyrex/__init__.py b/python/fake_pyrex/Pyrex/__init__.py new file mode 100644 index 0000000..51c8e16 --- /dev/null +++ b/python/fake_pyrex/Pyrex/__init__.py @@ -0,0 +1 @@ +# work around broken setuptools monkey patching diff --git a/python/fake_pyrex/README b/python/fake_pyrex/README new file mode 100644 index 0000000..cf3f3ff --- /dev/null +++ b/python/fake_pyrex/README @@ -0,0 +1,2 @@ +This directory is here to fool setuptools into building .pyx files +even if Pyrex is not installed. See ../setup.py. \ No newline at end of file diff --git a/python/libsharp/__init__.py b/python/libsharp/__init__.py new file mode 100644 index 0000000..dd0fa41 --- /dev/null +++ b/python/libsharp/__init__.py @@ -0,0 +1 @@ +from .libsharp import * diff --git a/python/libsharp/libsharp.pyx b/python/libsharp/libsharp.pyx new file mode 100644 index 0000000..0abfe69 --- /dev/null +++ b/python/libsharp/libsharp.pyx @@ -0,0 +1,41 @@ +import numpy as np + +cdef extern from "sharp.h": + ctypedef long ptrdiff_t + + void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x, + float *out, ptrdiff_t nx) + void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x, + double *out, ptrdiff_t nx) + void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax) + void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax) + + +def legendre_transform(x, bl, out=None): + if out is None: + out = np.empty_like(x) + if x.dtype == np.float64: + if bl.dtype != np.float64: + bl = bl.astype(np.float64) + return _legendre_transform(x, bl, out=out) + elif x.dtype == np.float32: + if bl.dtype != np.float32: + bl = bl.astype(np.float32) + return _legendre_transform_s(x, bl, out=out) + else: + raise ValueError("unsupported dtype") + + +def _legendre_transform(double[::1] x, double[::1] bl, double[::1] out): + if out.shape[0] != x.shape[0]: + raise ValueError('x and out must have same shape') + sharp_legendre_transform(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0]) + return np.asarray(out) + + +def _legendre_transform_s(float[::1] x, float[::1] bl, float[::1] out): + if out.shape[0] != x.shape[0]: + raise ValueError('x and out must have same shape') + sharp_legendre_transform_s(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0]) + return np.asarray(out) + diff --git a/python/libsharp/tests/__init__.py b/python/libsharp/tests/__init__.py new file mode 100644 index 0000000..1bb8bf6 --- /dev/null +++ b/python/libsharp/tests/__init__.py @@ -0,0 +1 @@ +# empty diff --git a/python/libsharp/tests/test_legendre.py b/python/libsharp/tests/test_legendre.py new file mode 100644 index 0000000..2e96822 --- /dev/null +++ b/python/libsharp/tests/test_legendre.py @@ -0,0 +1,29 @@ +import numpy as np +from scipy.special import legendre +import libsharp + + +def test_legendre_transform(): + lmax = 20 + ntheta = 19 + + l = np.arange(lmax + 1) + bl = np.exp(-l*(l+1)) + bl *= (2 * l + 1) + + theta = np.linspace(0, np.pi, ntheta, endpoint=True) + x = np.cos(theta) + + # Compute truth using scipy.special.legendre + P = np.zeros((ntheta, lmax + 1)) + for l in range(lmax + 1): + P[:, l] = legendre(l)(x) + y0 = np.dot(P, bl) + + # double-precision + y = libsharp.legendre_transform(x, bl) + assert np.max(np.abs(y - y) / np.abs(y)) < 1e-12 + + # single-precision + y32 = libsharp.legendre_transform(x.astype(np.float32), bl) + assert np.max(np.abs(y32 - y) / np.abs(y32)) < 1e-4 diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 0000000..1500f05 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,76 @@ +#! /usr/bin/env python + +descr = """Spherical Harmionic transforms package + +Python API for the libsharp spherical harmonic transforms library +""" + +import os +import sys + +DISTNAME = 'libsharp' +DESCRIPTION = 'libsharp library for fast Spherical Harmonic Transforms' +LONG_DESCRIPTION = descr +MAINTAINER = 'Dag Sverre Seljebotn', +MAINTAINER_EMAIL = 'd.s.seljebotn@astro.uio.no', +URL = 'http://sourceforge.net/projects/libsharp/' +LICENSE = 'GPL' +DOWNLOAD_URL = "http://sourceforge.net/projects/libsharp/" +VERSION = '0.1' + +# Add our fake Pyrex at the end of the Python search path +# in order to fool setuptools into allowing compilation of +# pyx files to C files. Importing Cython.Distutils then +# makes Cython the tool of choice for this rather than +# (the possibly nonexisting) Pyrex. +project_path = os.path.split(__file__)[0] +sys.path.append(os.path.join(project_path, 'fake_pyrex')) + +from setuptools import setup, find_packages, Extension +from Cython.Distutils import build_ext +import numpy as np + +libsharp = os.environ.get('LIBSHARP', None) +libsharp_include = os.environ.get('LIBSHARP_INCLUDE', libsharp and os.path.join(libsharp, 'include')) +libsharp_lib = os.environ.get('LIBSHARP_LIB', libsharp and os.path.join(libsharp, 'lib')) + +if libsharp_include is None or libsharp_lib is None: + sys.stderr.write('Please set LIBSHARP environment variable to the install directly of libsharp, ' + 'this script will refer to the lib and include sub-directories. Alternatively ' + 'set LIBSHARP_INCLUDE and LIBSHARP_LIB\n') + sys.exit(1) + +if __name__ == "__main__": + setup(install_requires = ['numpy'], + packages = find_packages(), + test_suite="nose.collector", + # Well, technically zipping the package will work, but since it's + # all compiled code it'll just get unzipped again at runtime, which + # is pointless: + zip_safe = False, + name = DISTNAME, + version = VERSION, + maintainer = MAINTAINER, + maintainer_email = MAINTAINER_EMAIL, + description = DESCRIPTION, + license = LICENSE, + url = URL, + download_url = DOWNLOAD_URL, + long_description = LONG_DESCRIPTION, + classifiers = + [ 'Development Status :: 3 - Alpha', + 'Environment :: Console', + 'Intended Audience :: Developers', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: GNU General Public License (GPL)', + 'Topic :: Scientific/Engineering'], + cmdclass = {"build_ext": build_ext}, + ext_modules = [ + Extension("libsharp.libsharp", + ["libsharp/libsharp.pyx"], + libraries=["sharp", "fftpack", "c_utils"], + include_dirs=[libsharp_include], + library_dirs=[libsharp_lib] + ), + ], + ) From ea8671c2eccbae23d0e92cc20a1571b0b323286e Mon Sep 17 00:00:00 2001 From: Dag Sverre Seljebotn Date: Wed, 22 Apr 2015 14:23:39 +0200 Subject: [PATCH 3/6] Prevent invoking Jinja templating unecessarily --- Makefile | 6 +++--- libsharp/sharp_legendre.c | 4 +++- libsharp/sharp_legendre.c.in | 1 - runjinja.py | 5 ++++- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index f44ecfd..a621948 100644 --- a/Makefile +++ b/Makefile @@ -54,10 +54,10 @@ perftest: compile_all $(BINDIR)/sharp_testsuite test gauss 4095 -1 -1 8192 0 1 && \ $(BINDIR)/sharp_testsuite test gauss 8191 -1 -1 16384 0 1 -# Jinja templates - %.c: %.c.in - ./runjinja.py < $< > $@ +# Only do this if the md5sum changed, in order to avoid Python and Jinja +# dependency when not modifying the c.in file + grep `md5sum $< | cut -d ' ' -f 1` $@ || ./runjinja.py < $< > $@ genclean: rm libsharp/sharp_legendre.c || exit 0 diff --git a/libsharp/sharp_legendre.c b/libsharp/sharp_legendre.c index 75518d6..8b4fe4b 100644 --- a/libsharp/sharp_legendre.c +++ b/libsharp/sharp_legendre.c @@ -1,4 +1,4 @@ -/* +/* DO NOT EDIT. md5sum of source: 88eec944ab6fbfdc7f391fbb58df3bf1 *//* NOTE NOTE NOTE @@ -1303,3 +1303,5 @@ void sharp_legendre_transform_s(float *bl, } } + + diff --git a/libsharp/sharp_legendre.c.in b/libsharp/sharp_legendre.c.in index 9862127..e30f787 100644 --- a/libsharp/sharp_legendre.c.in +++ b/libsharp/sharp_legendre.c.in @@ -160,4 +160,3 @@ void sharp_legendre_transform{{T}}({{scalar}} *bl, } } /*{ endfor }*/ - diff --git a/runjinja.py b/runjinja.py index 5269c1a..fd659fa 100755 --- a/runjinja.py +++ b/runjinja.py @@ -5,6 +5,7 @@ Preprocesses foo.c.in to foo.c. Reads STDIN and writes STDOUT. """ import sys +import hashlib from jinja2 import Template, Environment env = Environment(block_start_string='/*{', @@ -13,4 +14,6 @@ env = Environment(block_start_string='/*{', variable_end_string='}}') extra_vars = dict(len=len) -sys.stdout.write(env.from_string(sys.stdin.read()).render(**extra_vars)) +input = sys.stdin.read() +sys.stdout.write('/* DO NOT EDIT. md5sum of source: %s */' % hashlib.md5(input).hexdigest()) +sys.stdout.write(env.from_string(input).render(**extra_vars)) From f2fe4f9ca2177803008d70e58553c91c183c4930 Mon Sep 17 00:00:00 2001 From: Dag Sverre Seljebotn Date: Thu, 23 Apr 2015 09:57:45 +0200 Subject: [PATCH 4/6] legendre transforms: Polishing and bugfixes vbroadcast didn't work properly for some reason, vload does.. --- libsharp/sharp_legendre.c | 274 +++++++++++++------------ libsharp/sharp_legendre.c.in | 38 ++-- libsharp/sharp_vecsupport.h | 15 +- libsharp/sharp_vecutil.h | 8 + python/libsharp/libsharp.pyx | 4 +- python/libsharp/tests/test_legendre.py | 26 ++- 6 files changed, 204 insertions(+), 161 deletions(-) diff --git a/libsharp/sharp_legendre.c b/libsharp/sharp_legendre.c index 8b4fe4b..cd54101 100644 --- a/libsharp/sharp_legendre.c +++ b/libsharp/sharp_legendre.c @@ -1,4 +1,4 @@ -/* DO NOT EDIT. md5sum of source: 88eec944ab6fbfdc7f391fbb58df3bf1 *//* +/* DO NOT EDIT. md5sum of source: d9499375a254cbf1e7903a249a8676ff *//* NOTE NOTE NOTE @@ -46,6 +46,21 @@ * \author Dag Sverre Seljebotn */ +#ifndef NO_LEGENDRE +#if (VLEN==8) +#error This code is not tested with MIC; please compile with -DNO_LEGENDRE +/* ...or test it (it probably works) and remove this check */ +#endif + +#ifndef SHARP_LEGENDRE_CS +#define SHARP_LEGENDRE_CS 4 +#endif + +#define MAX_CS 6 +#if (SHARP_LEGENDRE_CS > MAX_CS) +#error (SHARP_LEGENDRE_CS > MAX_CS) +#endif + #include "sharp_legendre.h" #include "sharp_vecsupport.h" @@ -66,18 +81,18 @@ static void legendre_transform_vec1(double *recfacs, double *bl, ptrdiff_t lmax, x0 = vloadu(xarr + 0 * VLEN); Pm1_0 = vload(1.0); P_0 = x0; - b = vbroadcast(bl); + b = vload(*bl); y0 = vmul(Pm1_0, b); - b = vbroadcast(bl + 1); + b = vload(*(bl + 1)); vfmaeq(y0, P_0, b); for (l = 2; l <= lmax; ++l) { - b = vbroadcast(bl + l); - R = vbroadcast(recfacs + l); + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -87,8 +102,8 @@ static void legendre_transform_vec1(double *recfacs, double *bl, ptrdiff_t lmax, W1 = vmul(x0, Pm1_0); W2 = W1; W2 = vsub(W2, Pm2_0); - vfmaeq(W1, W2, R); P_0 = W1; + vfmaeq(P_0, W2, R); vfmaeq(y0, P_0, b); @@ -113,17 +128,17 @@ static void legendre_transform_vec2(double *recfacs, double *bl, ptrdiff_t lmax, x0 = vloadu(xarr + 0 * VLEN); Pm1_0 = vload(1.0); P_0 = x0; - b = vbroadcast(bl); + b = vload(*bl); y0 = vmul(Pm1_0, b); x1 = vloadu(xarr + 1 * VLEN); Pm1_1 = vload(1.0); P_1 = x1; - b = vbroadcast(bl); + b = vload(*bl); y1 = vmul(Pm1_1, b); - b = vbroadcast(bl + 1); + b = vload(*(bl + 1)); vfmaeq(y0, P_0, b); @@ -131,8 +146,8 @@ static void legendre_transform_vec2(double *recfacs, double *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast(bl + l); - R = vbroadcast(recfacs + l); + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -142,16 +157,16 @@ static void legendre_transform_vec2(double *recfacs, double *bl, ptrdiff_t lmax, W1 = vmul(x0, Pm1_0); W2 = W1; W2 = vsub(W2, Pm2_0); - vfmaeq(W1, W2, R); P_0 = W1; + vfmaeq(P_0, W2, R); vfmaeq(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul(x1, Pm1_1); W2 = W1; W2 = vsub(W2, Pm2_1); - vfmaeq(W1, W2, R); P_1 = W1; + vfmaeq(P_1, W2, R); vfmaeq(y1, P_1, b); @@ -180,23 +195,23 @@ static void legendre_transform_vec3(double *recfacs, double *bl, ptrdiff_t lmax, x0 = vloadu(xarr + 0 * VLEN); Pm1_0 = vload(1.0); P_0 = x0; - b = vbroadcast(bl); + b = vload(*bl); y0 = vmul(Pm1_0, b); x1 = vloadu(xarr + 1 * VLEN); Pm1_1 = vload(1.0); P_1 = x1; - b = vbroadcast(bl); + b = vload(*bl); y1 = vmul(Pm1_1, b); x2 = vloadu(xarr + 2 * VLEN); Pm1_2 = vload(1.0); P_2 = x2; - b = vbroadcast(bl); + b = vload(*bl); y2 = vmul(Pm1_2, b); - b = vbroadcast(bl + 1); + b = vload(*(bl + 1)); vfmaeq(y0, P_0, b); @@ -206,8 +221,8 @@ static void legendre_transform_vec3(double *recfacs, double *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast(bl + l); - R = vbroadcast(recfacs + l); + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -217,24 +232,24 @@ static void legendre_transform_vec3(double *recfacs, double *bl, ptrdiff_t lmax, W1 = vmul(x0, Pm1_0); W2 = W1; W2 = vsub(W2, Pm2_0); - vfmaeq(W1, W2, R); P_0 = W1; + vfmaeq(P_0, W2, R); vfmaeq(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul(x1, Pm1_1); W2 = W1; W2 = vsub(W2, Pm2_1); - vfmaeq(W1, W2, R); P_1 = W1; + vfmaeq(P_1, W2, R); vfmaeq(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul(x2, Pm1_2); W2 = W1; W2 = vsub(W2, Pm2_2); - vfmaeq(W1, W2, R); P_2 = W1; + vfmaeq(P_2, W2, R); vfmaeq(y2, P_2, b); @@ -267,29 +282,29 @@ static void legendre_transform_vec4(double *recfacs, double *bl, ptrdiff_t lmax, x0 = vloadu(xarr + 0 * VLEN); Pm1_0 = vload(1.0); P_0 = x0; - b = vbroadcast(bl); + b = vload(*bl); y0 = vmul(Pm1_0, b); x1 = vloadu(xarr + 1 * VLEN); Pm1_1 = vload(1.0); P_1 = x1; - b = vbroadcast(bl); + b = vload(*bl); y1 = vmul(Pm1_1, b); x2 = vloadu(xarr + 2 * VLEN); Pm1_2 = vload(1.0); P_2 = x2; - b = vbroadcast(bl); + b = vload(*bl); y2 = vmul(Pm1_2, b); x3 = vloadu(xarr + 3 * VLEN); Pm1_3 = vload(1.0); P_3 = x3; - b = vbroadcast(bl); + b = vload(*bl); y3 = vmul(Pm1_3, b); - b = vbroadcast(bl + 1); + b = vload(*(bl + 1)); vfmaeq(y0, P_0, b); @@ -301,8 +316,8 @@ static void legendre_transform_vec4(double *recfacs, double *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast(bl + l); - R = vbroadcast(recfacs + l); + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -312,32 +327,32 @@ static void legendre_transform_vec4(double *recfacs, double *bl, ptrdiff_t lmax, W1 = vmul(x0, Pm1_0); W2 = W1; W2 = vsub(W2, Pm2_0); - vfmaeq(W1, W2, R); P_0 = W1; + vfmaeq(P_0, W2, R); vfmaeq(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul(x1, Pm1_1); W2 = W1; W2 = vsub(W2, Pm2_1); - vfmaeq(W1, W2, R); P_1 = W1; + vfmaeq(P_1, W2, R); vfmaeq(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul(x2, Pm1_2); W2 = W1; W2 = vsub(W2, Pm2_2); - vfmaeq(W1, W2, R); P_2 = W1; + vfmaeq(P_2, W2, R); vfmaeq(y2, P_2, b); Pm2_3 = Pm1_3; Pm1_3 = P_3; W1 = vmul(x3, Pm1_3); W2 = W1; W2 = vsub(W2, Pm2_3); - vfmaeq(W1, W2, R); P_3 = W1; + vfmaeq(P_3, W2, R); vfmaeq(y3, P_3, b); @@ -374,35 +389,35 @@ static void legendre_transform_vec5(double *recfacs, double *bl, ptrdiff_t lmax, x0 = vloadu(xarr + 0 * VLEN); Pm1_0 = vload(1.0); P_0 = x0; - b = vbroadcast(bl); + b = vload(*bl); y0 = vmul(Pm1_0, b); x1 = vloadu(xarr + 1 * VLEN); Pm1_1 = vload(1.0); P_1 = x1; - b = vbroadcast(bl); + b = vload(*bl); y1 = vmul(Pm1_1, b); x2 = vloadu(xarr + 2 * VLEN); Pm1_2 = vload(1.0); P_2 = x2; - b = vbroadcast(bl); + b = vload(*bl); y2 = vmul(Pm1_2, b); x3 = vloadu(xarr + 3 * VLEN); Pm1_3 = vload(1.0); P_3 = x3; - b = vbroadcast(bl); + b = vload(*bl); y3 = vmul(Pm1_3, b); x4 = vloadu(xarr + 4 * VLEN); Pm1_4 = vload(1.0); P_4 = x4; - b = vbroadcast(bl); + b = vload(*bl); y4 = vmul(Pm1_4, b); - b = vbroadcast(bl + 1); + b = vload(*(bl + 1)); vfmaeq(y0, P_0, b); @@ -416,8 +431,8 @@ static void legendre_transform_vec5(double *recfacs, double *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast(bl + l); - R = vbroadcast(recfacs + l); + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -427,40 +442,40 @@ static void legendre_transform_vec5(double *recfacs, double *bl, ptrdiff_t lmax, W1 = vmul(x0, Pm1_0); W2 = W1; W2 = vsub(W2, Pm2_0); - vfmaeq(W1, W2, R); P_0 = W1; + vfmaeq(P_0, W2, R); vfmaeq(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul(x1, Pm1_1); W2 = W1; W2 = vsub(W2, Pm2_1); - vfmaeq(W1, W2, R); P_1 = W1; + vfmaeq(P_1, W2, R); vfmaeq(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul(x2, Pm1_2); W2 = W1; W2 = vsub(W2, Pm2_2); - vfmaeq(W1, W2, R); P_2 = W1; + vfmaeq(P_2, W2, R); vfmaeq(y2, P_2, b); Pm2_3 = Pm1_3; Pm1_3 = P_3; W1 = vmul(x3, Pm1_3); W2 = W1; W2 = vsub(W2, Pm2_3); - vfmaeq(W1, W2, R); P_3 = W1; + vfmaeq(P_3, W2, R); vfmaeq(y3, P_3, b); Pm2_4 = Pm1_4; Pm1_4 = P_4; W1 = vmul(x4, Pm1_4); W2 = W1; W2 = vsub(W2, Pm2_4); - vfmaeq(W1, W2, R); P_4 = W1; + vfmaeq(P_4, W2, R); vfmaeq(y4, P_4, b); @@ -501,41 +516,41 @@ static void legendre_transform_vec6(double *recfacs, double *bl, ptrdiff_t lmax, x0 = vloadu(xarr + 0 * VLEN); Pm1_0 = vload(1.0); P_0 = x0; - b = vbroadcast(bl); + b = vload(*bl); y0 = vmul(Pm1_0, b); x1 = vloadu(xarr + 1 * VLEN); Pm1_1 = vload(1.0); P_1 = x1; - b = vbroadcast(bl); + b = vload(*bl); y1 = vmul(Pm1_1, b); x2 = vloadu(xarr + 2 * VLEN); Pm1_2 = vload(1.0); P_2 = x2; - b = vbroadcast(bl); + b = vload(*bl); y2 = vmul(Pm1_2, b); x3 = vloadu(xarr + 3 * VLEN); Pm1_3 = vload(1.0); P_3 = x3; - b = vbroadcast(bl); + b = vload(*bl); y3 = vmul(Pm1_3, b); x4 = vloadu(xarr + 4 * VLEN); Pm1_4 = vload(1.0); P_4 = x4; - b = vbroadcast(bl); + b = vload(*bl); y4 = vmul(Pm1_4, b); x5 = vloadu(xarr + 5 * VLEN); Pm1_5 = vload(1.0); P_5 = x5; - b = vbroadcast(bl); + b = vload(*bl); y5 = vmul(Pm1_5, b); - b = vbroadcast(bl + 1); + b = vload(*(bl + 1)); vfmaeq(y0, P_0, b); @@ -551,8 +566,8 @@ static void legendre_transform_vec6(double *recfacs, double *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast(bl + l); - R = vbroadcast(recfacs + l); + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -562,48 +577,48 @@ static void legendre_transform_vec6(double *recfacs, double *bl, ptrdiff_t lmax, W1 = vmul(x0, Pm1_0); W2 = W1; W2 = vsub(W2, Pm2_0); - vfmaeq(W1, W2, R); P_0 = W1; + vfmaeq(P_0, W2, R); vfmaeq(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul(x1, Pm1_1); W2 = W1; W2 = vsub(W2, Pm2_1); - vfmaeq(W1, W2, R); P_1 = W1; + vfmaeq(P_1, W2, R); vfmaeq(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul(x2, Pm1_2); W2 = W1; W2 = vsub(W2, Pm2_2); - vfmaeq(W1, W2, R); P_2 = W1; + vfmaeq(P_2, W2, R); vfmaeq(y2, P_2, b); Pm2_3 = Pm1_3; Pm1_3 = P_3; W1 = vmul(x3, Pm1_3); W2 = W1; W2 = vsub(W2, Pm2_3); - vfmaeq(W1, W2, R); P_3 = W1; + vfmaeq(P_3, W2, R); vfmaeq(y3, P_3, b); Pm2_4 = Pm1_4; Pm1_4 = P_4; W1 = vmul(x4, Pm1_4); W2 = W1; W2 = vsub(W2, Pm2_4); - vfmaeq(W1, W2, R); P_4 = W1; + vfmaeq(P_4, W2, R); vfmaeq(y4, P_4, b); Pm2_5 = Pm1_5; Pm1_5 = P_5; W1 = vmul(x5, Pm1_5); W2 = W1; W2 = vsub(W2, Pm2_5); - vfmaeq(W1, W2, R); P_5 = W1; + vfmaeq(P_5, W2, R); vfmaeq(y5, P_5, b); @@ -638,18 +653,18 @@ static void legendre_transform_vec1_s(float *recfacs, float *bl, ptrdiff_t lmax, x0 = vloadu_s(xarr + 0 * VLEN_s); Pm1_0 = vload_s(1.0); P_0 = x0; - b = vbroadcast_s(bl); + b = vload_s(*bl); y0 = vmul_s(Pm1_0, b); - b = vbroadcast_s(bl + 1); + b = vload_s(*(bl + 1)); vfmaeq_s(y0, P_0, b); for (l = 2; l <= lmax; ++l) { - b = vbroadcast_s(bl + l); - R = vbroadcast_s(recfacs + l); + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -659,8 +674,8 @@ static void legendre_transform_vec1_s(float *recfacs, float *bl, ptrdiff_t lmax, W1 = vmul_s(x0, Pm1_0); W2 = W1; W2 = vsub_s(W2, Pm2_0); - vfmaeq_s(W1, W2, R); P_0 = W1; + vfmaeq_s(P_0, W2, R); vfmaeq_s(y0, P_0, b); @@ -685,17 +700,17 @@ static void legendre_transform_vec2_s(float *recfacs, float *bl, ptrdiff_t lmax, x0 = vloadu_s(xarr + 0 * VLEN_s); Pm1_0 = vload_s(1.0); P_0 = x0; - b = vbroadcast_s(bl); + b = vload_s(*bl); y0 = vmul_s(Pm1_0, b); x1 = vloadu_s(xarr + 1 * VLEN_s); Pm1_1 = vload_s(1.0); P_1 = x1; - b = vbroadcast_s(bl); + b = vload_s(*bl); y1 = vmul_s(Pm1_1, b); - b = vbroadcast_s(bl + 1); + b = vload_s(*(bl + 1)); vfmaeq_s(y0, P_0, b); @@ -703,8 +718,8 @@ static void legendre_transform_vec2_s(float *recfacs, float *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast_s(bl + l); - R = vbroadcast_s(recfacs + l); + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -714,16 +729,16 @@ static void legendre_transform_vec2_s(float *recfacs, float *bl, ptrdiff_t lmax, W1 = vmul_s(x0, Pm1_0); W2 = W1; W2 = vsub_s(W2, Pm2_0); - vfmaeq_s(W1, W2, R); P_0 = W1; + vfmaeq_s(P_0, W2, R); vfmaeq_s(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul_s(x1, Pm1_1); W2 = W1; W2 = vsub_s(W2, Pm2_1); - vfmaeq_s(W1, W2, R); P_1 = W1; + vfmaeq_s(P_1, W2, R); vfmaeq_s(y1, P_1, b); @@ -752,23 +767,23 @@ static void legendre_transform_vec3_s(float *recfacs, float *bl, ptrdiff_t lmax, x0 = vloadu_s(xarr + 0 * VLEN_s); Pm1_0 = vload_s(1.0); P_0 = x0; - b = vbroadcast_s(bl); + b = vload_s(*bl); y0 = vmul_s(Pm1_0, b); x1 = vloadu_s(xarr + 1 * VLEN_s); Pm1_1 = vload_s(1.0); P_1 = x1; - b = vbroadcast_s(bl); + b = vload_s(*bl); y1 = vmul_s(Pm1_1, b); x2 = vloadu_s(xarr + 2 * VLEN_s); Pm1_2 = vload_s(1.0); P_2 = x2; - b = vbroadcast_s(bl); + b = vload_s(*bl); y2 = vmul_s(Pm1_2, b); - b = vbroadcast_s(bl + 1); + b = vload_s(*(bl + 1)); vfmaeq_s(y0, P_0, b); @@ -778,8 +793,8 @@ static void legendre_transform_vec3_s(float *recfacs, float *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast_s(bl + l); - R = vbroadcast_s(recfacs + l); + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -789,24 +804,24 @@ static void legendre_transform_vec3_s(float *recfacs, float *bl, ptrdiff_t lmax, W1 = vmul_s(x0, Pm1_0); W2 = W1; W2 = vsub_s(W2, Pm2_0); - vfmaeq_s(W1, W2, R); P_0 = W1; + vfmaeq_s(P_0, W2, R); vfmaeq_s(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul_s(x1, Pm1_1); W2 = W1; W2 = vsub_s(W2, Pm2_1); - vfmaeq_s(W1, W2, R); P_1 = W1; + vfmaeq_s(P_1, W2, R); vfmaeq_s(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul_s(x2, Pm1_2); W2 = W1; W2 = vsub_s(W2, Pm2_2); - vfmaeq_s(W1, W2, R); P_2 = W1; + vfmaeq_s(P_2, W2, R); vfmaeq_s(y2, P_2, b); @@ -839,29 +854,29 @@ static void legendre_transform_vec4_s(float *recfacs, float *bl, ptrdiff_t lmax, x0 = vloadu_s(xarr + 0 * VLEN_s); Pm1_0 = vload_s(1.0); P_0 = x0; - b = vbroadcast_s(bl); + b = vload_s(*bl); y0 = vmul_s(Pm1_0, b); x1 = vloadu_s(xarr + 1 * VLEN_s); Pm1_1 = vload_s(1.0); P_1 = x1; - b = vbroadcast_s(bl); + b = vload_s(*bl); y1 = vmul_s(Pm1_1, b); x2 = vloadu_s(xarr + 2 * VLEN_s); Pm1_2 = vload_s(1.0); P_2 = x2; - b = vbroadcast_s(bl); + b = vload_s(*bl); y2 = vmul_s(Pm1_2, b); x3 = vloadu_s(xarr + 3 * VLEN_s); Pm1_3 = vload_s(1.0); P_3 = x3; - b = vbroadcast_s(bl); + b = vload_s(*bl); y3 = vmul_s(Pm1_3, b); - b = vbroadcast_s(bl + 1); + b = vload_s(*(bl + 1)); vfmaeq_s(y0, P_0, b); @@ -873,8 +888,8 @@ static void legendre_transform_vec4_s(float *recfacs, float *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast_s(bl + l); - R = vbroadcast_s(recfacs + l); + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -884,32 +899,32 @@ static void legendre_transform_vec4_s(float *recfacs, float *bl, ptrdiff_t lmax, W1 = vmul_s(x0, Pm1_0); W2 = W1; W2 = vsub_s(W2, Pm2_0); - vfmaeq_s(W1, W2, R); P_0 = W1; + vfmaeq_s(P_0, W2, R); vfmaeq_s(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul_s(x1, Pm1_1); W2 = W1; W2 = vsub_s(W2, Pm2_1); - vfmaeq_s(W1, W2, R); P_1 = W1; + vfmaeq_s(P_1, W2, R); vfmaeq_s(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul_s(x2, Pm1_2); W2 = W1; W2 = vsub_s(W2, Pm2_2); - vfmaeq_s(W1, W2, R); P_2 = W1; + vfmaeq_s(P_2, W2, R); vfmaeq_s(y2, P_2, b); Pm2_3 = Pm1_3; Pm1_3 = P_3; W1 = vmul_s(x3, Pm1_3); W2 = W1; W2 = vsub_s(W2, Pm2_3); - vfmaeq_s(W1, W2, R); P_3 = W1; + vfmaeq_s(P_3, W2, R); vfmaeq_s(y3, P_3, b); @@ -946,35 +961,35 @@ static void legendre_transform_vec5_s(float *recfacs, float *bl, ptrdiff_t lmax, x0 = vloadu_s(xarr + 0 * VLEN_s); Pm1_0 = vload_s(1.0); P_0 = x0; - b = vbroadcast_s(bl); + b = vload_s(*bl); y0 = vmul_s(Pm1_0, b); x1 = vloadu_s(xarr + 1 * VLEN_s); Pm1_1 = vload_s(1.0); P_1 = x1; - b = vbroadcast_s(bl); + b = vload_s(*bl); y1 = vmul_s(Pm1_1, b); x2 = vloadu_s(xarr + 2 * VLEN_s); Pm1_2 = vload_s(1.0); P_2 = x2; - b = vbroadcast_s(bl); + b = vload_s(*bl); y2 = vmul_s(Pm1_2, b); x3 = vloadu_s(xarr + 3 * VLEN_s); Pm1_3 = vload_s(1.0); P_3 = x3; - b = vbroadcast_s(bl); + b = vload_s(*bl); y3 = vmul_s(Pm1_3, b); x4 = vloadu_s(xarr + 4 * VLEN_s); Pm1_4 = vload_s(1.0); P_4 = x4; - b = vbroadcast_s(bl); + b = vload_s(*bl); y4 = vmul_s(Pm1_4, b); - b = vbroadcast_s(bl + 1); + b = vload_s(*(bl + 1)); vfmaeq_s(y0, P_0, b); @@ -988,8 +1003,8 @@ static void legendre_transform_vec5_s(float *recfacs, float *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast_s(bl + l); - R = vbroadcast_s(recfacs + l); + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -999,40 +1014,40 @@ static void legendre_transform_vec5_s(float *recfacs, float *bl, ptrdiff_t lmax, W1 = vmul_s(x0, Pm1_0); W2 = W1; W2 = vsub_s(W2, Pm2_0); - vfmaeq_s(W1, W2, R); P_0 = W1; + vfmaeq_s(P_0, W2, R); vfmaeq_s(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul_s(x1, Pm1_1); W2 = W1; W2 = vsub_s(W2, Pm2_1); - vfmaeq_s(W1, W2, R); P_1 = W1; + vfmaeq_s(P_1, W2, R); vfmaeq_s(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul_s(x2, Pm1_2); W2 = W1; W2 = vsub_s(W2, Pm2_2); - vfmaeq_s(W1, W2, R); P_2 = W1; + vfmaeq_s(P_2, W2, R); vfmaeq_s(y2, P_2, b); Pm2_3 = Pm1_3; Pm1_3 = P_3; W1 = vmul_s(x3, Pm1_3); W2 = W1; W2 = vsub_s(W2, Pm2_3); - vfmaeq_s(W1, W2, R); P_3 = W1; + vfmaeq_s(P_3, W2, R); vfmaeq_s(y3, P_3, b); Pm2_4 = Pm1_4; Pm1_4 = P_4; W1 = vmul_s(x4, Pm1_4); W2 = W1; W2 = vsub_s(W2, Pm2_4); - vfmaeq_s(W1, W2, R); P_4 = W1; + vfmaeq_s(P_4, W2, R); vfmaeq_s(y4, P_4, b); @@ -1073,41 +1088,41 @@ static void legendre_transform_vec6_s(float *recfacs, float *bl, ptrdiff_t lmax, x0 = vloadu_s(xarr + 0 * VLEN_s); Pm1_0 = vload_s(1.0); P_0 = x0; - b = vbroadcast_s(bl); + b = vload_s(*bl); y0 = vmul_s(Pm1_0, b); x1 = vloadu_s(xarr + 1 * VLEN_s); Pm1_1 = vload_s(1.0); P_1 = x1; - b = vbroadcast_s(bl); + b = vload_s(*bl); y1 = vmul_s(Pm1_1, b); x2 = vloadu_s(xarr + 2 * VLEN_s); Pm1_2 = vload_s(1.0); P_2 = x2; - b = vbroadcast_s(bl); + b = vload_s(*bl); y2 = vmul_s(Pm1_2, b); x3 = vloadu_s(xarr + 3 * VLEN_s); Pm1_3 = vload_s(1.0); P_3 = x3; - b = vbroadcast_s(bl); + b = vload_s(*bl); y3 = vmul_s(Pm1_3, b); x4 = vloadu_s(xarr + 4 * VLEN_s); Pm1_4 = vload_s(1.0); P_4 = x4; - b = vbroadcast_s(bl); + b = vload_s(*bl); y4 = vmul_s(Pm1_4, b); x5 = vloadu_s(xarr + 5 * VLEN_s); Pm1_5 = vload_s(1.0); P_5 = x5; - b = vbroadcast_s(bl); + b = vload_s(*bl); y5 = vmul_s(Pm1_5, b); - b = vbroadcast_s(bl + 1); + b = vload_s(*(bl + 1)); vfmaeq_s(y0, P_0, b); @@ -1123,8 +1138,8 @@ static void legendre_transform_vec6_s(float *recfacs, float *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast_s(bl + l); - R = vbroadcast_s(recfacs + l); + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -1134,48 +1149,48 @@ static void legendre_transform_vec6_s(float *recfacs, float *bl, ptrdiff_t lmax, W1 = vmul_s(x0, Pm1_0); W2 = W1; W2 = vsub_s(W2, Pm2_0); - vfmaeq_s(W1, W2, R); P_0 = W1; + vfmaeq_s(P_0, W2, R); vfmaeq_s(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul_s(x1, Pm1_1); W2 = W1; W2 = vsub_s(W2, Pm2_1); - vfmaeq_s(W1, W2, R); P_1 = W1; + vfmaeq_s(P_1, W2, R); vfmaeq_s(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul_s(x2, Pm1_2); W2 = W1; W2 = vsub_s(W2, Pm2_2); - vfmaeq_s(W1, W2, R); P_2 = W1; + vfmaeq_s(P_2, W2, R); vfmaeq_s(y2, P_2, b); Pm2_3 = Pm1_3; Pm1_3 = P_3; W1 = vmul_s(x3, Pm1_3); W2 = W1; W2 = vsub_s(W2, Pm2_3); - vfmaeq_s(W1, W2, R); P_3 = W1; + vfmaeq_s(P_3, W2, R); vfmaeq_s(y3, P_3, b); Pm2_4 = Pm1_4; Pm1_4 = P_4; W1 = vmul_s(x4, Pm1_4); W2 = W1; W2 = vsub_s(W2, Pm2_4); - vfmaeq_s(W1, W2, R); P_4 = W1; + vfmaeq_s(P_4, W2, R); vfmaeq_s(y4, P_4, b); Pm2_5 = Pm1_5; Pm1_5 = P_5; W1 = vmul_s(x5, Pm1_5); W2 = W1; W2 = vsub_s(W2, Pm2_5); - vfmaeq_s(W1, W2, R); P_5 = W1; + vfmaeq_s(P_5, W2, R); vfmaeq_s(y5, P_5, b); @@ -1224,24 +1239,21 @@ void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax) { Compute sum_l b_l P_l(x_i) for all i. */ - - -#define CS 4 -#define LEN (CS * VLEN) -#define LEN_s (CS * VLEN_s) +#define LEN (SHARP_LEGENDRE_CS * VLEN) +#define LEN_s (SHARP_LEGENDRE_CS * VLEN_s) void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x, double *out, ptrdiff_t nx) { - double xchunk[LEN], outchunk[LEN]; + double xchunk[MAX_CS * VLEN], outchunk[MAX_CS * LEN]; int compute_recfac; ptrdiff_t i, j, len; compute_recfac = (recfac == NULL); if (compute_recfac) { - recfac = memalign(16, sizeof(double) * (lmax + 1)); + recfac = malloc(sizeof(double) * (lmax + 1)); sharp_legendre_transform_recfac(recfac, lmax); } @@ -1271,13 +1283,13 @@ void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x, float *out, ptrdiff_t nx) { - float xchunk[LEN_s], outchunk[LEN_s]; + float xchunk[MAX_CS * VLEN_s], outchunk[MAX_CS * LEN_s]; int compute_recfac; ptrdiff_t i, j, len; compute_recfac = (recfac == NULL); if (compute_recfac) { - recfac = memalign(16, sizeof(float) * (lmax + 1)); + recfac = malloc(sizeof(float) * (lmax + 1)); sharp_legendre_transform_recfac_s(recfac, lmax); } @@ -1304,4 +1316,4 @@ void sharp_legendre_transform_s(float *bl, } - +#endif \ No newline at end of file diff --git a/libsharp/sharp_legendre.c.in b/libsharp/sharp_legendre.c.in index e30f787..63c84b7 100644 --- a/libsharp/sharp_legendre.c.in +++ b/libsharp/sharp_legendre.c.in @@ -46,6 +46,21 @@ * \author Dag Sverre Seljebotn */ +#ifndef NO_LEGENDRE +#if (VLEN==8) +#error This code is not tested with MIC; please compile with -DNO_LEGENDRE +/* ...or test it (it probably works) and remove this check */ +#endif + +#ifndef SHARP_LEGENDRE_CS +#define SHARP_LEGENDRE_CS 4 +#endif + +#define MAX_CS 6 +#if (SHARP_LEGENDRE_CS > MAX_CS) +#error (SHARP_LEGENDRE_CS > MAX_CS) +#endif + #include "sharp_legendre.h" #include "sharp_vecsupport.h" @@ -66,18 +81,18 @@ static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *b x{{i}} = vloadu{{T}}(xarr + {{i}} * VLEN{{T}}); Pm1_{{i}} = vload{{T}}(1.0); P_{{i}} = x{{i}}; - b = vbroadcast{{T}}(bl); + b = vload{{T}}(*bl); y{{i}} = vmul{{T}}(Pm1_{{i}}, b); /*{ endfor }*/ - b = vbroadcast{{T}}(bl + 1); + b = vload{{T}}(*(bl + 1)); /*{ for i in range(cs) }*/ vfmaeq{{T}}(y{{i}}, P_{{i}}, b); /*{ endfor }*/ for (l = 2; l <= lmax; ++l) { - b = vbroadcast{{T}}(bl + l); - R = vbroadcast{{T}}(recfacs + l); + b = vload{{T}}(*(bl + l)); + R = vload{{T}}(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -87,8 +102,8 @@ static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *b W1 = vmul{{T}}(x{{i}}, Pm1_{{i}}); W2 = W1; W2 = vsub{{T}}(W2, Pm2_{{i}}); - vfmaeq{{T}}(W1, W2, R); P_{{i}} = W1; + vfmaeq{{T}}(P_{{i}}, W2, R); vfmaeq{{T}}(y{{i}}, P_{{i}}, b); /*{ endfor }*/ @@ -117,24 +132,21 @@ void sharp_legendre_transform_recfac{{T}}({{scalar}} *r, ptrdiff_t lmax) { Compute sum_l b_l P_l(x_i) for all i. */ -/*{set cs=4}*/ - -#define CS {{cs}} -#define LEN (CS * VLEN) -#define LEN_s (CS * VLEN_s) +#define LEN (SHARP_LEGENDRE_CS * VLEN) +#define LEN_s (SHARP_LEGENDRE_CS * VLEN_s) /*{ for scalar, T in [("double", ""), ("float", "_s")] }*/ void sharp_legendre_transform{{T}}({{scalar}} *bl, {{scalar}} *recfac, ptrdiff_t lmax, {{scalar}} *x, {{scalar}} *out, ptrdiff_t nx) { - {{scalar}} xchunk[LEN{{T}}], outchunk[LEN{{T}}]; + {{scalar}} xchunk[MAX_CS * VLEN{{T}}], outchunk[MAX_CS * LEN{{T}}]; int compute_recfac; ptrdiff_t i, j, len; compute_recfac = (recfac == NULL); if (compute_recfac) { - recfac = memalign(16, sizeof({{scalar}}) * (lmax + 1)); + recfac = malloc(sizeof({{scalar}}) * (lmax + 1)); sharp_legendre_transform_recfac{{T}}(recfac, lmax); } @@ -160,3 +172,5 @@ void sharp_legendre_transform{{T}}({{scalar}} *bl, } } /*{ endfor }*/ + +#endif diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h index 28073ac..ee4c5e7 100644 --- a/libsharp/sharp_vecsupport.h +++ b/libsharp/sharp_vecsupport.h @@ -63,8 +63,8 @@ typedef int Tm; #define vneg(a) (-(a)) #define vload(a) (a) #define vload_s(a) (a) -#define vloadu(p) (*p) -#define vloadu_s(p) (*p) +#define vloadu(p) (*(p)) +#define vloadu_s(p) (*(p)) #define vabs(a) fabs(a) #define vsqrt(a) sqrt(a) #define vlt(a,b) ((a)<(b)) @@ -72,8 +72,6 @@ typedef int Tm; #define vge(a,b) ((a)>=(b)) #define vne(a,b) ((a)!=(b)) #define vand_mask(a,b) ((a)&&(b)) -#define vbroadcast(p) (*p) -#define vbroadcast_s(p) (*p) #define vstoreu(p, a) (*(p)=a) #define vstoreu_s(p, a) (*(p)=a) @@ -148,15 +146,13 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) #define vloadu_s(p) _mm_loadu_ps(p) #define vstoreu(p, v) _mm_storeu_pd(p, v) #define vstoreu_s(p, v) _mm_storeu_ps(p, v) -#define vbroadcast(p) _mm_set_pd(*p, *p) -#define vbroadcast_s(p) _mm_set_ps(*p, *p, *p, *p) #endif #if (VLEN==4) #include -#ifdef __FMA4__ +#if (USE_FMA4) #include #endif @@ -180,7 +176,7 @@ typedef __m256d Tm; #define vmul_s(a,b) _mm256_mul_ps(a,b) #define vmuleq(a,b) a=_mm256_mul_pd(a,b) #define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone)) -#ifdef __FMA4__ +#if (USE_FMA4) #define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a) #define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a) #define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a) @@ -188,6 +184,7 @@ typedef __m256d Tm; #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a)) #else #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c)) +#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c)) #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c)) #define vfmaaeq(a,b,c,d,e) \ a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e))) @@ -213,8 +210,6 @@ typedef __m256d Tm; #define vloadu_s(p) _mm256_loadu_ps(p) #define vstoreu(p, v) _mm256_storeu_pd(p, v) #define vstoreu_s(p, v) _mm256_storeu_ps(p, v) -#define vbroadcast(p) _mm256_broadcast_sd(p) -#define vbroadcast_s(p) _mm256_broadcast_ss(p) #endif diff --git a/libsharp/sharp_vecutil.h b/libsharp/sharp_vecutil.h index 0aee4ae..f6161ca 100644 --- a/libsharp/sharp_vecutil.h +++ b/libsharp/sharp_vecutil.h @@ -52,4 +52,12 @@ #define VLEN_s (2*VLEN) #endif +#ifndef USE_FMA4 +#ifdef __FMA4__ +#define USE_FMA4 1 +#else +#define USE_FMA4 0 +#endif +#endif + #endif diff --git a/python/libsharp/libsharp.pyx b/python/libsharp/libsharp.pyx index 0abfe69..e622bb3 100644 --- a/python/libsharp/libsharp.pyx +++ b/python/libsharp/libsharp.pyx @@ -14,7 +14,9 @@ cdef extern from "sharp.h": def legendre_transform(x, bl, out=None): if out is None: out = np.empty_like(x) - if x.dtype == np.float64: + if out.shape[0] == 0: + return out + elif x.dtype == np.float64: if bl.dtype != np.float64: bl = bl.astype(np.float64) return _legendre_transform(x, bl, out=out) diff --git a/python/libsharp/tests/test_legendre.py b/python/libsharp/tests/test_legendre.py index 2e96822..66b4c18 100644 --- a/python/libsharp/tests/test_legendre.py +++ b/python/libsharp/tests/test_legendre.py @@ -2,14 +2,17 @@ import numpy as np from scipy.special import legendre import libsharp +from numpy.testing import assert_allclose -def test_legendre_transform(): - lmax = 20 - ntheta = 19 +def check_legendre_transform(lmax, ntheta): l = np.arange(lmax + 1) - bl = np.exp(-l*(l+1)) - bl *= (2 * l + 1) + if lmax >= 1: + sigma = -np.log(1e-3) / lmax / (lmax + 1) + bl = np.exp(-sigma*l*(l+1)) + bl *= (2 * l + 1) + else: + bl = np.asarray([1], dtype=np.double) theta = np.linspace(0, np.pi, ntheta, endpoint=True) x = np.cos(theta) @@ -20,10 +23,19 @@ def test_legendre_transform(): P[:, l] = legendre(l)(x) y0 = np.dot(P, bl) + # double-precision y = libsharp.legendre_transform(x, bl) - assert np.max(np.abs(y - y) / np.abs(y)) < 1e-12 + + assert_allclose(y, y0, rtol=1e-12, atol=1e-12) # single-precision y32 = libsharp.legendre_transform(x.astype(np.float32), bl) - assert np.max(np.abs(y32 - y) / np.abs(y32)) < 1e-4 + assert_allclose(y, y0, rtol=1e-5, atol=1e-5) + + +def test_legendre_transform(): + nthetas_to_try = [0, 9, 17, 19] + list(np.random.randint(500, size=20)) + for ntheta in nthetas_to_try: + for lmax in [0, 1, 2, 3, 20] + list(np.random.randint(50, size=4)): + yield check_legendre_transform, lmax, ntheta From 765831ea2b4cea62a35ee47606b64dae2d6eb73d Mon Sep 17 00:00:00 2001 From: Dag Sverre Seljebotn Date: Thu, 23 Apr 2015 18:54:39 +0200 Subject: [PATCH 5/6] legendre transforms: Fortran wrapper --- fortran/sharp.f90 | 39 +++++++++++++++++++++++++++++++++++++++ fortran/test_sharp.f90 | 27 +++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/fortran/sharp.f90 b/fortran/sharp.f90 index 7dc815d..fb80c3c 100644 --- a/fortran/sharp.f90 +++ b/fortran/sharp.f90 @@ -103,12 +103,32 @@ module sharp type(c_ptr), intent(in) :: alm(*), map(*) end subroutine c_sharp_execute_mpi + ! Legendre transforms + subroutine c_sharp_legendre_transform(bl, recfac, lmax, x, out, nx) & + bind(c, name='sharp_legendre_transform') + use iso_c_binding + integer(c_ptrdiff_t), value :: lmax, nx + real(c_double) :: bl(lmax + 1), x(nx), out(nx) + real(c_double), optional :: recfac(lmax + 1) + end subroutine c_sharp_legendre_transform + + subroutine c_sharp_legendre_transform_s(bl, recfac, lmax, x, out, nx) & + bind(c, name='sharp_legendre_transform_s') + use iso_c_binding + integer(c_ptrdiff_t), value :: lmax, nx + real(c_float) :: bl(lmax + 1), x(nx), out(nx) + real(c_float), optional :: recfac(lmax + 1) + end subroutine c_sharp_legendre_transform_s end interface interface sharp_execute module procedure sharp_execute_d end interface + interface sharp_legendre_transform + module procedure sharp_legendre_transform_d, sharp_legendre_transform_s + end interface sharp_legendre_transform + contains ! alm info @@ -240,6 +260,25 @@ contains end if end subroutine sharp_execute_d + subroutine sharp_legendre_transform_d(bl, x, out) + use iso_c_binding + real(c_double) :: bl(:) + real(c_double) :: x(:), out(size(x)) + !-- + integer(c_ptrdiff_t) :: lmax, nx + call c_sharp_legendre_transform(bl, lmax=int(size(bl) - 1, c_ptrdiff_t), & + x=x, out=out, nx=int(size(x), c_ptrdiff_t)) + end subroutine sharp_legendre_transform_d + + subroutine sharp_legendre_transform_s(bl, x, out) + use iso_c_binding + real(c_float) :: bl(:) + real(c_float) :: x(:), out(size(x)) + !-- + integer(c_ptrdiff_t) :: lmax, nx + call c_sharp_legendre_transform_s(bl, lmax=int(size(bl) - 1, c_ptrdiff_t), & + x=x, out=out, nx=int(size(x), c_ptrdiff_t)) + end subroutine sharp_legendre_transform_s end module diff --git a/fortran/test_sharp.f90 b/fortran/test_sharp.f90 index f42cf0c..0b7cce2 100644 --- a/fortran/test_sharp.f90 +++ b/fortran/test_sharp.f90 @@ -54,4 +54,31 @@ program test_sharp print *, 'DONE' call MPI_Finalize(ierr) + print *, 'LEGENDRE TRANSFORMS' + + call test_legendre_transforms() + +contains + subroutine test_legendre_transforms() + integer, parameter :: lmax = 20, nx=10 + real(c_double) :: bl(0:lmax) + real(c_double) :: x(nx), out(nx) + real(c_float) :: out_s(nx) + !-- + integer :: l, i + + do l = 0, lmax + bl(l) = 1.0 / real(l + 1, c_double) + end do + do i = 1, nx + x(i) = 1 / real(i, c_double) + end do + out = 0 + call sharp_legendre_transform(bl, x, out) + print *, out + call sharp_legendre_transform(real(bl, c_float), real(x, c_float), out_s) + print *, out_s + end subroutine test_legendre_transforms + + end program test_sharp From 7ecd1ddc93281d9e58fa28c06bf6d65102ee03eb Mon Sep 17 00:00:00 2001 From: Dag Sverre Seljebotn Date: Fri, 24 Apr 2015 09:06:20 +0200 Subject: [PATCH 6/6] Expose gauss_legendre_tbl publicly as gauss_legendre_roots --- libsharp/planck.make | 2 +- libsharp/sharp.h | 1 + libsharp/sharp_geomhelpers.c | 66 +------------------------ libsharp/sharp_legendre_roots.c | 67 ++++++++++++++++++++++++++ libsharp/sharp_legendre_roots.h | 50 +++++++++++++++++++ python/libsharp/libsharp.pyx | 13 +++++ python/libsharp/tests/test_legendre.py | 18 +++++++ 7 files changed, 152 insertions(+), 65 deletions(-) create mode 100644 libsharp/sharp_legendre_roots.c create mode 100644 libsharp/sharp_legendre_roots.h diff --git a/libsharp/planck.make b/libsharp/planck.make index fcf51df..8bbd7e1 100644 --- a/libsharp/planck.make +++ b/libsharp/planck.make @@ -8,7 +8,7 @@ FULL_INCLUDE+= -I$(SD) HDR_$(PKG):=$(SD)/*.h LIB_$(PKG):=$(LIBDIR)/libsharp.a BIN:=sharp_testsuite -LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o +LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o sharp_legendre_roots.o ALLOBJ:=$(LIBOBJ) sharp_testsuite.o LIBOBJ:=$(LIBOBJ:%=$(OD)/%) ALLOBJ:=$(ALLOBJ:%=$(OD)/%) diff --git a/libsharp/sharp.h b/libsharp/sharp.h index da6fc4a..4497dd4 100644 --- a/libsharp/sharp.h +++ b/libsharp/sharp.h @@ -40,5 +40,6 @@ #include "sharp_lowlevel.h" #include "sharp_legendre.h" +#include "sharp_legendre_roots.h" #endif diff --git a/libsharp/sharp_geomhelpers.c b/libsharp/sharp_geomhelpers.c index cd088b3..dbb44e0 100644 --- a/libsharp/sharp_geomhelpers.c +++ b/libsharp/sharp_geomhelpers.c @@ -32,6 +32,7 @@ #include #include "sharp_geomhelpers.h" +#include "sharp_legendre_roots.h" #include "c_utils.h" #include "ls_fft.h" #include @@ -106,69 +107,6 @@ void sharp_make_weighted_healpix_geom_info (int nside, int stride, sharp_make_subset_healpix_geom_info(nside, stride, 4 * nside - 1, NULL, weight, geom_info); } -static inline double one_minus_x2 (double x) - { return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; } - -/* Function adapted from GNU GSL file glfixed.c - Original author: Pavel Holoborodko (http://www.holoborodko.com) - - Adjustments by M. Reinecke - - adjusted interface (keep epsilon internal, return full number of points) - - removed precomputed tables - - tweaked Newton iteration to obtain higher accuracy */ -static void gauss_legendre_tbl(int n, double *x, double *w) - { - const double pi = 3.141592653589793238462643383279502884197; - const double eps = 3e-14; - int m = (n+1)>>1; - - double t0 = 1 - (1-1./n) / (8.*n*n); - double t1 = 1./(4.*n+2.); - -#pragma omp parallel -{ - int i; -#pragma omp for schedule(dynamic,100) - for (i=1; i<=m; ++i) - { - double x0 = cos(pi * ((i<<2)-1) * t1) * t0; - - int dobreak=0; - int j=0; - double dpdx; - while(1) - { - double P_1 = 1.0; - double P0 = x0; - double dx, x1; - - for (int k=2; k<=n; k++) - { - double P_2 = P_1; - P_1 = P0; -// P0 = ((2*k-1)*x0*P_1-(k-1)*P_2)/k; - P0 = x0*P_1 + (k-1.)/k * (x0*P_1-P_2); - } - - dpdx = (P_1 - x0*P0) * n / one_minus_x2(x0); - - /* Newton step */ - x1 = x0 - P0/dpdx; - dx = x0-x1; - x0 = x1; - if (dobreak) break; - - if (fabs(dx)<=eps) dobreak=1; - UTIL_ASSERT(++j<100,"convergence problem"); - } - - x[i-1] = -x0; - x[n-i] = x0; - w[i-1] = w[n-i] = 2. / (one_minus_x2(x0) * dpdx * dpdx); - } -} // end of parallel region - } - void sharp_make_gauss_geom_info (int nrings, int nphi, double phi0, int stride_lon, int stride_lat, sharp_geom_info **geom_info) { @@ -181,7 +119,7 @@ void sharp_make_gauss_geom_info (int nrings, int nphi, double phi0, ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings); int *stride_=RALLOC(int,nrings); - gauss_legendre_tbl(nrings,theta,weight); + sharp_legendre_roots(nrings,theta,weight); for (int m=0; m +#include "sharp_legendre_roots.h" +#include "c_utils.h" + +static inline double one_minus_x2 (double x) + { return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; } + +void sharp_legendre_roots(int n, double *x, double *w) + { + const double pi = 3.141592653589793238462643383279502884197; + const double eps = 3e-14; + int m = (n+1)>>1; + + double t0 = 1 - (1-1./n) / (8.*n*n); + double t1 = 1./(4.*n+2.); + +#pragma omp parallel +{ + int i; +#pragma omp for schedule(dynamic,100) + for (i=1; i<=m; ++i) + { + double x0 = cos(pi * ((i<<2)-1) * t1) * t0; + + int dobreak=0; + int j=0; + double dpdx; + while(1) + { + double P_1 = 1.0; + double P0 = x0; + double dx, x1; + + for (int k=2; k<=n; k++) + { + double P_2 = P_1; + P_1 = P0; +// P0 = ((2*k-1)*x0*P_1-(k-1)*P_2)/k; + P0 = x0*P_1 + (k-1.)/k * (x0*P_1-P_2); + } + + dpdx = (P_1 - x0*P0) * n / one_minus_x2(x0); + + /* Newton step */ + x1 = x0 - P0/dpdx; + dx = x0-x1; + x0 = x1; + if (dobreak) break; + + if (fabs(dx)<=eps) dobreak=1; + UTIL_ASSERT(++j<100,"convergence problem"); + } + + x[i-1] = -x0; + x[n-i] = x0; + w[i-1] = w[n-i] = 2. / (one_minus_x2(x0) * dpdx * dpdx); + } +} // end of parallel region + } diff --git a/libsharp/sharp_legendre_roots.h b/libsharp/sharp_legendre_roots.h new file mode 100644 index 0000000..2a056b2 --- /dev/null +++ b/libsharp/sharp_legendre_roots.h @@ -0,0 +1,50 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_legendre_roots.h + * + * Copyright (C) 2006-2012 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef SHARP_LEGENDRE_ROOTS_H +#define SHARP_LEGENDRE_ROOTS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/*! Computes roots and Gaussian quadrature weights for Legendre polynomial + of degree \a n. + \param n Order of Legendre polynomial + \param x Array of length \a n for output (root position) + \param w Array of length \a w for output (weight for Gaussian quadrature) + */ +void sharp_legendre_roots(int n, double *x, double *w); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/python/libsharp/libsharp.pyx b/python/libsharp/libsharp.pyx index e622bb3..9e83d3d 100644 --- a/python/libsharp/libsharp.pyx +++ b/python/libsharp/libsharp.pyx @@ -1,5 +1,7 @@ import numpy as np +__all__ = ['legendre_transform', 'legendre_roots'] + cdef extern from "sharp.h": ctypedef long ptrdiff_t @@ -9,6 +11,7 @@ cdef extern from "sharp.h": double *out, ptrdiff_t nx) void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax) void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax) + void sharp_legendre_roots(int n, double *x, double *w) def legendre_transform(x, bl, out=None): @@ -41,3 +44,13 @@ def _legendre_transform_s(float[::1] x, float[::1] bl, float[::1] out): sharp_legendre_transform_s(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0]) return np.asarray(out) + +def legendre_roots(n): + x = np.empty(n, np.double) + w = np.empty(n, np.double) + cdef double[::1] x_buf = x, w_buf = w + if not (x_buf.shape[0] == w_buf.shape[0] == n): + raise AssertionError() + if n > 0: + sharp_legendre_roots(n, &x_buf[0], &w_buf[0]) + return x, w diff --git a/python/libsharp/tests/test_legendre.py b/python/libsharp/tests/test_legendre.py index 66b4c18..7860d2a 100644 --- a/python/libsharp/tests/test_legendre.py +++ b/python/libsharp/tests/test_legendre.py @@ -1,5 +1,6 @@ import numpy as np from scipy.special import legendre +from scipy.special import p_roots import libsharp from numpy.testing import assert_allclose @@ -39,3 +40,20 @@ def test_legendre_transform(): for ntheta in nthetas_to_try: for lmax in [0, 1, 2, 3, 20] + list(np.random.randint(50, size=4)): yield check_legendre_transform, lmax, ntheta + +def check_legendre_roots(n): + xs, ws = ([], []) if n == 0 else p_roots(n) # from SciPy + xl, wl = libsharp.legendre_roots(n) + assert_allclose(xs, xl) + assert_allclose(ws, wl) + +def test_legendre_roots(): + """ + Test the Legendre root-finding algorithm from libsharp by comparing it with + the SciPy version. + """ + yield check_legendre_roots, 0 + yield check_legendre_roots, 1 + yield check_legendre_roots, 32 + yield check_legendre_roots, 33 + yield check_legendre_roots, 128