diff --git a/Makefile b/Makefile index 5f2d9c8..dd9931e 100644 --- a/Makefile +++ b/Makefile @@ -53,3 +53,11 @@ perftest: compile_all $(BINDIR)/sharp_testsuite test gauss 2047 -1 -1 4096 0 1 && \ $(BINDIR)/sharp_testsuite test gauss 4095 -1 -1 8192 0 1 && \ $(BINDIR)/sharp_testsuite test gauss 8191 -1 -1 16384 0 1 + +# Jinja templates + +%.c: %.c.in + ./runjinja.py < $< > $@ + +genclean: + rm libsharp/sharp_legendre.c || exit 0 diff --git a/libsharp/planck.make b/libsharp/planck.make index 3397422..fcf51df 100644 --- a/libsharp/planck.make +++ b/libsharp/planck.make @@ -8,7 +8,7 @@ FULL_INCLUDE+= -I$(SD) HDR_$(PKG):=$(SD)/*.h LIB_$(PKG):=$(LIBDIR)/libsharp.a BIN:=sharp_testsuite -LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o +LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o ALLOBJ:=$(LIBOBJ) sharp_testsuite.o LIBOBJ:=$(LIBOBJ:%=$(OD)/%) ALLOBJ:=$(ALLOBJ:%=$(OD)/%) diff --git a/libsharp/sharp.h b/libsharp/sharp.h index 9c5dd57..da6fc4a 100644 --- a/libsharp/sharp.h +++ b/libsharp/sharp.h @@ -39,5 +39,6 @@ #include #include "sharp_lowlevel.h" +#include "sharp_legendre.h" #endif diff --git a/libsharp/sharp_legendre.c b/libsharp/sharp_legendre.c new file mode 100644 index 0000000..75518d6 --- /dev/null +++ b/libsharp/sharp_legendre.c @@ -0,0 +1,1305 @@ +/* + + NOTE NOTE NOTE + + This file is edited in sharp_legendre.c.in which is then preprocessed. + Do not make manual modifications to sharp_legendre.c. + + NOTE NOTE NOTE + +*/ + + +/* + * This file is part of libsharp. + * + * Redistribution and use in source and binary forms, with or without + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/*! \file sharp_legendre.c.in + * + * Copyright (C) 2015 University of Oslo + * \author Dag Sverre Seljebotn + */ + +#include "sharp_legendre.h" +#include "sharp_vecsupport.h" + +#include + + + +static void legendre_transform_vec1(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(1) * VLEN], + double out[(1) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vbroadcast(bl); + y0 = vmul(Pm1_0, b); + + + b = vbroadcast(bl + 1); + + vfmaeq(y0, P_0, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast(bl + l); + R = vbroadcast(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + vfmaeq(W1, W2, R); + P_0 = W1; + vfmaeq(y0, P_0, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + +} + +static void legendre_transform_vec2(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(2) * VLEN], + double out[(2) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv P_1, Pm1_1, Pm2_1, x1, y1; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vbroadcast(bl); + y0 = vmul(Pm1_0, b); + + x1 = vloadu(xarr + 1 * VLEN); + Pm1_1 = vload(1.0); + P_1 = x1; + b = vbroadcast(bl); + y1 = vmul(Pm1_1, b); + + + b = vbroadcast(bl + 1); + + vfmaeq(y0, P_0, b); + + vfmaeq(y1, P_1, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast(bl + l); + R = vbroadcast(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + vfmaeq(W1, W2, R); + P_0 = W1; + vfmaeq(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul(x1, Pm1_1); + W2 = W1; + W2 = vsub(W2, Pm2_1); + vfmaeq(W1, W2, R); + P_1 = W1; + vfmaeq(y1, P_1, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + + vstoreu(out + 1 * VLEN, y1); + +} + +static void legendre_transform_vec3(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(3) * VLEN], + double out[(3) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv P_1, Pm1_1, Pm2_1, x1, y1; + + Tv P_2, Pm1_2, Pm2_2, x2, y2; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vbroadcast(bl); + y0 = vmul(Pm1_0, b); + + x1 = vloadu(xarr + 1 * VLEN); + Pm1_1 = vload(1.0); + P_1 = x1; + b = vbroadcast(bl); + y1 = vmul(Pm1_1, b); + + x2 = vloadu(xarr + 2 * VLEN); + Pm1_2 = vload(1.0); + P_2 = x2; + b = vbroadcast(bl); + y2 = vmul(Pm1_2, b); + + + b = vbroadcast(bl + 1); + + vfmaeq(y0, P_0, b); + + vfmaeq(y1, P_1, b); + + vfmaeq(y2, P_2, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast(bl + l); + R = vbroadcast(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + vfmaeq(W1, W2, R); + P_0 = W1; + vfmaeq(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul(x1, Pm1_1); + W2 = W1; + W2 = vsub(W2, Pm2_1); + vfmaeq(W1, W2, R); + P_1 = W1; + vfmaeq(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul(x2, Pm1_2); + W2 = W1; + W2 = vsub(W2, Pm2_2); + vfmaeq(W1, W2, R); + P_2 = W1; + vfmaeq(y2, P_2, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + + vstoreu(out + 1 * VLEN, y1); + + vstoreu(out + 2 * VLEN, y2); + +} + +static void legendre_transform_vec4(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(4) * VLEN], + double out[(4) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv P_1, Pm1_1, Pm2_1, x1, y1; + + Tv P_2, Pm1_2, Pm2_2, x2, y2; + + Tv P_3, Pm1_3, Pm2_3, x3, y3; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vbroadcast(bl); + y0 = vmul(Pm1_0, b); + + x1 = vloadu(xarr + 1 * VLEN); + Pm1_1 = vload(1.0); + P_1 = x1; + b = vbroadcast(bl); + y1 = vmul(Pm1_1, b); + + x2 = vloadu(xarr + 2 * VLEN); + Pm1_2 = vload(1.0); + P_2 = x2; + b = vbroadcast(bl); + y2 = vmul(Pm1_2, b); + + x3 = vloadu(xarr + 3 * VLEN); + Pm1_3 = vload(1.0); + P_3 = x3; + b = vbroadcast(bl); + y3 = vmul(Pm1_3, b); + + + b = vbroadcast(bl + 1); + + vfmaeq(y0, P_0, b); + + vfmaeq(y1, P_1, b); + + vfmaeq(y2, P_2, b); + + vfmaeq(y3, P_3, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast(bl + l); + R = vbroadcast(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + vfmaeq(W1, W2, R); + P_0 = W1; + vfmaeq(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul(x1, Pm1_1); + W2 = W1; + W2 = vsub(W2, Pm2_1); + vfmaeq(W1, W2, R); + P_1 = W1; + vfmaeq(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul(x2, Pm1_2); + W2 = W1; + W2 = vsub(W2, Pm2_2); + vfmaeq(W1, W2, R); + P_2 = W1; + vfmaeq(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul(x3, Pm1_3); + W2 = W1; + W2 = vsub(W2, Pm2_3); + vfmaeq(W1, W2, R); + P_3 = W1; + vfmaeq(y3, P_3, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + + vstoreu(out + 1 * VLEN, y1); + + vstoreu(out + 2 * VLEN, y2); + + vstoreu(out + 3 * VLEN, y3); + +} + +static void legendre_transform_vec5(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(5) * VLEN], + double out[(5) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv P_1, Pm1_1, Pm2_1, x1, y1; + + Tv P_2, Pm1_2, Pm2_2, x2, y2; + + Tv P_3, Pm1_3, Pm2_3, x3, y3; + + Tv P_4, Pm1_4, Pm2_4, x4, y4; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vbroadcast(bl); + y0 = vmul(Pm1_0, b); + + x1 = vloadu(xarr + 1 * VLEN); + Pm1_1 = vload(1.0); + P_1 = x1; + b = vbroadcast(bl); + y1 = vmul(Pm1_1, b); + + x2 = vloadu(xarr + 2 * VLEN); + Pm1_2 = vload(1.0); + P_2 = x2; + b = vbroadcast(bl); + y2 = vmul(Pm1_2, b); + + x3 = vloadu(xarr + 3 * VLEN); + Pm1_3 = vload(1.0); + P_3 = x3; + b = vbroadcast(bl); + y3 = vmul(Pm1_3, b); + + x4 = vloadu(xarr + 4 * VLEN); + Pm1_4 = vload(1.0); + P_4 = x4; + b = vbroadcast(bl); + y4 = vmul(Pm1_4, b); + + + b = vbroadcast(bl + 1); + + vfmaeq(y0, P_0, b); + + vfmaeq(y1, P_1, b); + + vfmaeq(y2, P_2, b); + + vfmaeq(y3, P_3, b); + + vfmaeq(y4, P_4, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast(bl + l); + R = vbroadcast(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + vfmaeq(W1, W2, R); + P_0 = W1; + vfmaeq(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul(x1, Pm1_1); + W2 = W1; + W2 = vsub(W2, Pm2_1); + vfmaeq(W1, W2, R); + P_1 = W1; + vfmaeq(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul(x2, Pm1_2); + W2 = W1; + W2 = vsub(W2, Pm2_2); + vfmaeq(W1, W2, R); + P_2 = W1; + vfmaeq(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul(x3, Pm1_3); + W2 = W1; + W2 = vsub(W2, Pm2_3); + vfmaeq(W1, W2, R); + P_3 = W1; + vfmaeq(y3, P_3, b); + + Pm2_4 = Pm1_4; Pm1_4 = P_4; + W1 = vmul(x4, Pm1_4); + W2 = W1; + W2 = vsub(W2, Pm2_4); + vfmaeq(W1, W2, R); + P_4 = W1; + vfmaeq(y4, P_4, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + + vstoreu(out + 1 * VLEN, y1); + + vstoreu(out + 2 * VLEN, y2); + + vstoreu(out + 3 * VLEN, y3); + + vstoreu(out + 4 * VLEN, y4); + +} + +static void legendre_transform_vec6(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(6) * VLEN], + double out[(6) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv P_1, Pm1_1, Pm2_1, x1, y1; + + Tv P_2, Pm1_2, Pm2_2, x2, y2; + + Tv P_3, Pm1_3, Pm2_3, x3, y3; + + Tv P_4, Pm1_4, Pm2_4, x4, y4; + + Tv P_5, Pm1_5, Pm2_5, x5, y5; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vbroadcast(bl); + y0 = vmul(Pm1_0, b); + + x1 = vloadu(xarr + 1 * VLEN); + Pm1_1 = vload(1.0); + P_1 = x1; + b = vbroadcast(bl); + y1 = vmul(Pm1_1, b); + + x2 = vloadu(xarr + 2 * VLEN); + Pm1_2 = vload(1.0); + P_2 = x2; + b = vbroadcast(bl); + y2 = vmul(Pm1_2, b); + + x3 = vloadu(xarr + 3 * VLEN); + Pm1_3 = vload(1.0); + P_3 = x3; + b = vbroadcast(bl); + y3 = vmul(Pm1_3, b); + + x4 = vloadu(xarr + 4 * VLEN); + Pm1_4 = vload(1.0); + P_4 = x4; + b = vbroadcast(bl); + y4 = vmul(Pm1_4, b); + + x5 = vloadu(xarr + 5 * VLEN); + Pm1_5 = vload(1.0); + P_5 = x5; + b = vbroadcast(bl); + y5 = vmul(Pm1_5, b); + + + b = vbroadcast(bl + 1); + + vfmaeq(y0, P_0, b); + + vfmaeq(y1, P_1, b); + + vfmaeq(y2, P_2, b); + + vfmaeq(y3, P_3, b); + + vfmaeq(y4, P_4, b); + + vfmaeq(y5, P_5, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast(bl + l); + R = vbroadcast(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + vfmaeq(W1, W2, R); + P_0 = W1; + vfmaeq(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul(x1, Pm1_1); + W2 = W1; + W2 = vsub(W2, Pm2_1); + vfmaeq(W1, W2, R); + P_1 = W1; + vfmaeq(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul(x2, Pm1_2); + W2 = W1; + W2 = vsub(W2, Pm2_2); + vfmaeq(W1, W2, R); + P_2 = W1; + vfmaeq(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul(x3, Pm1_3); + W2 = W1; + W2 = vsub(W2, Pm2_3); + vfmaeq(W1, W2, R); + P_3 = W1; + vfmaeq(y3, P_3, b); + + Pm2_4 = Pm1_4; Pm1_4 = P_4; + W1 = vmul(x4, Pm1_4); + W2 = W1; + W2 = vsub(W2, Pm2_4); + vfmaeq(W1, W2, R); + P_4 = W1; + vfmaeq(y4, P_4, b); + + Pm2_5 = Pm1_5; Pm1_5 = P_5; + W1 = vmul(x5, Pm1_5); + W2 = W1; + W2 = vsub(W2, Pm2_5); + vfmaeq(W1, W2, R); + P_5 = W1; + vfmaeq(y5, P_5, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + + vstoreu(out + 1 * VLEN, y1); + + vstoreu(out + 2 * VLEN, y2); + + vstoreu(out + 3 * VLEN, y3); + + vstoreu(out + 4 * VLEN, y4); + + vstoreu(out + 5 * VLEN, y5); + +} + + + +static void legendre_transform_vec1_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(1) * VLEN_s], + float out[(1) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vbroadcast_s(bl); + y0 = vmul_s(Pm1_0, b); + + + b = vbroadcast_s(bl + 1); + + vfmaeq_s(y0, P_0, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast_s(bl + l); + R = vbroadcast_s(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + vfmaeq_s(W1, W2, R); + P_0 = W1; + vfmaeq_s(y0, P_0, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + +} + +static void legendre_transform_vec2_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(2) * VLEN_s], + float out[(2) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s P_1, Pm1_1, Pm2_1, x1, y1; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vbroadcast_s(bl); + y0 = vmul_s(Pm1_0, b); + + x1 = vloadu_s(xarr + 1 * VLEN_s); + Pm1_1 = vload_s(1.0); + P_1 = x1; + b = vbroadcast_s(bl); + y1 = vmul_s(Pm1_1, b); + + + b = vbroadcast_s(bl + 1); + + vfmaeq_s(y0, P_0, b); + + vfmaeq_s(y1, P_1, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast_s(bl + l); + R = vbroadcast_s(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + vfmaeq_s(W1, W2, R); + P_0 = W1; + vfmaeq_s(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul_s(x1, Pm1_1); + W2 = W1; + W2 = vsub_s(W2, Pm2_1); + vfmaeq_s(W1, W2, R); + P_1 = W1; + vfmaeq_s(y1, P_1, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + + vstoreu_s(out + 1 * VLEN_s, y1); + +} + +static void legendre_transform_vec3_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(3) * VLEN_s], + float out[(3) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s P_1, Pm1_1, Pm2_1, x1, y1; + + Tv_s P_2, Pm1_2, Pm2_2, x2, y2; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vbroadcast_s(bl); + y0 = vmul_s(Pm1_0, b); + + x1 = vloadu_s(xarr + 1 * VLEN_s); + Pm1_1 = vload_s(1.0); + P_1 = x1; + b = vbroadcast_s(bl); + y1 = vmul_s(Pm1_1, b); + + x2 = vloadu_s(xarr + 2 * VLEN_s); + Pm1_2 = vload_s(1.0); + P_2 = x2; + b = vbroadcast_s(bl); + y2 = vmul_s(Pm1_2, b); + + + b = vbroadcast_s(bl + 1); + + vfmaeq_s(y0, P_0, b); + + vfmaeq_s(y1, P_1, b); + + vfmaeq_s(y2, P_2, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast_s(bl + l); + R = vbroadcast_s(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + vfmaeq_s(W1, W2, R); + P_0 = W1; + vfmaeq_s(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul_s(x1, Pm1_1); + W2 = W1; + W2 = vsub_s(W2, Pm2_1); + vfmaeq_s(W1, W2, R); + P_1 = W1; + vfmaeq_s(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul_s(x2, Pm1_2); + W2 = W1; + W2 = vsub_s(W2, Pm2_2); + vfmaeq_s(W1, W2, R); + P_2 = W1; + vfmaeq_s(y2, P_2, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + + vstoreu_s(out + 1 * VLEN_s, y1); + + vstoreu_s(out + 2 * VLEN_s, y2); + +} + +static void legendre_transform_vec4_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(4) * VLEN_s], + float out[(4) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s P_1, Pm1_1, Pm2_1, x1, y1; + + Tv_s P_2, Pm1_2, Pm2_2, x2, y2; + + Tv_s P_3, Pm1_3, Pm2_3, x3, y3; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vbroadcast_s(bl); + y0 = vmul_s(Pm1_0, b); + + x1 = vloadu_s(xarr + 1 * VLEN_s); + Pm1_1 = vload_s(1.0); + P_1 = x1; + b = vbroadcast_s(bl); + y1 = vmul_s(Pm1_1, b); + + x2 = vloadu_s(xarr + 2 * VLEN_s); + Pm1_2 = vload_s(1.0); + P_2 = x2; + b = vbroadcast_s(bl); + y2 = vmul_s(Pm1_2, b); + + x3 = vloadu_s(xarr + 3 * VLEN_s); + Pm1_3 = vload_s(1.0); + P_3 = x3; + b = vbroadcast_s(bl); + y3 = vmul_s(Pm1_3, b); + + + b = vbroadcast_s(bl + 1); + + vfmaeq_s(y0, P_0, b); + + vfmaeq_s(y1, P_1, b); + + vfmaeq_s(y2, P_2, b); + + vfmaeq_s(y3, P_3, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast_s(bl + l); + R = vbroadcast_s(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + vfmaeq_s(W1, W2, R); + P_0 = W1; + vfmaeq_s(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul_s(x1, Pm1_1); + W2 = W1; + W2 = vsub_s(W2, Pm2_1); + vfmaeq_s(W1, W2, R); + P_1 = W1; + vfmaeq_s(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul_s(x2, Pm1_2); + W2 = W1; + W2 = vsub_s(W2, Pm2_2); + vfmaeq_s(W1, W2, R); + P_2 = W1; + vfmaeq_s(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul_s(x3, Pm1_3); + W2 = W1; + W2 = vsub_s(W2, Pm2_3); + vfmaeq_s(W1, W2, R); + P_3 = W1; + vfmaeq_s(y3, P_3, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + + vstoreu_s(out + 1 * VLEN_s, y1); + + vstoreu_s(out + 2 * VLEN_s, y2); + + vstoreu_s(out + 3 * VLEN_s, y3); + +} + +static void legendre_transform_vec5_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(5) * VLEN_s], + float out[(5) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s P_1, Pm1_1, Pm2_1, x1, y1; + + Tv_s P_2, Pm1_2, Pm2_2, x2, y2; + + Tv_s P_3, Pm1_3, Pm2_3, x3, y3; + + Tv_s P_4, Pm1_4, Pm2_4, x4, y4; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vbroadcast_s(bl); + y0 = vmul_s(Pm1_0, b); + + x1 = vloadu_s(xarr + 1 * VLEN_s); + Pm1_1 = vload_s(1.0); + P_1 = x1; + b = vbroadcast_s(bl); + y1 = vmul_s(Pm1_1, b); + + x2 = vloadu_s(xarr + 2 * VLEN_s); + Pm1_2 = vload_s(1.0); + P_2 = x2; + b = vbroadcast_s(bl); + y2 = vmul_s(Pm1_2, b); + + x3 = vloadu_s(xarr + 3 * VLEN_s); + Pm1_3 = vload_s(1.0); + P_3 = x3; + b = vbroadcast_s(bl); + y3 = vmul_s(Pm1_3, b); + + x4 = vloadu_s(xarr + 4 * VLEN_s); + Pm1_4 = vload_s(1.0); + P_4 = x4; + b = vbroadcast_s(bl); + y4 = vmul_s(Pm1_4, b); + + + b = vbroadcast_s(bl + 1); + + vfmaeq_s(y0, P_0, b); + + vfmaeq_s(y1, P_1, b); + + vfmaeq_s(y2, P_2, b); + + vfmaeq_s(y3, P_3, b); + + vfmaeq_s(y4, P_4, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast_s(bl + l); + R = vbroadcast_s(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + vfmaeq_s(W1, W2, R); + P_0 = W1; + vfmaeq_s(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul_s(x1, Pm1_1); + W2 = W1; + W2 = vsub_s(W2, Pm2_1); + vfmaeq_s(W1, W2, R); + P_1 = W1; + vfmaeq_s(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul_s(x2, Pm1_2); + W2 = W1; + W2 = vsub_s(W2, Pm2_2); + vfmaeq_s(W1, W2, R); + P_2 = W1; + vfmaeq_s(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul_s(x3, Pm1_3); + W2 = W1; + W2 = vsub_s(W2, Pm2_3); + vfmaeq_s(W1, W2, R); + P_3 = W1; + vfmaeq_s(y3, P_3, b); + + Pm2_4 = Pm1_4; Pm1_4 = P_4; + W1 = vmul_s(x4, Pm1_4); + W2 = W1; + W2 = vsub_s(W2, Pm2_4); + vfmaeq_s(W1, W2, R); + P_4 = W1; + vfmaeq_s(y4, P_4, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + + vstoreu_s(out + 1 * VLEN_s, y1); + + vstoreu_s(out + 2 * VLEN_s, y2); + + vstoreu_s(out + 3 * VLEN_s, y3); + + vstoreu_s(out + 4 * VLEN_s, y4); + +} + +static void legendre_transform_vec6_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(6) * VLEN_s], + float out[(6) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s P_1, Pm1_1, Pm2_1, x1, y1; + + Tv_s P_2, Pm1_2, Pm2_2, x2, y2; + + Tv_s P_3, Pm1_3, Pm2_3, x3, y3; + + Tv_s P_4, Pm1_4, Pm2_4, x4, y4; + + Tv_s P_5, Pm1_5, Pm2_5, x5, y5; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vbroadcast_s(bl); + y0 = vmul_s(Pm1_0, b); + + x1 = vloadu_s(xarr + 1 * VLEN_s); + Pm1_1 = vload_s(1.0); + P_1 = x1; + b = vbroadcast_s(bl); + y1 = vmul_s(Pm1_1, b); + + x2 = vloadu_s(xarr + 2 * VLEN_s); + Pm1_2 = vload_s(1.0); + P_2 = x2; + b = vbroadcast_s(bl); + y2 = vmul_s(Pm1_2, b); + + x3 = vloadu_s(xarr + 3 * VLEN_s); + Pm1_3 = vload_s(1.0); + P_3 = x3; + b = vbroadcast_s(bl); + y3 = vmul_s(Pm1_3, b); + + x4 = vloadu_s(xarr + 4 * VLEN_s); + Pm1_4 = vload_s(1.0); + P_4 = x4; + b = vbroadcast_s(bl); + y4 = vmul_s(Pm1_4, b); + + x5 = vloadu_s(xarr + 5 * VLEN_s); + Pm1_5 = vload_s(1.0); + P_5 = x5; + b = vbroadcast_s(bl); + y5 = vmul_s(Pm1_5, b); + + + b = vbroadcast_s(bl + 1); + + vfmaeq_s(y0, P_0, b); + + vfmaeq_s(y1, P_1, b); + + vfmaeq_s(y2, P_2, b); + + vfmaeq_s(y3, P_3, b); + + vfmaeq_s(y4, P_4, b); + + vfmaeq_s(y5, P_5, b); + + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast_s(bl + l); + R = vbroadcast_s(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + vfmaeq_s(W1, W2, R); + P_0 = W1; + vfmaeq_s(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul_s(x1, Pm1_1); + W2 = W1; + W2 = vsub_s(W2, Pm2_1); + vfmaeq_s(W1, W2, R); + P_1 = W1; + vfmaeq_s(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul_s(x2, Pm1_2); + W2 = W1; + W2 = vsub_s(W2, Pm2_2); + vfmaeq_s(W1, W2, R); + P_2 = W1; + vfmaeq_s(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul_s(x3, Pm1_3); + W2 = W1; + W2 = vsub_s(W2, Pm2_3); + vfmaeq_s(W1, W2, R); + P_3 = W1; + vfmaeq_s(y3, P_3, b); + + Pm2_4 = Pm1_4; Pm1_4 = P_4; + W1 = vmul_s(x4, Pm1_4); + W2 = W1; + W2 = vsub_s(W2, Pm2_4); + vfmaeq_s(W1, W2, R); + P_4 = W1; + vfmaeq_s(y4, P_4, b); + + Pm2_5 = Pm1_5; Pm1_5 = P_5; + W1 = vmul_s(x5, Pm1_5); + W2 = W1; + W2 = vsub_s(W2, Pm2_5); + vfmaeq_s(W1, W2, R); + P_5 = W1; + vfmaeq_s(y5, P_5, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + + vstoreu_s(out + 1 * VLEN_s, y1); + + vstoreu_s(out + 2 * VLEN_s, y2); + + vstoreu_s(out + 3 * VLEN_s, y3); + + vstoreu_s(out + 4 * VLEN_s, y4); + + vstoreu_s(out + 5 * VLEN_s, y5); + +} + + + + + +void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax) { + /* (l - 1) / l, for l >= 2 */ + ptrdiff_t l; + r[0] = 0; + r[1] = 1; + for (l = 2; l <= lmax; ++l) { + r[l] = (double)(l - 1) / (double)l; + } +} + +void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax) { + /* (l - 1) / l, for l >= 2 */ + ptrdiff_t l; + r[0] = 0; + r[1] = 1; + for (l = 2; l <= lmax; ++l) { + r[l] = (float)(l - 1) / (float)l; + } +} + + +/* + Compute sum_l b_l P_l(x_i) for all i. + */ + + + +#define CS 4 +#define LEN (CS * VLEN) +#define LEN_s (CS * VLEN_s) + + +void sharp_legendre_transform(double *bl, + double *recfac, + ptrdiff_t lmax, + double *x, double *out, ptrdiff_t nx) { + double xchunk[LEN], outchunk[LEN]; + int compute_recfac; + ptrdiff_t i, j, len; + + compute_recfac = (recfac == NULL); + if (compute_recfac) { + recfac = memalign(16, sizeof(double) * (lmax + 1)); + sharp_legendre_transform_recfac(recfac, lmax); + } + + for (j = 0; j != LEN; ++j) xchunk[j] = 0; + + for (i = 0; i < nx; i += LEN) { + len = (i + (LEN) <= nx) ? (LEN) : (nx - i); + for (j = 0; j != len; ++j) xchunk[j] = x[i + j]; + switch ((len + VLEN - 1) / VLEN) { + case 6: legendre_transform_vec6(recfac, bl, lmax, xchunk, outchunk); break; + case 5: legendre_transform_vec5(recfac, bl, lmax, xchunk, outchunk); break; + case 4: legendre_transform_vec4(recfac, bl, lmax, xchunk, outchunk); break; + case 3: legendre_transform_vec3(recfac, bl, lmax, xchunk, outchunk); break; + case 2: legendre_transform_vec2(recfac, bl, lmax, xchunk, outchunk); break; + case 1: + case 0: + legendre_transform_vec1(recfac, bl, lmax, xchunk, outchunk); break; + } + for (j = 0; j != len; ++j) out[i + j] = outchunk[j]; + } + if (compute_recfac) { + free(recfac); + } +} + +void sharp_legendre_transform_s(float *bl, + float *recfac, + ptrdiff_t lmax, + float *x, float *out, ptrdiff_t nx) { + float xchunk[LEN_s], outchunk[LEN_s]; + int compute_recfac; + ptrdiff_t i, j, len; + + compute_recfac = (recfac == NULL); + if (compute_recfac) { + recfac = memalign(16, sizeof(float) * (lmax + 1)); + sharp_legendre_transform_recfac_s(recfac, lmax); + } + + for (j = 0; j != LEN_s; ++j) xchunk[j] = 0; + + for (i = 0; i < nx; i += LEN_s) { + len = (i + (LEN_s) <= nx) ? (LEN_s) : (nx - i); + for (j = 0; j != len; ++j) xchunk[j] = x[i + j]; + switch ((len + VLEN_s - 1) / VLEN_s) { + case 6: legendre_transform_vec6_s(recfac, bl, lmax, xchunk, outchunk); break; + case 5: legendre_transform_vec5_s(recfac, bl, lmax, xchunk, outchunk); break; + case 4: legendre_transform_vec4_s(recfac, bl, lmax, xchunk, outchunk); break; + case 3: legendre_transform_vec3_s(recfac, bl, lmax, xchunk, outchunk); break; + case 2: legendre_transform_vec2_s(recfac, bl, lmax, xchunk, outchunk); break; + case 1: + case 0: + legendre_transform_vec1_s(recfac, bl, lmax, xchunk, outchunk); break; + } + for (j = 0; j != len; ++j) out[i + j] = outchunk[j]; + } + if (compute_recfac) { + free(recfac); + } +} + diff --git a/libsharp/sharp_legendre.c.in b/libsharp/sharp_legendre.c.in new file mode 100644 index 0000000..9862127 --- /dev/null +++ b/libsharp/sharp_legendre.c.in @@ -0,0 +1,163 @@ +/* + + NOTE NOTE NOTE + + This file is edited in sharp_legendre.c.in which is then preprocessed. + Do not make manual modifications to sharp_legendre.c. + + NOTE NOTE NOTE + +*/ + + +/* + * This file is part of libsharp. + * + * Redistribution and use in source and binary forms, with or without + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/*! \file sharp_legendre.c.in + * + * Copyright (C) 2015 University of Oslo + * \author Dag Sverre Seljebotn + */ + +#include "sharp_legendre.h" +#include "sharp_vecsupport.h" + +#include + +/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/ +/*{ for cs in range(1, 7) }*/ +static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *bl, ptrdiff_t lmax, + {{scalar}} xarr[({{cs}}) * VLEN{{T}}], + {{scalar}} out[({{cs}}) * VLEN{{T}}]) { + /*{ for i in range(cs) }*/ + Tv{{T}} P_{{i}}, Pm1_{{i}}, Pm2_{{i}}, x{{i}}, y{{i}}; + /*{ endfor }*/ + Tv{{T}} W1, W2, b, R; + ptrdiff_t l; + + /*{ for i in range(cs) }*/ + x{{i}} = vloadu{{T}}(xarr + {{i}} * VLEN{{T}}); + Pm1_{{i}} = vload{{T}}(1.0); + P_{{i}} = x{{i}}; + b = vbroadcast{{T}}(bl); + y{{i}} = vmul{{T}}(Pm1_{{i}}, b); + /*{ endfor }*/ + + b = vbroadcast{{T}}(bl + 1); + /*{ for i in range(cs) }*/ + vfmaeq{{T}}(y{{i}}, P_{{i}}, b); + /*{ endfor }*/ + + for (l = 2; l <= lmax; ++l) { + b = vbroadcast{{T}}(bl + l); + R = vbroadcast{{T}}(recfacs + l); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + /*{ for i in range(cs) }*/ + Pm2_{{i}} = Pm1_{{i}}; Pm1_{{i}} = P_{{i}}; + W1 = vmul{{T}}(x{{i}}, Pm1_{{i}}); + W2 = W1; + W2 = vsub{{T}}(W2, Pm2_{{i}}); + vfmaeq{{T}}(W1, W2, R); + P_{{i}} = W1; + vfmaeq{{T}}(y{{i}}, P_{{i}}, b); + /*{ endfor }*/ + + } + /*{ for i in range(cs) }*/ + vstoreu{{T}}(out + {{i}} * VLEN{{T}}, y{{i}}); + /*{ endfor }*/ +} +/*{ endfor }*/ +/*{ endfor }*/ + + +/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/ +void sharp_legendre_transform_recfac{{T}}({{scalar}} *r, ptrdiff_t lmax) { + /* (l - 1) / l, for l >= 2 */ + ptrdiff_t l; + r[0] = 0; + r[1] = 1; + for (l = 2; l <= lmax; ++l) { + r[l] = ({{scalar}})(l - 1) / ({{scalar}})l; + } +} +/*{ endfor }*/ + +/* + Compute sum_l b_l P_l(x_i) for all i. + */ + +/*{set cs=4}*/ + +#define CS {{cs}} +#define LEN (CS * VLEN) +#define LEN_s (CS * VLEN_s) + +/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/ +void sharp_legendre_transform{{T}}({{scalar}} *bl, + {{scalar}} *recfac, + ptrdiff_t lmax, + {{scalar}} *x, {{scalar}} *out, ptrdiff_t nx) { + {{scalar}} xchunk[LEN{{T}}], outchunk[LEN{{T}}]; + int compute_recfac; + ptrdiff_t i, j, len; + + compute_recfac = (recfac == NULL); + if (compute_recfac) { + recfac = memalign(16, sizeof({{scalar}}) * (lmax + 1)); + sharp_legendre_transform_recfac{{T}}(recfac, lmax); + } + + for (j = 0; j != LEN{{T}}; ++j) xchunk[j] = 0; + + for (i = 0; i < nx; i += LEN{{T}}) { + len = (i + (LEN{{T}}) <= nx) ? (LEN{{T}}) : (nx - i); + for (j = 0; j != len; ++j) xchunk[j] = x[i + j]; + switch ((len + VLEN{{T}} - 1) / VLEN{{T}}) { + case 6: legendre_transform_vec6{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + case 5: legendre_transform_vec5{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + case 4: legendre_transform_vec4{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + case 3: legendre_transform_vec3{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + case 2: legendre_transform_vec2{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + case 1: + case 0: + legendre_transform_vec1{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + } + for (j = 0; j != len; ++j) out[i + j] = outchunk[j]; + } + if (compute_recfac) { + free(recfac); + } +} +/*{ endfor }*/ + diff --git a/libsharp/sharp_legendre.h b/libsharp/sharp_legendre.h new file mode 100644 index 0000000..cfd8aee --- /dev/null +++ b/libsharp/sharp_legendre.h @@ -0,0 +1,62 @@ +/* + * This file is part of libsharp. + * + * Redistribution and use in source and binary forms, with or without + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/*! \file sharp_legendre.h + * Interface for the Legendre transform parts of the spherical transform library. + * + * Copyright (C) 2015 University of Oslo + * \author Dag Sverre Seljebotn + */ + +#ifndef SHARP_LEGENDRE_H +#define SHARP_LEGENDRE_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef NO_LEGENDRE + +void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x, + double *out, ptrdiff_t nx); +void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x, + float *out, ptrdiff_t nx); +void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax); +void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h index b26a539..28073ac 100644 --- a/libsharp/sharp_vecsupport.h +++ b/libsharp/sharp_vecsupport.h @@ -40,23 +40,31 @@ typedef double Ts; #if (VLEN==1) typedef double Tv; +typedef float Tv_s; typedef int Tm; #define vadd(a,b) ((a)+(b)) +#define vadd_s(a,b) ((a)+(b)) #define vaddeq(a,b) ((a)+=(b)) #define vaddeq_mask(mask,a,b) if (mask) (a)+=(b); #define vsub(a,b) ((a)-(b)) +#define vsub_s(a,b) ((a)-(b)) #define vsubeq(a,b) ((a)-=(b)) #define vsubeq_mask(mask,a,b) if (mask) (a)-=(b); #define vmul(a,b) ((a)*(b)) +#define vmul_s(a,b) ((a)*(b)) #define vmuleq(a,b) ((a)*=(b)) #define vmuleq_mask(mask,a,b) if (mask) (a)*=(b); #define vfmaeq(a,b,c) ((a)+=(b)*(c)) +#define vfmaeq_s(a,b,c) ((a)+=(b)*(c)) #define vfmseq(a,b,c) ((a)-=(b)*(c)) #define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e)) #define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e)) #define vneg(a) (-(a)) #define vload(a) (a) +#define vload_s(a) (a) +#define vloadu(p) (*p) +#define vloadu_s(p) (*p) #define vabs(a) fabs(a) #define vsqrt(a) sqrt(a) #define vlt(a,b) ((a)<(b)) @@ -64,6 +72,10 @@ typedef int Tm; #define vge(a,b) ((a)>=(b)) #define vne(a,b) ((a)!=(b)) #define vand_mask(a,b) ((a)&&(b)) +#define vbroadcast(p) (*p) +#define vbroadcast_s(p) (*p) +#define vstoreu(p, a) (*(p)=a) +#define vstoreu_s(p, a) (*(p)=a) static inline Tv vmin (Tv a, Tv b) { return (ab) ? a : b; } @@ -87,6 +99,7 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; } #endif typedef __m128d Tv; +typedef __m128 Tv_s; typedef __m128d Tm; #if defined(__SSE4_1__) @@ -99,15 +112,19 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) #define vone _mm_set1_pd(1.) #define vadd(a,b) _mm_add_pd(a,b) +#define vadd_s(a,b) _mm_add_ps(a,b) #define vaddeq(a,b) a=_mm_add_pd(a,b) #define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero)) #define vsub(a,b) _mm_sub_pd(a,b) +#define vsub_s(a,b) _mm_sub_ps(a,b) #define vsubeq(a,b) a=_mm_sub_pd(a,b) #define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero)) #define vmul(a,b) _mm_mul_pd(a,b) +#define vmul_s(a,b) _mm_mul_ps(a,b) #define vmuleq(a,b) a=_mm_mul_pd(a,b) #define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone)) #define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c)) +#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c)) #define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c)) #define vfmaaeq(a,b,c,d,e) \ a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e))) @@ -115,6 +132,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e))) #define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a) #define vload(a) _mm_set1_pd(a) +#define vload_s(a) _mm_set1_ps(a) #define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a) #define vsqrt(a) _mm_sqrt_pd(a) #define vlt(a,b) _mm_cmplt_pd(a,b) @@ -126,6 +144,12 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) #define vmax(a,b) _mm_max_pd(a,b); #define vanyTrue(a) (_mm_movemask_pd(a)!=0) #define vallTrue(a) (_mm_movemask_pd(a)==3) +#define vloadu(p) _mm_loadu_pd(p) +#define vloadu_s(p) _mm_loadu_ps(p) +#define vstoreu(p, v) _mm_storeu_pd(p, v) +#define vstoreu_s(p, v) _mm_storeu_ps(p, v) +#define vbroadcast(p) _mm_set_pd(*p, *p) +#define vbroadcast_s(p) _mm_set_ps(*p, *p, *p, *p) #endif @@ -137,6 +161,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) #endif typedef __m256d Tv; +typedef __m256 Tv_s; typedef __m256d Tm; #define vblend__(m,a,b) _mm256_blendv_pd(b,a,m) @@ -144,16 +169,20 @@ typedef __m256d Tm; #define vone _mm256_set1_pd(1.) #define vadd(a,b) _mm256_add_pd(a,b) +#define vadd_s(a,b) _mm256_add_ps(a,b) #define vaddeq(a,b) a=_mm256_add_pd(a,b) #define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero)) #define vsub(a,b) _mm256_sub_pd(a,b) +#define vsub_s(a,b) _mm256_sub_ps(a,b) #define vsubeq(a,b) a=_mm256_sub_pd(a,b) #define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero)) #define vmul(a,b) _mm256_mul_pd(a,b) +#define vmul_s(a,b) _mm256_mul_ps(a,b) #define vmuleq(a,b) a=_mm256_mul_pd(a,b) #define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone)) #ifdef __FMA4__ #define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a) +#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a) #define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a) #define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a)) #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a)) @@ -167,6 +196,7 @@ typedef __m256d Tm; #endif #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a) #define vload(a) _mm256_set1_pd(a) +#define vload_s(a) _mm256_set1_ps(a) #define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a) #define vsqrt(a) _mm256_sqrt_pd(a) #define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ) @@ -179,6 +209,13 @@ typedef __m256d Tm; #define vanyTrue(a) (_mm256_movemask_pd(a)!=0) #define vallTrue(a) (_mm256_movemask_pd(a)==15) +#define vloadu(p) _mm256_loadu_pd(p) +#define vloadu_s(p) _mm256_loadu_ps(p) +#define vstoreu(p, v) _mm256_storeu_pd(p, v) +#define vstoreu_s(p, v) _mm256_storeu_ps(p, v) +#define vbroadcast(p) _mm256_broadcast_sd(p) +#define vbroadcast_s(p) _mm256_broadcast_ss(p) + #endif #if (VLEN==8) diff --git a/libsharp/sharp_vecutil.h b/libsharp/sharp_vecutil.h index 16bfa13..0aee4ae 100644 --- a/libsharp/sharp_vecutil.h +++ b/libsharp/sharp_vecutil.h @@ -32,6 +32,8 @@ #ifndef SHARP_VECUTIL_H #define SHARP_VECUTIL_H +#ifndef VLEN + #if (defined (__MIC__)) #define VLEN 8 #elif (defined (__AVX__)) @@ -43,3 +45,11 @@ #endif #endif + +#if (VLEN==1) +#define VLEN_s 1 +#else +#define VLEN_s (2*VLEN) +#endif + +#endif diff --git a/runjinja.py b/runjinja.py new file mode 100755 index 0000000..5269c1a --- /dev/null +++ b/runjinja.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python + +""" +Preprocesses foo.c.in to foo.c. Reads STDIN and writes STDOUT. +""" + +import sys +from jinja2 import Template, Environment + +env = Environment(block_start_string='/*{', + block_end_string='}*/', + variable_start_string='{{', + variable_end_string='}}') + +extra_vars = dict(len=len) +sys.stdout.write(env.from_string(sys.stdin.read()).render(**extra_vars))