From f2fe4f9ca2177803008d70e58553c91c183c4930 Mon Sep 17 00:00:00 2001 From: Dag Sverre Seljebotn Date: Thu, 23 Apr 2015 09:57:45 +0200 Subject: [PATCH] legendre transforms: Polishing and bugfixes vbroadcast didn't work properly for some reason, vload does.. --- libsharp/sharp_legendre.c | 274 +++++++++++++------------ libsharp/sharp_legendre.c.in | 38 ++-- libsharp/sharp_vecsupport.h | 15 +- libsharp/sharp_vecutil.h | 8 + python/libsharp/libsharp.pyx | 4 +- python/libsharp/tests/test_legendre.py | 26 ++- 6 files changed, 204 insertions(+), 161 deletions(-) diff --git a/libsharp/sharp_legendre.c b/libsharp/sharp_legendre.c index 8b4fe4b..cd54101 100644 --- a/libsharp/sharp_legendre.c +++ b/libsharp/sharp_legendre.c @@ -1,4 +1,4 @@ -/* DO NOT EDIT. md5sum of source: 88eec944ab6fbfdc7f391fbb58df3bf1 *//* +/* DO NOT EDIT. md5sum of source: d9499375a254cbf1e7903a249a8676ff *//* NOTE NOTE NOTE @@ -46,6 +46,21 @@ * \author Dag Sverre Seljebotn */ +#ifndef NO_LEGENDRE +#if (VLEN==8) +#error This code is not tested with MIC; please compile with -DNO_LEGENDRE +/* ...or test it (it probably works) and remove this check */ +#endif + +#ifndef SHARP_LEGENDRE_CS +#define SHARP_LEGENDRE_CS 4 +#endif + +#define MAX_CS 6 +#if (SHARP_LEGENDRE_CS > MAX_CS) +#error (SHARP_LEGENDRE_CS > MAX_CS) +#endif + #include "sharp_legendre.h" #include "sharp_vecsupport.h" @@ -66,18 +81,18 @@ static void legendre_transform_vec1(double *recfacs, double *bl, ptrdiff_t lmax, x0 = vloadu(xarr + 0 * VLEN); Pm1_0 = vload(1.0); P_0 = x0; - b = vbroadcast(bl); + b = vload(*bl); y0 = vmul(Pm1_0, b); - b = vbroadcast(bl + 1); + b = vload(*(bl + 1)); vfmaeq(y0, P_0, b); for (l = 2; l <= lmax; ++l) { - b = vbroadcast(bl + l); - R = vbroadcast(recfacs + l); + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -87,8 +102,8 @@ static void legendre_transform_vec1(double *recfacs, double *bl, ptrdiff_t lmax, W1 = vmul(x0, Pm1_0); W2 = W1; W2 = vsub(W2, Pm2_0); - vfmaeq(W1, W2, R); P_0 = W1; + vfmaeq(P_0, W2, R); vfmaeq(y0, P_0, b); @@ -113,17 +128,17 @@ static void legendre_transform_vec2(double *recfacs, double *bl, ptrdiff_t lmax, x0 = vloadu(xarr + 0 * VLEN); Pm1_0 = vload(1.0); P_0 = x0; - b = vbroadcast(bl); + b = vload(*bl); y0 = vmul(Pm1_0, b); x1 = vloadu(xarr + 1 * VLEN); Pm1_1 = vload(1.0); P_1 = x1; - b = vbroadcast(bl); + b = vload(*bl); y1 = vmul(Pm1_1, b); - b = vbroadcast(bl + 1); + b = vload(*(bl + 1)); vfmaeq(y0, P_0, b); @@ -131,8 +146,8 @@ static void legendre_transform_vec2(double *recfacs, double *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast(bl + l); - R = vbroadcast(recfacs + l); + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -142,16 +157,16 @@ static void legendre_transform_vec2(double *recfacs, double *bl, ptrdiff_t lmax, W1 = vmul(x0, Pm1_0); W2 = W1; W2 = vsub(W2, Pm2_0); - vfmaeq(W1, W2, R); P_0 = W1; + vfmaeq(P_0, W2, R); vfmaeq(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul(x1, Pm1_1); W2 = W1; W2 = vsub(W2, Pm2_1); - vfmaeq(W1, W2, R); P_1 = W1; + vfmaeq(P_1, W2, R); vfmaeq(y1, P_1, b); @@ -180,23 +195,23 @@ static void legendre_transform_vec3(double *recfacs, double *bl, ptrdiff_t lmax, x0 = vloadu(xarr + 0 * VLEN); Pm1_0 = vload(1.0); P_0 = x0; - b = vbroadcast(bl); + b = vload(*bl); y0 = vmul(Pm1_0, b); x1 = vloadu(xarr + 1 * VLEN); Pm1_1 = vload(1.0); P_1 = x1; - b = vbroadcast(bl); + b = vload(*bl); y1 = vmul(Pm1_1, b); x2 = vloadu(xarr + 2 * VLEN); Pm1_2 = vload(1.0); P_2 = x2; - b = vbroadcast(bl); + b = vload(*bl); y2 = vmul(Pm1_2, b); - b = vbroadcast(bl + 1); + b = vload(*(bl + 1)); vfmaeq(y0, P_0, b); @@ -206,8 +221,8 @@ static void legendre_transform_vec3(double *recfacs, double *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast(bl + l); - R = vbroadcast(recfacs + l); + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -217,24 +232,24 @@ static void legendre_transform_vec3(double *recfacs, double *bl, ptrdiff_t lmax, W1 = vmul(x0, Pm1_0); W2 = W1; W2 = vsub(W2, Pm2_0); - vfmaeq(W1, W2, R); P_0 = W1; + vfmaeq(P_0, W2, R); vfmaeq(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul(x1, Pm1_1); W2 = W1; W2 = vsub(W2, Pm2_1); - vfmaeq(W1, W2, R); P_1 = W1; + vfmaeq(P_1, W2, R); vfmaeq(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul(x2, Pm1_2); W2 = W1; W2 = vsub(W2, Pm2_2); - vfmaeq(W1, W2, R); P_2 = W1; + vfmaeq(P_2, W2, R); vfmaeq(y2, P_2, b); @@ -267,29 +282,29 @@ static void legendre_transform_vec4(double *recfacs, double *bl, ptrdiff_t lmax, x0 = vloadu(xarr + 0 * VLEN); Pm1_0 = vload(1.0); P_0 = x0; - b = vbroadcast(bl); + b = vload(*bl); y0 = vmul(Pm1_0, b); x1 = vloadu(xarr + 1 * VLEN); Pm1_1 = vload(1.0); P_1 = x1; - b = vbroadcast(bl); + b = vload(*bl); y1 = vmul(Pm1_1, b); x2 = vloadu(xarr + 2 * VLEN); Pm1_2 = vload(1.0); P_2 = x2; - b = vbroadcast(bl); + b = vload(*bl); y2 = vmul(Pm1_2, b); x3 = vloadu(xarr + 3 * VLEN); Pm1_3 = vload(1.0); P_3 = x3; - b = vbroadcast(bl); + b = vload(*bl); y3 = vmul(Pm1_3, b); - b = vbroadcast(bl + 1); + b = vload(*(bl + 1)); vfmaeq(y0, P_0, b); @@ -301,8 +316,8 @@ static void legendre_transform_vec4(double *recfacs, double *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast(bl + l); - R = vbroadcast(recfacs + l); + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -312,32 +327,32 @@ static void legendre_transform_vec4(double *recfacs, double *bl, ptrdiff_t lmax, W1 = vmul(x0, Pm1_0); W2 = W1; W2 = vsub(W2, Pm2_0); - vfmaeq(W1, W2, R); P_0 = W1; + vfmaeq(P_0, W2, R); vfmaeq(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul(x1, Pm1_1); W2 = W1; W2 = vsub(W2, Pm2_1); - vfmaeq(W1, W2, R); P_1 = W1; + vfmaeq(P_1, W2, R); vfmaeq(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul(x2, Pm1_2); W2 = W1; W2 = vsub(W2, Pm2_2); - vfmaeq(W1, W2, R); P_2 = W1; + vfmaeq(P_2, W2, R); vfmaeq(y2, P_2, b); Pm2_3 = Pm1_3; Pm1_3 = P_3; W1 = vmul(x3, Pm1_3); W2 = W1; W2 = vsub(W2, Pm2_3); - vfmaeq(W1, W2, R); P_3 = W1; + vfmaeq(P_3, W2, R); vfmaeq(y3, P_3, b); @@ -374,35 +389,35 @@ static void legendre_transform_vec5(double *recfacs, double *bl, ptrdiff_t lmax, x0 = vloadu(xarr + 0 * VLEN); Pm1_0 = vload(1.0); P_0 = x0; - b = vbroadcast(bl); + b = vload(*bl); y0 = vmul(Pm1_0, b); x1 = vloadu(xarr + 1 * VLEN); Pm1_1 = vload(1.0); P_1 = x1; - b = vbroadcast(bl); + b = vload(*bl); y1 = vmul(Pm1_1, b); x2 = vloadu(xarr + 2 * VLEN); Pm1_2 = vload(1.0); P_2 = x2; - b = vbroadcast(bl); + b = vload(*bl); y2 = vmul(Pm1_2, b); x3 = vloadu(xarr + 3 * VLEN); Pm1_3 = vload(1.0); P_3 = x3; - b = vbroadcast(bl); + b = vload(*bl); y3 = vmul(Pm1_3, b); x4 = vloadu(xarr + 4 * VLEN); Pm1_4 = vload(1.0); P_4 = x4; - b = vbroadcast(bl); + b = vload(*bl); y4 = vmul(Pm1_4, b); - b = vbroadcast(bl + 1); + b = vload(*(bl + 1)); vfmaeq(y0, P_0, b); @@ -416,8 +431,8 @@ static void legendre_transform_vec5(double *recfacs, double *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast(bl + l); - R = vbroadcast(recfacs + l); + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -427,40 +442,40 @@ static void legendre_transform_vec5(double *recfacs, double *bl, ptrdiff_t lmax, W1 = vmul(x0, Pm1_0); W2 = W1; W2 = vsub(W2, Pm2_0); - vfmaeq(W1, W2, R); P_0 = W1; + vfmaeq(P_0, W2, R); vfmaeq(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul(x1, Pm1_1); W2 = W1; W2 = vsub(W2, Pm2_1); - vfmaeq(W1, W2, R); P_1 = W1; + vfmaeq(P_1, W2, R); vfmaeq(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul(x2, Pm1_2); W2 = W1; W2 = vsub(W2, Pm2_2); - vfmaeq(W1, W2, R); P_2 = W1; + vfmaeq(P_2, W2, R); vfmaeq(y2, P_2, b); Pm2_3 = Pm1_3; Pm1_3 = P_3; W1 = vmul(x3, Pm1_3); W2 = W1; W2 = vsub(W2, Pm2_3); - vfmaeq(W1, W2, R); P_3 = W1; + vfmaeq(P_3, W2, R); vfmaeq(y3, P_3, b); Pm2_4 = Pm1_4; Pm1_4 = P_4; W1 = vmul(x4, Pm1_4); W2 = W1; W2 = vsub(W2, Pm2_4); - vfmaeq(W1, W2, R); P_4 = W1; + vfmaeq(P_4, W2, R); vfmaeq(y4, P_4, b); @@ -501,41 +516,41 @@ static void legendre_transform_vec6(double *recfacs, double *bl, ptrdiff_t lmax, x0 = vloadu(xarr + 0 * VLEN); Pm1_0 = vload(1.0); P_0 = x0; - b = vbroadcast(bl); + b = vload(*bl); y0 = vmul(Pm1_0, b); x1 = vloadu(xarr + 1 * VLEN); Pm1_1 = vload(1.0); P_1 = x1; - b = vbroadcast(bl); + b = vload(*bl); y1 = vmul(Pm1_1, b); x2 = vloadu(xarr + 2 * VLEN); Pm1_2 = vload(1.0); P_2 = x2; - b = vbroadcast(bl); + b = vload(*bl); y2 = vmul(Pm1_2, b); x3 = vloadu(xarr + 3 * VLEN); Pm1_3 = vload(1.0); P_3 = x3; - b = vbroadcast(bl); + b = vload(*bl); y3 = vmul(Pm1_3, b); x4 = vloadu(xarr + 4 * VLEN); Pm1_4 = vload(1.0); P_4 = x4; - b = vbroadcast(bl); + b = vload(*bl); y4 = vmul(Pm1_4, b); x5 = vloadu(xarr + 5 * VLEN); Pm1_5 = vload(1.0); P_5 = x5; - b = vbroadcast(bl); + b = vload(*bl); y5 = vmul(Pm1_5, b); - b = vbroadcast(bl + 1); + b = vload(*(bl + 1)); vfmaeq(y0, P_0, b); @@ -551,8 +566,8 @@ static void legendre_transform_vec6(double *recfacs, double *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast(bl + l); - R = vbroadcast(recfacs + l); + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -562,48 +577,48 @@ static void legendre_transform_vec6(double *recfacs, double *bl, ptrdiff_t lmax, W1 = vmul(x0, Pm1_0); W2 = W1; W2 = vsub(W2, Pm2_0); - vfmaeq(W1, W2, R); P_0 = W1; + vfmaeq(P_0, W2, R); vfmaeq(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul(x1, Pm1_1); W2 = W1; W2 = vsub(W2, Pm2_1); - vfmaeq(W1, W2, R); P_1 = W1; + vfmaeq(P_1, W2, R); vfmaeq(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul(x2, Pm1_2); W2 = W1; W2 = vsub(W2, Pm2_2); - vfmaeq(W1, W2, R); P_2 = W1; + vfmaeq(P_2, W2, R); vfmaeq(y2, P_2, b); Pm2_3 = Pm1_3; Pm1_3 = P_3; W1 = vmul(x3, Pm1_3); W2 = W1; W2 = vsub(W2, Pm2_3); - vfmaeq(W1, W2, R); P_3 = W1; + vfmaeq(P_3, W2, R); vfmaeq(y3, P_3, b); Pm2_4 = Pm1_4; Pm1_4 = P_4; W1 = vmul(x4, Pm1_4); W2 = W1; W2 = vsub(W2, Pm2_4); - vfmaeq(W1, W2, R); P_4 = W1; + vfmaeq(P_4, W2, R); vfmaeq(y4, P_4, b); Pm2_5 = Pm1_5; Pm1_5 = P_5; W1 = vmul(x5, Pm1_5); W2 = W1; W2 = vsub(W2, Pm2_5); - vfmaeq(W1, W2, R); P_5 = W1; + vfmaeq(P_5, W2, R); vfmaeq(y5, P_5, b); @@ -638,18 +653,18 @@ static void legendre_transform_vec1_s(float *recfacs, float *bl, ptrdiff_t lmax, x0 = vloadu_s(xarr + 0 * VLEN_s); Pm1_0 = vload_s(1.0); P_0 = x0; - b = vbroadcast_s(bl); + b = vload_s(*bl); y0 = vmul_s(Pm1_0, b); - b = vbroadcast_s(bl + 1); + b = vload_s(*(bl + 1)); vfmaeq_s(y0, P_0, b); for (l = 2; l <= lmax; ++l) { - b = vbroadcast_s(bl + l); - R = vbroadcast_s(recfacs + l); + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -659,8 +674,8 @@ static void legendre_transform_vec1_s(float *recfacs, float *bl, ptrdiff_t lmax, W1 = vmul_s(x0, Pm1_0); W2 = W1; W2 = vsub_s(W2, Pm2_0); - vfmaeq_s(W1, W2, R); P_0 = W1; + vfmaeq_s(P_0, W2, R); vfmaeq_s(y0, P_0, b); @@ -685,17 +700,17 @@ static void legendre_transform_vec2_s(float *recfacs, float *bl, ptrdiff_t lmax, x0 = vloadu_s(xarr + 0 * VLEN_s); Pm1_0 = vload_s(1.0); P_0 = x0; - b = vbroadcast_s(bl); + b = vload_s(*bl); y0 = vmul_s(Pm1_0, b); x1 = vloadu_s(xarr + 1 * VLEN_s); Pm1_1 = vload_s(1.0); P_1 = x1; - b = vbroadcast_s(bl); + b = vload_s(*bl); y1 = vmul_s(Pm1_1, b); - b = vbroadcast_s(bl + 1); + b = vload_s(*(bl + 1)); vfmaeq_s(y0, P_0, b); @@ -703,8 +718,8 @@ static void legendre_transform_vec2_s(float *recfacs, float *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast_s(bl + l); - R = vbroadcast_s(recfacs + l); + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -714,16 +729,16 @@ static void legendre_transform_vec2_s(float *recfacs, float *bl, ptrdiff_t lmax, W1 = vmul_s(x0, Pm1_0); W2 = W1; W2 = vsub_s(W2, Pm2_0); - vfmaeq_s(W1, W2, R); P_0 = W1; + vfmaeq_s(P_0, W2, R); vfmaeq_s(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul_s(x1, Pm1_1); W2 = W1; W2 = vsub_s(W2, Pm2_1); - vfmaeq_s(W1, W2, R); P_1 = W1; + vfmaeq_s(P_1, W2, R); vfmaeq_s(y1, P_1, b); @@ -752,23 +767,23 @@ static void legendre_transform_vec3_s(float *recfacs, float *bl, ptrdiff_t lmax, x0 = vloadu_s(xarr + 0 * VLEN_s); Pm1_0 = vload_s(1.0); P_0 = x0; - b = vbroadcast_s(bl); + b = vload_s(*bl); y0 = vmul_s(Pm1_0, b); x1 = vloadu_s(xarr + 1 * VLEN_s); Pm1_1 = vload_s(1.0); P_1 = x1; - b = vbroadcast_s(bl); + b = vload_s(*bl); y1 = vmul_s(Pm1_1, b); x2 = vloadu_s(xarr + 2 * VLEN_s); Pm1_2 = vload_s(1.0); P_2 = x2; - b = vbroadcast_s(bl); + b = vload_s(*bl); y2 = vmul_s(Pm1_2, b); - b = vbroadcast_s(bl + 1); + b = vload_s(*(bl + 1)); vfmaeq_s(y0, P_0, b); @@ -778,8 +793,8 @@ static void legendre_transform_vec3_s(float *recfacs, float *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast_s(bl + l); - R = vbroadcast_s(recfacs + l); + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -789,24 +804,24 @@ static void legendre_transform_vec3_s(float *recfacs, float *bl, ptrdiff_t lmax, W1 = vmul_s(x0, Pm1_0); W2 = W1; W2 = vsub_s(W2, Pm2_0); - vfmaeq_s(W1, W2, R); P_0 = W1; + vfmaeq_s(P_0, W2, R); vfmaeq_s(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul_s(x1, Pm1_1); W2 = W1; W2 = vsub_s(W2, Pm2_1); - vfmaeq_s(W1, W2, R); P_1 = W1; + vfmaeq_s(P_1, W2, R); vfmaeq_s(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul_s(x2, Pm1_2); W2 = W1; W2 = vsub_s(W2, Pm2_2); - vfmaeq_s(W1, W2, R); P_2 = W1; + vfmaeq_s(P_2, W2, R); vfmaeq_s(y2, P_2, b); @@ -839,29 +854,29 @@ static void legendre_transform_vec4_s(float *recfacs, float *bl, ptrdiff_t lmax, x0 = vloadu_s(xarr + 0 * VLEN_s); Pm1_0 = vload_s(1.0); P_0 = x0; - b = vbroadcast_s(bl); + b = vload_s(*bl); y0 = vmul_s(Pm1_0, b); x1 = vloadu_s(xarr + 1 * VLEN_s); Pm1_1 = vload_s(1.0); P_1 = x1; - b = vbroadcast_s(bl); + b = vload_s(*bl); y1 = vmul_s(Pm1_1, b); x2 = vloadu_s(xarr + 2 * VLEN_s); Pm1_2 = vload_s(1.0); P_2 = x2; - b = vbroadcast_s(bl); + b = vload_s(*bl); y2 = vmul_s(Pm1_2, b); x3 = vloadu_s(xarr + 3 * VLEN_s); Pm1_3 = vload_s(1.0); P_3 = x3; - b = vbroadcast_s(bl); + b = vload_s(*bl); y3 = vmul_s(Pm1_3, b); - b = vbroadcast_s(bl + 1); + b = vload_s(*(bl + 1)); vfmaeq_s(y0, P_0, b); @@ -873,8 +888,8 @@ static void legendre_transform_vec4_s(float *recfacs, float *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast_s(bl + l); - R = vbroadcast_s(recfacs + l); + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -884,32 +899,32 @@ static void legendre_transform_vec4_s(float *recfacs, float *bl, ptrdiff_t lmax, W1 = vmul_s(x0, Pm1_0); W2 = W1; W2 = vsub_s(W2, Pm2_0); - vfmaeq_s(W1, W2, R); P_0 = W1; + vfmaeq_s(P_0, W2, R); vfmaeq_s(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul_s(x1, Pm1_1); W2 = W1; W2 = vsub_s(W2, Pm2_1); - vfmaeq_s(W1, W2, R); P_1 = W1; + vfmaeq_s(P_1, W2, R); vfmaeq_s(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul_s(x2, Pm1_2); W2 = W1; W2 = vsub_s(W2, Pm2_2); - vfmaeq_s(W1, W2, R); P_2 = W1; + vfmaeq_s(P_2, W2, R); vfmaeq_s(y2, P_2, b); Pm2_3 = Pm1_3; Pm1_3 = P_3; W1 = vmul_s(x3, Pm1_3); W2 = W1; W2 = vsub_s(W2, Pm2_3); - vfmaeq_s(W1, W2, R); P_3 = W1; + vfmaeq_s(P_3, W2, R); vfmaeq_s(y3, P_3, b); @@ -946,35 +961,35 @@ static void legendre_transform_vec5_s(float *recfacs, float *bl, ptrdiff_t lmax, x0 = vloadu_s(xarr + 0 * VLEN_s); Pm1_0 = vload_s(1.0); P_0 = x0; - b = vbroadcast_s(bl); + b = vload_s(*bl); y0 = vmul_s(Pm1_0, b); x1 = vloadu_s(xarr + 1 * VLEN_s); Pm1_1 = vload_s(1.0); P_1 = x1; - b = vbroadcast_s(bl); + b = vload_s(*bl); y1 = vmul_s(Pm1_1, b); x2 = vloadu_s(xarr + 2 * VLEN_s); Pm1_2 = vload_s(1.0); P_2 = x2; - b = vbroadcast_s(bl); + b = vload_s(*bl); y2 = vmul_s(Pm1_2, b); x3 = vloadu_s(xarr + 3 * VLEN_s); Pm1_3 = vload_s(1.0); P_3 = x3; - b = vbroadcast_s(bl); + b = vload_s(*bl); y3 = vmul_s(Pm1_3, b); x4 = vloadu_s(xarr + 4 * VLEN_s); Pm1_4 = vload_s(1.0); P_4 = x4; - b = vbroadcast_s(bl); + b = vload_s(*bl); y4 = vmul_s(Pm1_4, b); - b = vbroadcast_s(bl + 1); + b = vload_s(*(bl + 1)); vfmaeq_s(y0, P_0, b); @@ -988,8 +1003,8 @@ static void legendre_transform_vec5_s(float *recfacs, float *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast_s(bl + l); - R = vbroadcast_s(recfacs + l); + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -999,40 +1014,40 @@ static void legendre_transform_vec5_s(float *recfacs, float *bl, ptrdiff_t lmax, W1 = vmul_s(x0, Pm1_0); W2 = W1; W2 = vsub_s(W2, Pm2_0); - vfmaeq_s(W1, W2, R); P_0 = W1; + vfmaeq_s(P_0, W2, R); vfmaeq_s(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul_s(x1, Pm1_1); W2 = W1; W2 = vsub_s(W2, Pm2_1); - vfmaeq_s(W1, W2, R); P_1 = W1; + vfmaeq_s(P_1, W2, R); vfmaeq_s(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul_s(x2, Pm1_2); W2 = W1; W2 = vsub_s(W2, Pm2_2); - vfmaeq_s(W1, W2, R); P_2 = W1; + vfmaeq_s(P_2, W2, R); vfmaeq_s(y2, P_2, b); Pm2_3 = Pm1_3; Pm1_3 = P_3; W1 = vmul_s(x3, Pm1_3); W2 = W1; W2 = vsub_s(W2, Pm2_3); - vfmaeq_s(W1, W2, R); P_3 = W1; + vfmaeq_s(P_3, W2, R); vfmaeq_s(y3, P_3, b); Pm2_4 = Pm1_4; Pm1_4 = P_4; W1 = vmul_s(x4, Pm1_4); W2 = W1; W2 = vsub_s(W2, Pm2_4); - vfmaeq_s(W1, W2, R); P_4 = W1; + vfmaeq_s(P_4, W2, R); vfmaeq_s(y4, P_4, b); @@ -1073,41 +1088,41 @@ static void legendre_transform_vec6_s(float *recfacs, float *bl, ptrdiff_t lmax, x0 = vloadu_s(xarr + 0 * VLEN_s); Pm1_0 = vload_s(1.0); P_0 = x0; - b = vbroadcast_s(bl); + b = vload_s(*bl); y0 = vmul_s(Pm1_0, b); x1 = vloadu_s(xarr + 1 * VLEN_s); Pm1_1 = vload_s(1.0); P_1 = x1; - b = vbroadcast_s(bl); + b = vload_s(*bl); y1 = vmul_s(Pm1_1, b); x2 = vloadu_s(xarr + 2 * VLEN_s); Pm1_2 = vload_s(1.0); P_2 = x2; - b = vbroadcast_s(bl); + b = vload_s(*bl); y2 = vmul_s(Pm1_2, b); x3 = vloadu_s(xarr + 3 * VLEN_s); Pm1_3 = vload_s(1.0); P_3 = x3; - b = vbroadcast_s(bl); + b = vload_s(*bl); y3 = vmul_s(Pm1_3, b); x4 = vloadu_s(xarr + 4 * VLEN_s); Pm1_4 = vload_s(1.0); P_4 = x4; - b = vbroadcast_s(bl); + b = vload_s(*bl); y4 = vmul_s(Pm1_4, b); x5 = vloadu_s(xarr + 5 * VLEN_s); Pm1_5 = vload_s(1.0); P_5 = x5; - b = vbroadcast_s(bl); + b = vload_s(*bl); y5 = vmul_s(Pm1_5, b); - b = vbroadcast_s(bl + 1); + b = vload_s(*(bl + 1)); vfmaeq_s(y0, P_0, b); @@ -1123,8 +1138,8 @@ static void legendre_transform_vec6_s(float *recfacs, float *bl, ptrdiff_t lmax, for (l = 2; l <= lmax; ++l) { - b = vbroadcast_s(bl + l); - R = vbroadcast_s(recfacs + l); + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -1134,48 +1149,48 @@ static void legendre_transform_vec6_s(float *recfacs, float *bl, ptrdiff_t lmax, W1 = vmul_s(x0, Pm1_0); W2 = W1; W2 = vsub_s(W2, Pm2_0); - vfmaeq_s(W1, W2, R); P_0 = W1; + vfmaeq_s(P_0, W2, R); vfmaeq_s(y0, P_0, b); Pm2_1 = Pm1_1; Pm1_1 = P_1; W1 = vmul_s(x1, Pm1_1); W2 = W1; W2 = vsub_s(W2, Pm2_1); - vfmaeq_s(W1, W2, R); P_1 = W1; + vfmaeq_s(P_1, W2, R); vfmaeq_s(y1, P_1, b); Pm2_2 = Pm1_2; Pm1_2 = P_2; W1 = vmul_s(x2, Pm1_2); W2 = W1; W2 = vsub_s(W2, Pm2_2); - vfmaeq_s(W1, W2, R); P_2 = W1; + vfmaeq_s(P_2, W2, R); vfmaeq_s(y2, P_2, b); Pm2_3 = Pm1_3; Pm1_3 = P_3; W1 = vmul_s(x3, Pm1_3); W2 = W1; W2 = vsub_s(W2, Pm2_3); - vfmaeq_s(W1, W2, R); P_3 = W1; + vfmaeq_s(P_3, W2, R); vfmaeq_s(y3, P_3, b); Pm2_4 = Pm1_4; Pm1_4 = P_4; W1 = vmul_s(x4, Pm1_4); W2 = W1; W2 = vsub_s(W2, Pm2_4); - vfmaeq_s(W1, W2, R); P_4 = W1; + vfmaeq_s(P_4, W2, R); vfmaeq_s(y4, P_4, b); Pm2_5 = Pm1_5; Pm1_5 = P_5; W1 = vmul_s(x5, Pm1_5); W2 = W1; W2 = vsub_s(W2, Pm2_5); - vfmaeq_s(W1, W2, R); P_5 = W1; + vfmaeq_s(P_5, W2, R); vfmaeq_s(y5, P_5, b); @@ -1224,24 +1239,21 @@ void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax) { Compute sum_l b_l P_l(x_i) for all i. */ - - -#define CS 4 -#define LEN (CS * VLEN) -#define LEN_s (CS * VLEN_s) +#define LEN (SHARP_LEGENDRE_CS * VLEN) +#define LEN_s (SHARP_LEGENDRE_CS * VLEN_s) void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x, double *out, ptrdiff_t nx) { - double xchunk[LEN], outchunk[LEN]; + double xchunk[MAX_CS * VLEN], outchunk[MAX_CS * LEN]; int compute_recfac; ptrdiff_t i, j, len; compute_recfac = (recfac == NULL); if (compute_recfac) { - recfac = memalign(16, sizeof(double) * (lmax + 1)); + recfac = malloc(sizeof(double) * (lmax + 1)); sharp_legendre_transform_recfac(recfac, lmax); } @@ -1271,13 +1283,13 @@ void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x, float *out, ptrdiff_t nx) { - float xchunk[LEN_s], outchunk[LEN_s]; + float xchunk[MAX_CS * VLEN_s], outchunk[MAX_CS * LEN_s]; int compute_recfac; ptrdiff_t i, j, len; compute_recfac = (recfac == NULL); if (compute_recfac) { - recfac = memalign(16, sizeof(float) * (lmax + 1)); + recfac = malloc(sizeof(float) * (lmax + 1)); sharp_legendre_transform_recfac_s(recfac, lmax); } @@ -1304,4 +1316,4 @@ void sharp_legendre_transform_s(float *bl, } - +#endif \ No newline at end of file diff --git a/libsharp/sharp_legendre.c.in b/libsharp/sharp_legendre.c.in index e30f787..63c84b7 100644 --- a/libsharp/sharp_legendre.c.in +++ b/libsharp/sharp_legendre.c.in @@ -46,6 +46,21 @@ * \author Dag Sverre Seljebotn */ +#ifndef NO_LEGENDRE +#if (VLEN==8) +#error This code is not tested with MIC; please compile with -DNO_LEGENDRE +/* ...or test it (it probably works) and remove this check */ +#endif + +#ifndef SHARP_LEGENDRE_CS +#define SHARP_LEGENDRE_CS 4 +#endif + +#define MAX_CS 6 +#if (SHARP_LEGENDRE_CS > MAX_CS) +#error (SHARP_LEGENDRE_CS > MAX_CS) +#endif + #include "sharp_legendre.h" #include "sharp_vecsupport.h" @@ -66,18 +81,18 @@ static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *b x{{i}} = vloadu{{T}}(xarr + {{i}} * VLEN{{T}}); Pm1_{{i}} = vload{{T}}(1.0); P_{{i}} = x{{i}}; - b = vbroadcast{{T}}(bl); + b = vload{{T}}(*bl); y{{i}} = vmul{{T}}(Pm1_{{i}}, b); /*{ endfor }*/ - b = vbroadcast{{T}}(bl + 1); + b = vload{{T}}(*(bl + 1)); /*{ for i in range(cs) }*/ vfmaeq{{T}}(y{{i}}, P_{{i}}, b); /*{ endfor }*/ for (l = 2; l <= lmax; ++l) { - b = vbroadcast{{T}}(bl + l); - R = vbroadcast{{T}}(recfacs + l); + b = vload{{T}}(*(bl + l)); + R = vload{{T}}(*(recfacs + l)); /* P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) @@ -87,8 +102,8 @@ static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *b W1 = vmul{{T}}(x{{i}}, Pm1_{{i}}); W2 = W1; W2 = vsub{{T}}(W2, Pm2_{{i}}); - vfmaeq{{T}}(W1, W2, R); P_{{i}} = W1; + vfmaeq{{T}}(P_{{i}}, W2, R); vfmaeq{{T}}(y{{i}}, P_{{i}}, b); /*{ endfor }*/ @@ -117,24 +132,21 @@ void sharp_legendre_transform_recfac{{T}}({{scalar}} *r, ptrdiff_t lmax) { Compute sum_l b_l P_l(x_i) for all i. */ -/*{set cs=4}*/ - -#define CS {{cs}} -#define LEN (CS * VLEN) -#define LEN_s (CS * VLEN_s) +#define LEN (SHARP_LEGENDRE_CS * VLEN) +#define LEN_s (SHARP_LEGENDRE_CS * VLEN_s) /*{ for scalar, T in [("double", ""), ("float", "_s")] }*/ void sharp_legendre_transform{{T}}({{scalar}} *bl, {{scalar}} *recfac, ptrdiff_t lmax, {{scalar}} *x, {{scalar}} *out, ptrdiff_t nx) { - {{scalar}} xchunk[LEN{{T}}], outchunk[LEN{{T}}]; + {{scalar}} xchunk[MAX_CS * VLEN{{T}}], outchunk[MAX_CS * LEN{{T}}]; int compute_recfac; ptrdiff_t i, j, len; compute_recfac = (recfac == NULL); if (compute_recfac) { - recfac = memalign(16, sizeof({{scalar}}) * (lmax + 1)); + recfac = malloc(sizeof({{scalar}}) * (lmax + 1)); sharp_legendre_transform_recfac{{T}}(recfac, lmax); } @@ -160,3 +172,5 @@ void sharp_legendre_transform{{T}}({{scalar}} *bl, } } /*{ endfor }*/ + +#endif diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h index 28073ac..ee4c5e7 100644 --- a/libsharp/sharp_vecsupport.h +++ b/libsharp/sharp_vecsupport.h @@ -63,8 +63,8 @@ typedef int Tm; #define vneg(a) (-(a)) #define vload(a) (a) #define vload_s(a) (a) -#define vloadu(p) (*p) -#define vloadu_s(p) (*p) +#define vloadu(p) (*(p)) +#define vloadu_s(p) (*(p)) #define vabs(a) fabs(a) #define vsqrt(a) sqrt(a) #define vlt(a,b) ((a)<(b)) @@ -72,8 +72,6 @@ typedef int Tm; #define vge(a,b) ((a)>=(b)) #define vne(a,b) ((a)!=(b)) #define vand_mask(a,b) ((a)&&(b)) -#define vbroadcast(p) (*p) -#define vbroadcast_s(p) (*p) #define vstoreu(p, a) (*(p)=a) #define vstoreu_s(p, a) (*(p)=a) @@ -148,15 +146,13 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) #define vloadu_s(p) _mm_loadu_ps(p) #define vstoreu(p, v) _mm_storeu_pd(p, v) #define vstoreu_s(p, v) _mm_storeu_ps(p, v) -#define vbroadcast(p) _mm_set_pd(*p, *p) -#define vbroadcast_s(p) _mm_set_ps(*p, *p, *p, *p) #endif #if (VLEN==4) #include -#ifdef __FMA4__ +#if (USE_FMA4) #include #endif @@ -180,7 +176,7 @@ typedef __m256d Tm; #define vmul_s(a,b) _mm256_mul_ps(a,b) #define vmuleq(a,b) a=_mm256_mul_pd(a,b) #define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone)) -#ifdef __FMA4__ +#if (USE_FMA4) #define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a) #define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a) #define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a) @@ -188,6 +184,7 @@ typedef __m256d Tm; #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a)) #else #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c)) +#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c)) #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c)) #define vfmaaeq(a,b,c,d,e) \ a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e))) @@ -213,8 +210,6 @@ typedef __m256d Tm; #define vloadu_s(p) _mm256_loadu_ps(p) #define vstoreu(p, v) _mm256_storeu_pd(p, v) #define vstoreu_s(p, v) _mm256_storeu_ps(p, v) -#define vbroadcast(p) _mm256_broadcast_sd(p) -#define vbroadcast_s(p) _mm256_broadcast_ss(p) #endif diff --git a/libsharp/sharp_vecutil.h b/libsharp/sharp_vecutil.h index 0aee4ae..f6161ca 100644 --- a/libsharp/sharp_vecutil.h +++ b/libsharp/sharp_vecutil.h @@ -52,4 +52,12 @@ #define VLEN_s (2*VLEN) #endif +#ifndef USE_FMA4 +#ifdef __FMA4__ +#define USE_FMA4 1 +#else +#define USE_FMA4 0 +#endif +#endif + #endif diff --git a/python/libsharp/libsharp.pyx b/python/libsharp/libsharp.pyx index 0abfe69..e622bb3 100644 --- a/python/libsharp/libsharp.pyx +++ b/python/libsharp/libsharp.pyx @@ -14,7 +14,9 @@ cdef extern from "sharp.h": def legendre_transform(x, bl, out=None): if out is None: out = np.empty_like(x) - if x.dtype == np.float64: + if out.shape[0] == 0: + return out + elif x.dtype == np.float64: if bl.dtype != np.float64: bl = bl.astype(np.float64) return _legendre_transform(x, bl, out=out) diff --git a/python/libsharp/tests/test_legendre.py b/python/libsharp/tests/test_legendre.py index 2e96822..66b4c18 100644 --- a/python/libsharp/tests/test_legendre.py +++ b/python/libsharp/tests/test_legendre.py @@ -2,14 +2,17 @@ import numpy as np from scipy.special import legendre import libsharp +from numpy.testing import assert_allclose -def test_legendre_transform(): - lmax = 20 - ntheta = 19 +def check_legendre_transform(lmax, ntheta): l = np.arange(lmax + 1) - bl = np.exp(-l*(l+1)) - bl *= (2 * l + 1) + if lmax >= 1: + sigma = -np.log(1e-3) / lmax / (lmax + 1) + bl = np.exp(-sigma*l*(l+1)) + bl *= (2 * l + 1) + else: + bl = np.asarray([1], dtype=np.double) theta = np.linspace(0, np.pi, ntheta, endpoint=True) x = np.cos(theta) @@ -20,10 +23,19 @@ def test_legendre_transform(): P[:, l] = legendre(l)(x) y0 = np.dot(P, bl) + # double-precision y = libsharp.legendre_transform(x, bl) - assert np.max(np.abs(y - y) / np.abs(y)) < 1e-12 + + assert_allclose(y, y0, rtol=1e-12, atol=1e-12) # single-precision y32 = libsharp.legendre_transform(x.astype(np.float32), bl) - assert np.max(np.abs(y32 - y) / np.abs(y32)) < 1e-4 + assert_allclose(y, y0, rtol=1e-5, atol=1e-5) + + +def test_legendre_transform(): + nthetas_to_try = [0, 9, 17, 19] + list(np.random.randint(500, size=20)) + for ntheta in nthetas_to_try: + for lmax in [0, 1, 2, 3, 20] + list(np.random.randint(50, size=4)): + yield check_legendre_transform, lmax, ntheta