diff --git a/.gitignore b/.gitignore index cd0bf21..b555caa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ *.o +*.so #* *~ +*.pyc +*.pyo /auto /autom4te.cache @@ -9,3 +12,5 @@ /config/config.auto /configure /sharp_oracle.inc + +/python/libsharp/libsharp.c diff --git a/Makefile b/Makefile index 5f2d9c8..a621948 100644 --- a/Makefile +++ b/Makefile @@ -53,3 +53,18 @@ perftest: compile_all $(BINDIR)/sharp_testsuite test gauss 2047 -1 -1 4096 0 1 && \ $(BINDIR)/sharp_testsuite test gauss 4095 -1 -1 8192 0 1 && \ $(BINDIR)/sharp_testsuite test gauss 8191 -1 -1 16384 0 1 + +%.c: %.c.in +# Only do this if the md5sum changed, in order to avoid Python and Jinja +# dependency when not modifying the c.in file + grep `md5sum $< | cut -d ' ' -f 1` $@ || ./runjinja.py < $< > $@ + +genclean: + rm libsharp/sharp_legendre.c || exit 0 + +pytest: + rm python/libsharp/libsharp.so || exit 0 + cd python && LIBSHARP_INCLUDE=$(INCDIR) LIBSHARP_LIB=$(LIBDIR) python setup.py build_ext --inplace + cd python && nosetests libsharp + + diff --git a/fortran/sharp.f90 b/fortran/sharp.f90 index 7dc815d..fb80c3c 100644 --- a/fortran/sharp.f90 +++ b/fortran/sharp.f90 @@ -103,12 +103,32 @@ module sharp type(c_ptr), intent(in) :: alm(*), map(*) end subroutine c_sharp_execute_mpi + ! Legendre transforms + subroutine c_sharp_legendre_transform(bl, recfac, lmax, x, out, nx) & + bind(c, name='sharp_legendre_transform') + use iso_c_binding + integer(c_ptrdiff_t), value :: lmax, nx + real(c_double) :: bl(lmax + 1), x(nx), out(nx) + real(c_double), optional :: recfac(lmax + 1) + end subroutine c_sharp_legendre_transform + + subroutine c_sharp_legendre_transform_s(bl, recfac, lmax, x, out, nx) & + bind(c, name='sharp_legendre_transform_s') + use iso_c_binding + integer(c_ptrdiff_t), value :: lmax, nx + real(c_float) :: bl(lmax + 1), x(nx), out(nx) + real(c_float), optional :: recfac(lmax + 1) + end subroutine c_sharp_legendre_transform_s end interface interface sharp_execute module procedure sharp_execute_d end interface + interface sharp_legendre_transform + module procedure sharp_legendre_transform_d, sharp_legendre_transform_s + end interface sharp_legendre_transform + contains ! alm info @@ -240,6 +260,25 @@ contains end if end subroutine sharp_execute_d + subroutine sharp_legendre_transform_d(bl, x, out) + use iso_c_binding + real(c_double) :: bl(:) + real(c_double) :: x(:), out(size(x)) + !-- + integer(c_ptrdiff_t) :: lmax, nx + call c_sharp_legendre_transform(bl, lmax=int(size(bl) - 1, c_ptrdiff_t), & + x=x, out=out, nx=int(size(x), c_ptrdiff_t)) + end subroutine sharp_legendre_transform_d + + subroutine sharp_legendre_transform_s(bl, x, out) + use iso_c_binding + real(c_float) :: bl(:) + real(c_float) :: x(:), out(size(x)) + !-- + integer(c_ptrdiff_t) :: lmax, nx + call c_sharp_legendre_transform_s(bl, lmax=int(size(bl) - 1, c_ptrdiff_t), & + x=x, out=out, nx=int(size(x), c_ptrdiff_t)) + end subroutine sharp_legendre_transform_s end module diff --git a/fortran/test_sharp.f90 b/fortran/test_sharp.f90 index f42cf0c..0b7cce2 100644 --- a/fortran/test_sharp.f90 +++ b/fortran/test_sharp.f90 @@ -54,4 +54,31 @@ program test_sharp print *, 'DONE' call MPI_Finalize(ierr) + print *, 'LEGENDRE TRANSFORMS' + + call test_legendre_transforms() + +contains + subroutine test_legendre_transforms() + integer, parameter :: lmax = 20, nx=10 + real(c_double) :: bl(0:lmax) + real(c_double) :: x(nx), out(nx) + real(c_float) :: out_s(nx) + !-- + integer :: l, i + + do l = 0, lmax + bl(l) = 1.0 / real(l + 1, c_double) + end do + do i = 1, nx + x(i) = 1 / real(i, c_double) + end do + out = 0 + call sharp_legendre_transform(bl, x, out) + print *, out + call sharp_legendre_transform(real(bl, c_float), real(x, c_float), out_s) + print *, out_s + end subroutine test_legendre_transforms + + end program test_sharp diff --git a/libsharp/planck.make b/libsharp/planck.make index 3397422..8bbd7e1 100644 --- a/libsharp/planck.make +++ b/libsharp/planck.make @@ -8,7 +8,7 @@ FULL_INCLUDE+= -I$(SD) HDR_$(PKG):=$(SD)/*.h LIB_$(PKG):=$(LIBDIR)/libsharp.a BIN:=sharp_testsuite -LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o +LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o sharp_legendre_roots.o ALLOBJ:=$(LIBOBJ) sharp_testsuite.o LIBOBJ:=$(LIBOBJ:%=$(OD)/%) ALLOBJ:=$(ALLOBJ:%=$(OD)/%) diff --git a/libsharp/sharp.h b/libsharp/sharp.h index 9c5dd57..4497dd4 100644 --- a/libsharp/sharp.h +++ b/libsharp/sharp.h @@ -39,5 +39,7 @@ #include #include "sharp_lowlevel.h" +#include "sharp_legendre.h" +#include "sharp_legendre_roots.h" #endif diff --git a/libsharp/sharp_geomhelpers.c b/libsharp/sharp_geomhelpers.c index cd088b3..dbb44e0 100644 --- a/libsharp/sharp_geomhelpers.c +++ b/libsharp/sharp_geomhelpers.c @@ -32,6 +32,7 @@ #include #include "sharp_geomhelpers.h" +#include "sharp_legendre_roots.h" #include "c_utils.h" #include "ls_fft.h" #include @@ -106,69 +107,6 @@ void sharp_make_weighted_healpix_geom_info (int nside, int stride, sharp_make_subset_healpix_geom_info(nside, stride, 4 * nside - 1, NULL, weight, geom_info); } -static inline double one_minus_x2 (double x) - { return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; } - -/* Function adapted from GNU GSL file glfixed.c - Original author: Pavel Holoborodko (http://www.holoborodko.com) - - Adjustments by M. Reinecke - - adjusted interface (keep epsilon internal, return full number of points) - - removed precomputed tables - - tweaked Newton iteration to obtain higher accuracy */ -static void gauss_legendre_tbl(int n, double *x, double *w) - { - const double pi = 3.141592653589793238462643383279502884197; - const double eps = 3e-14; - int m = (n+1)>>1; - - double t0 = 1 - (1-1./n) / (8.*n*n); - double t1 = 1./(4.*n+2.); - -#pragma omp parallel -{ - int i; -#pragma omp for schedule(dynamic,100) - for (i=1; i<=m; ++i) - { - double x0 = cos(pi * ((i<<2)-1) * t1) * t0; - - int dobreak=0; - int j=0; - double dpdx; - while(1) - { - double P_1 = 1.0; - double P0 = x0; - double dx, x1; - - for (int k=2; k<=n; k++) - { - double P_2 = P_1; - P_1 = P0; -// P0 = ((2*k-1)*x0*P_1-(k-1)*P_2)/k; - P0 = x0*P_1 + (k-1.)/k * (x0*P_1-P_2); - } - - dpdx = (P_1 - x0*P0) * n / one_minus_x2(x0); - - /* Newton step */ - x1 = x0 - P0/dpdx; - dx = x0-x1; - x0 = x1; - if (dobreak) break; - - if (fabs(dx)<=eps) dobreak=1; - UTIL_ASSERT(++j<100,"convergence problem"); - } - - x[i-1] = -x0; - x[n-i] = x0; - w[i-1] = w[n-i] = 2. / (one_minus_x2(x0) * dpdx * dpdx); - } -} // end of parallel region - } - void sharp_make_gauss_geom_info (int nrings, int nphi, double phi0, int stride_lon, int stride_lat, sharp_geom_info **geom_info) { @@ -181,7 +119,7 @@ void sharp_make_gauss_geom_info (int nrings, int nphi, double phi0, ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings); int *stride_=RALLOC(int,nrings); - gauss_legendre_tbl(nrings,theta,weight); + sharp_legendre_roots(nrings,theta,weight); for (int m=0; m MAX_CS) +#error (SHARP_LEGENDRE_CS > MAX_CS) +#endif + +#include "sharp_legendre.h" +#include "sharp_vecsupport.h" + +#include + + + +static void legendre_transform_vec1(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(1) * VLEN], + double out[(1) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vload(*bl); + y0 = vmul(Pm1_0, b); + + + b = vload(*(bl + 1)); + + vfmaeq(y0, P_0, b); + + + for (l = 2; l <= lmax; ++l) { + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + P_0 = W1; + vfmaeq(P_0, W2, R); + vfmaeq(y0, P_0, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + +} + +static void legendre_transform_vec2(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(2) * VLEN], + double out[(2) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv P_1, Pm1_1, Pm2_1, x1, y1; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vload(*bl); + y0 = vmul(Pm1_0, b); + + x1 = vloadu(xarr + 1 * VLEN); + Pm1_1 = vload(1.0); + P_1 = x1; + b = vload(*bl); + y1 = vmul(Pm1_1, b); + + + b = vload(*(bl + 1)); + + vfmaeq(y0, P_0, b); + + vfmaeq(y1, P_1, b); + + + for (l = 2; l <= lmax; ++l) { + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + P_0 = W1; + vfmaeq(P_0, W2, R); + vfmaeq(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul(x1, Pm1_1); + W2 = W1; + W2 = vsub(W2, Pm2_1); + P_1 = W1; + vfmaeq(P_1, W2, R); + vfmaeq(y1, P_1, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + + vstoreu(out + 1 * VLEN, y1); + +} + +static void legendre_transform_vec3(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(3) * VLEN], + double out[(3) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv P_1, Pm1_1, Pm2_1, x1, y1; + + Tv P_2, Pm1_2, Pm2_2, x2, y2; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vload(*bl); + y0 = vmul(Pm1_0, b); + + x1 = vloadu(xarr + 1 * VLEN); + Pm1_1 = vload(1.0); + P_1 = x1; + b = vload(*bl); + y1 = vmul(Pm1_1, b); + + x2 = vloadu(xarr + 2 * VLEN); + Pm1_2 = vload(1.0); + P_2 = x2; + b = vload(*bl); + y2 = vmul(Pm1_2, b); + + + b = vload(*(bl + 1)); + + vfmaeq(y0, P_0, b); + + vfmaeq(y1, P_1, b); + + vfmaeq(y2, P_2, b); + + + for (l = 2; l <= lmax; ++l) { + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + P_0 = W1; + vfmaeq(P_0, W2, R); + vfmaeq(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul(x1, Pm1_1); + W2 = W1; + W2 = vsub(W2, Pm2_1); + P_1 = W1; + vfmaeq(P_1, W2, R); + vfmaeq(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul(x2, Pm1_2); + W2 = W1; + W2 = vsub(W2, Pm2_2); + P_2 = W1; + vfmaeq(P_2, W2, R); + vfmaeq(y2, P_2, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + + vstoreu(out + 1 * VLEN, y1); + + vstoreu(out + 2 * VLEN, y2); + +} + +static void legendre_transform_vec4(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(4) * VLEN], + double out[(4) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv P_1, Pm1_1, Pm2_1, x1, y1; + + Tv P_2, Pm1_2, Pm2_2, x2, y2; + + Tv P_3, Pm1_3, Pm2_3, x3, y3; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vload(*bl); + y0 = vmul(Pm1_0, b); + + x1 = vloadu(xarr + 1 * VLEN); + Pm1_1 = vload(1.0); + P_1 = x1; + b = vload(*bl); + y1 = vmul(Pm1_1, b); + + x2 = vloadu(xarr + 2 * VLEN); + Pm1_2 = vload(1.0); + P_2 = x2; + b = vload(*bl); + y2 = vmul(Pm1_2, b); + + x3 = vloadu(xarr + 3 * VLEN); + Pm1_3 = vload(1.0); + P_3 = x3; + b = vload(*bl); + y3 = vmul(Pm1_3, b); + + + b = vload(*(bl + 1)); + + vfmaeq(y0, P_0, b); + + vfmaeq(y1, P_1, b); + + vfmaeq(y2, P_2, b); + + vfmaeq(y3, P_3, b); + + + for (l = 2; l <= lmax; ++l) { + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + P_0 = W1; + vfmaeq(P_0, W2, R); + vfmaeq(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul(x1, Pm1_1); + W2 = W1; + W2 = vsub(W2, Pm2_1); + P_1 = W1; + vfmaeq(P_1, W2, R); + vfmaeq(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul(x2, Pm1_2); + W2 = W1; + W2 = vsub(W2, Pm2_2); + P_2 = W1; + vfmaeq(P_2, W2, R); + vfmaeq(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul(x3, Pm1_3); + W2 = W1; + W2 = vsub(W2, Pm2_3); + P_3 = W1; + vfmaeq(P_3, W2, R); + vfmaeq(y3, P_3, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + + vstoreu(out + 1 * VLEN, y1); + + vstoreu(out + 2 * VLEN, y2); + + vstoreu(out + 3 * VLEN, y3); + +} + +static void legendre_transform_vec5(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(5) * VLEN], + double out[(5) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv P_1, Pm1_1, Pm2_1, x1, y1; + + Tv P_2, Pm1_2, Pm2_2, x2, y2; + + Tv P_3, Pm1_3, Pm2_3, x3, y3; + + Tv P_4, Pm1_4, Pm2_4, x4, y4; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vload(*bl); + y0 = vmul(Pm1_0, b); + + x1 = vloadu(xarr + 1 * VLEN); + Pm1_1 = vload(1.0); + P_1 = x1; + b = vload(*bl); + y1 = vmul(Pm1_1, b); + + x2 = vloadu(xarr + 2 * VLEN); + Pm1_2 = vload(1.0); + P_2 = x2; + b = vload(*bl); + y2 = vmul(Pm1_2, b); + + x3 = vloadu(xarr + 3 * VLEN); + Pm1_3 = vload(1.0); + P_3 = x3; + b = vload(*bl); + y3 = vmul(Pm1_3, b); + + x4 = vloadu(xarr + 4 * VLEN); + Pm1_4 = vload(1.0); + P_4 = x4; + b = vload(*bl); + y4 = vmul(Pm1_4, b); + + + b = vload(*(bl + 1)); + + vfmaeq(y0, P_0, b); + + vfmaeq(y1, P_1, b); + + vfmaeq(y2, P_2, b); + + vfmaeq(y3, P_3, b); + + vfmaeq(y4, P_4, b); + + + for (l = 2; l <= lmax; ++l) { + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + P_0 = W1; + vfmaeq(P_0, W2, R); + vfmaeq(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul(x1, Pm1_1); + W2 = W1; + W2 = vsub(W2, Pm2_1); + P_1 = W1; + vfmaeq(P_1, W2, R); + vfmaeq(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul(x2, Pm1_2); + W2 = W1; + W2 = vsub(W2, Pm2_2); + P_2 = W1; + vfmaeq(P_2, W2, R); + vfmaeq(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul(x3, Pm1_3); + W2 = W1; + W2 = vsub(W2, Pm2_3); + P_3 = W1; + vfmaeq(P_3, W2, R); + vfmaeq(y3, P_3, b); + + Pm2_4 = Pm1_4; Pm1_4 = P_4; + W1 = vmul(x4, Pm1_4); + W2 = W1; + W2 = vsub(W2, Pm2_4); + P_4 = W1; + vfmaeq(P_4, W2, R); + vfmaeq(y4, P_4, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + + vstoreu(out + 1 * VLEN, y1); + + vstoreu(out + 2 * VLEN, y2); + + vstoreu(out + 3 * VLEN, y3); + + vstoreu(out + 4 * VLEN, y4); + +} + +static void legendre_transform_vec6(double *recfacs, double *bl, ptrdiff_t lmax, + double xarr[(6) * VLEN], + double out[(6) * VLEN]) { + + Tv P_0, Pm1_0, Pm2_0, x0, y0; + + Tv P_1, Pm1_1, Pm2_1, x1, y1; + + Tv P_2, Pm1_2, Pm2_2, x2, y2; + + Tv P_3, Pm1_3, Pm2_3, x3, y3; + + Tv P_4, Pm1_4, Pm2_4, x4, y4; + + Tv P_5, Pm1_5, Pm2_5, x5, y5; + + Tv W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu(xarr + 0 * VLEN); + Pm1_0 = vload(1.0); + P_0 = x0; + b = vload(*bl); + y0 = vmul(Pm1_0, b); + + x1 = vloadu(xarr + 1 * VLEN); + Pm1_1 = vload(1.0); + P_1 = x1; + b = vload(*bl); + y1 = vmul(Pm1_1, b); + + x2 = vloadu(xarr + 2 * VLEN); + Pm1_2 = vload(1.0); + P_2 = x2; + b = vload(*bl); + y2 = vmul(Pm1_2, b); + + x3 = vloadu(xarr + 3 * VLEN); + Pm1_3 = vload(1.0); + P_3 = x3; + b = vload(*bl); + y3 = vmul(Pm1_3, b); + + x4 = vloadu(xarr + 4 * VLEN); + Pm1_4 = vload(1.0); + P_4 = x4; + b = vload(*bl); + y4 = vmul(Pm1_4, b); + + x5 = vloadu(xarr + 5 * VLEN); + Pm1_5 = vload(1.0); + P_5 = x5; + b = vload(*bl); + y5 = vmul(Pm1_5, b); + + + b = vload(*(bl + 1)); + + vfmaeq(y0, P_0, b); + + vfmaeq(y1, P_1, b); + + vfmaeq(y2, P_2, b); + + vfmaeq(y3, P_3, b); + + vfmaeq(y4, P_4, b); + + vfmaeq(y5, P_5, b); + + + for (l = 2; l <= lmax; ++l) { + b = vload(*(bl + l)); + R = vload(*(recfacs + l)); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul(x0, Pm1_0); + W2 = W1; + W2 = vsub(W2, Pm2_0); + P_0 = W1; + vfmaeq(P_0, W2, R); + vfmaeq(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul(x1, Pm1_1); + W2 = W1; + W2 = vsub(W2, Pm2_1); + P_1 = W1; + vfmaeq(P_1, W2, R); + vfmaeq(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul(x2, Pm1_2); + W2 = W1; + W2 = vsub(W2, Pm2_2); + P_2 = W1; + vfmaeq(P_2, W2, R); + vfmaeq(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul(x3, Pm1_3); + W2 = W1; + W2 = vsub(W2, Pm2_3); + P_3 = W1; + vfmaeq(P_3, W2, R); + vfmaeq(y3, P_3, b); + + Pm2_4 = Pm1_4; Pm1_4 = P_4; + W1 = vmul(x4, Pm1_4); + W2 = W1; + W2 = vsub(W2, Pm2_4); + P_4 = W1; + vfmaeq(P_4, W2, R); + vfmaeq(y4, P_4, b); + + Pm2_5 = Pm1_5; Pm1_5 = P_5; + W1 = vmul(x5, Pm1_5); + W2 = W1; + W2 = vsub(W2, Pm2_5); + P_5 = W1; + vfmaeq(P_5, W2, R); + vfmaeq(y5, P_5, b); + + + } + + vstoreu(out + 0 * VLEN, y0); + + vstoreu(out + 1 * VLEN, y1); + + vstoreu(out + 2 * VLEN, y2); + + vstoreu(out + 3 * VLEN, y3); + + vstoreu(out + 4 * VLEN, y4); + + vstoreu(out + 5 * VLEN, y5); + +} + + + +static void legendre_transform_vec1_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(1) * VLEN_s], + float out[(1) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vload_s(*bl); + y0 = vmul_s(Pm1_0, b); + + + b = vload_s(*(bl + 1)); + + vfmaeq_s(y0, P_0, b); + + + for (l = 2; l <= lmax; ++l) { + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + P_0 = W1; + vfmaeq_s(P_0, W2, R); + vfmaeq_s(y0, P_0, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + +} + +static void legendre_transform_vec2_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(2) * VLEN_s], + float out[(2) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s P_1, Pm1_1, Pm2_1, x1, y1; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vload_s(*bl); + y0 = vmul_s(Pm1_0, b); + + x1 = vloadu_s(xarr + 1 * VLEN_s); + Pm1_1 = vload_s(1.0); + P_1 = x1; + b = vload_s(*bl); + y1 = vmul_s(Pm1_1, b); + + + b = vload_s(*(bl + 1)); + + vfmaeq_s(y0, P_0, b); + + vfmaeq_s(y1, P_1, b); + + + for (l = 2; l <= lmax; ++l) { + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + P_0 = W1; + vfmaeq_s(P_0, W2, R); + vfmaeq_s(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul_s(x1, Pm1_1); + W2 = W1; + W2 = vsub_s(W2, Pm2_1); + P_1 = W1; + vfmaeq_s(P_1, W2, R); + vfmaeq_s(y1, P_1, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + + vstoreu_s(out + 1 * VLEN_s, y1); + +} + +static void legendre_transform_vec3_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(3) * VLEN_s], + float out[(3) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s P_1, Pm1_1, Pm2_1, x1, y1; + + Tv_s P_2, Pm1_2, Pm2_2, x2, y2; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vload_s(*bl); + y0 = vmul_s(Pm1_0, b); + + x1 = vloadu_s(xarr + 1 * VLEN_s); + Pm1_1 = vload_s(1.0); + P_1 = x1; + b = vload_s(*bl); + y1 = vmul_s(Pm1_1, b); + + x2 = vloadu_s(xarr + 2 * VLEN_s); + Pm1_2 = vload_s(1.0); + P_2 = x2; + b = vload_s(*bl); + y2 = vmul_s(Pm1_2, b); + + + b = vload_s(*(bl + 1)); + + vfmaeq_s(y0, P_0, b); + + vfmaeq_s(y1, P_1, b); + + vfmaeq_s(y2, P_2, b); + + + for (l = 2; l <= lmax; ++l) { + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + P_0 = W1; + vfmaeq_s(P_0, W2, R); + vfmaeq_s(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul_s(x1, Pm1_1); + W2 = W1; + W2 = vsub_s(W2, Pm2_1); + P_1 = W1; + vfmaeq_s(P_1, W2, R); + vfmaeq_s(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul_s(x2, Pm1_2); + W2 = W1; + W2 = vsub_s(W2, Pm2_2); + P_2 = W1; + vfmaeq_s(P_2, W2, R); + vfmaeq_s(y2, P_2, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + + vstoreu_s(out + 1 * VLEN_s, y1); + + vstoreu_s(out + 2 * VLEN_s, y2); + +} + +static void legendre_transform_vec4_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(4) * VLEN_s], + float out[(4) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s P_1, Pm1_1, Pm2_1, x1, y1; + + Tv_s P_2, Pm1_2, Pm2_2, x2, y2; + + Tv_s P_3, Pm1_3, Pm2_3, x3, y3; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vload_s(*bl); + y0 = vmul_s(Pm1_0, b); + + x1 = vloadu_s(xarr + 1 * VLEN_s); + Pm1_1 = vload_s(1.0); + P_1 = x1; + b = vload_s(*bl); + y1 = vmul_s(Pm1_1, b); + + x2 = vloadu_s(xarr + 2 * VLEN_s); + Pm1_2 = vload_s(1.0); + P_2 = x2; + b = vload_s(*bl); + y2 = vmul_s(Pm1_2, b); + + x3 = vloadu_s(xarr + 3 * VLEN_s); + Pm1_3 = vload_s(1.0); + P_3 = x3; + b = vload_s(*bl); + y3 = vmul_s(Pm1_3, b); + + + b = vload_s(*(bl + 1)); + + vfmaeq_s(y0, P_0, b); + + vfmaeq_s(y1, P_1, b); + + vfmaeq_s(y2, P_2, b); + + vfmaeq_s(y3, P_3, b); + + + for (l = 2; l <= lmax; ++l) { + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + P_0 = W1; + vfmaeq_s(P_0, W2, R); + vfmaeq_s(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul_s(x1, Pm1_1); + W2 = W1; + W2 = vsub_s(W2, Pm2_1); + P_1 = W1; + vfmaeq_s(P_1, W2, R); + vfmaeq_s(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul_s(x2, Pm1_2); + W2 = W1; + W2 = vsub_s(W2, Pm2_2); + P_2 = W1; + vfmaeq_s(P_2, W2, R); + vfmaeq_s(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul_s(x3, Pm1_3); + W2 = W1; + W2 = vsub_s(W2, Pm2_3); + P_3 = W1; + vfmaeq_s(P_3, W2, R); + vfmaeq_s(y3, P_3, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + + vstoreu_s(out + 1 * VLEN_s, y1); + + vstoreu_s(out + 2 * VLEN_s, y2); + + vstoreu_s(out + 3 * VLEN_s, y3); + +} + +static void legendre_transform_vec5_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(5) * VLEN_s], + float out[(5) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s P_1, Pm1_1, Pm2_1, x1, y1; + + Tv_s P_2, Pm1_2, Pm2_2, x2, y2; + + Tv_s P_3, Pm1_3, Pm2_3, x3, y3; + + Tv_s P_4, Pm1_4, Pm2_4, x4, y4; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vload_s(*bl); + y0 = vmul_s(Pm1_0, b); + + x1 = vloadu_s(xarr + 1 * VLEN_s); + Pm1_1 = vload_s(1.0); + P_1 = x1; + b = vload_s(*bl); + y1 = vmul_s(Pm1_1, b); + + x2 = vloadu_s(xarr + 2 * VLEN_s); + Pm1_2 = vload_s(1.0); + P_2 = x2; + b = vload_s(*bl); + y2 = vmul_s(Pm1_2, b); + + x3 = vloadu_s(xarr + 3 * VLEN_s); + Pm1_3 = vload_s(1.0); + P_3 = x3; + b = vload_s(*bl); + y3 = vmul_s(Pm1_3, b); + + x4 = vloadu_s(xarr + 4 * VLEN_s); + Pm1_4 = vload_s(1.0); + P_4 = x4; + b = vload_s(*bl); + y4 = vmul_s(Pm1_4, b); + + + b = vload_s(*(bl + 1)); + + vfmaeq_s(y0, P_0, b); + + vfmaeq_s(y1, P_1, b); + + vfmaeq_s(y2, P_2, b); + + vfmaeq_s(y3, P_3, b); + + vfmaeq_s(y4, P_4, b); + + + for (l = 2; l <= lmax; ++l) { + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + P_0 = W1; + vfmaeq_s(P_0, W2, R); + vfmaeq_s(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul_s(x1, Pm1_1); + W2 = W1; + W2 = vsub_s(W2, Pm2_1); + P_1 = W1; + vfmaeq_s(P_1, W2, R); + vfmaeq_s(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul_s(x2, Pm1_2); + W2 = W1; + W2 = vsub_s(W2, Pm2_2); + P_2 = W1; + vfmaeq_s(P_2, W2, R); + vfmaeq_s(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul_s(x3, Pm1_3); + W2 = W1; + W2 = vsub_s(W2, Pm2_3); + P_3 = W1; + vfmaeq_s(P_3, W2, R); + vfmaeq_s(y3, P_3, b); + + Pm2_4 = Pm1_4; Pm1_4 = P_4; + W1 = vmul_s(x4, Pm1_4); + W2 = W1; + W2 = vsub_s(W2, Pm2_4); + P_4 = W1; + vfmaeq_s(P_4, W2, R); + vfmaeq_s(y4, P_4, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + + vstoreu_s(out + 1 * VLEN_s, y1); + + vstoreu_s(out + 2 * VLEN_s, y2); + + vstoreu_s(out + 3 * VLEN_s, y3); + + vstoreu_s(out + 4 * VLEN_s, y4); + +} + +static void legendre_transform_vec6_s(float *recfacs, float *bl, ptrdiff_t lmax, + float xarr[(6) * VLEN_s], + float out[(6) * VLEN_s]) { + + Tv_s P_0, Pm1_0, Pm2_0, x0, y0; + + Tv_s P_1, Pm1_1, Pm2_1, x1, y1; + + Tv_s P_2, Pm1_2, Pm2_2, x2, y2; + + Tv_s P_3, Pm1_3, Pm2_3, x3, y3; + + Tv_s P_4, Pm1_4, Pm2_4, x4, y4; + + Tv_s P_5, Pm1_5, Pm2_5, x5, y5; + + Tv_s W1, W2, b, R; + ptrdiff_t l; + + + x0 = vloadu_s(xarr + 0 * VLEN_s); + Pm1_0 = vload_s(1.0); + P_0 = x0; + b = vload_s(*bl); + y0 = vmul_s(Pm1_0, b); + + x1 = vloadu_s(xarr + 1 * VLEN_s); + Pm1_1 = vload_s(1.0); + P_1 = x1; + b = vload_s(*bl); + y1 = vmul_s(Pm1_1, b); + + x2 = vloadu_s(xarr + 2 * VLEN_s); + Pm1_2 = vload_s(1.0); + P_2 = x2; + b = vload_s(*bl); + y2 = vmul_s(Pm1_2, b); + + x3 = vloadu_s(xarr + 3 * VLEN_s); + Pm1_3 = vload_s(1.0); + P_3 = x3; + b = vload_s(*bl); + y3 = vmul_s(Pm1_3, b); + + x4 = vloadu_s(xarr + 4 * VLEN_s); + Pm1_4 = vload_s(1.0); + P_4 = x4; + b = vload_s(*bl); + y4 = vmul_s(Pm1_4, b); + + x5 = vloadu_s(xarr + 5 * VLEN_s); + Pm1_5 = vload_s(1.0); + P_5 = x5; + b = vload_s(*bl); + y5 = vmul_s(Pm1_5, b); + + + b = vload_s(*(bl + 1)); + + vfmaeq_s(y0, P_0, b); + + vfmaeq_s(y1, P_1, b); + + vfmaeq_s(y2, P_2, b); + + vfmaeq_s(y3, P_3, b); + + vfmaeq_s(y4, P_4, b); + + vfmaeq_s(y5, P_5, b); + + + for (l = 2; l <= lmax; ++l) { + b = vload_s(*(bl + l)); + R = vload_s(*(recfacs + l)); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + + Pm2_0 = Pm1_0; Pm1_0 = P_0; + W1 = vmul_s(x0, Pm1_0); + W2 = W1; + W2 = vsub_s(W2, Pm2_0); + P_0 = W1; + vfmaeq_s(P_0, W2, R); + vfmaeq_s(y0, P_0, b); + + Pm2_1 = Pm1_1; Pm1_1 = P_1; + W1 = vmul_s(x1, Pm1_1); + W2 = W1; + W2 = vsub_s(W2, Pm2_1); + P_1 = W1; + vfmaeq_s(P_1, W2, R); + vfmaeq_s(y1, P_1, b); + + Pm2_2 = Pm1_2; Pm1_2 = P_2; + W1 = vmul_s(x2, Pm1_2); + W2 = W1; + W2 = vsub_s(W2, Pm2_2); + P_2 = W1; + vfmaeq_s(P_2, W2, R); + vfmaeq_s(y2, P_2, b); + + Pm2_3 = Pm1_3; Pm1_3 = P_3; + W1 = vmul_s(x3, Pm1_3); + W2 = W1; + W2 = vsub_s(W2, Pm2_3); + P_3 = W1; + vfmaeq_s(P_3, W2, R); + vfmaeq_s(y3, P_3, b); + + Pm2_4 = Pm1_4; Pm1_4 = P_4; + W1 = vmul_s(x4, Pm1_4); + W2 = W1; + W2 = vsub_s(W2, Pm2_4); + P_4 = W1; + vfmaeq_s(P_4, W2, R); + vfmaeq_s(y4, P_4, b); + + Pm2_5 = Pm1_5; Pm1_5 = P_5; + W1 = vmul_s(x5, Pm1_5); + W2 = W1; + W2 = vsub_s(W2, Pm2_5); + P_5 = W1; + vfmaeq_s(P_5, W2, R); + vfmaeq_s(y5, P_5, b); + + + } + + vstoreu_s(out + 0 * VLEN_s, y0); + + vstoreu_s(out + 1 * VLEN_s, y1); + + vstoreu_s(out + 2 * VLEN_s, y2); + + vstoreu_s(out + 3 * VLEN_s, y3); + + vstoreu_s(out + 4 * VLEN_s, y4); + + vstoreu_s(out + 5 * VLEN_s, y5); + +} + + + + + +void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax) { + /* (l - 1) / l, for l >= 2 */ + ptrdiff_t l; + r[0] = 0; + r[1] = 1; + for (l = 2; l <= lmax; ++l) { + r[l] = (double)(l - 1) / (double)l; + } +} + +void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax) { + /* (l - 1) / l, for l >= 2 */ + ptrdiff_t l; + r[0] = 0; + r[1] = 1; + for (l = 2; l <= lmax; ++l) { + r[l] = (float)(l - 1) / (float)l; + } +} + + +/* + Compute sum_l b_l P_l(x_i) for all i. + */ + +#define LEN (SHARP_LEGENDRE_CS * VLEN) +#define LEN_s (SHARP_LEGENDRE_CS * VLEN_s) + + +void sharp_legendre_transform(double *bl, + double *recfac, + ptrdiff_t lmax, + double *x, double *out, ptrdiff_t nx) { + double xchunk[MAX_CS * VLEN], outchunk[MAX_CS * LEN]; + int compute_recfac; + ptrdiff_t i, j, len; + + compute_recfac = (recfac == NULL); + if (compute_recfac) { + recfac = malloc(sizeof(double) * (lmax + 1)); + sharp_legendre_transform_recfac(recfac, lmax); + } + + for (j = 0; j != LEN; ++j) xchunk[j] = 0; + + for (i = 0; i < nx; i += LEN) { + len = (i + (LEN) <= nx) ? (LEN) : (nx - i); + for (j = 0; j != len; ++j) xchunk[j] = x[i + j]; + switch ((len + VLEN - 1) / VLEN) { + case 6: legendre_transform_vec6(recfac, bl, lmax, xchunk, outchunk); break; + case 5: legendre_transform_vec5(recfac, bl, lmax, xchunk, outchunk); break; + case 4: legendre_transform_vec4(recfac, bl, lmax, xchunk, outchunk); break; + case 3: legendre_transform_vec3(recfac, bl, lmax, xchunk, outchunk); break; + case 2: legendre_transform_vec2(recfac, bl, lmax, xchunk, outchunk); break; + case 1: + case 0: + legendre_transform_vec1(recfac, bl, lmax, xchunk, outchunk); break; + } + for (j = 0; j != len; ++j) out[i + j] = outchunk[j]; + } + if (compute_recfac) { + free(recfac); + } +} + +void sharp_legendre_transform_s(float *bl, + float *recfac, + ptrdiff_t lmax, + float *x, float *out, ptrdiff_t nx) { + float xchunk[MAX_CS * VLEN_s], outchunk[MAX_CS * LEN_s]; + int compute_recfac; + ptrdiff_t i, j, len; + + compute_recfac = (recfac == NULL); + if (compute_recfac) { + recfac = malloc(sizeof(float) * (lmax + 1)); + sharp_legendre_transform_recfac_s(recfac, lmax); + } + + for (j = 0; j != LEN_s; ++j) xchunk[j] = 0; + + for (i = 0; i < nx; i += LEN_s) { + len = (i + (LEN_s) <= nx) ? (LEN_s) : (nx - i); + for (j = 0; j != len; ++j) xchunk[j] = x[i + j]; + switch ((len + VLEN_s - 1) / VLEN_s) { + case 6: legendre_transform_vec6_s(recfac, bl, lmax, xchunk, outchunk); break; + case 5: legendre_transform_vec5_s(recfac, bl, lmax, xchunk, outchunk); break; + case 4: legendre_transform_vec4_s(recfac, bl, lmax, xchunk, outchunk); break; + case 3: legendre_transform_vec3_s(recfac, bl, lmax, xchunk, outchunk); break; + case 2: legendre_transform_vec2_s(recfac, bl, lmax, xchunk, outchunk); break; + case 1: + case 0: + legendre_transform_vec1_s(recfac, bl, lmax, xchunk, outchunk); break; + } + for (j = 0; j != len; ++j) out[i + j] = outchunk[j]; + } + if (compute_recfac) { + free(recfac); + } +} + + +#endif \ No newline at end of file diff --git a/libsharp/sharp_legendre.c.in b/libsharp/sharp_legendre.c.in new file mode 100644 index 0000000..63c84b7 --- /dev/null +++ b/libsharp/sharp_legendre.c.in @@ -0,0 +1,176 @@ +/* + + NOTE NOTE NOTE + + This file is edited in sharp_legendre.c.in which is then preprocessed. + Do not make manual modifications to sharp_legendre.c. + + NOTE NOTE NOTE + +*/ + + +/* + * This file is part of libsharp. + * + * Redistribution and use in source and binary forms, with or without + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/*! \file sharp_legendre.c.in + * + * Copyright (C) 2015 University of Oslo + * \author Dag Sverre Seljebotn + */ + +#ifndef NO_LEGENDRE +#if (VLEN==8) +#error This code is not tested with MIC; please compile with -DNO_LEGENDRE +/* ...or test it (it probably works) and remove this check */ +#endif + +#ifndef SHARP_LEGENDRE_CS +#define SHARP_LEGENDRE_CS 4 +#endif + +#define MAX_CS 6 +#if (SHARP_LEGENDRE_CS > MAX_CS) +#error (SHARP_LEGENDRE_CS > MAX_CS) +#endif + +#include "sharp_legendre.h" +#include "sharp_vecsupport.h" + +#include + +/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/ +/*{ for cs in range(1, 7) }*/ +static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *bl, ptrdiff_t lmax, + {{scalar}} xarr[({{cs}}) * VLEN{{T}}], + {{scalar}} out[({{cs}}) * VLEN{{T}}]) { + /*{ for i in range(cs) }*/ + Tv{{T}} P_{{i}}, Pm1_{{i}}, Pm2_{{i}}, x{{i}}, y{{i}}; + /*{ endfor }*/ + Tv{{T}} W1, W2, b, R; + ptrdiff_t l; + + /*{ for i in range(cs) }*/ + x{{i}} = vloadu{{T}}(xarr + {{i}} * VLEN{{T}}); + Pm1_{{i}} = vload{{T}}(1.0); + P_{{i}} = x{{i}}; + b = vload{{T}}(*bl); + y{{i}} = vmul{{T}}(Pm1_{{i}}, b); + /*{ endfor }*/ + + b = vload{{T}}(*(bl + 1)); + /*{ for i in range(cs) }*/ + vfmaeq{{T}}(y{{i}}, P_{{i}}, b); + /*{ endfor }*/ + + for (l = 2; l <= lmax; ++l) { + b = vload{{T}}(*(bl + l)); + R = vload{{T}}(*(recfacs + l)); + + /* + P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2) + */ + /*{ for i in range(cs) }*/ + Pm2_{{i}} = Pm1_{{i}}; Pm1_{{i}} = P_{{i}}; + W1 = vmul{{T}}(x{{i}}, Pm1_{{i}}); + W2 = W1; + W2 = vsub{{T}}(W2, Pm2_{{i}}); + P_{{i}} = W1; + vfmaeq{{T}}(P_{{i}}, W2, R); + vfmaeq{{T}}(y{{i}}, P_{{i}}, b); + /*{ endfor }*/ + + } + /*{ for i in range(cs) }*/ + vstoreu{{T}}(out + {{i}} * VLEN{{T}}, y{{i}}); + /*{ endfor }*/ +} +/*{ endfor }*/ +/*{ endfor }*/ + + +/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/ +void sharp_legendre_transform_recfac{{T}}({{scalar}} *r, ptrdiff_t lmax) { + /* (l - 1) / l, for l >= 2 */ + ptrdiff_t l; + r[0] = 0; + r[1] = 1; + for (l = 2; l <= lmax; ++l) { + r[l] = ({{scalar}})(l - 1) / ({{scalar}})l; + } +} +/*{ endfor }*/ + +/* + Compute sum_l b_l P_l(x_i) for all i. + */ + +#define LEN (SHARP_LEGENDRE_CS * VLEN) +#define LEN_s (SHARP_LEGENDRE_CS * VLEN_s) + +/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/ +void sharp_legendre_transform{{T}}({{scalar}} *bl, + {{scalar}} *recfac, + ptrdiff_t lmax, + {{scalar}} *x, {{scalar}} *out, ptrdiff_t nx) { + {{scalar}} xchunk[MAX_CS * VLEN{{T}}], outchunk[MAX_CS * LEN{{T}}]; + int compute_recfac; + ptrdiff_t i, j, len; + + compute_recfac = (recfac == NULL); + if (compute_recfac) { + recfac = malloc(sizeof({{scalar}}) * (lmax + 1)); + sharp_legendre_transform_recfac{{T}}(recfac, lmax); + } + + for (j = 0; j != LEN{{T}}; ++j) xchunk[j] = 0; + + for (i = 0; i < nx; i += LEN{{T}}) { + len = (i + (LEN{{T}}) <= nx) ? (LEN{{T}}) : (nx - i); + for (j = 0; j != len; ++j) xchunk[j] = x[i + j]; + switch ((len + VLEN{{T}} - 1) / VLEN{{T}}) { + case 6: legendre_transform_vec6{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + case 5: legendre_transform_vec5{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + case 4: legendre_transform_vec4{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + case 3: legendre_transform_vec3{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + case 2: legendre_transform_vec2{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + case 1: + case 0: + legendre_transform_vec1{{T}}(recfac, bl, lmax, xchunk, outchunk); break; + } + for (j = 0; j != len; ++j) out[i + j] = outchunk[j]; + } + if (compute_recfac) { + free(recfac); + } +} +/*{ endfor }*/ + +#endif diff --git a/libsharp/sharp_legendre.h b/libsharp/sharp_legendre.h new file mode 100644 index 0000000..cfd8aee --- /dev/null +++ b/libsharp/sharp_legendre.h @@ -0,0 +1,62 @@ +/* + * This file is part of libsharp. + * + * Redistribution and use in source and binary forms, with or without + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/*! \file sharp_legendre.h + * Interface for the Legendre transform parts of the spherical transform library. + * + * Copyright (C) 2015 University of Oslo + * \author Dag Sverre Seljebotn + */ + +#ifndef SHARP_LEGENDRE_H +#define SHARP_LEGENDRE_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef NO_LEGENDRE + +void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x, + double *out, ptrdiff_t nx); +void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x, + float *out, ptrdiff_t nx); +void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax); +void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libsharp/sharp_legendre_roots.c b/libsharp/sharp_legendre_roots.c new file mode 100644 index 0000000..44d8507 --- /dev/null +++ b/libsharp/sharp_legendre_roots.c @@ -0,0 +1,67 @@ +/* Function adapted from GNU GSL file glfixed.c + Original author: Pavel Holoborodko (http://www.holoborodko.com) + + Adjustments by M. Reinecke + - adjusted interface (keep epsilon internal, return full number of points) + - removed precomputed tables + - tweaked Newton iteration to obtain higher accuracy */ + +#include +#include "sharp_legendre_roots.h" +#include "c_utils.h" + +static inline double one_minus_x2 (double x) + { return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; } + +void sharp_legendre_roots(int n, double *x, double *w) + { + const double pi = 3.141592653589793238462643383279502884197; + const double eps = 3e-14; + int m = (n+1)>>1; + + double t0 = 1 - (1-1./n) / (8.*n*n); + double t1 = 1./(4.*n+2.); + +#pragma omp parallel +{ + int i; +#pragma omp for schedule(dynamic,100) + for (i=1; i<=m; ++i) + { + double x0 = cos(pi * ((i<<2)-1) * t1) * t0; + + int dobreak=0; + int j=0; + double dpdx; + while(1) + { + double P_1 = 1.0; + double P0 = x0; + double dx, x1; + + for (int k=2; k<=n; k++) + { + double P_2 = P_1; + P_1 = P0; +// P0 = ((2*k-1)*x0*P_1-(k-1)*P_2)/k; + P0 = x0*P_1 + (k-1.)/k * (x0*P_1-P_2); + } + + dpdx = (P_1 - x0*P0) * n / one_minus_x2(x0); + + /* Newton step */ + x1 = x0 - P0/dpdx; + dx = x0-x1; + x0 = x1; + if (dobreak) break; + + if (fabs(dx)<=eps) dobreak=1; + UTIL_ASSERT(++j<100,"convergence problem"); + } + + x[i-1] = -x0; + x[n-i] = x0; + w[i-1] = w[n-i] = 2. / (one_minus_x2(x0) * dpdx * dpdx); + } +} // end of parallel region + } diff --git a/libsharp/sharp_legendre_roots.h b/libsharp/sharp_legendre_roots.h new file mode 100644 index 0000000..2a056b2 --- /dev/null +++ b/libsharp/sharp_legendre_roots.h @@ -0,0 +1,50 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_legendre_roots.h + * + * Copyright (C) 2006-2012 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef SHARP_LEGENDRE_ROOTS_H +#define SHARP_LEGENDRE_ROOTS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/*! Computes roots and Gaussian quadrature weights for Legendre polynomial + of degree \a n. + \param n Order of Legendre polynomial + \param x Array of length \a n for output (root position) + \param w Array of length \a w for output (weight for Gaussian quadrature) + */ +void sharp_legendre_roots(int n, double *x, double *w); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h index b26a539..ee4c5e7 100644 --- a/libsharp/sharp_vecsupport.h +++ b/libsharp/sharp_vecsupport.h @@ -40,23 +40,31 @@ typedef double Ts; #if (VLEN==1) typedef double Tv; +typedef float Tv_s; typedef int Tm; #define vadd(a,b) ((a)+(b)) +#define vadd_s(a,b) ((a)+(b)) #define vaddeq(a,b) ((a)+=(b)) #define vaddeq_mask(mask,a,b) if (mask) (a)+=(b); #define vsub(a,b) ((a)-(b)) +#define vsub_s(a,b) ((a)-(b)) #define vsubeq(a,b) ((a)-=(b)) #define vsubeq_mask(mask,a,b) if (mask) (a)-=(b); #define vmul(a,b) ((a)*(b)) +#define vmul_s(a,b) ((a)*(b)) #define vmuleq(a,b) ((a)*=(b)) #define vmuleq_mask(mask,a,b) if (mask) (a)*=(b); #define vfmaeq(a,b,c) ((a)+=(b)*(c)) +#define vfmaeq_s(a,b,c) ((a)+=(b)*(c)) #define vfmseq(a,b,c) ((a)-=(b)*(c)) #define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e)) #define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e)) #define vneg(a) (-(a)) #define vload(a) (a) +#define vload_s(a) (a) +#define vloadu(p) (*(p)) +#define vloadu_s(p) (*(p)) #define vabs(a) fabs(a) #define vsqrt(a) sqrt(a) #define vlt(a,b) ((a)<(b)) @@ -64,6 +72,8 @@ typedef int Tm; #define vge(a,b) ((a)>=(b)) #define vne(a,b) ((a)!=(b)) #define vand_mask(a,b) ((a)&&(b)) +#define vstoreu(p, a) (*(p)=a) +#define vstoreu_s(p, a) (*(p)=a) static inline Tv vmin (Tv a, Tv b) { return (ab) ? a : b; } @@ -87,6 +97,7 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; } #endif typedef __m128d Tv; +typedef __m128 Tv_s; typedef __m128d Tm; #if defined(__SSE4_1__) @@ -99,15 +110,19 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) #define vone _mm_set1_pd(1.) #define vadd(a,b) _mm_add_pd(a,b) +#define vadd_s(a,b) _mm_add_ps(a,b) #define vaddeq(a,b) a=_mm_add_pd(a,b) #define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero)) #define vsub(a,b) _mm_sub_pd(a,b) +#define vsub_s(a,b) _mm_sub_ps(a,b) #define vsubeq(a,b) a=_mm_sub_pd(a,b) #define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero)) #define vmul(a,b) _mm_mul_pd(a,b) +#define vmul_s(a,b) _mm_mul_ps(a,b) #define vmuleq(a,b) a=_mm_mul_pd(a,b) #define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone)) #define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c)) +#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c)) #define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c)) #define vfmaaeq(a,b,c,d,e) \ a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e))) @@ -115,6 +130,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e))) #define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a) #define vload(a) _mm_set1_pd(a) +#define vload_s(a) _mm_set1_ps(a) #define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a) #define vsqrt(a) _mm_sqrt_pd(a) #define vlt(a,b) _mm_cmplt_pd(a,b) @@ -126,17 +142,22 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) #define vmax(a,b) _mm_max_pd(a,b); #define vanyTrue(a) (_mm_movemask_pd(a)!=0) #define vallTrue(a) (_mm_movemask_pd(a)==3) +#define vloadu(p) _mm_loadu_pd(p) +#define vloadu_s(p) _mm_loadu_ps(p) +#define vstoreu(p, v) _mm_storeu_pd(p, v) +#define vstoreu_s(p, v) _mm_storeu_ps(p, v) #endif #if (VLEN==4) #include -#ifdef __FMA4__ +#if (USE_FMA4) #include #endif typedef __m256d Tv; +typedef __m256 Tv_s; typedef __m256d Tm; #define vblend__(m,a,b) _mm256_blendv_pd(b,a,m) @@ -144,21 +165,26 @@ typedef __m256d Tm; #define vone _mm256_set1_pd(1.) #define vadd(a,b) _mm256_add_pd(a,b) +#define vadd_s(a,b) _mm256_add_ps(a,b) #define vaddeq(a,b) a=_mm256_add_pd(a,b) #define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero)) #define vsub(a,b) _mm256_sub_pd(a,b) +#define vsub_s(a,b) _mm256_sub_ps(a,b) #define vsubeq(a,b) a=_mm256_sub_pd(a,b) #define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero)) #define vmul(a,b) _mm256_mul_pd(a,b) +#define vmul_s(a,b) _mm256_mul_ps(a,b) #define vmuleq(a,b) a=_mm256_mul_pd(a,b) #define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone)) -#ifdef __FMA4__ +#if (USE_FMA4) #define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a) +#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a) #define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a) #define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a)) #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a)) #else #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c)) +#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c)) #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c)) #define vfmaaeq(a,b,c,d,e) \ a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e))) @@ -167,6 +193,7 @@ typedef __m256d Tm; #endif #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a) #define vload(a) _mm256_set1_pd(a) +#define vload_s(a) _mm256_set1_ps(a) #define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a) #define vsqrt(a) _mm256_sqrt_pd(a) #define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ) @@ -179,6 +206,11 @@ typedef __m256d Tm; #define vanyTrue(a) (_mm256_movemask_pd(a)!=0) #define vallTrue(a) (_mm256_movemask_pd(a)==15) +#define vloadu(p) _mm256_loadu_pd(p) +#define vloadu_s(p) _mm256_loadu_ps(p) +#define vstoreu(p, v) _mm256_storeu_pd(p, v) +#define vstoreu_s(p, v) _mm256_storeu_ps(p, v) + #endif #if (VLEN==8) diff --git a/libsharp/sharp_vecutil.h b/libsharp/sharp_vecutil.h index 16bfa13..f6161ca 100644 --- a/libsharp/sharp_vecutil.h +++ b/libsharp/sharp_vecutil.h @@ -32,6 +32,8 @@ #ifndef SHARP_VECUTIL_H #define SHARP_VECUTIL_H +#ifndef VLEN + #if (defined (__MIC__)) #define VLEN 8 #elif (defined (__AVX__)) @@ -43,3 +45,19 @@ #endif #endif + +#if (VLEN==1) +#define VLEN_s 1 +#else +#define VLEN_s (2*VLEN) +#endif + +#ifndef USE_FMA4 +#ifdef __FMA4__ +#define USE_FMA4 1 +#else +#define USE_FMA4 0 +#endif +#endif + +#endif diff --git a/python/fake_pyrex/Pyrex/Distutils/__init__.py b/python/fake_pyrex/Pyrex/Distutils/__init__.py new file mode 100644 index 0000000..51c8e16 --- /dev/null +++ b/python/fake_pyrex/Pyrex/Distutils/__init__.py @@ -0,0 +1 @@ +# work around broken setuptools monkey patching diff --git a/python/fake_pyrex/Pyrex/Distutils/build_ext.py b/python/fake_pyrex/Pyrex/Distutils/build_ext.py new file mode 100644 index 0000000..4f846f6 --- /dev/null +++ b/python/fake_pyrex/Pyrex/Distutils/build_ext.py @@ -0,0 +1 @@ +build_ext = "yes, it's there!" diff --git a/python/fake_pyrex/Pyrex/__init__.py b/python/fake_pyrex/Pyrex/__init__.py new file mode 100644 index 0000000..51c8e16 --- /dev/null +++ b/python/fake_pyrex/Pyrex/__init__.py @@ -0,0 +1 @@ +# work around broken setuptools monkey patching diff --git a/python/fake_pyrex/README b/python/fake_pyrex/README new file mode 100644 index 0000000..cf3f3ff --- /dev/null +++ b/python/fake_pyrex/README @@ -0,0 +1,2 @@ +This directory is here to fool setuptools into building .pyx files +even if Pyrex is not installed. See ../setup.py. \ No newline at end of file diff --git a/python/libsharp/__init__.py b/python/libsharp/__init__.py new file mode 100644 index 0000000..dd0fa41 --- /dev/null +++ b/python/libsharp/__init__.py @@ -0,0 +1 @@ +from .libsharp import * diff --git a/python/libsharp/libsharp.pyx b/python/libsharp/libsharp.pyx new file mode 100644 index 0000000..9e83d3d --- /dev/null +++ b/python/libsharp/libsharp.pyx @@ -0,0 +1,56 @@ +import numpy as np + +__all__ = ['legendre_transform', 'legendre_roots'] + +cdef extern from "sharp.h": + ctypedef long ptrdiff_t + + void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x, + float *out, ptrdiff_t nx) + void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x, + double *out, ptrdiff_t nx) + void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax) + void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax) + void sharp_legendre_roots(int n, double *x, double *w) + + +def legendre_transform(x, bl, out=None): + if out is None: + out = np.empty_like(x) + if out.shape[0] == 0: + return out + elif x.dtype == np.float64: + if bl.dtype != np.float64: + bl = bl.astype(np.float64) + return _legendre_transform(x, bl, out=out) + elif x.dtype == np.float32: + if bl.dtype != np.float32: + bl = bl.astype(np.float32) + return _legendre_transform_s(x, bl, out=out) + else: + raise ValueError("unsupported dtype") + + +def _legendre_transform(double[::1] x, double[::1] bl, double[::1] out): + if out.shape[0] != x.shape[0]: + raise ValueError('x and out must have same shape') + sharp_legendre_transform(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0]) + return np.asarray(out) + + +def _legendre_transform_s(float[::1] x, float[::1] bl, float[::1] out): + if out.shape[0] != x.shape[0]: + raise ValueError('x and out must have same shape') + sharp_legendre_transform_s(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0]) + return np.asarray(out) + + +def legendre_roots(n): + x = np.empty(n, np.double) + w = np.empty(n, np.double) + cdef double[::1] x_buf = x, w_buf = w + if not (x_buf.shape[0] == w_buf.shape[0] == n): + raise AssertionError() + if n > 0: + sharp_legendre_roots(n, &x_buf[0], &w_buf[0]) + return x, w diff --git a/python/libsharp/tests/__init__.py b/python/libsharp/tests/__init__.py new file mode 100644 index 0000000..1bb8bf6 --- /dev/null +++ b/python/libsharp/tests/__init__.py @@ -0,0 +1 @@ +# empty diff --git a/python/libsharp/tests/test_legendre.py b/python/libsharp/tests/test_legendre.py new file mode 100644 index 0000000..7860d2a --- /dev/null +++ b/python/libsharp/tests/test_legendre.py @@ -0,0 +1,59 @@ +import numpy as np +from scipy.special import legendre +from scipy.special import p_roots +import libsharp + +from numpy.testing import assert_allclose + + +def check_legendre_transform(lmax, ntheta): + l = np.arange(lmax + 1) + if lmax >= 1: + sigma = -np.log(1e-3) / lmax / (lmax + 1) + bl = np.exp(-sigma*l*(l+1)) + bl *= (2 * l + 1) + else: + bl = np.asarray([1], dtype=np.double) + + theta = np.linspace(0, np.pi, ntheta, endpoint=True) + x = np.cos(theta) + + # Compute truth using scipy.special.legendre + P = np.zeros((ntheta, lmax + 1)) + for l in range(lmax + 1): + P[:, l] = legendre(l)(x) + y0 = np.dot(P, bl) + + + # double-precision + y = libsharp.legendre_transform(x, bl) + + assert_allclose(y, y0, rtol=1e-12, atol=1e-12) + + # single-precision + y32 = libsharp.legendre_transform(x.astype(np.float32), bl) + assert_allclose(y, y0, rtol=1e-5, atol=1e-5) + + +def test_legendre_transform(): + nthetas_to_try = [0, 9, 17, 19] + list(np.random.randint(500, size=20)) + for ntheta in nthetas_to_try: + for lmax in [0, 1, 2, 3, 20] + list(np.random.randint(50, size=4)): + yield check_legendre_transform, lmax, ntheta + +def check_legendre_roots(n): + xs, ws = ([], []) if n == 0 else p_roots(n) # from SciPy + xl, wl = libsharp.legendre_roots(n) + assert_allclose(xs, xl) + assert_allclose(ws, wl) + +def test_legendre_roots(): + """ + Test the Legendre root-finding algorithm from libsharp by comparing it with + the SciPy version. + """ + yield check_legendre_roots, 0 + yield check_legendre_roots, 1 + yield check_legendre_roots, 32 + yield check_legendre_roots, 33 + yield check_legendre_roots, 128 diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 0000000..1500f05 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,76 @@ +#! /usr/bin/env python + +descr = """Spherical Harmionic transforms package + +Python API for the libsharp spherical harmonic transforms library +""" + +import os +import sys + +DISTNAME = 'libsharp' +DESCRIPTION = 'libsharp library for fast Spherical Harmonic Transforms' +LONG_DESCRIPTION = descr +MAINTAINER = 'Dag Sverre Seljebotn', +MAINTAINER_EMAIL = 'd.s.seljebotn@astro.uio.no', +URL = 'http://sourceforge.net/projects/libsharp/' +LICENSE = 'GPL' +DOWNLOAD_URL = "http://sourceforge.net/projects/libsharp/" +VERSION = '0.1' + +# Add our fake Pyrex at the end of the Python search path +# in order to fool setuptools into allowing compilation of +# pyx files to C files. Importing Cython.Distutils then +# makes Cython the tool of choice for this rather than +# (the possibly nonexisting) Pyrex. +project_path = os.path.split(__file__)[0] +sys.path.append(os.path.join(project_path, 'fake_pyrex')) + +from setuptools import setup, find_packages, Extension +from Cython.Distutils import build_ext +import numpy as np + +libsharp = os.environ.get('LIBSHARP', None) +libsharp_include = os.environ.get('LIBSHARP_INCLUDE', libsharp and os.path.join(libsharp, 'include')) +libsharp_lib = os.environ.get('LIBSHARP_LIB', libsharp and os.path.join(libsharp, 'lib')) + +if libsharp_include is None or libsharp_lib is None: + sys.stderr.write('Please set LIBSHARP environment variable to the install directly of libsharp, ' + 'this script will refer to the lib and include sub-directories. Alternatively ' + 'set LIBSHARP_INCLUDE and LIBSHARP_LIB\n') + sys.exit(1) + +if __name__ == "__main__": + setup(install_requires = ['numpy'], + packages = find_packages(), + test_suite="nose.collector", + # Well, technically zipping the package will work, but since it's + # all compiled code it'll just get unzipped again at runtime, which + # is pointless: + zip_safe = False, + name = DISTNAME, + version = VERSION, + maintainer = MAINTAINER, + maintainer_email = MAINTAINER_EMAIL, + description = DESCRIPTION, + license = LICENSE, + url = URL, + download_url = DOWNLOAD_URL, + long_description = LONG_DESCRIPTION, + classifiers = + [ 'Development Status :: 3 - Alpha', + 'Environment :: Console', + 'Intended Audience :: Developers', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: GNU General Public License (GPL)', + 'Topic :: Scientific/Engineering'], + cmdclass = {"build_ext": build_ext}, + ext_modules = [ + Extension("libsharp.libsharp", + ["libsharp/libsharp.pyx"], + libraries=["sharp", "fftpack", "c_utils"], + include_dirs=[libsharp_include], + library_dirs=[libsharp_lib] + ), + ], + ) diff --git a/runjinja.py b/runjinja.py new file mode 100755 index 0000000..fd659fa --- /dev/null +++ b/runjinja.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python + +""" +Preprocesses foo.c.in to foo.c. Reads STDIN and writes STDOUT. +""" + +import sys +import hashlib +from jinja2 import Template, Environment + +env = Environment(block_start_string='/*{', + block_end_string='}*/', + variable_start_string='{{', + variable_end_string='}}') + +extra_vars = dict(len=len) +input = sys.stdin.read() +sys.stdout.write('/* DO NOT EDIT. md5sum of source: %s */' % hashlib.md5(input).hexdigest()) +sys.stdout.write(env.from_string(input).render(**extra_vars))