Compare commits
6 commits
Author | SHA1 | Date | |
---|---|---|---|
|
b27ce30cde | ||
|
f8a9c96acf | ||
|
75559e6894 | ||
|
54cbae0534 | ||
|
0f8051fc28 | ||
|
0378ce155a |
19 changed files with 382 additions and 2817 deletions
2
COMPILE
2
COMPILE
|
@ -49,7 +49,7 @@ It can be disabled at configuration time by specifying "--disable-openmp" at the
|
||||||
configure command line.
|
configure command line.
|
||||||
At runtime OMP_NUM_THREADS should be set to the number of hardware threads
|
At runtime OMP_NUM_THREADS should be set to the number of hardware threads
|
||||||
(not physical cores) of the system.
|
(not physical cores) of the system.
|
||||||
(Usually this is already the default setting when OMP_NUM_THREADS is not
|
(Usually this is already the default setting when OMP_NUM_THREADS is not
|
||||||
specified.)
|
specified.)
|
||||||
|
|
||||||
|
|
||||||
|
|
44
Makefile.am
44
Makefile.am
|
@ -3,16 +3,14 @@ ACLOCAL_AMFLAGS = -I m4
|
||||||
lib_LTLIBRARIES = libsharp2.la
|
lib_LTLIBRARIES = libsharp2.la
|
||||||
|
|
||||||
libsharp2_la_SOURCES = \
|
libsharp2_la_SOURCES = \
|
||||||
libsharp2/pocketfft.c \
|
libsharp2/sharp_utils.cc \
|
||||||
libsharp2/pocketfft.h \
|
|
||||||
libsharp2/sharp_utils.c \
|
|
||||||
libsharp2/sharp_utils.h \
|
libsharp2/sharp_utils.h \
|
||||||
libsharp2/sharp.c \
|
libsharp2/sharp.cc \
|
||||||
libsharp2/sharp_almhelpers.c \
|
libsharp2/sharp_almhelpers.cc \
|
||||||
libsharp2/sharp_core.c \
|
libsharp2/sharp_core.cc \
|
||||||
libsharp2/sharp_geomhelpers.c \
|
libsharp2/sharp_geomhelpers.cc \
|
||||||
libsharp2/sharp_legendre_roots.c \
|
libsharp2/sharp_legendre_roots.cc \
|
||||||
libsharp2/sharp_ylmgen_c.c \
|
libsharp2/sharp_ylmgen_c.cc \
|
||||||
libsharp2/sharp_internal.h \
|
libsharp2/sharp_internal.h \
|
||||||
libsharp2/sharp_legendre_roots.h \
|
libsharp2/sharp_legendre_roots.h \
|
||||||
libsharp2/sharp_vecsupport.h \
|
libsharp2/sharp_vecsupport.h \
|
||||||
|
@ -24,25 +22,25 @@ libsharp2_la_SOURCES = \
|
||||||
# any backward-compatible change: increase age
|
# any backward-compatible change: increase age
|
||||||
# any backward-incompatible change: age=0
|
# any backward-incompatible change: age=0
|
||||||
# ==> age <= current
|
# ==> age <= current
|
||||||
libsharp2_la_LDFLAGS = -version-info 0:0:0
|
libsharp2_la_LDFLAGS = -version-info 0:0:0 -lpthread
|
||||||
|
|
||||||
AM_CFLAGS = @AM_CFLAGS@
|
AM_CXXFLAGS = @AM_CXXFLAGS@
|
||||||
|
|
||||||
if HAVE_MULTIARCH
|
if HAVE_MULTIARCH
|
||||||
|
|
||||||
libavx_la_SOURCES = libsharp2/sharp_core_inc.c
|
libavx_la_SOURCES = libsharp2/sharp_core_inc.cc
|
||||||
libavx2_la_SOURCES = libsharp2/sharp_core_inc.c
|
libavx2_la_SOURCES = libsharp2/sharp_core_inc.cc
|
||||||
libfma_la_SOURCES = libsharp2/sharp_core_inc.c
|
libfma_la_SOURCES = libsharp2/sharp_core_inc.cc
|
||||||
libfma4_la_SOURCES = libsharp2/sharp_core_inc.c
|
libfma4_la_SOURCES = libsharp2/sharp_core_inc.cc
|
||||||
libavx512f_la_SOURCES = libsharp2/sharp_core_inc.c
|
libavx512f_la_SOURCES = libsharp2/sharp_core_inc.cc
|
||||||
|
|
||||||
noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
|
noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
|
||||||
|
|
||||||
libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx
|
libavx_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx -DARCH=avx
|
||||||
libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2
|
libavx2_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx2 -DARCH=avx2
|
||||||
libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma
|
libfma_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma -DARCH=fma
|
||||||
libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4
|
libfma4_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma4 -DARCH=fma4
|
||||||
libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f
|
libavx512f_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx512f -DARCH=avx512f
|
||||||
|
|
||||||
libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
|
libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
|
||||||
|
|
||||||
|
@ -56,10 +54,10 @@ nobase_include_HEADERS = \
|
||||||
libsharp2/sharp_cxx.h
|
libsharp2/sharp_cxx.h
|
||||||
|
|
||||||
EXTRA_DIST = \
|
EXTRA_DIST = \
|
||||||
runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.c
|
runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.cc
|
||||||
|
|
||||||
check_PROGRAMS = sharp2_testsuite
|
check_PROGRAMS = sharp2_testsuite
|
||||||
sharp2_testsuite_SOURCES = test/sharp2_testsuite.c test/memusage.c test/memusage.h
|
sharp2_testsuite_SOURCES = test/sharp2_testsuite.cc test/memusage.cc test/memusage.h
|
||||||
sharp2_testsuite_LDADD = libsharp2.la
|
sharp2_testsuite_LDADD = libsharp2.la
|
||||||
|
|
||||||
TESTS = runtest.sh
|
TESTS = runtest.sh
|
||||||
|
|
15
configure.ac
15
configure.ac
|
@ -20,28 +20,21 @@ dnl
|
||||||
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
|
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
|
||||||
|
|
||||||
|
|
||||||
AC_PROG_CC_C99
|
AC_PROG_CXX
|
||||||
AC_OPENMP
|
AX_CXX_COMPILE_STDCXX([11])
|
||||||
|
|
||||||
# add math library
|
|
||||||
LIBS="-lm"
|
|
||||||
|
|
||||||
AC_PROG_LIBTOOL
|
AC_PROG_LIBTOOL
|
||||||
|
|
||||||
tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'`
|
tmpval=`echo $CXXFLAGS | grep -c '\-DMULTIARCH'`
|
||||||
AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
|
AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
|
||||||
|
|
||||||
AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS"
|
|
||||||
|
|
||||||
PACKAGE_LIBS="-lsharp2"
|
PACKAGE_LIBS="-lsharp2"
|
||||||
PACKAGE_CFLAGS="$PACKAGE_CFLAGS $OPENMP_CFLAGS"
|
|
||||||
PACKAGE_LDFLAGS="$PACKAGE_LDFLAGS $OPENMP_CFLAGS"
|
|
||||||
|
|
||||||
dnl
|
dnl
|
||||||
dnl Create pkgconfig .pc file.
|
dnl Create pkgconfig .pc file.
|
||||||
dnl
|
dnl
|
||||||
AX_CREATE_PKGCONFIG_INFO(,,,,[])
|
AX_CREATE_PKGCONFIG_INFO(,,,,[])
|
||||||
AC_SUBST([AM_CFLAGS])
|
AC_SUBST([AM_CXXFLAGS])
|
||||||
|
|
||||||
AC_CONFIG_FILES([Makefile])
|
AC_CONFIG_FILES([Makefile])
|
||||||
AC_OUTPUT
|
AC_OUTPUT
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,50 +0,0 @@
|
||||||
/*
|
|
||||||
* This file is part of pocketfft.
|
|
||||||
* Licensed under a 3-clause BSD style license - see LICENSE.md
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*! \file pocketfft.h
|
|
||||||
* Public interface of the pocketfft library
|
|
||||||
*
|
|
||||||
* Copyright (C) 2008-2019 Max-Planck-Society
|
|
||||||
* \author Martin Reinecke
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef POCKETFFT_H
|
|
||||||
#define POCKETFFT_H
|
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct pocketfft_plan_c_i;
|
|
||||||
typedef struct pocketfft_plan_c_i * pocketfft_plan_c;
|
|
||||||
pocketfft_plan_c pocketfft_make_plan_c (size_t length);
|
|
||||||
void pocketfft_delete_plan_c (pocketfft_plan_c plan);
|
|
||||||
int pocketfft_backward_c(pocketfft_plan_c plan, double c[], double fct);
|
|
||||||
int pocketfft_forward_c(pocketfft_plan_c plan, double c[], double fct);
|
|
||||||
size_t pocketfft_length_c(pocketfft_plan_c plan);
|
|
||||||
|
|
||||||
struct pocketfft_plan_r_i;
|
|
||||||
typedef struct pocketfft_plan_r_i * pocketfft_plan_r;
|
|
||||||
pocketfft_plan_r pocketfft_make_plan_r (size_t length);
|
|
||||||
void pocketfft_delete_plan_r (pocketfft_plan_r plan);
|
|
||||||
/*! Computes a real backward FFT on \a c, using \a plan
|
|
||||||
and assuming the FFTPACK storage scheme:
|
|
||||||
- on entry, \a c has the form <tt>r0, r1, i1, r2, i2, ...</tt>
|
|
||||||
- on exit, it has the form <tt>r0, r1, ..., r[length-1]</tt>. */
|
|
||||||
int pocketfft_backward_r(pocketfft_plan_r plan, double c[], double fct);
|
|
||||||
/*! Computes a real forward FFT on \a c, using \a plan
|
|
||||||
and assuming the FFTPACK storage scheme:
|
|
||||||
- on entry, \a c has the form <tt>r0, r1, ..., r[length-1]</tt>;
|
|
||||||
- on exit, it has the form <tt>r0, r1, i1, r2, i2, ...</tt> */
|
|
||||||
int pocketfft_forward_r(pocketfft_plan_r plan, double c[], double fct);
|
|
||||||
size_t pocketfft_length_r(pocketfft_plan_r plan);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -25,17 +25,20 @@
|
||||||
* \author Martin Reinecke \author Dag Sverre Seljebotn
|
* \author Martin Reinecke \author Dag Sverre Seljebotn
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <math.h>
|
#include <cmath>
|
||||||
#include <string.h>
|
#include <cstring>
|
||||||
#include "libsharp2/pocketfft.h"
|
#include <atomic>
|
||||||
|
#include "mr_util/fft.h"
|
||||||
#include "libsharp2/sharp_ylmgen_c.h"
|
#include "libsharp2/sharp_ylmgen_c.h"
|
||||||
#include "libsharp2/sharp_internal.h"
|
#include "libsharp2/sharp_internal.h"
|
||||||
#include "libsharp2/sharp_utils.h"
|
#include "libsharp2/sharp_utils.h"
|
||||||
#include "libsharp2/sharp_almhelpers.h"
|
#include "libsharp2/sharp_almhelpers.h"
|
||||||
#include "libsharp2/sharp_geomhelpers.h"
|
#include "libsharp2/sharp_geomhelpers.h"
|
||||||
|
#include "mr_util/threading.h"
|
||||||
|
#include "mr_util/useful_macros.h"
|
||||||
|
|
||||||
typedef complex double dcmplx;
|
typedef complex<double> dcmplx;
|
||||||
typedef complex float fcmplx;
|
typedef complex<float> fcmplx;
|
||||||
|
|
||||||
static const double sqrt_one_half = 0.707106781186547572737310929369;
|
static const double sqrt_one_half = 0.707106781186547572737310929369;
|
||||||
static const double sqrt_two = 1.414213562373095145474621858739;
|
static const double sqrt_two = 1.414213562373095145474621858739;
|
||||||
|
@ -57,7 +60,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
|
||||||
*nchunks = (ndata+(*chunksize)-1)/(*chunksize);
|
*nchunks = (ndata+(*chunksize)-1)/(*chunksize);
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
|
MRUTIL_NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
|
||||||
{
|
{
|
||||||
double ofs=lmax*0.01;
|
double ofs=lmax*0.01;
|
||||||
if (ofs<100.) ofs=100.;
|
if (ofs<100.) ofs=100.;
|
||||||
|
@ -76,7 +79,7 @@ typedef struct
|
||||||
double phi0_;
|
double phi0_;
|
||||||
dcmplx *shiftarr;
|
dcmplx *shiftarr;
|
||||||
int s_shift;
|
int s_shift;
|
||||||
pocketfft_plan_r plan;
|
mr::detail_fft::rfftp<double> *plan;
|
||||||
int length;
|
int length;
|
||||||
int norot;
|
int norot;
|
||||||
} ringhelper;
|
} ringhelper;
|
||||||
|
@ -89,12 +92,12 @@ static void ringhelper_init (ringhelper *self)
|
||||||
|
|
||||||
static void ringhelper_destroy (ringhelper *self)
|
static void ringhelper_destroy (ringhelper *self)
|
||||||
{
|
{
|
||||||
if (self->plan) pocketfft_delete_plan_r(self->plan);
|
delete self->plan;
|
||||||
DEALLOC(self->shiftarr);
|
DEALLOC(self->shiftarr);
|
||||||
ringhelper_init(self);
|
ringhelper_init(self);
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
|
MRUTIL_NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
|
||||||
{
|
{
|
||||||
self->norot = (fabs(phi0)<1e-14);
|
self->norot = (fabs(phi0)<1e-14);
|
||||||
if (!(self->norot))
|
if (!(self->norot))
|
||||||
|
@ -105,27 +108,27 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou
|
||||||
self->phi0_ = phi0;
|
self->phi0_ = phi0;
|
||||||
// FIXME: improve this by using sincos2pibyn(nph) etc.
|
// FIXME: improve this by using sincos2pibyn(nph) etc.
|
||||||
for (int m=0; m<=mmax; ++m)
|
for (int m=0; m<=mmax; ++m)
|
||||||
self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
|
self->shiftarr[m] = dcmplx(cos(m*phi0),sin(m*phi0));
|
||||||
// double *tmp=(double *) self->shiftarr;
|
// double *tmp=(double *) self->shiftarr;
|
||||||
// sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
|
// sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
|
||||||
}
|
}
|
||||||
// if (!self->plan) self->plan=pocketfft_make_plan_r(nph);
|
// if (!self->plan) self->plan=pocketfft_make_plan_r(nph);
|
||||||
if (nph!=(int)self->length)
|
if (nph!=(int)self->length)
|
||||||
{
|
{
|
||||||
if (self->plan) pocketfft_delete_plan_r(self->plan);
|
if (self->plan) delete self->plan;
|
||||||
self->plan=pocketfft_make_plan_r(nph);
|
self->plan=new mr::detail_fft::rfftp<double>(nph);
|
||||||
self->length=nph;
|
self->length=nph;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static int ringinfo_compare (const void *xa, const void *xb)
|
static int ringinfo_compare (const void *xa, const void *xb)
|
||||||
{
|
{
|
||||||
const sharp_ringinfo *a=xa, *b=xb;
|
const sharp_ringinfo *a=(const sharp_ringinfo *)xa, *b=(const sharp_ringinfo *)xb;
|
||||||
return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
|
return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
|
||||||
}
|
}
|
||||||
static int ringpair_compare (const void *xa, const void *xb)
|
static int ringpair_compare (const void *xa, const void *xb)
|
||||||
{
|
{
|
||||||
const sharp_ringpair *a=xa, *b=xb;
|
const sharp_ringpair *a=(const sharp_ringpair *)xa, *b=(const sharp_ringpair *)xb;
|
||||||
// return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
|
// return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
|
||||||
if (a->r1.nph==b->r1.nph)
|
if (a->r1.nph==b->r1.nph)
|
||||||
return (a->r1.phi0 < b->r1.phi0) ? -1 :
|
return (a->r1.phi0 < b->r1.phi0) ? -1 :
|
||||||
|
@ -275,7 +278,7 @@ static int sharp_get_mmax (int *mval, int nm)
|
||||||
return nm-1;
|
return nm-1;
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void ringhelper_phase2ring (ringhelper *self,
|
MRUTIL_NOINLINE static void ringhelper_phase2ring (ringhelper *self,
|
||||||
const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase,
|
const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase,
|
||||||
int pstride, int flags)
|
int pstride, int flags)
|
||||||
{
|
{
|
||||||
|
@ -292,22 +295,22 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
|
||||||
if (self->norot)
|
if (self->norot)
|
||||||
for (int m=0; m<=mmax; ++m)
|
for (int m=0; m<=mmax; ++m)
|
||||||
{
|
{
|
||||||
data[2*m]=creal(phase[m*pstride])*wgt;
|
data[2*m]=phase[m*pstride].real()*wgt;
|
||||||
data[2*m+1]=cimag(phase[m*pstride])*wgt;
|
data[2*m+1]=phase[m*pstride].imag()*wgt;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
for (int m=0; m<=mmax; ++m)
|
for (int m=0; m<=mmax; ++m)
|
||||||
{
|
{
|
||||||
dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
|
dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
|
||||||
data[2*m]=creal(tmp)*wgt;
|
data[2*m]=tmp.real()*wgt;
|
||||||
data[2*m+1]=cimag(tmp)*wgt;
|
data[2*m+1]=tmp.imag()*wgt;
|
||||||
}
|
}
|
||||||
for (int m=2*(mmax+1); m<nph+2; ++m)
|
for (int m=2*(mmax+1); m<nph+2; ++m)
|
||||||
data[m]=0.;
|
data[m]=0.;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
data[0]=creal(phase[0])*wgt;
|
data[0]=phase[0].real()*wgt;
|
||||||
SET_ARRAY(data,1,nph+2,0.);
|
SET_ARRAY(data,1,nph+2,0.);
|
||||||
|
|
||||||
int idx1=1, idx2=nph-1;
|
int idx1=1, idx2=nph-1;
|
||||||
|
@ -317,23 +320,23 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
|
||||||
if(!self->norot) tmp*=self->shiftarr[m];
|
if(!self->norot) tmp*=self->shiftarr[m];
|
||||||
if (idx1<(nph+2)/2)
|
if (idx1<(nph+2)/2)
|
||||||
{
|
{
|
||||||
data[2*idx1]+=creal(tmp);
|
data[2*idx1]+=tmp.real();
|
||||||
data[2*idx1+1]+=cimag(tmp);
|
data[2*idx1+1]+=tmp.imag();
|
||||||
}
|
}
|
||||||
if (idx2<(nph+2)/2)
|
if (idx2<(nph+2)/2)
|
||||||
{
|
{
|
||||||
data[2*idx2]+=creal(tmp);
|
data[2*idx2]+=tmp.real();
|
||||||
data[2*idx2+1]-=cimag(tmp);
|
data[2*idx2+1]-=tmp.imag();
|
||||||
}
|
}
|
||||||
if (++idx1>=nph) idx1=0;
|
if (++idx1>=nph) idx1=0;
|
||||||
if (--idx2<0) idx2=nph-1;
|
if (--idx2<0) idx2=nph-1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
data[1]=data[0];
|
data[1]=data[0];
|
||||||
pocketfft_backward_r (self->plan, &(data[1]), 1.);
|
self->plan->exec(&(data[1]), 1., false);
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void ringhelper_ring2phase (ringhelper *self,
|
MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self,
|
||||||
const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase,
|
const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase,
|
||||||
int pstride, int flags)
|
int pstride, int flags)
|
||||||
{
|
{
|
||||||
|
@ -349,7 +352,7 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
|
||||||
if (flags&SHARP_REAL_HARMONICS)
|
if (flags&SHARP_REAL_HARMONICS)
|
||||||
wgt *= sqrt_two;
|
wgt *= sqrt_two;
|
||||||
|
|
||||||
pocketfft_forward_r (self->plan, &(data[1]), 1.);
|
self->plan->exec (&(data[1]), 1., true);
|
||||||
data[0]=data[1];
|
data[0]=data[1];
|
||||||
data[1]=data[nph+1]=0.;
|
data[1]=data[nph+1]=0.;
|
||||||
|
|
||||||
|
@ -357,11 +360,11 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
|
||||||
{
|
{
|
||||||
if (self->norot)
|
if (self->norot)
|
||||||
for (int m=0; m<=maxidx; ++m)
|
for (int m=0; m<=maxidx; ++m)
|
||||||
phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * wgt;
|
phase[m*pstride] = dcmplx(data[2*m], data[2*m+1]) * wgt;
|
||||||
else
|
else
|
||||||
for (int m=0; m<=maxidx; ++m)
|
for (int m=0; m<=maxidx; ++m)
|
||||||
phase[m*pstride] =
|
phase[m*pstride] =
|
||||||
(data[2*m] + _Complex_I*data[2*m+1]) * self->shiftarr[m] * wgt;
|
dcmplx(data[2*m], data[2*m+1]) * self->shiftarr[m] * wgt;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -370,9 +373,9 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
|
||||||
int idx=m%nph;
|
int idx=m%nph;
|
||||||
dcmplx val;
|
dcmplx val;
|
||||||
if (idx<(nph-idx))
|
if (idx<(nph-idx))
|
||||||
val = (data[2*idx] + _Complex_I*data[2*idx+1]) * wgt;
|
val = dcmplx(data[2*idx], data[2*idx+1]) * wgt;
|
||||||
else
|
else
|
||||||
val = (data[2*(nph-idx)] - _Complex_I*data[2*(nph-idx)+1]) * wgt;
|
val = dcmplx(data[2*(nph-idx)], -data[2*(nph-idx)+1]) * wgt;
|
||||||
if (!self->norot)
|
if (!self->norot)
|
||||||
val *= self->shiftarr[m];
|
val *= self->shiftarr[m];
|
||||||
phase[m*pstride]=val;
|
phase[m*pstride]=val;
|
||||||
|
@ -383,7 +386,7 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
|
||||||
phase[m*pstride]=0.;
|
phase[m*pstride]=0.;
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
|
MRUTIL_NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
|
||||||
int flags)
|
int flags)
|
||||||
{
|
{
|
||||||
if (flags & SHARP_NO_FFT)
|
if (flags & SHARP_NO_FFT)
|
||||||
|
@ -440,7 +443,7 @@ NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
|
MRUTIL_NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
|
||||||
int flags)
|
int flags)
|
||||||
{
|
{
|
||||||
#define CLEARLOOP(real_t,body) \
|
#define CLEARLOOP(real_t,body) \
|
||||||
|
@ -477,7 +480,7 @@ NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void init_output (sharp_job *job)
|
MRUTIL_NOINLINE static void init_output (sharp_job *job)
|
||||||
{
|
{
|
||||||
if (job->flags&SHARP_ADD) return;
|
if (job->flags&SHARP_ADD) return;
|
||||||
if (job->type == SHARP_MAP2ALM)
|
if (job->type == SHARP_MAP2ALM)
|
||||||
|
@ -488,7 +491,7 @@ NOINLINE static void init_output (sharp_job *job)
|
||||||
clear_map (job->ginfo,job->map[i],job->flags);
|
clear_map (job->ginfo,job->map[i],job->flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
|
MRUTIL_NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
|
||||||
{
|
{
|
||||||
if (job->type==SHARP_MAP2ALM)
|
if (job->type==SHARP_MAP2ALM)
|
||||||
{
|
{
|
||||||
|
@ -514,7 +517,7 @@ static void alloc_almtmp (sharp_job *job, int lmax)
|
||||||
static void dealloc_almtmp (sharp_job *job)
|
static void dealloc_almtmp (sharp_job *job)
|
||||||
{ DEALLOC(job->almtmp); }
|
{ DEALLOC(job->almtmp); }
|
||||||
|
|
||||||
NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
|
MRUTIL_NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
|
||||||
{
|
{
|
||||||
|
|
||||||
#define COPY_LOOP(real_t, source_t, expr_of_x) \
|
#define COPY_LOOP(real_t, source_t, expr_of_x) \
|
||||||
|
@ -577,7 +580,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
|
||||||
if (job->flags&SHARP_DP)
|
if (job->flags&SHARP_DP)
|
||||||
COPY_LOOP(double, dcmplx, x*job->norm_l[l])
|
COPY_LOOP(double, dcmplx, x*job->norm_l[l])
|
||||||
else
|
else
|
||||||
COPY_LOOP(float, fcmplx, x*job->norm_l[l])
|
COPY_LOOP(float, fcmplx, x*float(job->norm_l[l]))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -588,7 +591,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
|
||||||
#undef COPY_LOOP
|
#undef COPY_LOOP
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
|
MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
|
||||||
{
|
{
|
||||||
|
|
||||||
#define COPY_LOOP(real_t, target_t, expr_of_x) \
|
#define COPY_LOOP(real_t, target_t, expr_of_x) \
|
||||||
|
@ -617,9 +620,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
|
||||||
if (m==0)
|
if (m==0)
|
||||||
{
|
{
|
||||||
if (job->flags&SHARP_DP)
|
if (job->flags&SHARP_DP)
|
||||||
COPY_LOOP(double, double, creal(x)*norm_m0)
|
COPY_LOOP(double, double, x.real()*norm_m0)
|
||||||
else
|
else
|
||||||
COPY_LOOP(float, float, crealf(x)*norm_m0)
|
COPY_LOOP(float, float, x.real()*norm_m0)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -634,9 +637,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
|
||||||
if (m==0)
|
if (m==0)
|
||||||
{
|
{
|
||||||
if (job->flags&SHARP_DP)
|
if (job->flags&SHARP_DP)
|
||||||
COPY_LOOP(double, double, creal(x)*job->norm_l[l]*norm_m0)
|
COPY_LOOP(double, double, x.real()*job->norm_l[l]*norm_m0)
|
||||||
else
|
else
|
||||||
COPY_LOOP(float, fcmplx, (float)(creal(x)*job->norm_l[l]*norm_m0))
|
COPY_LOOP(float, fcmplx, (float)(x.real()*job->norm_l[l]*norm_m0))
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -650,7 +653,7 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
|
||||||
#undef COPY_LOOP
|
#undef COPY_LOOP
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
|
MRUTIL_NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
|
||||||
const double *ringtmp, int rstride)
|
const double *ringtmp, int rstride)
|
||||||
{
|
{
|
||||||
if (job->flags & SHARP_DP)
|
if (job->flags & SHARP_DP)
|
||||||
|
@ -658,8 +661,8 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
|
||||||
double **dmap = (double **)job->map;
|
double **dmap = (double **)job->map;
|
||||||
for (int i=0; i<job->nmaps; ++i)
|
for (int i=0; i<job->nmaps; ++i)
|
||||||
{
|
{
|
||||||
double *restrict p1=&dmap[i][ri->ofs];
|
double *MRUTIL_RESTRICT p1=&dmap[i][ri->ofs];
|
||||||
const double *restrict p2=&ringtmp[i*rstride+1];
|
const double *MRUTIL_RESTRICT p2=&ringtmp[i*rstride+1];
|
||||||
if (ri->stride==1)
|
if (ri->stride==1)
|
||||||
{
|
{
|
||||||
if (job->flags&SHARP_ADD)
|
if (job->flags&SHARP_ADD)
|
||||||
|
@ -682,14 +685,14 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
|
MRUTIL_NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
|
||||||
double *ringtmp, int rstride)
|
double *ringtmp, int rstride)
|
||||||
{
|
{
|
||||||
if (job->flags & SHARP_DP)
|
if (job->flags & SHARP_DP)
|
||||||
for (int i=0; i<job->nmaps; ++i)
|
for (int i=0; i<job->nmaps; ++i)
|
||||||
{
|
{
|
||||||
double *restrict p1=&ringtmp[i*rstride+1],
|
double *MRUTIL_RESTRICT p1=&ringtmp[i*rstride+1],
|
||||||
*restrict p2=&(((double *)(job->map[i]))[ri->ofs]);
|
*MRUTIL_RESTRICT p2=&(((double *)(job->map[i]))[ri->ofs]);
|
||||||
if (ri->stride==1)
|
if (ri->stride==1)
|
||||||
memcpy(p1,p2,ri->nph*sizeof(double));
|
memcpy(p1,p2,ri->nph*sizeof(double));
|
||||||
else
|
else
|
||||||
|
@ -721,7 +724,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
|
||||||
for (int m=0; m<=mmax; ++m)
|
for (int m=0; m<=mmax; ++m)
|
||||||
phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
|
phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
|
||||||
((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
|
((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
|
||||||
((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt;
|
((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*float(wgt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
|
static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
|
||||||
|
@ -743,7 +746,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
|
||||||
}
|
}
|
||||||
|
|
||||||
//FIXME: set phase to zero if not SHARP_MAP2ALM?
|
//FIXME: set phase to zero if not SHARP_MAP2ALM?
|
||||||
NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
|
MRUTIL_NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
|
||||||
{
|
{
|
||||||
if (job->type != SHARP_MAP2ALM) return;
|
if (job->type != SHARP_MAP2ALM) return;
|
||||||
int pstride = job->s_m;
|
int pstride = job->s_m;
|
||||||
|
@ -760,35 +763,35 @@ NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
#pragma omp parallel
|
mr::execDynamic(ulim-llim, 0, 1, [&](mr::Scheduler &sched)
|
||||||
{
|
|
||||||
ringhelper helper;
|
|
||||||
ringhelper_init(&helper);
|
|
||||||
int rstride=job->ginfo->nphmax+2;
|
|
||||||
double *ringtmp=RALLOC(double,job->nmaps*rstride);
|
|
||||||
#pragma omp for schedule(dynamic,1)
|
|
||||||
for (int ith=llim; ith<ulim; ++ith)
|
|
||||||
{
|
{
|
||||||
int dim2 = job->s_th*(ith-llim);
|
ringhelper helper;
|
||||||
ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
|
ringhelper_init(&helper);
|
||||||
for (int i=0; i<job->nmaps; ++i)
|
int rstride=job->ginfo->nphmax+2;
|
||||||
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1),
|
double *ringtmp=RALLOC(double,job->nmaps*rstride);
|
||||||
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
|
|
||||||
if (job->ginfo->pair[ith].r2.nph>0)
|
while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; ith<rng.hi+llim; ++ith)
|
||||||
{
|
{
|
||||||
ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
|
int dim2 = job->s_th*(ith-llim);
|
||||||
|
ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
|
||||||
for (int i=0; i<job->nmaps; ++i)
|
for (int i=0; i<job->nmaps; ++i)
|
||||||
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2),
|
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1),
|
||||||
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
|
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
|
||||||
|
if (job->ginfo->pair[ith].r2.nph>0)
|
||||||
|
{
|
||||||
|
ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
|
||||||
|
for (int i=0; i<job->nmaps; ++i)
|
||||||
|
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2),
|
||||||
|
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
DEALLOC(ringtmp);
|
||||||
DEALLOC(ringtmp);
|
ringhelper_destroy(&helper);
|
||||||
ringhelper_destroy(&helper);
|
}); /* end of parallel region */
|
||||||
} /* end of parallel region */
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
|
MRUTIL_NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
|
||||||
{
|
{
|
||||||
if (job->type == SHARP_MAP2ALM) return;
|
if (job->type == SHARP_MAP2ALM) return;
|
||||||
int pstride = job->s_m;
|
int pstride = job->s_m;
|
||||||
|
@ -805,35 +808,35 @@ NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
#pragma omp parallel
|
mr::execDynamic(ulim-llim, 0, 1, [&](mr::Scheduler &sched)
|
||||||
{
|
|
||||||
ringhelper helper;
|
|
||||||
ringhelper_init(&helper);
|
|
||||||
int rstride=job->ginfo->nphmax+2;
|
|
||||||
double *ringtmp=RALLOC(double,job->nmaps*rstride);
|
|
||||||
#pragma omp for schedule(dynamic,1)
|
|
||||||
for (int ith=llim; ith<ulim; ++ith)
|
|
||||||
{
|
{
|
||||||
int dim2 = job->s_th*(ith-llim);
|
ringhelper helper;
|
||||||
for (int i=0; i<job->nmaps; ++i)
|
ringhelper_init(&helper);
|
||||||
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1),
|
int rstride=job->ginfo->nphmax+2;
|
||||||
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
|
double *ringtmp=RALLOC(double,job->nmaps*rstride);
|
||||||
ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
|
|
||||||
if (job->ginfo->pair[ith].r2.nph>0)
|
while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; ith<rng.hi+llim; ++ith)
|
||||||
{
|
{
|
||||||
|
int dim2 = job->s_th*(ith-llim);
|
||||||
for (int i=0; i<job->nmaps; ++i)
|
for (int i=0; i<job->nmaps; ++i)
|
||||||
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2),
|
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1),
|
||||||
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
|
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
|
||||||
ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
|
ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
|
||||||
|
if (job->ginfo->pair[ith].r2.nph>0)
|
||||||
|
{
|
||||||
|
for (int i=0; i<job->nmaps; ++i)
|
||||||
|
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2),
|
||||||
|
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
|
||||||
|
ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
DEALLOC(ringtmp);
|
||||||
DEALLOC(ringtmp);
|
ringhelper_destroy(&helper);
|
||||||
ringhelper_destroy(&helper);
|
}); /* end of parallel region */
|
||||||
} /* end of parallel region */
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void sharp_execute_job (sharp_job *job)
|
MRUTIL_NOINLINE static void sharp_execute_job (sharp_job *job)
|
||||||
{
|
{
|
||||||
double timer=sharp_wallTime();
|
double timer=sharp_wallTime();
|
||||||
job->opcnt=0;
|
job->opcnt=0;
|
||||||
|
@ -852,6 +855,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
|
||||||
&nchunks,&chunksize);
|
&nchunks,&chunksize);
|
||||||
//FIXME: needs to be changed to "nm"
|
//FIXME: needs to be changed to "nm"
|
||||||
alloc_phase (job,mmax+1,chunksize);
|
alloc_phase (job,mmax+1,chunksize);
|
||||||
|
std::atomic<size_t> opcnt = 0;
|
||||||
|
|
||||||
/* chunk loop */
|
/* chunk loop */
|
||||||
for (int chunk=0; chunk<nchunks; ++chunk)
|
for (int chunk=0; chunk<nchunks; ++chunk)
|
||||||
|
@ -871,32 +875,30 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
|
||||||
/* map->phase where necessary */
|
/* map->phase where necessary */
|
||||||
map2phase (job, mmax, llim, ulim);
|
map2phase (job, mmax, llim, ulim);
|
||||||
|
|
||||||
#pragma omp parallel
|
mr::execDynamic(job->ainfo->nm, 0, 1, [&](mr::Scheduler &sched)
|
||||||
{
|
|
||||||
sharp_job ljob = *job;
|
|
||||||
ljob.opcnt=0;
|
|
||||||
sharp_Ylmgen_C generator;
|
|
||||||
sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
|
|
||||||
alloc_almtmp(&ljob,lmax);
|
|
||||||
|
|
||||||
#pragma omp for schedule(dynamic,1)
|
|
||||||
for (int mi=0; mi<job->ainfo->nm; ++mi)
|
|
||||||
{
|
{
|
||||||
/* alm->alm_tmp where necessary */
|
sharp_job ljob = *job;
|
||||||
alm2almtmp (&ljob, lmax, mi);
|
ljob.opcnt=0;
|
||||||
|
sharp_Ylmgen_C generator;
|
||||||
|
sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
|
||||||
|
alloc_almtmp(&ljob,lmax);
|
||||||
|
|
||||||
inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim);
|
while (auto rng=sched.getNext()) for(auto mi=rng.lo; mi<rng.hi; ++mi)
|
||||||
|
{
|
||||||
|
/* alm->alm_tmp where necessary */
|
||||||
|
alm2almtmp (&ljob, lmax, mi);
|
||||||
|
|
||||||
|
inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim);
|
||||||
|
|
||||||
/* alm_tmp->alm where necessary */
|
/* alm_tmp->alm where necessary */
|
||||||
almtmp2alm (&ljob, lmax, mi);
|
almtmp2alm (&ljob, lmax, mi);
|
||||||
}
|
}
|
||||||
|
|
||||||
sharp_Ylmgen_destroy(&generator);
|
sharp_Ylmgen_destroy(&generator);
|
||||||
dealloc_almtmp(&ljob);
|
dealloc_almtmp(&ljob);
|
||||||
|
|
||||||
#pragma omp critical
|
opcnt+=ljob.opcnt;
|
||||||
job->opcnt+=ljob.opcnt;
|
}); /* end of parallel region */
|
||||||
} /* end of parallel region */
|
|
||||||
|
|
||||||
/* phase->map where necessary */
|
/* phase->map where necessary */
|
||||||
phase2map (job, mmax, llim, ulim);
|
phase2map (job, mmax, llim, ulim);
|
||||||
|
@ -909,6 +911,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
|
||||||
|
|
||||||
DEALLOC(job->norm_l);
|
DEALLOC(job->norm_l);
|
||||||
dealloc_phase (job);
|
dealloc_phase (job);
|
||||||
|
job->opcnt = opcnt;
|
||||||
job->time=sharp_wallTime()-timer;
|
job->time=sharp_wallTime()-timer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -934,8 +937,8 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
|
||||||
job->flags|=SHARP_REAL_HARMONICS;
|
job->flags|=SHARP_REAL_HARMONICS;
|
||||||
job->time = 0.;
|
job->time = 0.;
|
||||||
job->opcnt = 0;
|
job->opcnt = 0;
|
||||||
job->alm=alm;
|
job->alm=(void **)alm;
|
||||||
job->map=map;
|
job->map=(void **)map;
|
||||||
}
|
}
|
||||||
|
|
||||||
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
|
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
|
|
@ -27,7 +27,7 @@
|
||||||
|
|
||||||
#define ARCH default
|
#define ARCH default
|
||||||
#define GENERIC_ARCH
|
#define GENERIC_ARCH
|
||||||
#include "libsharp2/sharp_core_inc.c"
|
#include "libsharp2/sharp_core_inc.cc"
|
||||||
#undef GENERIC_ARCH
|
#undef GENERIC_ARCH
|
||||||
#undef ARCH
|
#undef ARCH
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
#define XCONCATX2(a,b) XCONCATX(a,b)
|
||||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
#define XARCH(a) XCONCATX2(a,ARCH)
|
||||||
|
|
||||||
#include <complex.h>
|
#include <complex>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "libsharp2/sharp_vecsupport.h"
|
#include "libsharp2/sharp_vecsupport.h"
|
||||||
|
@ -49,7 +49,7 @@
|
||||||
// Unfortunately, most compilers don't act on this pragma yet.
|
// Unfortunately, most compilers don't act on this pragma yet.
|
||||||
#pragma STDC FP_CONTRACT ON
|
#pragma STDC FP_CONTRACT ON
|
||||||
|
|
||||||
typedef complex double dcmplx;
|
typedef complex<double> dcmplx;
|
||||||
|
|
||||||
#define nv0 (128/VLEN)
|
#define nv0 (128/VLEN)
|
||||||
#define nvx (64/VLEN)
|
#define nvx (64/VLEN)
|
||||||
|
@ -94,35 +94,35 @@ typedef union
|
||||||
sxdata_s s;
|
sxdata_s s;
|
||||||
} sxdata_u;
|
} sxdata_u;
|
||||||
|
|
||||||
static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale,
|
static inline void Tvnormalize (Tv * MRUTIL_RESTRICT val, Tv * MRUTIL_RESTRICT scale,
|
||||||
double maxval)
|
double maxval)
|
||||||
{
|
{
|
||||||
const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
|
const Tv vfmin=sharp_fsmall*maxval, vfmax=maxval;
|
||||||
const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
|
const Tv vfsmall=sharp_fsmall, vfbig=sharp_fbig;
|
||||||
Tm mask = vgt(vabs(*val),vfmax);
|
auto mask = abs(*val)>vfmax;
|
||||||
while (vanyTrue(mask))
|
while (any_of(mask))
|
||||||
{
|
{
|
||||||
vmuleq_mask(mask,*val,vfsmall);
|
where(mask,*val)*=vfsmall;
|
||||||
vaddeq_mask(mask,*scale,vone);
|
where(mask,*scale)+=1;
|
||||||
mask = vgt(vabs(*val),vfmax);
|
mask = abs(*val)>vfmax;
|
||||||
}
|
}
|
||||||
mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
|
mask = (abs(*val)<vfmin) & (*val!=0);
|
||||||
while (vanyTrue(mask))
|
while (any_of(mask))
|
||||||
{
|
{
|
||||||
vmuleq_mask(mask,*val,vfbig);
|
where(mask,*val)*=vfbig;
|
||||||
vsubeq_mask(mask,*scale,vone);
|
where(mask,*scale)-=1;
|
||||||
mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
|
mask = (abs(*val)<vfmin) & (*val!=0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mypow(Tv val, int npow, const double * restrict powlimit,
|
static void mypow(Tv val, int npow, const double * MRUTIL_RESTRICT powlimit,
|
||||||
Tv * restrict resd, Tv * restrict ress)
|
Tv * MRUTIL_RESTRICT resd, Tv * MRUTIL_RESTRICT ress)
|
||||||
{
|
{
|
||||||
Tv vminv=vload(powlimit[npow]);
|
Tv vminv=powlimit[npow];
|
||||||
Tm mask = vlt(vabs(val),vminv);
|
auto mask = abs(val)<vminv;
|
||||||
if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
|
if (none_of(mask)) // no underflows possible, use quick algoritm
|
||||||
{
|
{
|
||||||
Tv res=vone;
|
Tv res=1;
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
if (npow&1)
|
if (npow&1)
|
||||||
|
@ -131,11 +131,11 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
|
||||||
}
|
}
|
||||||
while(npow>>=1);
|
while(npow>>=1);
|
||||||
*resd=res;
|
*resd=res;
|
||||||
*ress=vzero;
|
*ress=0;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
Tv scale=vzero, scaleint=vzero, res=vone;
|
Tv scale=0, scaleint=0, res=1;
|
||||||
Tvnormalize(&val,&scaleint,sharp_fbighalf);
|
Tvnormalize(&val,&scaleint,sharp_fbighalf);
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
|
@ -155,8 +155,8 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void getCorfac(Tv scale, Tv * restrict corfac,
|
static inline void getCorfac(Tv scale, Tv * MRUTIL_RESTRICT corfac,
|
||||||
const double * restrict cf)
|
const double * MRUTIL_RESTRICT cf)
|
||||||
{
|
{
|
||||||
typedef union
|
typedef union
|
||||||
{ Tv v; double s[VLEN]; } Tvu;
|
{ Tv v; double s[VLEN]; } Tvu;
|
||||||
|
@ -169,67 +169,67 @@ static inline void getCorfac(Tv scale, Tv * restrict corfac,
|
||||||
*corfac=corf.v;
|
*corfac=corf.v;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, Tv eps)
|
static inline bool rescale(Tv * MRUTIL_RESTRICT v1, Tv * MRUTIL_RESTRICT v2, Tv * MRUTIL_RESTRICT s, Tv eps)
|
||||||
{
|
{
|
||||||
Tm mask = vgt(vabs(*v2),eps);
|
auto mask = abs(*v2)>eps;
|
||||||
if (vanyTrue(mask))
|
if (any_of(mask))
|
||||||
{
|
{
|
||||||
vmuleq_mask(mask,*v1,vload(sharp_fsmall));
|
where(mask,*v1)*=sharp_fsmall;
|
||||||
vmuleq_mask(mask,*v2,vload(sharp_fsmall));
|
where(mask,*v2)*=sharp_fsmall;
|
||||||
vaddeq_mask(mask,*s,vone);
|
where(mask,*s)+=1;
|
||||||
return 1;
|
return true;
|
||||||
}
|
}
|
||||||
return 0;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
|
MRUTIL_NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * MRUTIL_RESTRICT gen,
|
||||||
s0data_v * restrict d, int * restrict l_, int * restrict il_, int nv2)
|
s0data_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int * MRUTIL_RESTRICT il_, int nv2)
|
||||||
{
|
{
|
||||||
int l=gen->m, il=0;
|
int l=gen->m, il=0;
|
||||||
Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
|
Tv mfac = (gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m];
|
||||||
Tv limscale=vload(sharp_limscale);
|
Tv limscale=sharp_limscale;
|
||||||
int below_limit = 1;
|
int below_limit = 1;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
d->lam1[i]=vzero;
|
d->lam1[i]=0;
|
||||||
mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]);
|
mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]);
|
||||||
d->lam2[i] *= mfac;
|
d->lam2[i] *= mfac;
|
||||||
Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
|
Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
|
||||||
below_limit &= vallTrue(vlt(d->scale[i],limscale));
|
below_limit &= all_of(d->scale[i]<limscale);
|
||||||
}
|
}
|
||||||
|
|
||||||
while (below_limit)
|
while (below_limit)
|
||||||
{
|
{
|
||||||
if (l+4>gen->lmax) {*l_=gen->lmax+1;return;}
|
if (l+4>gen->lmax) {*l_=gen->lmax+1;return;}
|
||||||
below_limit=1;
|
below_limit=1;
|
||||||
Tv a1=vload(gen->coef[il ].a), b1=vload(gen->coef[il ].b);
|
Tv a1=gen->coef[il ].a, b1=gen->coef[il ].b;
|
||||||
Tv a2=vload(gen->coef[il+1].a), b2=vload(gen->coef[il+1].b);
|
Tv a2=gen->coef[il+1].a, b2=gen->coef[il+1].b;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
|
d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
|
||||||
d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
|
d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
|
||||||
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
|
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol))
|
||||||
below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale)));
|
below_limit &= all_of(d->scale[i]<sharp_limscale);
|
||||||
}
|
}
|
||||||
l+=4; il+=2;
|
l+=4; il+=2;
|
||||||
}
|
}
|
||||||
*l_=l; *il_=il;
|
*l_=l; *il_=il;
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void alm2map_kernel(s0data_v * restrict d,
|
MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d,
|
||||||
const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm,
|
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, const dcmplx * MRUTIL_RESTRICT alm,
|
||||||
int l, int il, int lmax, int nv2)
|
int l, int il, int lmax, int nv2)
|
||||||
{
|
{
|
||||||
if (nv2==nv0)
|
if (nv2==nv0)
|
||||||
{
|
{
|
||||||
for (; l<=lmax-2; il+=2, l+=4)
|
for (; l<=lmax-2; il+=2, l+=4)
|
||||||
{
|
{
|
||||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
|
Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
|
||||||
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
|
||||||
Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
|
Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag();
|
||||||
Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
|
Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag();
|
||||||
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
|
Tv a1=coef[il ].a, b1=coef[il ].b;
|
||||||
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
|
Tv a2=coef[il+1].a, b2=coef[il+1].b;
|
||||||
for (int i=0; i<nv0; ++i)
|
for (int i=0; i<nv0; ++i)
|
||||||
{
|
{
|
||||||
d->p1r[i] += d->lam2[i]*ar1;
|
d->p1r[i] += d->lam2[i]*ar1;
|
||||||
|
@ -249,12 +249,12 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
|
||||||
{
|
{
|
||||||
for (; l<=lmax-2; il+=2, l+=4)
|
for (; l<=lmax-2; il+=2, l+=4)
|
||||||
{
|
{
|
||||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
|
Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
|
||||||
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
|
||||||
Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
|
Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag();
|
||||||
Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
|
Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag();
|
||||||
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
|
Tv a1=coef[il ].a, b1=coef[il ].b;
|
||||||
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
|
Tv a2=coef[il+1].a, b2=coef[il+1].b;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
d->p1r[i] += d->lam2[i]*ar1;
|
d->p1r[i] += d->lam2[i]*ar1;
|
||||||
|
@ -272,9 +272,9 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
|
||||||
}
|
}
|
||||||
for (; l<=lmax; ++il, l+=2)
|
for (; l<=lmax; ++il, l+=2)
|
||||||
{
|
{
|
||||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
|
Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
|
||||||
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
|
||||||
Tv a=vload(coef[il].a), b=vload(coef[il].b);
|
Tv a=coef[il].a, b=coef[il].b;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
d->p1r[i] += d->lam2[i]*ar1;
|
d->p1r[i] += d->lam2[i]*ar1;
|
||||||
|
@ -288,8 +288,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void calc_alm2map (sharp_job * restrict job,
|
MRUTIL_NOINLINE static void calc_alm2map (sharp_job * MRUTIL_RESTRICT job,
|
||||||
const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
|
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth)
|
||||||
{
|
{
|
||||||
int l,il,lmax=gen->lmax;
|
int l,il,lmax=gen->lmax;
|
||||||
int nv2 = (nth+VLEN-1)/VLEN;
|
int nv2 = (nth+VLEN-1)/VLEN;
|
||||||
|
@ -298,20 +298,20 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
|
||||||
if (l>lmax) return;
|
if (l>lmax) return;
|
||||||
job->opcnt += (lmax+1-l) * 6*nth;
|
job->opcnt += (lmax+1-l) * 6*nth;
|
||||||
|
|
||||||
const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
|
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef;
|
||||||
const dcmplx * restrict alm=job->almtmp;
|
const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
|
||||||
int full_ieee=1;
|
int full_ieee=1;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
|
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
|
||||||
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
|
full_ieee &= all_of(d->scale[i]>=sharp_minscale);
|
||||||
}
|
}
|
||||||
|
|
||||||
while((!full_ieee) && (l<=lmax))
|
while((!full_ieee) && (l<=lmax))
|
||||||
{
|
{
|
||||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
|
Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
|
||||||
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
|
||||||
Tv a=vload(coef[il].a), b=vload(coef[il].b);
|
Tv a=coef[il].a, b=coef[il].b;
|
||||||
full_ieee=1;
|
full_ieee=1;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
|
@ -322,9 +322,9 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
|
||||||
Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
|
Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
|
||||||
d->lam1[i] = d->lam2[i];
|
d->lam1[i] = d->lam2[i];
|
||||||
d->lam2[i] = tmp;
|
d->lam2[i] = tmp;
|
||||||
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
|
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol))
|
||||||
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
|
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
|
||||||
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
|
full_ieee &= all_of(d->scale[i]>=sharp_minscale);
|
||||||
}
|
}
|
||||||
l+=2; ++il;
|
l+=2; ++il;
|
||||||
}
|
}
|
||||||
|
@ -338,16 +338,16 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
|
||||||
alm2map_kernel(d, coef, alm, l, il, lmax, nv2);
|
alm2map_kernel(d, coef, alm, l, il, lmax, nv2);
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void map2alm_kernel(s0data_v * restrict d,
|
MRUTIL_NOINLINE static void map2alm_kernel(s0data_v * MRUTIL_RESTRICT d,
|
||||||
const sharp_ylmgen_dbl2 * restrict coef, dcmplx * restrict alm, int l,
|
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, dcmplx * MRUTIL_RESTRICT alm, int l,
|
||||||
int il, int lmax, int nv2)
|
int il, int lmax, int nv2)
|
||||||
{
|
{
|
||||||
for (; l<=lmax-2; il+=2, l+=4)
|
for (; l<=lmax-2; il+=2, l+=4)
|
||||||
{
|
{
|
||||||
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
|
Tv a1=coef[il ].a, b1=coef[il ].b;
|
||||||
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
|
Tv a2=coef[il+1].a, b2=coef[il+1].b;
|
||||||
Tv atmp1[4] = {vzero, vzero, vzero, vzero};
|
Tv atmp1[4] = {0,0,0,0};
|
||||||
Tv atmp2[4] = {vzero, vzero, vzero, vzero};
|
Tv atmp2[4] = {0,0,0,0};
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
atmp1[0] += d->lam2[i]*d->p1r[i];
|
atmp1[0] += d->lam2[i]*d->p1r[i];
|
||||||
|
@ -366,8 +366,8 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d,
|
||||||
}
|
}
|
||||||
for (; l<=lmax; ++il, l+=2)
|
for (; l<=lmax; ++il, l+=2)
|
||||||
{
|
{
|
||||||
Tv a=vload(coef[il].a), b=vload(coef[il].b);
|
Tv a=coef[il].a, b=coef[il].b;
|
||||||
Tv atmp[4] = {vzero, vzero, vzero, vzero};
|
Tv atmp[4] = {0,0,0,0};
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
atmp[0] += d->lam2[i]*d->p1r[i];
|
atmp[0] += d->lam2[i]*d->p1r[i];
|
||||||
|
@ -382,8 +382,8 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void calc_map2alm (sharp_job * restrict job,
|
MRUTIL_NOINLINE static void calc_map2alm (sharp_job * MRUTIL_RESTRICT job,
|
||||||
const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
|
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth)
|
||||||
{
|
{
|
||||||
int l,il,lmax=gen->lmax;
|
int l,il,lmax=gen->lmax;
|
||||||
int nv2 = (nth+VLEN-1)/VLEN;
|
int nv2 = (nth+VLEN-1)/VLEN;
|
||||||
|
@ -392,19 +392,19 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
|
||||||
if (l>lmax) return;
|
if (l>lmax) return;
|
||||||
job->opcnt += (lmax+1-l) * 6*nth;
|
job->opcnt += (lmax+1-l) * 6*nth;
|
||||||
|
|
||||||
const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
|
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef;
|
||||||
dcmplx * restrict alm=job->almtmp;
|
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
|
||||||
int full_ieee=1;
|
int full_ieee=1;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
|
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
|
||||||
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
|
full_ieee &= all_of(d->scale[i]>=sharp_minscale);
|
||||||
}
|
}
|
||||||
|
|
||||||
while((!full_ieee) && (l<=lmax))
|
while((!full_ieee) && (l<=lmax))
|
||||||
{
|
{
|
||||||
Tv a=vload(coef[il].a), b=vload(coef[il].b);
|
Tv a=coef[il].a, b=coef[il].b;
|
||||||
Tv atmp[4] = {vzero, vzero, vzero, vzero};
|
Tv atmp[4] = {0,0,0,0};
|
||||||
full_ieee=1;
|
full_ieee=1;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
|
@ -415,9 +415,9 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
|
||||||
Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
|
Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
|
||||||
d->lam1[i] = d->lam2[i];
|
d->lam1[i] = d->lam2[i];
|
||||||
d->lam2[i] = tmp;
|
d->lam2[i] = tmp;
|
||||||
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
|
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol))
|
||||||
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
|
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
|
||||||
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
|
full_ieee &= all_of(d->scale[i]>=sharp_minscale);
|
||||||
}
|
}
|
||||||
vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
|
vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
|
||||||
l+=2; ++il;
|
l+=2; ++il;
|
||||||
|
@ -432,21 +432,21 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
|
||||||
map2alm_kernel(d, coef, alm, l, il, lmax, nv2);
|
map2alm_kernel(d, coef, alm, l, il, lmax, nv2);
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RESTRICT gen,
|
||||||
sxdata_v * restrict d, int * restrict l_, int nv2)
|
sxdata_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int nv2)
|
||||||
{
|
{
|
||||||
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
|
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
|
||||||
Tv prefac=vload(gen->prefac[gen->m]),
|
Tv prefac=gen->prefac[gen->m],
|
||||||
prescale=vload(gen->fscale[gen->m]);
|
prescale=gen->fscale[gen->m];
|
||||||
Tv limscale=vload(sharp_limscale);
|
Tv limscale=sharp_limscale;
|
||||||
int below_limit=1;
|
int below_limit=1;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
Tv cth2=vmax(vload(1e-15),vsqrt((vone+d->cth[i])*vload(0.5)));
|
Tv cth2=max(Tv(1e-15),sqrt((1.+d->cth[i])*0.5));
|
||||||
Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5)));
|
Tv sth2=max(Tv(1e-15),sqrt((1.-d->cth[i])*0.5));
|
||||||
Tm mask=vlt(d->sth[i],vzero);
|
auto mask=d->sth[i]<0;
|
||||||
vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.));
|
where(mask&(d->cth[i]<0),cth2)*=-1.;
|
||||||
vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.));
|
where(mask&(d->cth[i]<0),sth2)*=-1.;
|
||||||
|
|
||||||
Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
|
Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
|
||||||
mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
|
mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
|
||||||
|
@ -454,8 +454,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
||||||
mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
|
mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
|
||||||
mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
|
mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
|
||||||
|
|
||||||
d->l1p[i] = vzero;
|
d->l1p[i] = 0;
|
||||||
d->l1m[i] = vzero;
|
d->l1m[i] = 0;
|
||||||
d->l2p[i] = prefac*ccp;
|
d->l2p[i] = prefac*ccp;
|
||||||
d->scp[i] = prescale+ccps;
|
d->scp[i] = prescale+ccps;
|
||||||
d->l2m[i] = prefac*csp;
|
d->l2m[i] = prefac*csp;
|
||||||
|
@ -467,17 +467,17 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
||||||
d->l2m[i] *= scp;
|
d->l2m[i] *= scp;
|
||||||
d->scm[i] += scps;
|
d->scm[i] += scps;
|
||||||
if (gen->preMinus_p)
|
if (gen->preMinus_p)
|
||||||
d->l2p[i] = vneg(d->l2p[i]);
|
d->l2p[i] = -d->l2p[i];
|
||||||
if (gen->preMinus_m)
|
if (gen->preMinus_m)
|
||||||
d->l2m[i] = vneg(d->l2m[i]);
|
d->l2m[i] = -d->l2m[i];
|
||||||
if (gen->s&1)
|
if (gen->s&1)
|
||||||
d->l2p[i] = vneg(d->l2p[i]);
|
d->l2p[i] = -d->l2p[i];
|
||||||
|
|
||||||
Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol);
|
Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol);
|
||||||
Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol);
|
Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol);
|
||||||
|
|
||||||
below_limit &= vallTrue(vlt(d->scm[i],limscale)) &&
|
below_limit &= all_of(d->scm[i]<limscale) &&
|
||||||
vallTrue(vlt(d->scp[i],limscale));
|
all_of(d->scp[i]<limscale);
|
||||||
}
|
}
|
||||||
|
|
||||||
int l=gen->mhi;
|
int l=gen->mhi;
|
||||||
|
@ -486,18 +486,18 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
||||||
{
|
{
|
||||||
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
|
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
|
||||||
below_limit=1;
|
below_limit=1;
|
||||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
||||||
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
|
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
|
||||||
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
|
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
|
||||||
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
|
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
|
||||||
if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
|
if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],sharp_ftol) ||
|
||||||
rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
|
rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],sharp_ftol))
|
||||||
below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
|
below_limit &= all_of(d->scp[i]<limscale) &&
|
||||||
vallTrue(vlt(d->scm[i],limscale));
|
all_of(d->scm[i]<limscale);
|
||||||
}
|
}
|
||||||
l+=2;
|
l+=2;
|
||||||
}
|
}
|
||||||
|
@ -505,19 +505,19 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
||||||
*l_=l;
|
*l_=l;
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
|
MRUTIL_NOINLINE static void alm2map_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
|
||||||
const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
|
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm,
|
||||||
int l, int lmax, int nv2)
|
int l, int lmax, int nv2)
|
||||||
{
|
{
|
||||||
int lsave = l;
|
int lsave = l;
|
||||||
while (l<=lmax)
|
while (l<=lmax)
|
||||||
{
|
{
|
||||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||||
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
|
Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(),
|
||||||
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
|
acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
|
||||||
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
|
Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
|
||||||
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
|
acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
||||||
|
@ -537,12 +537,12 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
|
||||||
l=lsave;
|
l=lsave;
|
||||||
while (l<=lmax)
|
while (l<=lmax)
|
||||||
{
|
{
|
||||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||||
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
|
Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(),
|
||||||
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
|
acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
|
||||||
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
|
Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
|
||||||
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
|
acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
|
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
|
||||||
|
@ -561,8 +561,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
|
MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job * MRUTIL_RESTRICT job,
|
||||||
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
|
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
|
||||||
{
|
{
|
||||||
int l,lmax=gen->lmax;
|
int l,lmax=gen->lmax;
|
||||||
int nv2 = (nth+VLEN-1)/VLEN;
|
int nv2 = (nth+VLEN-1)/VLEN;
|
||||||
|
@ -571,25 +571,25 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
|
||||||
if (l>lmax) return;
|
if (l>lmax) return;
|
||||||
job->opcnt += (lmax+1-l) * 23*nth;
|
job->opcnt += (lmax+1-l) * 23*nth;
|
||||||
|
|
||||||
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
|
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
|
||||||
const dcmplx * restrict alm=job->almtmp;
|
const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
|
||||||
int full_ieee=1;
|
int full_ieee=1;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
|
full_ieee &= all_of(d->scp[i]>=sharp_minscale) &&
|
||||||
vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
all_of(d->scm[i]>=sharp_minscale);
|
||||||
}
|
}
|
||||||
|
|
||||||
while((!full_ieee) && (l<=lmax))
|
while((!full_ieee) && (l<=lmax))
|
||||||
{
|
{
|
||||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||||
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
|
Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(),
|
||||||
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
|
acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
|
||||||
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
|
Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
|
||||||
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
|
acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
|
||||||
full_ieee=1;
|
full_ieee=1;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
|
@ -611,12 +611,12 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
|
||||||
|
|
||||||
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
|
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
|
||||||
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
|
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
|
||||||
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
|
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol))
|
||||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
|
full_ieee &= all_of(d->scp[i]>=sharp_minscale);
|
||||||
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
|
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol))
|
||||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||||
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
full_ieee &= all_of(d->scm[i]>=sharp_minscale);
|
||||||
}
|
}
|
||||||
l+=2;
|
l+=2;
|
||||||
}
|
}
|
||||||
|
@ -641,17 +641,17 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
|
MRUTIL_NOINLINE static void map2alm_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
|
||||||
const sharp_ylmgen_dbl2 * restrict fx, dcmplx * restrict alm,
|
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, dcmplx * MRUTIL_RESTRICT alm,
|
||||||
int l, int lmax, int nv2)
|
int l, int lmax, int nv2)
|
||||||
{
|
{
|
||||||
int lsave=l;
|
int lsave=l;
|
||||||
while (l<=lmax)
|
while (l<=lmax)
|
||||||
{
|
{
|
||||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||||
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
|
Tv agr1=0, agi1=0, acr1=0, aci1=0;
|
||||||
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
|
Tv agr2=0, agi2=0, acr2=0, aci2=0;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
||||||
|
@ -672,10 +672,10 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
|
||||||
l=lsave;
|
l=lsave;
|
||||||
while (l<=lmax)
|
while (l<=lmax)
|
||||||
{
|
{
|
||||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||||
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
|
Tv agr1=0, agi1=0, acr1=0, aci1=0;
|
||||||
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
|
Tv agr2=0, agi2=0, acr2=0, aci2=0;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
|
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
|
||||||
|
@ -695,8 +695,8 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
|
MRUTIL_NOINLINE static void calc_map2alm_spin (sharp_job * MRUTIL_RESTRICT job,
|
||||||
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
|
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
|
||||||
{
|
{
|
||||||
int l,lmax=gen->lmax;
|
int l,lmax=gen->lmax;
|
||||||
int nv2 = (nth+VLEN-1)/VLEN;
|
int nv2 = (nth+VLEN-1)/VLEN;
|
||||||
|
@ -705,15 +705,15 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
|
||||||
if (l>lmax) return;
|
if (l>lmax) return;
|
||||||
job->opcnt += (lmax+1-l) * 23*nth;
|
job->opcnt += (lmax+1-l) * 23*nth;
|
||||||
|
|
||||||
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
|
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
|
||||||
dcmplx * restrict alm=job->almtmp;
|
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
|
||||||
int full_ieee=1;
|
int full_ieee=1;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
|
full_ieee &= all_of(d->scp[i]>=sharp_minscale) &&
|
||||||
vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
all_of(d->scm[i]>=sharp_minscale);
|
||||||
}
|
}
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
|
@ -726,10 +726,10 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
|
||||||
|
|
||||||
while((!full_ieee) && (l<=lmax))
|
while((!full_ieee) && (l<=lmax))
|
||||||
{
|
{
|
||||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||||
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
|
Tv agr1=0, agi1=0, acr1=0, aci1=0;
|
||||||
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
|
Tv agr2=0, agi2=0, acr2=0, aci2=0;
|
||||||
full_ieee=1;
|
full_ieee=1;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
|
@ -748,12 +748,12 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
|
||||||
|
|
||||||
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
|
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
|
||||||
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
|
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
|
||||||
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
|
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol))
|
||||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
|
full_ieee &= all_of(d->scp[i]>=sharp_minscale);
|
||||||
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
|
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol))
|
||||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||||
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
full_ieee &= all_of(d->scm[i]>=sharp_minscale);
|
||||||
}
|
}
|
||||||
vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
|
vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
|
||||||
vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
|
vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
|
||||||
|
@ -772,17 +772,17 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
|
MRUTIL_NOINLINE static void alm2map_deriv1_kernel(sxdata_v * MRUTIL_RESTRICT d,
|
||||||
const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
|
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm,
|
||||||
int l, int lmax, int nv2)
|
int l, int lmax, int nv2)
|
||||||
{
|
{
|
||||||
int lsave=l;
|
int lsave=l;
|
||||||
while (l<=lmax)
|
while (l<=lmax)
|
||||||
{
|
{
|
||||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])),
|
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(),
|
||||||
ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
ar2=alm[l+1].real(), ai2=alm[l+1].imag();
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
||||||
|
@ -798,10 +798,10 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
|
||||||
l=lsave;
|
l=lsave;
|
||||||
while (l<=lmax)
|
while (l<=lmax)
|
||||||
{
|
{
|
||||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])),
|
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(),
|
||||||
ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
ar2=alm[l+1].real(), ai2=alm[l+1].imag();
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
|
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
|
||||||
|
@ -816,8 +816,8 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
|
MRUTIL_NOINLINE static void calc_alm2map_deriv1(sharp_job * MRUTIL_RESTRICT job,
|
||||||
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
|
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
|
||||||
{
|
{
|
||||||
int l,lmax=gen->lmax;
|
int l,lmax=gen->lmax;
|
||||||
int nv2 = (nth+VLEN-1)/VLEN;
|
int nv2 = (nth+VLEN-1)/VLEN;
|
||||||
|
@ -826,23 +826,23 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
|
||||||
if (l>lmax) return;
|
if (l>lmax) return;
|
||||||
job->opcnt += (lmax+1-l) * 15*nth;
|
job->opcnt += (lmax+1-l) * 15*nth;
|
||||||
|
|
||||||
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
|
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
|
||||||
const dcmplx * restrict alm=job->almtmp;
|
const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
|
||||||
int full_ieee=1;
|
int full_ieee=1;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
|
full_ieee &= all_of(d->scp[i]>=sharp_minscale) &&
|
||||||
vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
all_of(d->scm[i]>=sharp_minscale);
|
||||||
}
|
}
|
||||||
|
|
||||||
while((!full_ieee) && (l<=lmax))
|
while((!full_ieee) && (l<=lmax))
|
||||||
{
|
{
|
||||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])),
|
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(),
|
||||||
ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
ar2=alm[l+1].real(), ai2=alm[l+1].imag();
|
||||||
full_ieee=1;
|
full_ieee=1;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
|
@ -864,12 +864,12 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
|
||||||
|
|
||||||
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
|
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
|
||||||
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
|
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
|
||||||
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
|
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol))
|
||||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
|
full_ieee &= all_of(d->scp[i]>=sharp_minscale);
|
||||||
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
|
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol))
|
||||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||||
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
full_ieee &= all_of(d->scm[i]>=sharp_minscale);
|
||||||
}
|
}
|
||||||
l+=2;
|
l+=2;
|
||||||
}
|
}
|
||||||
|
@ -897,7 +897,7 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
|
||||||
|
|
||||||
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
|
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
|
||||||
|
|
||||||
NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
||||||
const double *cth_, const double *sth_, int llim, int ulim,
|
const double *cth_, const double *sth_, int llim, int ulim,
|
||||||
sharp_Ylmgen_C *gen, int mi, const int *mlim)
|
sharp_Ylmgen_C *gen, int mi, const int *mlim)
|
||||||
{
|
{
|
||||||
|
@ -912,7 +912,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
||||||
if (job->spin==0)
|
if (job->spin==0)
|
||||||
{
|
{
|
||||||
//adjust the a_lm for the new algorithm
|
//adjust the a_lm for the new algorithm
|
||||||
dcmplx * restrict alm=job->almtmp;
|
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
|
||||||
for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
|
for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
|
||||||
{
|
{
|
||||||
dcmplx al = alm[l];
|
dcmplx al = alm[l];
|
||||||
|
@ -963,8 +963,8 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
||||||
d.s.p2r[i]*=cth_[tgt];
|
d.s.p2r[i]*=cth_[tgt];
|
||||||
d.s.p2i[i]*=cth_[tgt];
|
d.s.p2i[i]*=cth_[tgt];
|
||||||
int phas_idx = tgt*job->s_th + mi*job->s_m;
|
int phas_idx = tgt*job->s_th + mi*job->s_m;
|
||||||
complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I,
|
complex<double> r1(d.s.p1r[i], d.s.p1i[i]),
|
||||||
r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I;
|
r2(d.s.p2r[i], d.s.p2i[i]);
|
||||||
job->phase[phas_idx] = r1+r2;
|
job->phase[phas_idx] = r1+r2;
|
||||||
if (ispair[tgt])
|
if (ispair[tgt])
|
||||||
job->phase[phas_idx+1] = r1-r2;
|
job->phase[phas_idx+1] = r1-r2;
|
||||||
|
@ -1027,10 +1027,10 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
||||||
{
|
{
|
||||||
int tgt=itgt[i];
|
int tgt=itgt[i];
|
||||||
int phas_idx = tgt*job->s_th + mi*job->s_m;
|
int phas_idx = tgt*job->s_th + mi*job->s_m;
|
||||||
complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I,
|
complex<double> q1(d.s.p1pr[i], d.s.p1pi[i]),
|
||||||
q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I,
|
q2(d.s.p2pr[i], d.s.p2pi[i]),
|
||||||
u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I,
|
u1(d.s.p1mr[i], d.s.p1mi[i]),
|
||||||
u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I;
|
u2(d.s.p2mr[i], d.s.p2mi[i]);
|
||||||
job->phase[phas_idx ] = q1+q2;
|
job->phase[phas_idx ] = q1+q2;
|
||||||
job->phase[phas_idx+2] = u1+u2;
|
job->phase[phas_idx+2] = u1+u2;
|
||||||
if (ispair[tgt])
|
if (ispair[tgt])
|
||||||
|
@ -1056,7 +1056,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
|
MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
|
||||||
const double *cth_, const double *sth_, int llim, int ulim,
|
const double *cth_, const double *sth_, int llim, int ulim,
|
||||||
sharp_Ylmgen_C *gen, int mi, const int *mlim)
|
sharp_Ylmgen_C *gen, int mi, const int *mlim)
|
||||||
{
|
{
|
||||||
|
@ -1083,8 +1083,8 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
|
||||||
int phas_idx = ith*job->s_th + mi*job->s_m;
|
int phas_idx = ith*job->s_th + mi*job->s_m;
|
||||||
dcmplx ph1=job->phase[phas_idx];
|
dcmplx ph1=job->phase[phas_idx];
|
||||||
dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.;
|
dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.;
|
||||||
d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2);
|
d.s.p1r[nth]=(ph1+ph2).real(); d.s.p1i[nth]=(ph1+ph2).imag();
|
||||||
d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2);
|
d.s.p2r[nth]=(ph1-ph2).real(); d.s.p2i[nth]=(ph1-ph2).imag();
|
||||||
//adjust for new algorithm
|
//adjust for new algorithm
|
||||||
d.s.p2r[nth]*=cth_[ith];
|
d.s.p2r[nth]*=cth_[ith];
|
||||||
d.s.p2i[nth]*=cth_[ith];
|
d.s.p2i[nth]*=cth_[ith];
|
||||||
|
@ -1105,7 +1105,7 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//adjust the a_lm for the new algorithm
|
//adjust the a_lm for the new algorithm
|
||||||
dcmplx * restrict alm=job->almtmp;
|
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
|
||||||
dcmplx alm2 = 0.;
|
dcmplx alm2 = 0.;
|
||||||
double alold=0;
|
double alold=0;
|
||||||
for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
|
for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
|
||||||
|
@ -1138,10 +1138,10 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
|
||||||
p2U=ispair[ith] ? job->phase[phas_idx+3]:0.;
|
p2U=ispair[ith] ? job->phase[phas_idx+3]:0.;
|
||||||
if ((gen->mhi-gen->m+gen->s)&1)
|
if ((gen->mhi-gen->m+gen->s)&1)
|
||||||
{ p2Q=-p2Q; p2U=-p2U; }
|
{ p2Q=-p2Q; p2U=-p2U; }
|
||||||
d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q);
|
d.s.p1pr[nth]=(p1Q+p2Q).real(); d.s.p1pi[nth]=(p1Q+p2Q).imag();
|
||||||
d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U);
|
d.s.p1mr[nth]=(p1U+p2U).real(); d.s.p1mi[nth]=(p1U+p2U).imag();
|
||||||
d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q);
|
d.s.p2pr[nth]=(p1Q-p2Q).real(); d.s.p2pi[nth]=(p1Q-p2Q).imag();
|
||||||
d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U);
|
d.s.p2mr[nth]=(p1U-p2U).real(); d.s.p2mi[nth]=(p1U-p2U).imag();
|
||||||
++nth;
|
++nth;
|
||||||
}
|
}
|
||||||
++ith;
|
++ith;
|
|
@ -29,7 +29,7 @@
|
||||||
#include "libsharp2/sharp_geomhelpers.h"
|
#include "libsharp2/sharp_geomhelpers.h"
|
||||||
#include "libsharp2/sharp_legendre_roots.h"
|
#include "libsharp2/sharp_legendre_roots.h"
|
||||||
#include "libsharp2/sharp_utils.h"
|
#include "libsharp2/sharp_utils.h"
|
||||||
#include "libsharp2/pocketfft.h"
|
#include "mr_util/fft.h"
|
||||||
|
|
||||||
void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
|
void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
|
||||||
const int *rings, const double *weight, sharp_geom_info **geom_info)
|
const int *rings, const double *weight, sharp_geom_info **geom_info)
|
||||||
|
@ -155,9 +155,7 @@ void sharp_make_fejer1_geom_info (int nrings, int ppring, double phi0,
|
||||||
weight[2*k ]=2./(1.-4.*k*k)*sin((k*pi)/nrings);
|
weight[2*k ]=2./(1.-4.*k*k)*sin((k*pi)/nrings);
|
||||||
}
|
}
|
||||||
if ((nrings&1)==0) weight[nrings-1]=0.;
|
if ((nrings&1)==0) weight[nrings-1]=0.;
|
||||||
pocketfft_plan_r plan = pocketfft_make_plan_r(nrings);
|
mr::r2r_fftpack({size_t(nrings)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.);
|
||||||
pocketfft_backward_r(plan,weight,1.);
|
|
||||||
pocketfft_delete_plan_r(plan);
|
|
||||||
|
|
||||||
for (int m=0; m<(nrings+1)/2; ++m)
|
for (int m=0; m<(nrings+1)/2; ++m)
|
||||||
{
|
{
|
||||||
|
@ -202,9 +200,7 @@ void sharp_make_cc_geom_info (int nrings, int ppring, double phi0,
|
||||||
for (int k=1; k<=(n/2-1); ++k)
|
for (int k=1; k<=(n/2-1); ++k)
|
||||||
weight[2*k-1]=2./(1.-4.*k*k) + dw;
|
weight[2*k-1]=2./(1.-4.*k*k) + dw;
|
||||||
weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1);
|
weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1);
|
||||||
pocketfft_plan_r plan = pocketfft_make_plan_r(n);
|
mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.);
|
||||||
pocketfft_backward_r(plan,weight,1.);
|
|
||||||
pocketfft_delete_plan_r(plan);
|
|
||||||
weight[n]=weight[0];
|
weight[n]=weight[0];
|
||||||
|
|
||||||
for (int m=0; m<(nrings+1)/2; ++m)
|
for (int m=0; m<(nrings+1)/2; ++m)
|
||||||
|
@ -250,9 +246,7 @@ void sharp_make_fejer2_geom_info (int nrings, int ppring, double phi0,
|
||||||
for (int k=1; k<=(n/2-1); ++k)
|
for (int k=1; k<=(n/2-1); ++k)
|
||||||
weight[2*k-1]=2./(1.-4.*k*k);
|
weight[2*k-1]=2./(1.-4.*k*k);
|
||||||
weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.;
|
weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.;
|
||||||
pocketfft_plan_r plan = pocketfft_make_plan_r(n);
|
mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.);
|
||||||
pocketfft_backward_r(plan,weight,1.);
|
|
||||||
pocketfft_delete_plan_r(plan);
|
|
||||||
for (int m=0; m<nrings; ++m)
|
for (int m=0; m<nrings; ++m)
|
||||||
weight[m]=weight[m+1];
|
weight[m]=weight[m+1];
|
||||||
|
|
|
@ -28,14 +28,12 @@
|
||||||
#ifndef SHARP2_INTERNAL_H
|
#ifndef SHARP2_INTERNAL_H
|
||||||
#define SHARP2_INTERNAL_H
|
#define SHARP2_INTERNAL_H
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#include <complex>
|
||||||
#error This header file cannot be included from C++, only from C
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <complex.h>
|
|
||||||
#include "libsharp2/sharp.h"
|
#include "libsharp2/sharp.h"
|
||||||
#include "libsharp2/sharp_ylmgen_c.h"
|
#include "libsharp2/sharp_ylmgen_c.h"
|
||||||
|
|
||||||
|
using std::complex;
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
sharp_jobtype type;
|
sharp_jobtype type;
|
||||||
|
@ -45,9 +43,9 @@ typedef struct
|
||||||
void **map;
|
void **map;
|
||||||
void **alm;
|
void **alm;
|
||||||
int s_m, s_th; // strides in m and theta direction
|
int s_m, s_th; // strides in m and theta direction
|
||||||
complex double *phase;
|
complex<double> *phase;
|
||||||
double *norm_l;
|
double *norm_l;
|
||||||
complex double *almtmp;
|
complex<double> *almtmp;
|
||||||
const sharp_geom_info *ginfo;
|
const sharp_geom_info *ginfo;
|
||||||
const sharp_alm_info *ainfo;
|
const sharp_alm_info *ainfo;
|
||||||
double time;
|
double time;
|
||||||
|
|
|
@ -122,10 +122,4 @@ double sharp_wallTime(void);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __GNUC__
|
|
||||||
#define NOINLINE __attribute__((noinline))
|
|
||||||
#else
|
|
||||||
#define NOINLINE
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -28,193 +28,27 @@
|
||||||
#ifndef SHARP2_VECSUPPORT_H
|
#ifndef SHARP2_VECSUPPORT_H
|
||||||
#define SHARP2_VECSUPPORT_H
|
#define SHARP2_VECSUPPORT_H
|
||||||
|
|
||||||
#include <math.h>
|
#include <cmath>
|
||||||
|
#include <complex>
|
||||||
|
using std::complex;
|
||||||
|
#include <experimental/simd>
|
||||||
|
using std::experimental::native_simd;
|
||||||
|
using std::experimental::reduce;
|
||||||
|
|
||||||
#ifndef VLEN
|
#include "mr_util/useful_macros.h"
|
||||||
|
|
||||||
#if (defined(__AVX512F__))
|
using Tv=native_simd<double>;
|
||||||
#define VLEN 8
|
using Tm=Tv::mask_type;
|
||||||
#elif (defined (__AVX__))
|
using Ts=Tv::value_type;
|
||||||
#define VLEN 4
|
static constexpr size_t VLEN=Tv::size();
|
||||||
#elif (defined (__SSE2__))
|
|
||||||
#define VLEN 2
|
|
||||||
#else
|
|
||||||
#define VLEN 1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef double Ts;
|
|
||||||
|
|
||||||
#if (VLEN==1)
|
|
||||||
|
|
||||||
typedef double Tv;
|
|
||||||
typedef int Tm;
|
|
||||||
|
|
||||||
#define vload(a) (a)
|
#define vload(a) (a)
|
||||||
#define vzero 0.
|
|
||||||
#define vone 1.
|
|
||||||
|
|
||||||
#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
|
|
||||||
#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
|
|
||||||
#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
|
|
||||||
#define vneg(a) (-(a))
|
|
||||||
#define vabs(a) fabs(a)
|
|
||||||
#define vsqrt(a) sqrt(a)
|
|
||||||
#define vlt(a,b) ((a)<(b))
|
|
||||||
#define vgt(a,b) ((a)>(b))
|
|
||||||
#define vge(a,b) ((a)>=(b))
|
|
||||||
#define vne(a,b) ((a)!=(b))
|
|
||||||
#define vand_mask(a,b) ((a)&&(b))
|
|
||||||
#define vor_mask(a,b) ((a)||(b))
|
|
||||||
static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
|
|
||||||
static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
|
|
||||||
#define vanyTrue(a) (a)
|
|
||||||
#define vallTrue(a) (a)
|
|
||||||
|
|
||||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||||
_Complex double * restrict cc)
|
complex<double> * MRUTIL_RESTRICT cc)
|
||||||
{ cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if (VLEN==2)
|
|
||||||
|
|
||||||
#include <emmintrin.h>
|
|
||||||
|
|
||||||
#if defined (__SSE3__)
|
|
||||||
#include <pmmintrin.h>
|
|
||||||
#endif
|
|
||||||
#if defined (__SSE4_1__)
|
|
||||||
#include <smmintrin.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef __m128d Tv;
|
|
||||||
typedef __m128d Tm;
|
|
||||||
|
|
||||||
#if defined(__SSE4_1__)
|
|
||||||
#define vblend__(m,a,b) _mm_blendv_pd(b,a,m)
|
|
||||||
#else
|
|
||||||
static inline Tv vblend__(Tv m, Tv a, Tv b)
|
|
||||||
{ return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
|
|
||||||
#endif
|
|
||||||
#define vload(a) _mm_set1_pd(a)
|
|
||||||
#define vzero _mm_setzero_pd()
|
|
||||||
#define vone vload(1.)
|
|
||||||
|
|
||||||
#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
|
|
||||||
#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
|
|
||||||
#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
|
|
||||||
#define vneg(a) _mm_xor_pd(vload(-0.),a)
|
|
||||||
#define vabs(a) _mm_andnot_pd(vload(-0.),a)
|
|
||||||
#define vsqrt(a) _mm_sqrt_pd(a)
|
|
||||||
#define vlt(a,b) _mm_cmplt_pd(a,b)
|
|
||||||
#define vgt(a,b) _mm_cmpgt_pd(a,b)
|
|
||||||
#define vge(a,b) _mm_cmpge_pd(a,b)
|
|
||||||
#define vne(a,b) _mm_cmpneq_pd(a,b)
|
|
||||||
#define vand_mask(a,b) _mm_and_pd(a,b)
|
|
||||||
#define vor_mask(a,b) _mm_or_pd(a,b)
|
|
||||||
#define vmin(a,b) _mm_min_pd(a,b)
|
|
||||||
#define vmax(a,b) _mm_max_pd(a,b);
|
|
||||||
#define vanyTrue(a) (_mm_movemask_pd(a)!=0)
|
|
||||||
#define vallTrue(a) (_mm_movemask_pd(a)==3)
|
|
||||||
|
|
||||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c,
|
|
||||||
Tv d, _Complex double * restrict cc)
|
|
||||||
{
|
{
|
||||||
union {Tv v; _Complex double c; } u1, u2;
|
cc[0] += complex<double>(reduce(a,std::plus<>()),reduce(b,std::plus<>()));
|
||||||
#if defined(__SSE3__)
|
cc[1] += complex<double>(reduce(c,std::plus<>()),reduce(d,std::plus<>()));
|
||||||
u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
|
|
||||||
#else
|
|
||||||
u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
|
|
||||||
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
|
|
||||||
u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
|
|
||||||
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
|
|
||||||
#endif
|
|
||||||
cc[0]+=u1.c; cc[1]+=u2.c;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (VLEN==4)
|
|
||||||
|
|
||||||
#include <immintrin.h>
|
|
||||||
|
|
||||||
typedef __m256d Tv;
|
|
||||||
typedef __m256d Tm;
|
|
||||||
|
|
||||||
#define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
|
|
||||||
#define vload(a) _mm256_set1_pd(a)
|
|
||||||
#define vzero _mm256_setzero_pd()
|
|
||||||
#define vone vload(1.)
|
|
||||||
|
|
||||||
#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
|
|
||||||
#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
|
|
||||||
#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
|
|
||||||
#define vneg(a) _mm256_xor_pd(vload(-0.),a)
|
|
||||||
#define vabs(a) _mm256_andnot_pd(vload(-0.),a)
|
|
||||||
#define vsqrt(a) _mm256_sqrt_pd(a)
|
|
||||||
#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
|
|
||||||
#define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ)
|
|
||||||
#define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ)
|
|
||||||
#define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
|
|
||||||
#define vand_mask(a,b) _mm256_and_pd(a,b)
|
|
||||||
#define vor_mask(a,b) _mm256_or_pd(a,b)
|
|
||||||
#define vmin(a,b) _mm256_min_pd(a,b)
|
|
||||||
#define vmax(a,b) _mm256_max_pd(a,b)
|
|
||||||
#define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
|
|
||||||
#define vallTrue(a) (_mm256_movemask_pd(a)==15)
|
|
||||||
|
|
||||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
|
||||||
_Complex double * restrict cc)
|
|
||||||
{
|
|
||||||
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
|
|
||||||
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
|
|
||||||
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
|
|
||||||
tmp1=tmp3+tmp4;
|
|
||||||
union {Tv v; _Complex double c[2]; } u;
|
|
||||||
u.v=tmp1;
|
|
||||||
cc[0]+=u.c[0]; cc[1]+=u.c[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if (VLEN==8)
|
|
||||||
|
|
||||||
#include <immintrin.h>
|
|
||||||
|
|
||||||
typedef __m512d Tv;
|
|
||||||
typedef __mmask8 Tm;
|
|
||||||
|
|
||||||
#define vload(a) _mm512_set1_pd(a)
|
|
||||||
#define vzero _mm512_setzero_pd()
|
|
||||||
#define vone vload(1.)
|
|
||||||
|
|
||||||
#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
|
|
||||||
#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
|
|
||||||
#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
|
|
||||||
#define vneg(a) _mm512_mul_pd(a,vload(-1.))
|
|
||||||
#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)vload(-0.),(__m512i)a)
|
|
||||||
#define vsqrt(a) _mm512_sqrt_pd(a)
|
|
||||||
#define vlt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_LT_OQ)
|
|
||||||
#define vgt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GT_OQ)
|
|
||||||
#define vge(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GE_OQ)
|
|
||||||
#define vne(a,b) _mm512_cmp_pd_mask(a,b,_CMP_NEQ_OQ)
|
|
||||||
#define vand_mask(a,b) ((a)&(b))
|
|
||||||
#define vor_mask(a,b) ((a)|(b))
|
|
||||||
#define vmin(a,b) _mm512_min_pd(a,b)
|
|
||||||
#define vmax(a,b) _mm512_max_pd(a,b)
|
|
||||||
#define vanyTrue(a) (a!=0)
|
|
||||||
#define vallTrue(a) (a==255)
|
|
||||||
|
|
||||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
|
||||||
_Complex double * restrict cc)
|
|
||||||
{
|
|
||||||
cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
|
|
||||||
cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
|
@ -26,7 +26,8 @@
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <complex.h>
|
#include <complex>
|
||||||
|
using std::complex;
|
||||||
#ifdef USE_MPI
|
#ifdef USE_MPI
|
||||||
#include "mpi.h"
|
#include "mpi.h"
|
||||||
#include "libsharp2/sharp_mpi.h"
|
#include "libsharp2/sharp_mpi.h"
|
||||||
|
@ -38,7 +39,7 @@
|
||||||
#include "libsharp2/sharp_geomhelpers.h"
|
#include "libsharp2/sharp_geomhelpers.h"
|
||||||
#include "libsharp2/sharp_almhelpers.h"
|
#include "libsharp2/sharp_almhelpers.h"
|
||||||
#include "libsharp2/sharp_utils.h"
|
#include "libsharp2/sharp_utils.h"
|
||||||
#include "libsharp2/sharp_utils.c"
|
#include "libsharp2/sharp_utils.cc"
|
||||||
#include "test/memusage.h"
|
#include "test/memusage.h"
|
||||||
|
|
||||||
static void OpenMP_status(void)
|
static void OpenMP_status(void)
|
||||||
|
@ -94,7 +95,7 @@ static void sharp_module_startup (const char *name, int argc, int argc_expected,
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef complex double dcmplx;
|
typedef complex<double> dcmplx;
|
||||||
|
|
||||||
int ntasks, mytask;
|
int ntasks, mytask;
|
||||||
|
|
||||||
|
@ -122,7 +123,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
|
||||||
{
|
{
|
||||||
double rv = drand(-1,1,&state);
|
double rv = drand(-1,1,&state);
|
||||||
double iv = (m==0) ? 0 : drand(-1,1,&state);
|
double iv = (m==0) ? 0 : drand(-1,1,&state);
|
||||||
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
|
alm[sharp_alm_index(helper,l,mi)] = dcmplx(rv,iv);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -230,8 +231,7 @@ static double *get_sqsum_and_invert (dcmplx **alm, ptrdiff_t nalms, int ncomp)
|
||||||
sqsum[i]=0;
|
sqsum[i]=0;
|
||||||
for (ptrdiff_t j=0; j<nalms; ++j)
|
for (ptrdiff_t j=0; j<nalms; ++j)
|
||||||
{
|
{
|
||||||
sqsum[i]+=creal(alm[i][j])*creal(alm[i][j])
|
sqsum[i]+=norm(alm[i][j]);
|
||||||
+cimag(alm[i][j])*cimag(alm[i][j]);
|
|
||||||
alm[i][j]=-alm[i][j];
|
alm[i][j]=-alm[i][j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -253,8 +253,7 @@ static void get_errors (dcmplx **alm, ptrdiff_t nalms, int ncomp, double *sqsum,
|
||||||
double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot;
|
double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot;
|
||||||
for (ptrdiff_t j=0; j<nalms; ++j)
|
for (ptrdiff_t j=0; j<nalms; ++j)
|
||||||
{
|
{
|
||||||
double sqr=creal(alm[i][j])*creal(alm[i][j])
|
double sqr=norm(alm[i][j]);
|
||||||
+cimag(alm[i][j])*cimag(alm[i][j]);
|
|
||||||
sum+=sqr;
|
sum+=sqr;
|
||||||
if (sqr>maxdiff) maxdiff=sqr;
|
if (sqr>maxdiff) maxdiff=sqr;
|
||||||
}
|
}
|
||||||
|
@ -414,7 +413,7 @@ static void check_sign_scale(void)
|
||||||
ALLOC2D(alm,dcmplx,2,nalms);
|
ALLOC2D(alm,dcmplx,2,nalms);
|
||||||
for (int i=0; i<2; ++i)
|
for (int i=0; i<2; ++i)
|
||||||
for (int j=0; j<nalms; ++j)
|
for (int j=0; j<nalms; ++j)
|
||||||
alm[i][j]=1.+_Complex_I;
|
alm[i][j]=dcmplx(1.,1.);
|
||||||
|
|
||||||
sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP,
|
sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP,
|
||||||
NULL,NULL);
|
NULL,NULL);
|
Loading…
Add table
Add a link
Reference in a new issue