Compare commits
6 commits
Author | SHA1 | Date | |
---|---|---|---|
|
b27ce30cde | ||
|
f8a9c96acf | ||
|
75559e6894 | ||
|
54cbae0534 | ||
|
0f8051fc28 | ||
|
0378ce155a |
19 changed files with 382 additions and 2817 deletions
2
COMPILE
2
COMPILE
|
@ -49,7 +49,7 @@ It can be disabled at configuration time by specifying "--disable-openmp" at the
|
|||
configure command line.
|
||||
At runtime OMP_NUM_THREADS should be set to the number of hardware threads
|
||||
(not physical cores) of the system.
|
||||
(Usually this is already the default setting when OMP_NUM_THREADS is not
|
||||
(Usually this is already the default setting when OMP_NUM_THREADS is not
|
||||
specified.)
|
||||
|
||||
|
||||
|
|
44
Makefile.am
44
Makefile.am
|
@ -3,16 +3,14 @@ ACLOCAL_AMFLAGS = -I m4
|
|||
lib_LTLIBRARIES = libsharp2.la
|
||||
|
||||
libsharp2_la_SOURCES = \
|
||||
libsharp2/pocketfft.c \
|
||||
libsharp2/pocketfft.h \
|
||||
libsharp2/sharp_utils.c \
|
||||
libsharp2/sharp_utils.cc \
|
||||
libsharp2/sharp_utils.h \
|
||||
libsharp2/sharp.c \
|
||||
libsharp2/sharp_almhelpers.c \
|
||||
libsharp2/sharp_core.c \
|
||||
libsharp2/sharp_geomhelpers.c \
|
||||
libsharp2/sharp_legendre_roots.c \
|
||||
libsharp2/sharp_ylmgen_c.c \
|
||||
libsharp2/sharp.cc \
|
||||
libsharp2/sharp_almhelpers.cc \
|
||||
libsharp2/sharp_core.cc \
|
||||
libsharp2/sharp_geomhelpers.cc \
|
||||
libsharp2/sharp_legendre_roots.cc \
|
||||
libsharp2/sharp_ylmgen_c.cc \
|
||||
libsharp2/sharp_internal.h \
|
||||
libsharp2/sharp_legendre_roots.h \
|
||||
libsharp2/sharp_vecsupport.h \
|
||||
|
@ -24,25 +22,25 @@ libsharp2_la_SOURCES = \
|
|||
# any backward-compatible change: increase age
|
||||
# any backward-incompatible change: age=0
|
||||
# ==> age <= current
|
||||
libsharp2_la_LDFLAGS = -version-info 0:0:0
|
||||
libsharp2_la_LDFLAGS = -version-info 0:0:0 -lpthread
|
||||
|
||||
AM_CFLAGS = @AM_CFLAGS@
|
||||
AM_CXXFLAGS = @AM_CXXFLAGS@
|
||||
|
||||
if HAVE_MULTIARCH
|
||||
|
||||
libavx_la_SOURCES = libsharp2/sharp_core_inc.c
|
||||
libavx2_la_SOURCES = libsharp2/sharp_core_inc.c
|
||||
libfma_la_SOURCES = libsharp2/sharp_core_inc.c
|
||||
libfma4_la_SOURCES = libsharp2/sharp_core_inc.c
|
||||
libavx512f_la_SOURCES = libsharp2/sharp_core_inc.c
|
||||
libavx_la_SOURCES = libsharp2/sharp_core_inc.cc
|
||||
libavx2_la_SOURCES = libsharp2/sharp_core_inc.cc
|
||||
libfma_la_SOURCES = libsharp2/sharp_core_inc.cc
|
||||
libfma4_la_SOURCES = libsharp2/sharp_core_inc.cc
|
||||
libavx512f_la_SOURCES = libsharp2/sharp_core_inc.cc
|
||||
|
||||
noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
|
||||
|
||||
libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx
|
||||
libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2
|
||||
libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma
|
||||
libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4
|
||||
libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f
|
||||
libavx_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx -DARCH=avx
|
||||
libavx2_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx2 -DARCH=avx2
|
||||
libfma_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma -DARCH=fma
|
||||
libfma4_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma4 -DARCH=fma4
|
||||
libavx512f_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx512f -DARCH=avx512f
|
||||
|
||||
libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
|
||||
|
||||
|
@ -56,10 +54,10 @@ nobase_include_HEADERS = \
|
|||
libsharp2/sharp_cxx.h
|
||||
|
||||
EXTRA_DIST = \
|
||||
runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.c
|
||||
runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.cc
|
||||
|
||||
check_PROGRAMS = sharp2_testsuite
|
||||
sharp2_testsuite_SOURCES = test/sharp2_testsuite.c test/memusage.c test/memusage.h
|
||||
sharp2_testsuite_SOURCES = test/sharp2_testsuite.cc test/memusage.cc test/memusage.h
|
||||
sharp2_testsuite_LDADD = libsharp2.la
|
||||
|
||||
TESTS = runtest.sh
|
||||
|
|
15
configure.ac
15
configure.ac
|
@ -20,28 +20,21 @@ dnl
|
|||
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
|
||||
|
||||
|
||||
AC_PROG_CC_C99
|
||||
AC_OPENMP
|
||||
|
||||
# add math library
|
||||
LIBS="-lm"
|
||||
AC_PROG_CXX
|
||||
AX_CXX_COMPILE_STDCXX([11])
|
||||
|
||||
AC_PROG_LIBTOOL
|
||||
|
||||
tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'`
|
||||
tmpval=`echo $CXXFLAGS | grep -c '\-DMULTIARCH'`
|
||||
AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
|
||||
|
||||
AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS"
|
||||
|
||||
PACKAGE_LIBS="-lsharp2"
|
||||
PACKAGE_CFLAGS="$PACKAGE_CFLAGS $OPENMP_CFLAGS"
|
||||
PACKAGE_LDFLAGS="$PACKAGE_LDFLAGS $OPENMP_CFLAGS"
|
||||
|
||||
dnl
|
||||
dnl Create pkgconfig .pc file.
|
||||
dnl
|
||||
AX_CREATE_PKGCONFIG_INFO(,,,,[])
|
||||
AC_SUBST([AM_CFLAGS])
|
||||
AC_SUBST([AM_CXXFLAGS])
|
||||
|
||||
AC_CONFIG_FILES([Makefile])
|
||||
AC_OUTPUT
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,50 +0,0 @@
|
|||
/*
|
||||
* This file is part of pocketfft.
|
||||
* Licensed under a 3-clause BSD style license - see LICENSE.md
|
||||
*/
|
||||
|
||||
/*! \file pocketfft.h
|
||||
* Public interface of the pocketfft library
|
||||
*
|
||||
* Copyright (C) 2008-2019 Max-Planck-Society
|
||||
* \author Martin Reinecke
|
||||
*/
|
||||
|
||||
#ifndef POCKETFFT_H
|
||||
#define POCKETFFT_H
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct pocketfft_plan_c_i;
|
||||
typedef struct pocketfft_plan_c_i * pocketfft_plan_c;
|
||||
pocketfft_plan_c pocketfft_make_plan_c (size_t length);
|
||||
void pocketfft_delete_plan_c (pocketfft_plan_c plan);
|
||||
int pocketfft_backward_c(pocketfft_plan_c plan, double c[], double fct);
|
||||
int pocketfft_forward_c(pocketfft_plan_c plan, double c[], double fct);
|
||||
size_t pocketfft_length_c(pocketfft_plan_c plan);
|
||||
|
||||
struct pocketfft_plan_r_i;
|
||||
typedef struct pocketfft_plan_r_i * pocketfft_plan_r;
|
||||
pocketfft_plan_r pocketfft_make_plan_r (size_t length);
|
||||
void pocketfft_delete_plan_r (pocketfft_plan_r plan);
|
||||
/*! Computes a real backward FFT on \a c, using \a plan
|
||||
and assuming the FFTPACK storage scheme:
|
||||
- on entry, \a c has the form <tt>r0, r1, i1, r2, i2, ...</tt>
|
||||
- on exit, it has the form <tt>r0, r1, ..., r[length-1]</tt>. */
|
||||
int pocketfft_backward_r(pocketfft_plan_r plan, double c[], double fct);
|
||||
/*! Computes a real forward FFT on \a c, using \a plan
|
||||
and assuming the FFTPACK storage scheme:
|
||||
- on entry, \a c has the form <tt>r0, r1, ..., r[length-1]</tt>;
|
||||
- on exit, it has the form <tt>r0, r1, i1, r2, i2, ...</tt> */
|
||||
int pocketfft_forward_r(pocketfft_plan_r plan, double c[], double fct);
|
||||
size_t pocketfft_length_r(pocketfft_plan_r plan);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -25,17 +25,20 @@
|
|||
* \author Martin Reinecke \author Dag Sverre Seljebotn
|
||||
*/
|
||||
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
#include "libsharp2/pocketfft.h"
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <atomic>
|
||||
#include "mr_util/fft.h"
|
||||
#include "libsharp2/sharp_ylmgen_c.h"
|
||||
#include "libsharp2/sharp_internal.h"
|
||||
#include "libsharp2/sharp_utils.h"
|
||||
#include "libsharp2/sharp_almhelpers.h"
|
||||
#include "libsharp2/sharp_geomhelpers.h"
|
||||
#include "mr_util/threading.h"
|
||||
#include "mr_util/useful_macros.h"
|
||||
|
||||
typedef complex double dcmplx;
|
||||
typedef complex float fcmplx;
|
||||
typedef complex<double> dcmplx;
|
||||
typedef complex<float> fcmplx;
|
||||
|
||||
static const double sqrt_one_half = 0.707106781186547572737310929369;
|
||||
static const double sqrt_two = 1.414213562373095145474621858739;
|
||||
|
@ -57,7 +60,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
|
|||
*nchunks = (ndata+(*chunksize)-1)/(*chunksize);
|
||||
}
|
||||
|
||||
NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
|
||||
MRUTIL_NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
|
||||
{
|
||||
double ofs=lmax*0.01;
|
||||
if (ofs<100.) ofs=100.;
|
||||
|
@ -76,7 +79,7 @@ typedef struct
|
|||
double phi0_;
|
||||
dcmplx *shiftarr;
|
||||
int s_shift;
|
||||
pocketfft_plan_r plan;
|
||||
mr::detail_fft::rfftp<double> *plan;
|
||||
int length;
|
||||
int norot;
|
||||
} ringhelper;
|
||||
|
@ -89,12 +92,12 @@ static void ringhelper_init (ringhelper *self)
|
|||
|
||||
static void ringhelper_destroy (ringhelper *self)
|
||||
{
|
||||
if (self->plan) pocketfft_delete_plan_r(self->plan);
|
||||
delete self->plan;
|
||||
DEALLOC(self->shiftarr);
|
||||
ringhelper_init(self);
|
||||
}
|
||||
|
||||
NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
|
||||
MRUTIL_NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
|
||||
{
|
||||
self->norot = (fabs(phi0)<1e-14);
|
||||
if (!(self->norot))
|
||||
|
@ -105,27 +108,27 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou
|
|||
self->phi0_ = phi0;
|
||||
// FIXME: improve this by using sincos2pibyn(nph) etc.
|
||||
for (int m=0; m<=mmax; ++m)
|
||||
self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
|
||||
self->shiftarr[m] = dcmplx(cos(m*phi0),sin(m*phi0));
|
||||
// double *tmp=(double *) self->shiftarr;
|
||||
// sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
|
||||
}
|
||||
// if (!self->plan) self->plan=pocketfft_make_plan_r(nph);
|
||||
if (nph!=(int)self->length)
|
||||
{
|
||||
if (self->plan) pocketfft_delete_plan_r(self->plan);
|
||||
self->plan=pocketfft_make_plan_r(nph);
|
||||
if (self->plan) delete self->plan;
|
||||
self->plan=new mr::detail_fft::rfftp<double>(nph);
|
||||
self->length=nph;
|
||||
}
|
||||
}
|
||||
|
||||
static int ringinfo_compare (const void *xa, const void *xb)
|
||||
{
|
||||
const sharp_ringinfo *a=xa, *b=xb;
|
||||
const sharp_ringinfo *a=(const sharp_ringinfo *)xa, *b=(const sharp_ringinfo *)xb;
|
||||
return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
|
||||
}
|
||||
static int ringpair_compare (const void *xa, const void *xb)
|
||||
{
|
||||
const sharp_ringpair *a=xa, *b=xb;
|
||||
const sharp_ringpair *a=(const sharp_ringpair *)xa, *b=(const sharp_ringpair *)xb;
|
||||
// return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
|
||||
if (a->r1.nph==b->r1.nph)
|
||||
return (a->r1.phi0 < b->r1.phi0) ? -1 :
|
||||
|
@ -275,7 +278,7 @@ static int sharp_get_mmax (int *mval, int nm)
|
|||
return nm-1;
|
||||
}
|
||||
|
||||
NOINLINE static void ringhelper_phase2ring (ringhelper *self,
|
||||
MRUTIL_NOINLINE static void ringhelper_phase2ring (ringhelper *self,
|
||||
const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase,
|
||||
int pstride, int flags)
|
||||
{
|
||||
|
@ -292,22 +295,22 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
|
|||
if (self->norot)
|
||||
for (int m=0; m<=mmax; ++m)
|
||||
{
|
||||
data[2*m]=creal(phase[m*pstride])*wgt;
|
||||
data[2*m+1]=cimag(phase[m*pstride])*wgt;
|
||||
data[2*m]=phase[m*pstride].real()*wgt;
|
||||
data[2*m+1]=phase[m*pstride].imag()*wgt;
|
||||
}
|
||||
else
|
||||
for (int m=0; m<=mmax; ++m)
|
||||
{
|
||||
dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
|
||||
data[2*m]=creal(tmp)*wgt;
|
||||
data[2*m+1]=cimag(tmp)*wgt;
|
||||
data[2*m]=tmp.real()*wgt;
|
||||
data[2*m+1]=tmp.imag()*wgt;
|
||||
}
|
||||
for (int m=2*(mmax+1); m<nph+2; ++m)
|
||||
data[m]=0.;
|
||||
}
|
||||
else
|
||||
{
|
||||
data[0]=creal(phase[0])*wgt;
|
||||
data[0]=phase[0].real()*wgt;
|
||||
SET_ARRAY(data,1,nph+2,0.);
|
||||
|
||||
int idx1=1, idx2=nph-1;
|
||||
|
@ -317,23 +320,23 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
|
|||
if(!self->norot) tmp*=self->shiftarr[m];
|
||||
if (idx1<(nph+2)/2)
|
||||
{
|
||||
data[2*idx1]+=creal(tmp);
|
||||
data[2*idx1+1]+=cimag(tmp);
|
||||
data[2*idx1]+=tmp.real();
|
||||
data[2*idx1+1]+=tmp.imag();
|
||||
}
|
||||
if (idx2<(nph+2)/2)
|
||||
{
|
||||
data[2*idx2]+=creal(tmp);
|
||||
data[2*idx2+1]-=cimag(tmp);
|
||||
data[2*idx2]+=tmp.real();
|
||||
data[2*idx2+1]-=tmp.imag();
|
||||
}
|
||||
if (++idx1>=nph) idx1=0;
|
||||
if (--idx2<0) idx2=nph-1;
|
||||
}
|
||||
}
|
||||
data[1]=data[0];
|
||||
pocketfft_backward_r (self->plan, &(data[1]), 1.);
|
||||
self->plan->exec(&(data[1]), 1., false);
|
||||
}
|
||||
|
||||
NOINLINE static void ringhelper_ring2phase (ringhelper *self,
|
||||
MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self,
|
||||
const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase,
|
||||
int pstride, int flags)
|
||||
{
|
||||
|
@ -349,7 +352,7 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
|
|||
if (flags&SHARP_REAL_HARMONICS)
|
||||
wgt *= sqrt_two;
|
||||
|
||||
pocketfft_forward_r (self->plan, &(data[1]), 1.);
|
||||
self->plan->exec (&(data[1]), 1., true);
|
||||
data[0]=data[1];
|
||||
data[1]=data[nph+1]=0.;
|
||||
|
||||
|
@ -357,11 +360,11 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
|
|||
{
|
||||
if (self->norot)
|
||||
for (int m=0; m<=maxidx; ++m)
|
||||
phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * wgt;
|
||||
phase[m*pstride] = dcmplx(data[2*m], data[2*m+1]) * wgt;
|
||||
else
|
||||
for (int m=0; m<=maxidx; ++m)
|
||||
phase[m*pstride] =
|
||||
(data[2*m] + _Complex_I*data[2*m+1]) * self->shiftarr[m] * wgt;
|
||||
dcmplx(data[2*m], data[2*m+1]) * self->shiftarr[m] * wgt;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -370,9 +373,9 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
|
|||
int idx=m%nph;
|
||||
dcmplx val;
|
||||
if (idx<(nph-idx))
|
||||
val = (data[2*idx] + _Complex_I*data[2*idx+1]) * wgt;
|
||||
val = dcmplx(data[2*idx], data[2*idx+1]) * wgt;
|
||||
else
|
||||
val = (data[2*(nph-idx)] - _Complex_I*data[2*(nph-idx)+1]) * wgt;
|
||||
val = dcmplx(data[2*(nph-idx)], -data[2*(nph-idx)+1]) * wgt;
|
||||
if (!self->norot)
|
||||
val *= self->shiftarr[m];
|
||||
phase[m*pstride]=val;
|
||||
|
@ -383,7 +386,7 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
|
|||
phase[m*pstride]=0.;
|
||||
}
|
||||
|
||||
NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
|
||||
MRUTIL_NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
|
||||
int flags)
|
||||
{
|
||||
if (flags & SHARP_NO_FFT)
|
||||
|
@ -440,7 +443,7 @@ NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
|
|||
}
|
||||
}
|
||||
|
||||
NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
|
||||
MRUTIL_NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
|
||||
int flags)
|
||||
{
|
||||
#define CLEARLOOP(real_t,body) \
|
||||
|
@ -477,7 +480,7 @@ NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
|
|||
}
|
||||
}
|
||||
|
||||
NOINLINE static void init_output (sharp_job *job)
|
||||
MRUTIL_NOINLINE static void init_output (sharp_job *job)
|
||||
{
|
||||
if (job->flags&SHARP_ADD) return;
|
||||
if (job->type == SHARP_MAP2ALM)
|
||||
|
@ -488,7 +491,7 @@ NOINLINE static void init_output (sharp_job *job)
|
|||
clear_map (job->ginfo,job->map[i],job->flags);
|
||||
}
|
||||
|
||||
NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
|
||||
MRUTIL_NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
|
||||
{
|
||||
if (job->type==SHARP_MAP2ALM)
|
||||
{
|
||||
|
@ -514,7 +517,7 @@ static void alloc_almtmp (sharp_job *job, int lmax)
|
|||
static void dealloc_almtmp (sharp_job *job)
|
||||
{ DEALLOC(job->almtmp); }
|
||||
|
||||
NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
|
||||
MRUTIL_NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
|
||||
{
|
||||
|
||||
#define COPY_LOOP(real_t, source_t, expr_of_x) \
|
||||
|
@ -577,7 +580,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
|
|||
if (job->flags&SHARP_DP)
|
||||
COPY_LOOP(double, dcmplx, x*job->norm_l[l])
|
||||
else
|
||||
COPY_LOOP(float, fcmplx, x*job->norm_l[l])
|
||||
COPY_LOOP(float, fcmplx, x*float(job->norm_l[l]))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -588,7 +591,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
|
|||
#undef COPY_LOOP
|
||||
}
|
||||
|
||||
NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
|
||||
MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
|
||||
{
|
||||
|
||||
#define COPY_LOOP(real_t, target_t, expr_of_x) \
|
||||
|
@ -617,9 +620,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
|
|||
if (m==0)
|
||||
{
|
||||
if (job->flags&SHARP_DP)
|
||||
COPY_LOOP(double, double, creal(x)*norm_m0)
|
||||
COPY_LOOP(double, double, x.real()*norm_m0)
|
||||
else
|
||||
COPY_LOOP(float, float, crealf(x)*norm_m0)
|
||||
COPY_LOOP(float, float, x.real()*norm_m0)
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -634,9 +637,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
|
|||
if (m==0)
|
||||
{
|
||||
if (job->flags&SHARP_DP)
|
||||
COPY_LOOP(double, double, creal(x)*job->norm_l[l]*norm_m0)
|
||||
COPY_LOOP(double, double, x.real()*job->norm_l[l]*norm_m0)
|
||||
else
|
||||
COPY_LOOP(float, fcmplx, (float)(creal(x)*job->norm_l[l]*norm_m0))
|
||||
COPY_LOOP(float, fcmplx, (float)(x.real()*job->norm_l[l]*norm_m0))
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -650,7 +653,7 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
|
|||
#undef COPY_LOOP
|
||||
}
|
||||
|
||||
NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
|
||||
MRUTIL_NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
|
||||
const double *ringtmp, int rstride)
|
||||
{
|
||||
if (job->flags & SHARP_DP)
|
||||
|
@ -658,8 +661,8 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
|
|||
double **dmap = (double **)job->map;
|
||||
for (int i=0; i<job->nmaps; ++i)
|
||||
{
|
||||
double *restrict p1=&dmap[i][ri->ofs];
|
||||
const double *restrict p2=&ringtmp[i*rstride+1];
|
||||
double *MRUTIL_RESTRICT p1=&dmap[i][ri->ofs];
|
||||
const double *MRUTIL_RESTRICT p2=&ringtmp[i*rstride+1];
|
||||
if (ri->stride==1)
|
||||
{
|
||||
if (job->flags&SHARP_ADD)
|
||||
|
@ -682,14 +685,14 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
|
|||
}
|
||||
}
|
||||
|
||||
NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
|
||||
MRUTIL_NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
|
||||
double *ringtmp, int rstride)
|
||||
{
|
||||
if (job->flags & SHARP_DP)
|
||||
for (int i=0; i<job->nmaps; ++i)
|
||||
{
|
||||
double *restrict p1=&ringtmp[i*rstride+1],
|
||||
*restrict p2=&(((double *)(job->map[i]))[ri->ofs]);
|
||||
double *MRUTIL_RESTRICT p1=&ringtmp[i*rstride+1],
|
||||
*MRUTIL_RESTRICT p2=&(((double *)(job->map[i]))[ri->ofs]);
|
||||
if (ri->stride==1)
|
||||
memcpy(p1,p2,ri->nph*sizeof(double));
|
||||
else
|
||||
|
@ -721,7 +724,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
|
|||
for (int m=0; m<=mmax; ++m)
|
||||
phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
|
||||
((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
|
||||
((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt;
|
||||
((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*float(wgt);
|
||||
}
|
||||
}
|
||||
static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
|
||||
|
@ -743,7 +746,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
|
|||
}
|
||||
|
||||
//FIXME: set phase to zero if not SHARP_MAP2ALM?
|
||||
NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
|
||||
MRUTIL_NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
|
||||
{
|
||||
if (job->type != SHARP_MAP2ALM) return;
|
||||
int pstride = job->s_m;
|
||||
|
@ -760,35 +763,35 @@ NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
|
|||
}
|
||||
else
|
||||
{
|
||||
#pragma omp parallel
|
||||
{
|
||||
ringhelper helper;
|
||||
ringhelper_init(&helper);
|
||||
int rstride=job->ginfo->nphmax+2;
|
||||
double *ringtmp=RALLOC(double,job->nmaps*rstride);
|
||||
#pragma omp for schedule(dynamic,1)
|
||||
for (int ith=llim; ith<ulim; ++ith)
|
||||
mr::execDynamic(ulim-llim, 0, 1, [&](mr::Scheduler &sched)
|
||||
{
|
||||
int dim2 = job->s_th*(ith-llim);
|
||||
ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
|
||||
for (int i=0; i<job->nmaps; ++i)
|
||||
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1),
|
||||
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
|
||||
if (job->ginfo->pair[ith].r2.nph>0)
|
||||
ringhelper helper;
|
||||
ringhelper_init(&helper);
|
||||
int rstride=job->ginfo->nphmax+2;
|
||||
double *ringtmp=RALLOC(double,job->nmaps*rstride);
|
||||
|
||||
while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; ith<rng.hi+llim; ++ith)
|
||||
{
|
||||
ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
|
||||
int dim2 = job->s_th*(ith-llim);
|
||||
ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
|
||||
for (int i=0; i<job->nmaps; ++i)
|
||||
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2),
|
||||
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
|
||||
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1),
|
||||
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
|
||||
if (job->ginfo->pair[ith].r2.nph>0)
|
||||
{
|
||||
ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
|
||||
for (int i=0; i<job->nmaps; ++i)
|
||||
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2),
|
||||
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
|
||||
}
|
||||
}
|
||||
}
|
||||
DEALLOC(ringtmp);
|
||||
ringhelper_destroy(&helper);
|
||||
} /* end of parallel region */
|
||||
DEALLOC(ringtmp);
|
||||
ringhelper_destroy(&helper);
|
||||
}); /* end of parallel region */
|
||||
}
|
||||
}
|
||||
|
||||
NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
|
||||
MRUTIL_NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
|
||||
{
|
||||
if (job->type == SHARP_MAP2ALM) return;
|
||||
int pstride = job->s_m;
|
||||
|
@ -805,35 +808,35 @@ NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
|
|||
}
|
||||
else
|
||||
{
|
||||
#pragma omp parallel
|
||||
{
|
||||
ringhelper helper;
|
||||
ringhelper_init(&helper);
|
||||
int rstride=job->ginfo->nphmax+2;
|
||||
double *ringtmp=RALLOC(double,job->nmaps*rstride);
|
||||
#pragma omp for schedule(dynamic,1)
|
||||
for (int ith=llim; ith<ulim; ++ith)
|
||||
mr::execDynamic(ulim-llim, 0, 1, [&](mr::Scheduler &sched)
|
||||
{
|
||||
int dim2 = job->s_th*(ith-llim);
|
||||
for (int i=0; i<job->nmaps; ++i)
|
||||
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1),
|
||||
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
|
||||
ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
|
||||
if (job->ginfo->pair[ith].r2.nph>0)
|
||||
ringhelper helper;
|
||||
ringhelper_init(&helper);
|
||||
int rstride=job->ginfo->nphmax+2;
|
||||
double *ringtmp=RALLOC(double,job->nmaps*rstride);
|
||||
|
||||
while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; ith<rng.hi+llim; ++ith)
|
||||
{
|
||||
int dim2 = job->s_th*(ith-llim);
|
||||
for (int i=0; i<job->nmaps; ++i)
|
||||
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2),
|
||||
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
|
||||
ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
|
||||
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1),
|
||||
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
|
||||
ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
|
||||
if (job->ginfo->pair[ith].r2.nph>0)
|
||||
{
|
||||
for (int i=0; i<job->nmaps; ++i)
|
||||
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2),
|
||||
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
|
||||
ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
|
||||
}
|
||||
}
|
||||
}
|
||||
DEALLOC(ringtmp);
|
||||
ringhelper_destroy(&helper);
|
||||
} /* end of parallel region */
|
||||
DEALLOC(ringtmp);
|
||||
ringhelper_destroy(&helper);
|
||||
}); /* end of parallel region */
|
||||
}
|
||||
}
|
||||
|
||||
NOINLINE static void sharp_execute_job (sharp_job *job)
|
||||
MRUTIL_NOINLINE static void sharp_execute_job (sharp_job *job)
|
||||
{
|
||||
double timer=sharp_wallTime();
|
||||
job->opcnt=0;
|
||||
|
@ -852,6 +855,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
|
|||
&nchunks,&chunksize);
|
||||
//FIXME: needs to be changed to "nm"
|
||||
alloc_phase (job,mmax+1,chunksize);
|
||||
std::atomic<size_t> opcnt = 0;
|
||||
|
||||
/* chunk loop */
|
||||
for (int chunk=0; chunk<nchunks; ++chunk)
|
||||
|
@ -871,32 +875,30 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
|
|||
/* map->phase where necessary */
|
||||
map2phase (job, mmax, llim, ulim);
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
sharp_job ljob = *job;
|
||||
ljob.opcnt=0;
|
||||
sharp_Ylmgen_C generator;
|
||||
sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
|
||||
alloc_almtmp(&ljob,lmax);
|
||||
|
||||
#pragma omp for schedule(dynamic,1)
|
||||
for (int mi=0; mi<job->ainfo->nm; ++mi)
|
||||
mr::execDynamic(job->ainfo->nm, 0, 1, [&](mr::Scheduler &sched)
|
||||
{
|
||||
/* alm->alm_tmp where necessary */
|
||||
alm2almtmp (&ljob, lmax, mi);
|
||||
sharp_job ljob = *job;
|
||||
ljob.opcnt=0;
|
||||
sharp_Ylmgen_C generator;
|
||||
sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
|
||||
alloc_almtmp(&ljob,lmax);
|
||||
|
||||
inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim);
|
||||
while (auto rng=sched.getNext()) for(auto mi=rng.lo; mi<rng.hi; ++mi)
|
||||
{
|
||||
/* alm->alm_tmp where necessary */
|
||||
alm2almtmp (&ljob, lmax, mi);
|
||||
|
||||
inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim);
|
||||
|
||||
/* alm_tmp->alm where necessary */
|
||||
almtmp2alm (&ljob, lmax, mi);
|
||||
}
|
||||
almtmp2alm (&ljob, lmax, mi);
|
||||
}
|
||||
|
||||
sharp_Ylmgen_destroy(&generator);
|
||||
dealloc_almtmp(&ljob);
|
||||
sharp_Ylmgen_destroy(&generator);
|
||||
dealloc_almtmp(&ljob);
|
||||
|
||||
#pragma omp critical
|
||||
job->opcnt+=ljob.opcnt;
|
||||
} /* end of parallel region */
|
||||
opcnt+=ljob.opcnt;
|
||||
}); /* end of parallel region */
|
||||
|
||||
/* phase->map where necessary */
|
||||
phase2map (job, mmax, llim, ulim);
|
||||
|
@ -909,6 +911,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
|
|||
|
||||
DEALLOC(job->norm_l);
|
||||
dealloc_phase (job);
|
||||
job->opcnt = opcnt;
|
||||
job->time=sharp_wallTime()-timer;
|
||||
}
|
||||
|
||||
|
@ -934,8 +937,8 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
|
|||
job->flags|=SHARP_REAL_HARMONICS;
|
||||
job->time = 0.;
|
||||
job->opcnt = 0;
|
||||
job->alm=alm;
|
||||
job->map=map;
|
||||
job->alm=(void **)alm;
|
||||
job->map=(void **)map;
|
||||
}
|
||||
|
||||
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
|
|
@ -27,7 +27,7 @@
|
|||
|
||||
#define ARCH default
|
||||
#define GENERIC_ARCH
|
||||
#include "libsharp2/sharp_core_inc.c"
|
||||
#include "libsharp2/sharp_core_inc.cc"
|
||||
#undef GENERIC_ARCH
|
||||
#undef ARCH
|
||||
|
|
@ -34,7 +34,7 @@
|
|||
#define XCONCATX2(a,b) XCONCATX(a,b)
|
||||
#define XARCH(a) XCONCATX2(a,ARCH)
|
||||
|
||||
#include <complex.h>
|
||||
#include <complex>
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
#include "libsharp2/sharp_vecsupport.h"
|
||||
|
@ -49,7 +49,7 @@
|
|||
// Unfortunately, most compilers don't act on this pragma yet.
|
||||
#pragma STDC FP_CONTRACT ON
|
||||
|
||||
typedef complex double dcmplx;
|
||||
typedef complex<double> dcmplx;
|
||||
|
||||
#define nv0 (128/VLEN)
|
||||
#define nvx (64/VLEN)
|
||||
|
@ -94,35 +94,35 @@ typedef union
|
|||
sxdata_s s;
|
||||
} sxdata_u;
|
||||
|
||||
static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale,
|
||||
static inline void Tvnormalize (Tv * MRUTIL_RESTRICT val, Tv * MRUTIL_RESTRICT scale,
|
||||
double maxval)
|
||||
{
|
||||
const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
|
||||
const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
|
||||
Tm mask = vgt(vabs(*val),vfmax);
|
||||
while (vanyTrue(mask))
|
||||
const Tv vfmin=sharp_fsmall*maxval, vfmax=maxval;
|
||||
const Tv vfsmall=sharp_fsmall, vfbig=sharp_fbig;
|
||||
auto mask = abs(*val)>vfmax;
|
||||
while (any_of(mask))
|
||||
{
|
||||
vmuleq_mask(mask,*val,vfsmall);
|
||||
vaddeq_mask(mask,*scale,vone);
|
||||
mask = vgt(vabs(*val),vfmax);
|
||||
where(mask,*val)*=vfsmall;
|
||||
where(mask,*scale)+=1;
|
||||
mask = abs(*val)>vfmax;
|
||||
}
|
||||
mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
|
||||
while (vanyTrue(mask))
|
||||
mask = (abs(*val)<vfmin) & (*val!=0);
|
||||
while (any_of(mask))
|
||||
{
|
||||
vmuleq_mask(mask,*val,vfbig);
|
||||
vsubeq_mask(mask,*scale,vone);
|
||||
mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
|
||||
where(mask,*val)*=vfbig;
|
||||
where(mask,*scale)-=1;
|
||||
mask = (abs(*val)<vfmin) & (*val!=0);
|
||||
}
|
||||
}
|
||||
|
||||
static void mypow(Tv val, int npow, const double * restrict powlimit,
|
||||
Tv * restrict resd, Tv * restrict ress)
|
||||
static void mypow(Tv val, int npow, const double * MRUTIL_RESTRICT powlimit,
|
||||
Tv * MRUTIL_RESTRICT resd, Tv * MRUTIL_RESTRICT ress)
|
||||
{
|
||||
Tv vminv=vload(powlimit[npow]);
|
||||
Tm mask = vlt(vabs(val),vminv);
|
||||
if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
|
||||
Tv vminv=powlimit[npow];
|
||||
auto mask = abs(val)<vminv;
|
||||
if (none_of(mask)) // no underflows possible, use quick algoritm
|
||||
{
|
||||
Tv res=vone;
|
||||
Tv res=1;
|
||||
do
|
||||
{
|
||||
if (npow&1)
|
||||
|
@ -131,11 +131,11 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
|
|||
}
|
||||
while(npow>>=1);
|
||||
*resd=res;
|
||||
*ress=vzero;
|
||||
*ress=0;
|
||||
}
|
||||
else
|
||||
{
|
||||
Tv scale=vzero, scaleint=vzero, res=vone;
|
||||
Tv scale=0, scaleint=0, res=1;
|
||||
Tvnormalize(&val,&scaleint,sharp_fbighalf);
|
||||
do
|
||||
{
|
||||
|
@ -155,8 +155,8 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
|
|||
}
|
||||
}
|
||||
|
||||
static inline void getCorfac(Tv scale, Tv * restrict corfac,
|
||||
const double * restrict cf)
|
||||
static inline void getCorfac(Tv scale, Tv * MRUTIL_RESTRICT corfac,
|
||||
const double * MRUTIL_RESTRICT cf)
|
||||
{
|
||||
typedef union
|
||||
{ Tv v; double s[VLEN]; } Tvu;
|
||||
|
@ -169,67 +169,67 @@ static inline void getCorfac(Tv scale, Tv * restrict corfac,
|
|||
*corfac=corf.v;
|
||||
}
|
||||
|
||||
static inline int rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, Tv eps)
|
||||
static inline bool rescale(Tv * MRUTIL_RESTRICT v1, Tv * MRUTIL_RESTRICT v2, Tv * MRUTIL_RESTRICT s, Tv eps)
|
||||
{
|
||||
Tm mask = vgt(vabs(*v2),eps);
|
||||
if (vanyTrue(mask))
|
||||
auto mask = abs(*v2)>eps;
|
||||
if (any_of(mask))
|
||||
{
|
||||
vmuleq_mask(mask,*v1,vload(sharp_fsmall));
|
||||
vmuleq_mask(mask,*v2,vload(sharp_fsmall));
|
||||
vaddeq_mask(mask,*s,vone);
|
||||
return 1;
|
||||
where(mask,*v1)*=sharp_fsmall;
|
||||
where(mask,*v2)*=sharp_fsmall;
|
||||
where(mask,*s)+=1;
|
||||
return true;
|
||||
}
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
|
||||
s0data_v * restrict d, int * restrict l_, int * restrict il_, int nv2)
|
||||
MRUTIL_NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * MRUTIL_RESTRICT gen,
|
||||
s0data_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int * MRUTIL_RESTRICT il_, int nv2)
|
||||
{
|
||||
int l=gen->m, il=0;
|
||||
Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
|
||||
Tv limscale=vload(sharp_limscale);
|
||||
Tv mfac = (gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m];
|
||||
Tv limscale=sharp_limscale;
|
||||
int below_limit = 1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->lam1[i]=vzero;
|
||||
d->lam1[i]=0;
|
||||
mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]);
|
||||
d->lam2[i] *= mfac;
|
||||
Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
|
||||
below_limit &= vallTrue(vlt(d->scale[i],limscale));
|
||||
below_limit &= all_of(d->scale[i]<limscale);
|
||||
}
|
||||
|
||||
while (below_limit)
|
||||
{
|
||||
if (l+4>gen->lmax) {*l_=gen->lmax+1;return;}
|
||||
below_limit=1;
|
||||
Tv a1=vload(gen->coef[il ].a), b1=vload(gen->coef[il ].b);
|
||||
Tv a2=vload(gen->coef[il+1].a), b2=vload(gen->coef[il+1].b);
|
||||
Tv a1=gen->coef[il ].a, b1=gen->coef[il ].b;
|
||||
Tv a2=gen->coef[il+1].a, b2=gen->coef[il+1].b;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
|
||||
d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
|
||||
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
|
||||
below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale)));
|
||||
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol))
|
||||
below_limit &= all_of(d->scale[i]<sharp_limscale);
|
||||
}
|
||||
l+=4; il+=2;
|
||||
}
|
||||
*l_=l; *il_=il;
|
||||
}
|
||||
|
||||
NOINLINE static void alm2map_kernel(s0data_v * restrict d,
|
||||
const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm,
|
||||
MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d,
|
||||
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, const dcmplx * MRUTIL_RESTRICT alm,
|
||||
int l, int il, int lmax, int nv2)
|
||||
{
|
||||
if (nv2==nv0)
|
||||
{
|
||||
for (; l<=lmax-2; il+=2, l+=4)
|
||||
{
|
||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
|
||||
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
||||
Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
|
||||
Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
|
||||
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
|
||||
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
|
||||
Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
|
||||
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
|
||||
Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag();
|
||||
Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag();
|
||||
Tv a1=coef[il ].a, b1=coef[il ].b;
|
||||
Tv a2=coef[il+1].a, b2=coef[il+1].b;
|
||||
for (int i=0; i<nv0; ++i)
|
||||
{
|
||||
d->p1r[i] += d->lam2[i]*ar1;
|
||||
|
@ -249,12 +249,12 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
|
|||
{
|
||||
for (; l<=lmax-2; il+=2, l+=4)
|
||||
{
|
||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
|
||||
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
||||
Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
|
||||
Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
|
||||
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
|
||||
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
|
||||
Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
|
||||
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
|
||||
Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag();
|
||||
Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag();
|
||||
Tv a1=coef[il ].a, b1=coef[il ].b;
|
||||
Tv a2=coef[il+1].a, b2=coef[il+1].b;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->p1r[i] += d->lam2[i]*ar1;
|
||||
|
@ -272,9 +272,9 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
|
|||
}
|
||||
for (; l<=lmax; ++il, l+=2)
|
||||
{
|
||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
|
||||
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
||||
Tv a=vload(coef[il].a), b=vload(coef[il].b);
|
||||
Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
|
||||
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
|
||||
Tv a=coef[il].a, b=coef[il].b;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->p1r[i] += d->lam2[i]*ar1;
|
||||
|
@ -288,8 +288,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
|
|||
}
|
||||
}
|
||||
|
||||
NOINLINE static void calc_alm2map (sharp_job * restrict job,
|
||||
const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
|
||||
MRUTIL_NOINLINE static void calc_alm2map (sharp_job * MRUTIL_RESTRICT job,
|
||||
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth)
|
||||
{
|
||||
int l,il,lmax=gen->lmax;
|
||||
int nv2 = (nth+VLEN-1)/VLEN;
|
||||
|
@ -298,20 +298,20 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
|
|||
if (l>lmax) return;
|
||||
job->opcnt += (lmax+1-l) * 6*nth;
|
||||
|
||||
const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
|
||||
const dcmplx * restrict alm=job->almtmp;
|
||||
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef;
|
||||
const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
|
||||
int full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
|
||||
full_ieee &= all_of(d->scale[i]>=sharp_minscale);
|
||||
}
|
||||
|
||||
while((!full_ieee) && (l<=lmax))
|
||||
{
|
||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
|
||||
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
||||
Tv a=vload(coef[il].a), b=vload(coef[il].b);
|
||||
Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
|
||||
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
|
||||
Tv a=coef[il].a, b=coef[il].b;
|
||||
full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
|
@ -322,9 +322,9 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
|
|||
Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
|
||||
d->lam1[i] = d->lam2[i];
|
||||
d->lam2[i] = tmp;
|
||||
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
|
||||
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol))
|
||||
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
|
||||
full_ieee &= all_of(d->scale[i]>=sharp_minscale);
|
||||
}
|
||||
l+=2; ++il;
|
||||
}
|
||||
|
@ -338,16 +338,16 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
|
|||
alm2map_kernel(d, coef, alm, l, il, lmax, nv2);
|
||||
}
|
||||
|
||||
NOINLINE static void map2alm_kernel(s0data_v * restrict d,
|
||||
const sharp_ylmgen_dbl2 * restrict coef, dcmplx * restrict alm, int l,
|
||||
MRUTIL_NOINLINE static void map2alm_kernel(s0data_v * MRUTIL_RESTRICT d,
|
||||
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, dcmplx * MRUTIL_RESTRICT alm, int l,
|
||||
int il, int lmax, int nv2)
|
||||
{
|
||||
for (; l<=lmax-2; il+=2, l+=4)
|
||||
{
|
||||
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
|
||||
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
|
||||
Tv atmp1[4] = {vzero, vzero, vzero, vzero};
|
||||
Tv atmp2[4] = {vzero, vzero, vzero, vzero};
|
||||
Tv a1=coef[il ].a, b1=coef[il ].b;
|
||||
Tv a2=coef[il+1].a, b2=coef[il+1].b;
|
||||
Tv atmp1[4] = {0,0,0,0};
|
||||
Tv atmp2[4] = {0,0,0,0};
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
atmp1[0] += d->lam2[i]*d->p1r[i];
|
||||
|
@ -366,8 +366,8 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d,
|
|||
}
|
||||
for (; l<=lmax; ++il, l+=2)
|
||||
{
|
||||
Tv a=vload(coef[il].a), b=vload(coef[il].b);
|
||||
Tv atmp[4] = {vzero, vzero, vzero, vzero};
|
||||
Tv a=coef[il].a, b=coef[il].b;
|
||||
Tv atmp[4] = {0,0,0,0};
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
atmp[0] += d->lam2[i]*d->p1r[i];
|
||||
|
@ -382,8 +382,8 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d,
|
|||
}
|
||||
}
|
||||
|
||||
NOINLINE static void calc_map2alm (sharp_job * restrict job,
|
||||
const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
|
||||
MRUTIL_NOINLINE static void calc_map2alm (sharp_job * MRUTIL_RESTRICT job,
|
||||
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth)
|
||||
{
|
||||
int l,il,lmax=gen->lmax;
|
||||
int nv2 = (nth+VLEN-1)/VLEN;
|
||||
|
@ -392,19 +392,19 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
|
|||
if (l>lmax) return;
|
||||
job->opcnt += (lmax+1-l) * 6*nth;
|
||||
|
||||
const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
|
||||
dcmplx * restrict alm=job->almtmp;
|
||||
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef;
|
||||
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
|
||||
int full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
|
||||
full_ieee &= all_of(d->scale[i]>=sharp_minscale);
|
||||
}
|
||||
|
||||
while((!full_ieee) && (l<=lmax))
|
||||
{
|
||||
Tv a=vload(coef[il].a), b=vload(coef[il].b);
|
||||
Tv atmp[4] = {vzero, vzero, vzero, vzero};
|
||||
Tv a=coef[il].a, b=coef[il].b;
|
||||
Tv atmp[4] = {0,0,0,0};
|
||||
full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
|
@ -415,9 +415,9 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
|
|||
Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
|
||||
d->lam1[i] = d->lam2[i];
|
||||
d->lam2[i] = tmp;
|
||||
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
|
||||
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol))
|
||||
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
|
||||
full_ieee &= all_of(d->scale[i]>=sharp_minscale);
|
||||
}
|
||||
vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
|
||||
l+=2; ++il;
|
||||
|
@ -432,21 +432,21 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
|
|||
map2alm_kernel(d, coef, alm, l, il, lmax, nv2);
|
||||
}
|
||||
|
||||
NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
||||
sxdata_v * restrict d, int * restrict l_, int nv2)
|
||||
MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RESTRICT gen,
|
||||
sxdata_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int nv2)
|
||||
{
|
||||
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
|
||||
Tv prefac=vload(gen->prefac[gen->m]),
|
||||
prescale=vload(gen->fscale[gen->m]);
|
||||
Tv limscale=vload(sharp_limscale);
|
||||
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
|
||||
Tv prefac=gen->prefac[gen->m],
|
||||
prescale=gen->fscale[gen->m];
|
||||
Tv limscale=sharp_limscale;
|
||||
int below_limit=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
Tv cth2=vmax(vload(1e-15),vsqrt((vone+d->cth[i])*vload(0.5)));
|
||||
Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5)));
|
||||
Tm mask=vlt(d->sth[i],vzero);
|
||||
vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.));
|
||||
vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.));
|
||||
Tv cth2=max(Tv(1e-15),sqrt((1.+d->cth[i])*0.5));
|
||||
Tv sth2=max(Tv(1e-15),sqrt((1.-d->cth[i])*0.5));
|
||||
auto mask=d->sth[i]<0;
|
||||
where(mask&(d->cth[i]<0),cth2)*=-1.;
|
||||
where(mask&(d->cth[i]<0),sth2)*=-1.;
|
||||
|
||||
Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
|
||||
mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
|
||||
|
@ -454,8 +454,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
|||
mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
|
||||
mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
|
||||
|
||||
d->l1p[i] = vzero;
|
||||
d->l1m[i] = vzero;
|
||||
d->l1p[i] = 0;
|
||||
d->l1m[i] = 0;
|
||||
d->l2p[i] = prefac*ccp;
|
||||
d->scp[i] = prescale+ccps;
|
||||
d->l2m[i] = prefac*csp;
|
||||
|
@ -467,17 +467,17 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
|||
d->l2m[i] *= scp;
|
||||
d->scm[i] += scps;
|
||||
if (gen->preMinus_p)
|
||||
d->l2p[i] = vneg(d->l2p[i]);
|
||||
d->l2p[i] = -d->l2p[i];
|
||||
if (gen->preMinus_m)
|
||||
d->l2m[i] = vneg(d->l2m[i]);
|
||||
d->l2m[i] = -d->l2m[i];
|
||||
if (gen->s&1)
|
||||
d->l2p[i] = vneg(d->l2p[i]);
|
||||
d->l2p[i] = -d->l2p[i];
|
||||
|
||||
Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol);
|
||||
Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol);
|
||||
|
||||
below_limit &= vallTrue(vlt(d->scm[i],limscale)) &&
|
||||
vallTrue(vlt(d->scp[i],limscale));
|
||||
below_limit &= all_of(d->scm[i]<limscale) &&
|
||||
all_of(d->scp[i]<limscale);
|
||||
}
|
||||
|
||||
int l=gen->mhi;
|
||||
|
@ -486,18 +486,18 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
|||
{
|
||||
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
|
||||
below_limit=1;
|
||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
||||
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
||||
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
|
||||
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
|
||||
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
|
||||
if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
|
||||
rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
|
||||
below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
|
||||
vallTrue(vlt(d->scm[i],limscale));
|
||||
if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],sharp_ftol) ||
|
||||
rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],sharp_ftol))
|
||||
below_limit &= all_of(d->scp[i]<limscale) &&
|
||||
all_of(d->scm[i]<limscale);
|
||||
}
|
||||
l+=2;
|
||||
}
|
||||
|
@ -505,19 +505,19 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
|||
*l_=l;
|
||||
}
|
||||
|
||||
NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
|
||||
const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
|
||||
MRUTIL_NOINLINE static void alm2map_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
|
||||
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm,
|
||||
int l, int lmax, int nv2)
|
||||
{
|
||||
int lsave = l;
|
||||
while (l<=lmax)
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
||||
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
|
||||
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
|
||||
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
|
||||
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
|
||||
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||
Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(),
|
||||
acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
|
||||
Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
|
||||
acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
||||
|
@ -537,12 +537,12 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
|
|||
l=lsave;
|
||||
while (l<=lmax)
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
||||
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
|
||||
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
|
||||
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
|
||||
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
|
||||
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||
Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(),
|
||||
acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
|
||||
Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
|
||||
acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
|
||||
|
@ -561,8 +561,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
|
|||
}
|
||||
}
|
||||
|
||||
NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
|
||||
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
|
||||
MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job * MRUTIL_RESTRICT job,
|
||||
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
|
||||
{
|
||||
int l,lmax=gen->lmax;
|
||||
int nv2 = (nth+VLEN-1)/VLEN;
|
||||
|
@ -571,25 +571,25 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
|
|||
if (l>lmax) return;
|
||||
job->opcnt += (lmax+1-l) * 23*nth;
|
||||
|
||||
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
|
||||
const dcmplx * restrict alm=job->almtmp;
|
||||
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
|
||||
const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
|
||||
int full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
|
||||
vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
||||
full_ieee &= all_of(d->scp[i]>=sharp_minscale) &&
|
||||
all_of(d->scm[i]>=sharp_minscale);
|
||||
}
|
||||
|
||||
while((!full_ieee) && (l<=lmax))
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
||||
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
|
||||
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
|
||||
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
|
||||
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
|
||||
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||
Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(),
|
||||
acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
|
||||
Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
|
||||
acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
|
||||
full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
|
@ -611,12 +611,12 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
|
|||
|
||||
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
|
||||
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
|
||||
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
|
||||
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol))
|
||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
|
||||
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
|
||||
full_ieee &= all_of(d->scp[i]>=sharp_minscale);
|
||||
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol))
|
||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
||||
full_ieee &= all_of(d->scm[i]>=sharp_minscale);
|
||||
}
|
||||
l+=2;
|
||||
}
|
||||
|
@ -641,17 +641,17 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
|
|||
}
|
||||
}
|
||||
|
||||
NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
|
||||
const sharp_ylmgen_dbl2 * restrict fx, dcmplx * restrict alm,
|
||||
MRUTIL_NOINLINE static void map2alm_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
|
||||
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, dcmplx * MRUTIL_RESTRICT alm,
|
||||
int l, int lmax, int nv2)
|
||||
{
|
||||
int lsave=l;
|
||||
while (l<=lmax)
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
||||
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
|
||||
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
|
||||
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||
Tv agr1=0, agi1=0, acr1=0, aci1=0;
|
||||
Tv agr2=0, agi2=0, acr2=0, aci2=0;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
||||
|
@ -672,10 +672,10 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
|
|||
l=lsave;
|
||||
while (l<=lmax)
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
||||
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
|
||||
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
|
||||
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||
Tv agr1=0, agi1=0, acr1=0, aci1=0;
|
||||
Tv agr2=0, agi2=0, acr2=0, aci2=0;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
|
||||
|
@ -695,8 +695,8 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
|
|||
}
|
||||
}
|
||||
|
||||
NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
|
||||
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
|
||||
MRUTIL_NOINLINE static void calc_map2alm_spin (sharp_job * MRUTIL_RESTRICT job,
|
||||
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
|
||||
{
|
||||
int l,lmax=gen->lmax;
|
||||
int nv2 = (nth+VLEN-1)/VLEN;
|
||||
|
@ -705,15 +705,15 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
|
|||
if (l>lmax) return;
|
||||
job->opcnt += (lmax+1-l) * 23*nth;
|
||||
|
||||
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
|
||||
dcmplx * restrict alm=job->almtmp;
|
||||
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
|
||||
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
|
||||
int full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
|
||||
vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
||||
full_ieee &= all_of(d->scp[i]>=sharp_minscale) &&
|
||||
all_of(d->scm[i]>=sharp_minscale);
|
||||
}
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
|
@ -726,10 +726,10 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
|
|||
|
||||
while((!full_ieee) && (l<=lmax))
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
||||
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
|
||||
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
|
||||
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||
Tv agr1=0, agi1=0, acr1=0, aci1=0;
|
||||
Tv agr2=0, agi2=0, acr2=0, aci2=0;
|
||||
full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
|
@ -748,12 +748,12 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
|
|||
|
||||
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
|
||||
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
|
||||
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
|
||||
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol))
|
||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
|
||||
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
|
||||
full_ieee &= all_of(d->scp[i]>=sharp_minscale);
|
||||
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol))
|
||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
||||
full_ieee &= all_of(d->scm[i]>=sharp_minscale);
|
||||
}
|
||||
vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
|
||||
vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
|
||||
|
@ -772,17 +772,17 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
|
|||
}
|
||||
|
||||
|
||||
NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
|
||||
const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
|
||||
MRUTIL_NOINLINE static void alm2map_deriv1_kernel(sxdata_v * MRUTIL_RESTRICT d,
|
||||
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm,
|
||||
int l, int lmax, int nv2)
|
||||
{
|
||||
int lsave=l;
|
||||
while (l<=lmax)
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])),
|
||||
ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
||||
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(),
|
||||
ar2=alm[l+1].real(), ai2=alm[l+1].imag();
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
||||
|
@ -798,10 +798,10 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
|
|||
l=lsave;
|
||||
while (l<=lmax)
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])),
|
||||
ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
||||
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(),
|
||||
ar2=alm[l+1].real(), ai2=alm[l+1].imag();
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
|
||||
|
@ -816,8 +816,8 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
|
|||
}
|
||||
}
|
||||
|
||||
NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
|
||||
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
|
||||
MRUTIL_NOINLINE static void calc_alm2map_deriv1(sharp_job * MRUTIL_RESTRICT job,
|
||||
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
|
||||
{
|
||||
int l,lmax=gen->lmax;
|
||||
int nv2 = (nth+VLEN-1)/VLEN;
|
||||
|
@ -826,23 +826,23 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
|
|||
if (l>lmax) return;
|
||||
job->opcnt += (lmax+1-l) * 15*nth;
|
||||
|
||||
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
|
||||
const dcmplx * restrict alm=job->almtmp;
|
||||
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
|
||||
const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
|
||||
int full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
|
||||
vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
||||
full_ieee &= all_of(d->scp[i]>=sharp_minscale) &&
|
||||
all_of(d->scm[i]>=sharp_minscale);
|
||||
}
|
||||
|
||||
while((!full_ieee) && (l<=lmax))
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
|
||||
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
|
||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])),
|
||||
ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
||||
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
|
||||
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
|
||||
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(),
|
||||
ar2=alm[l+1].real(), ai2=alm[l+1].imag();
|
||||
full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
|
@ -864,12 +864,12 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
|
|||
|
||||
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
|
||||
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
|
||||
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
|
||||
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol))
|
||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
|
||||
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
|
||||
full_ieee &= all_of(d->scp[i]>=sharp_minscale);
|
||||
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol))
|
||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
||||
full_ieee &= all_of(d->scm[i]>=sharp_minscale);
|
||||
}
|
||||
l+=2;
|
||||
}
|
||||
|
@ -897,7 +897,7 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
|
|||
|
||||
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
|
||||
|
||||
NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
||||
MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
||||
const double *cth_, const double *sth_, int llim, int ulim,
|
||||
sharp_Ylmgen_C *gen, int mi, const int *mlim)
|
||||
{
|
||||
|
@ -912,7 +912,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
|||
if (job->spin==0)
|
||||
{
|
||||
//adjust the a_lm for the new algorithm
|
||||
dcmplx * restrict alm=job->almtmp;
|
||||
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
|
||||
for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
|
||||
{
|
||||
dcmplx al = alm[l];
|
||||
|
@ -963,8 +963,8 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
|||
d.s.p2r[i]*=cth_[tgt];
|
||||
d.s.p2i[i]*=cth_[tgt];
|
||||
int phas_idx = tgt*job->s_th + mi*job->s_m;
|
||||
complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I,
|
||||
r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I;
|
||||
complex<double> r1(d.s.p1r[i], d.s.p1i[i]),
|
||||
r2(d.s.p2r[i], d.s.p2i[i]);
|
||||
job->phase[phas_idx] = r1+r2;
|
||||
if (ispair[tgt])
|
||||
job->phase[phas_idx+1] = r1-r2;
|
||||
|
@ -1027,10 +1027,10 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
|||
{
|
||||
int tgt=itgt[i];
|
||||
int phas_idx = tgt*job->s_th + mi*job->s_m;
|
||||
complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I,
|
||||
q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I,
|
||||
u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I,
|
||||
u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I;
|
||||
complex<double> q1(d.s.p1pr[i], d.s.p1pi[i]),
|
||||
q2(d.s.p2pr[i], d.s.p2pi[i]),
|
||||
u1(d.s.p1mr[i], d.s.p1mi[i]),
|
||||
u2(d.s.p2mr[i], d.s.p2mi[i]);
|
||||
job->phase[phas_idx ] = q1+q2;
|
||||
job->phase[phas_idx+2] = u1+u2;
|
||||
if (ispair[tgt])
|
||||
|
@ -1056,7 +1056,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
|||
}
|
||||
}
|
||||
|
||||
NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
|
||||
MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
|
||||
const double *cth_, const double *sth_, int llim, int ulim,
|
||||
sharp_Ylmgen_C *gen, int mi, const int *mlim)
|
||||
{
|
||||
|
@ -1083,8 +1083,8 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
|
|||
int phas_idx = ith*job->s_th + mi*job->s_m;
|
||||
dcmplx ph1=job->phase[phas_idx];
|
||||
dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.;
|
||||
d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2);
|
||||
d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2);
|
||||
d.s.p1r[nth]=(ph1+ph2).real(); d.s.p1i[nth]=(ph1+ph2).imag();
|
||||
d.s.p2r[nth]=(ph1-ph2).real(); d.s.p2i[nth]=(ph1-ph2).imag();
|
||||
//adjust for new algorithm
|
||||
d.s.p2r[nth]*=cth_[ith];
|
||||
d.s.p2i[nth]*=cth_[ith];
|
||||
|
@ -1105,7 +1105,7 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
|
|||
}
|
||||
}
|
||||
//adjust the a_lm for the new algorithm
|
||||
dcmplx * restrict alm=job->almtmp;
|
||||
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
|
||||
dcmplx alm2 = 0.;
|
||||
double alold=0;
|
||||
for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
|
||||
|
@ -1138,10 +1138,10 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
|
|||
p2U=ispair[ith] ? job->phase[phas_idx+3]:0.;
|
||||
if ((gen->mhi-gen->m+gen->s)&1)
|
||||
{ p2Q=-p2Q; p2U=-p2U; }
|
||||
d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q);
|
||||
d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U);
|
||||
d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q);
|
||||
d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U);
|
||||
d.s.p1pr[nth]=(p1Q+p2Q).real(); d.s.p1pi[nth]=(p1Q+p2Q).imag();
|
||||
d.s.p1mr[nth]=(p1U+p2U).real(); d.s.p1mi[nth]=(p1U+p2U).imag();
|
||||
d.s.p2pr[nth]=(p1Q-p2Q).real(); d.s.p2pi[nth]=(p1Q-p2Q).imag();
|
||||
d.s.p2mr[nth]=(p1U-p2U).real(); d.s.p2mi[nth]=(p1U-p2U).imag();
|
||||
++nth;
|
||||
}
|
||||
++ith;
|
|
@ -29,7 +29,7 @@
|
|||
#include "libsharp2/sharp_geomhelpers.h"
|
||||
#include "libsharp2/sharp_legendre_roots.h"
|
||||
#include "libsharp2/sharp_utils.h"
|
||||
#include "libsharp2/pocketfft.h"
|
||||
#include "mr_util/fft.h"
|
||||
|
||||
void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
|
||||
const int *rings, const double *weight, sharp_geom_info **geom_info)
|
||||
|
@ -155,9 +155,7 @@ void sharp_make_fejer1_geom_info (int nrings, int ppring, double phi0,
|
|||
weight[2*k ]=2./(1.-4.*k*k)*sin((k*pi)/nrings);
|
||||
}
|
||||
if ((nrings&1)==0) weight[nrings-1]=0.;
|
||||
pocketfft_plan_r plan = pocketfft_make_plan_r(nrings);
|
||||
pocketfft_backward_r(plan,weight,1.);
|
||||
pocketfft_delete_plan_r(plan);
|
||||
mr::r2r_fftpack({size_t(nrings)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.);
|
||||
|
||||
for (int m=0; m<(nrings+1)/2; ++m)
|
||||
{
|
||||
|
@ -202,9 +200,7 @@ void sharp_make_cc_geom_info (int nrings, int ppring, double phi0,
|
|||
for (int k=1; k<=(n/2-1); ++k)
|
||||
weight[2*k-1]=2./(1.-4.*k*k) + dw;
|
||||
weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1);
|
||||
pocketfft_plan_r plan = pocketfft_make_plan_r(n);
|
||||
pocketfft_backward_r(plan,weight,1.);
|
||||
pocketfft_delete_plan_r(plan);
|
||||
mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.);
|
||||
weight[n]=weight[0];
|
||||
|
||||
for (int m=0; m<(nrings+1)/2; ++m)
|
||||
|
@ -250,9 +246,7 @@ void sharp_make_fejer2_geom_info (int nrings, int ppring, double phi0,
|
|||
for (int k=1; k<=(n/2-1); ++k)
|
||||
weight[2*k-1]=2./(1.-4.*k*k);
|
||||
weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.;
|
||||
pocketfft_plan_r plan = pocketfft_make_plan_r(n);
|
||||
pocketfft_backward_r(plan,weight,1.);
|
||||
pocketfft_delete_plan_r(plan);
|
||||
mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.);
|
||||
for (int m=0; m<nrings; ++m)
|
||||
weight[m]=weight[m+1];
|
||||
|
|
@ -28,14 +28,12 @@
|
|||
#ifndef SHARP2_INTERNAL_H
|
||||
#define SHARP2_INTERNAL_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
#error This header file cannot be included from C++, only from C
|
||||
#endif
|
||||
|
||||
#include <complex.h>
|
||||
#include <complex>
|
||||
#include "libsharp2/sharp.h"
|
||||
#include "libsharp2/sharp_ylmgen_c.h"
|
||||
|
||||
using std::complex;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
sharp_jobtype type;
|
||||
|
@ -45,9 +43,9 @@ typedef struct
|
|||
void **map;
|
||||
void **alm;
|
||||
int s_m, s_th; // strides in m and theta direction
|
||||
complex double *phase;
|
||||
complex<double> *phase;
|
||||
double *norm_l;
|
||||
complex double *almtmp;
|
||||
complex<double> *almtmp;
|
||||
const sharp_geom_info *ginfo;
|
||||
const sharp_alm_info *ainfo;
|
||||
double time;
|
||||
|
|
|
@ -122,10 +122,4 @@ double sharp_wallTime(void);
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define NOINLINE __attribute__((noinline))
|
||||
#else
|
||||
#define NOINLINE
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -28,193 +28,27 @@
|
|||
#ifndef SHARP2_VECSUPPORT_H
|
||||
#define SHARP2_VECSUPPORT_H
|
||||
|
||||
#include <math.h>
|
||||
#include <cmath>
|
||||
#include <complex>
|
||||
using std::complex;
|
||||
#include <experimental/simd>
|
||||
using std::experimental::native_simd;
|
||||
using std::experimental::reduce;
|
||||
|
||||
#ifndef VLEN
|
||||
#include "mr_util/useful_macros.h"
|
||||
|
||||
#if (defined(__AVX512F__))
|
||||
#define VLEN 8
|
||||
#elif (defined (__AVX__))
|
||||
#define VLEN 4
|
||||
#elif (defined (__SSE2__))
|
||||
#define VLEN 2
|
||||
#else
|
||||
#define VLEN 1
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
typedef double Ts;
|
||||
|
||||
#if (VLEN==1)
|
||||
|
||||
typedef double Tv;
|
||||
typedef int Tm;
|
||||
using Tv=native_simd<double>;
|
||||
using Tm=Tv::mask_type;
|
||||
using Ts=Tv::value_type;
|
||||
static constexpr size_t VLEN=Tv::size();
|
||||
|
||||
#define vload(a) (a)
|
||||
#define vzero 0.
|
||||
#define vone 1.
|
||||
|
||||
#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
|
||||
#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
|
||||
#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
|
||||
#define vneg(a) (-(a))
|
||||
#define vabs(a) fabs(a)
|
||||
#define vsqrt(a) sqrt(a)
|
||||
#define vlt(a,b) ((a)<(b))
|
||||
#define vgt(a,b) ((a)>(b))
|
||||
#define vge(a,b) ((a)>=(b))
|
||||
#define vne(a,b) ((a)!=(b))
|
||||
#define vand_mask(a,b) ((a)&&(b))
|
||||
#define vor_mask(a,b) ((a)||(b))
|
||||
static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
|
||||
static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
|
||||
#define vanyTrue(a) (a)
|
||||
#define vallTrue(a) (a)
|
||||
|
||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||
_Complex double * restrict cc)
|
||||
{ cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#if (VLEN==2)
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
#if defined (__SSE3__)
|
||||
#include <pmmintrin.h>
|
||||
#endif
|
||||
#if defined (__SSE4_1__)
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
typedef __m128d Tv;
|
||||
typedef __m128d Tm;
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#define vblend__(m,a,b) _mm_blendv_pd(b,a,m)
|
||||
#else
|
||||
static inline Tv vblend__(Tv m, Tv a, Tv b)
|
||||
{ return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
|
||||
#endif
|
||||
#define vload(a) _mm_set1_pd(a)
|
||||
#define vzero _mm_setzero_pd()
|
||||
#define vone vload(1.)
|
||||
|
||||
#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
|
||||
#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
|
||||
#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
|
||||
#define vneg(a) _mm_xor_pd(vload(-0.),a)
|
||||
#define vabs(a) _mm_andnot_pd(vload(-0.),a)
|
||||
#define vsqrt(a) _mm_sqrt_pd(a)
|
||||
#define vlt(a,b) _mm_cmplt_pd(a,b)
|
||||
#define vgt(a,b) _mm_cmpgt_pd(a,b)
|
||||
#define vge(a,b) _mm_cmpge_pd(a,b)
|
||||
#define vne(a,b) _mm_cmpneq_pd(a,b)
|
||||
#define vand_mask(a,b) _mm_and_pd(a,b)
|
||||
#define vor_mask(a,b) _mm_or_pd(a,b)
|
||||
#define vmin(a,b) _mm_min_pd(a,b)
|
||||
#define vmax(a,b) _mm_max_pd(a,b);
|
||||
#define vanyTrue(a) (_mm_movemask_pd(a)!=0)
|
||||
#define vallTrue(a) (_mm_movemask_pd(a)==3)
|
||||
|
||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c,
|
||||
Tv d, _Complex double * restrict cc)
|
||||
complex<double> * MRUTIL_RESTRICT cc)
|
||||
{
|
||||
union {Tv v; _Complex double c; } u1, u2;
|
||||
#if defined(__SSE3__)
|
||||
u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
|
||||
#else
|
||||
u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
|
||||
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
|
||||
u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
|
||||
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
|
||||
#endif
|
||||
cc[0]+=u1.c; cc[1]+=u2.c;
|
||||
cc[0] += complex<double>(reduce(a,std::plus<>()),reduce(b,std::plus<>()));
|
||||
cc[1] += complex<double>(reduce(c,std::plus<>()),reduce(d,std::plus<>()));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if (VLEN==4)
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef __m256d Tv;
|
||||
typedef __m256d Tm;
|
||||
|
||||
#define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
|
||||
#define vload(a) _mm256_set1_pd(a)
|
||||
#define vzero _mm256_setzero_pd()
|
||||
#define vone vload(1.)
|
||||
|
||||
#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
|
||||
#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
|
||||
#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
|
||||
#define vneg(a) _mm256_xor_pd(vload(-0.),a)
|
||||
#define vabs(a) _mm256_andnot_pd(vload(-0.),a)
|
||||
#define vsqrt(a) _mm256_sqrt_pd(a)
|
||||
#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
|
||||
#define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ)
|
||||
#define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ)
|
||||
#define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
|
||||
#define vand_mask(a,b) _mm256_and_pd(a,b)
|
||||
#define vor_mask(a,b) _mm256_or_pd(a,b)
|
||||
#define vmin(a,b) _mm256_min_pd(a,b)
|
||||
#define vmax(a,b) _mm256_max_pd(a,b)
|
||||
#define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
|
||||
#define vallTrue(a) (_mm256_movemask_pd(a)==15)
|
||||
|
||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||
_Complex double * restrict cc)
|
||||
{
|
||||
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
|
||||
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
|
||||
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
|
||||
tmp1=tmp3+tmp4;
|
||||
union {Tv v; _Complex double c[2]; } u;
|
||||
u.v=tmp1;
|
||||
cc[0]+=u.c[0]; cc[1]+=u.c[1];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if (VLEN==8)
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef __m512d Tv;
|
||||
typedef __mmask8 Tm;
|
||||
|
||||
#define vload(a) _mm512_set1_pd(a)
|
||||
#define vzero _mm512_setzero_pd()
|
||||
#define vone vload(1.)
|
||||
|
||||
#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
|
||||
#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
|
||||
#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
|
||||
#define vneg(a) _mm512_mul_pd(a,vload(-1.))
|
||||
#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)vload(-0.),(__m512i)a)
|
||||
#define vsqrt(a) _mm512_sqrt_pd(a)
|
||||
#define vlt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_LT_OQ)
|
||||
#define vgt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GT_OQ)
|
||||
#define vge(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GE_OQ)
|
||||
#define vne(a,b) _mm512_cmp_pd_mask(a,b,_CMP_NEQ_OQ)
|
||||
#define vand_mask(a,b) ((a)&(b))
|
||||
#define vor_mask(a,b) ((a)|(b))
|
||||
#define vmin(a,b) _mm512_min_pd(a,b)
|
||||
#define vmax(a,b) _mm512_max_pd(a,b)
|
||||
#define vanyTrue(a) (a!=0)
|
||||
#define vallTrue(a) (a==255)
|
||||
|
||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||
_Complex double * restrict cc)
|
||||
{
|
||||
cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
|
||||
cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -26,7 +26,8 @@
|
|||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <complex.h>
|
||||
#include <complex>
|
||||
using std::complex;
|
||||
#ifdef USE_MPI
|
||||
#include "mpi.h"
|
||||
#include "libsharp2/sharp_mpi.h"
|
||||
|
@ -38,7 +39,7 @@
|
|||
#include "libsharp2/sharp_geomhelpers.h"
|
||||
#include "libsharp2/sharp_almhelpers.h"
|
||||
#include "libsharp2/sharp_utils.h"
|
||||
#include "libsharp2/sharp_utils.c"
|
||||
#include "libsharp2/sharp_utils.cc"
|
||||
#include "test/memusage.h"
|
||||
|
||||
static void OpenMP_status(void)
|
||||
|
@ -94,7 +95,7 @@ static void sharp_module_startup (const char *name, int argc, int argc_expected,
|
|||
exit(1);
|
||||
}
|
||||
|
||||
typedef complex double dcmplx;
|
||||
typedef complex<double> dcmplx;
|
||||
|
||||
int ntasks, mytask;
|
||||
|
||||
|
@ -122,7 +123,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
|
|||
{
|
||||
double rv = drand(-1,1,&state);
|
||||
double iv = (m==0) ? 0 : drand(-1,1,&state);
|
||||
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
|
||||
alm[sharp_alm_index(helper,l,mi)] = dcmplx(rv,iv);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -230,8 +231,7 @@ static double *get_sqsum_and_invert (dcmplx **alm, ptrdiff_t nalms, int ncomp)
|
|||
sqsum[i]=0;
|
||||
for (ptrdiff_t j=0; j<nalms; ++j)
|
||||
{
|
||||
sqsum[i]+=creal(alm[i][j])*creal(alm[i][j])
|
||||
+cimag(alm[i][j])*cimag(alm[i][j]);
|
||||
sqsum[i]+=norm(alm[i][j]);
|
||||
alm[i][j]=-alm[i][j];
|
||||
}
|
||||
}
|
||||
|
@ -253,8 +253,7 @@ static void get_errors (dcmplx **alm, ptrdiff_t nalms, int ncomp, double *sqsum,
|
|||
double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot;
|
||||
for (ptrdiff_t j=0; j<nalms; ++j)
|
||||
{
|
||||
double sqr=creal(alm[i][j])*creal(alm[i][j])
|
||||
+cimag(alm[i][j])*cimag(alm[i][j]);
|
||||
double sqr=norm(alm[i][j]);
|
||||
sum+=sqr;
|
||||
if (sqr>maxdiff) maxdiff=sqr;
|
||||
}
|
||||
|
@ -414,7 +413,7 @@ static void check_sign_scale(void)
|
|||
ALLOC2D(alm,dcmplx,2,nalms);
|
||||
for (int i=0; i<2; ++i)
|
||||
for (int j=0; j<nalms; ++j)
|
||||
alm[i][j]=1.+_Complex_I;
|
||||
alm[i][j]=dcmplx(1.,1.);
|
||||
|
||||
sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP,
|
||||
NULL,NULL);
|
Loading…
Add table
Add a link
Reference in a new issue