Compare commits

...
Sign in to create a new pull request.

6 commits
master ... cpp

Author SHA1 Message Date
Martin Reinecke
b27ce30cde switch to new FFT 2020-01-04 12:59:59 +01:00
Martin Reinecke
f8a9c96acf use MRUTIL macros 2020-01-03 22:55:52 +01:00
Martin Reinecke
75559e6894 vectorization cleanup 2020-01-03 22:47:49 +01:00
Martin Reinecke
54cbae0534 multithreading demonstrated to work, but not safe yet 2020-01-03 18:26:53 +01:00
Martin Reinecke
0f8051fc28 vectors work with experimental::simd 2020-01-03 18:01:34 +01:00
Martin Reinecke
0378ce155a runs as C++, no vector support yet 2020-01-03 17:22:31 +01:00
19 changed files with 382 additions and 2817 deletions

View file

@ -49,7 +49,7 @@ It can be disabled at configuration time by specifying "--disable-openmp" at the
configure command line. configure command line.
At runtime OMP_NUM_THREADS should be set to the number of hardware threads At runtime OMP_NUM_THREADS should be set to the number of hardware threads
(not physical cores) of the system. (not physical cores) of the system.
(Usually this is already the default setting when OMP_NUM_THREADS is not (Usually this is already the default setting when OMP_NUM_THREADS is not
specified.) specified.)

View file

@ -3,16 +3,14 @@ ACLOCAL_AMFLAGS = -I m4
lib_LTLIBRARIES = libsharp2.la lib_LTLIBRARIES = libsharp2.la
libsharp2_la_SOURCES = \ libsharp2_la_SOURCES = \
libsharp2/pocketfft.c \ libsharp2/sharp_utils.cc \
libsharp2/pocketfft.h \
libsharp2/sharp_utils.c \
libsharp2/sharp_utils.h \ libsharp2/sharp_utils.h \
libsharp2/sharp.c \ libsharp2/sharp.cc \
libsharp2/sharp_almhelpers.c \ libsharp2/sharp_almhelpers.cc \
libsharp2/sharp_core.c \ libsharp2/sharp_core.cc \
libsharp2/sharp_geomhelpers.c \ libsharp2/sharp_geomhelpers.cc \
libsharp2/sharp_legendre_roots.c \ libsharp2/sharp_legendre_roots.cc \
libsharp2/sharp_ylmgen_c.c \ libsharp2/sharp_ylmgen_c.cc \
libsharp2/sharp_internal.h \ libsharp2/sharp_internal.h \
libsharp2/sharp_legendre_roots.h \ libsharp2/sharp_legendre_roots.h \
libsharp2/sharp_vecsupport.h \ libsharp2/sharp_vecsupport.h \
@ -24,25 +22,25 @@ libsharp2_la_SOURCES = \
# any backward-compatible change: increase age # any backward-compatible change: increase age
# any backward-incompatible change: age=0 # any backward-incompatible change: age=0
# ==> age <= current # ==> age <= current
libsharp2_la_LDFLAGS = -version-info 0:0:0 libsharp2_la_LDFLAGS = -version-info 0:0:0 -lpthread
AM_CFLAGS = @AM_CFLAGS@ AM_CXXFLAGS = @AM_CXXFLAGS@
if HAVE_MULTIARCH if HAVE_MULTIARCH
libavx_la_SOURCES = libsharp2/sharp_core_inc.c libavx_la_SOURCES = libsharp2/sharp_core_inc.cc
libavx2_la_SOURCES = libsharp2/sharp_core_inc.c libavx2_la_SOURCES = libsharp2/sharp_core_inc.cc
libfma_la_SOURCES = libsharp2/sharp_core_inc.c libfma_la_SOURCES = libsharp2/sharp_core_inc.cc
libfma4_la_SOURCES = libsharp2/sharp_core_inc.c libfma4_la_SOURCES = libsharp2/sharp_core_inc.cc
libavx512f_la_SOURCES = libsharp2/sharp_core_inc.c libavx512f_la_SOURCES = libsharp2/sharp_core_inc.cc
noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx libavx_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx -DARCH=avx
libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2 libavx2_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx2 -DARCH=avx2
libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma libfma_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma -DARCH=fma
libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4 libfma4_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma4 -DARCH=fma4
libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f libavx512f_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx512f -DARCH=avx512f
libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
@ -56,10 +54,10 @@ nobase_include_HEADERS = \
libsharp2/sharp_cxx.h libsharp2/sharp_cxx.h
EXTRA_DIST = \ EXTRA_DIST = \
runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.c runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.cc
check_PROGRAMS = sharp2_testsuite check_PROGRAMS = sharp2_testsuite
sharp2_testsuite_SOURCES = test/sharp2_testsuite.c test/memusage.c test/memusage.h sharp2_testsuite_SOURCES = test/sharp2_testsuite.cc test/memusage.cc test/memusage.h
sharp2_testsuite_LDADD = libsharp2.la sharp2_testsuite_LDADD = libsharp2.la
TESTS = runtest.sh TESTS = runtest.sh

View file

@ -20,28 +20,21 @@ dnl
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
AC_PROG_CC_C99 AC_PROG_CXX
AC_OPENMP AX_CXX_COMPILE_STDCXX([11])
# add math library
LIBS="-lm"
AC_PROG_LIBTOOL AC_PROG_LIBTOOL
tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'` tmpval=`echo $CXXFLAGS | grep -c '\-DMULTIARCH'`
AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0]) AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS"
PACKAGE_LIBS="-lsharp2" PACKAGE_LIBS="-lsharp2"
PACKAGE_CFLAGS="$PACKAGE_CFLAGS $OPENMP_CFLAGS"
PACKAGE_LDFLAGS="$PACKAGE_LDFLAGS $OPENMP_CFLAGS"
dnl dnl
dnl Create pkgconfig .pc file. dnl Create pkgconfig .pc file.
dnl dnl
AX_CREATE_PKGCONFIG_INFO(,,,,[]) AX_CREATE_PKGCONFIG_INFO(,,,,[])
AC_SUBST([AM_CFLAGS]) AC_SUBST([AM_CXXFLAGS])
AC_CONFIG_FILES([Makefile]) AC_CONFIG_FILES([Makefile])
AC_OUTPUT AC_OUTPUT

File diff suppressed because it is too large Load diff

View file

@ -1,50 +0,0 @@
/*
* This file is part of pocketfft.
* Licensed under a 3-clause BSD style license - see LICENSE.md
*/
/*! \file pocketfft.h
* Public interface of the pocketfft library
*
* Copyright (C) 2008-2019 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef POCKETFFT_H
#define POCKETFFT_H
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
struct pocketfft_plan_c_i;
typedef struct pocketfft_plan_c_i * pocketfft_plan_c;
pocketfft_plan_c pocketfft_make_plan_c (size_t length);
void pocketfft_delete_plan_c (pocketfft_plan_c plan);
int pocketfft_backward_c(pocketfft_plan_c plan, double c[], double fct);
int pocketfft_forward_c(pocketfft_plan_c plan, double c[], double fct);
size_t pocketfft_length_c(pocketfft_plan_c plan);
struct pocketfft_plan_r_i;
typedef struct pocketfft_plan_r_i * pocketfft_plan_r;
pocketfft_plan_r pocketfft_make_plan_r (size_t length);
void pocketfft_delete_plan_r (pocketfft_plan_r plan);
/*! Computes a real backward FFT on \a c, using \a plan
and assuming the FFTPACK storage scheme:
- on entry, \a c has the form <tt>r0, r1, i1, r2, i2, ...</tt>
- on exit, it has the form <tt>r0, r1, ..., r[length-1]</tt>. */
int pocketfft_backward_r(pocketfft_plan_r plan, double c[], double fct);
/*! Computes a real forward FFT on \a c, using \a plan
and assuming the FFTPACK storage scheme:
- on entry, \a c has the form <tt>r0, r1, ..., r[length-1]</tt>;
- on exit, it has the form <tt>r0, r1, i1, r2, i2, ...</tt> */
int pocketfft_forward_r(pocketfft_plan_r plan, double c[], double fct);
size_t pocketfft_length_r(pocketfft_plan_r plan);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -25,17 +25,20 @@
* \author Martin Reinecke \author Dag Sverre Seljebotn * \author Martin Reinecke \author Dag Sverre Seljebotn
*/ */
#include <math.h> #include <cmath>
#include <string.h> #include <cstring>
#include "libsharp2/pocketfft.h" #include <atomic>
#include "mr_util/fft.h"
#include "libsharp2/sharp_ylmgen_c.h" #include "libsharp2/sharp_ylmgen_c.h"
#include "libsharp2/sharp_internal.h" #include "libsharp2/sharp_internal.h"
#include "libsharp2/sharp_utils.h" #include "libsharp2/sharp_utils.h"
#include "libsharp2/sharp_almhelpers.h" #include "libsharp2/sharp_almhelpers.h"
#include "libsharp2/sharp_geomhelpers.h" #include "libsharp2/sharp_geomhelpers.h"
#include "mr_util/threading.h"
#include "mr_util/useful_macros.h"
typedef complex double dcmplx; typedef complex<double> dcmplx;
typedef complex float fcmplx; typedef complex<float> fcmplx;
static const double sqrt_one_half = 0.707106781186547572737310929369; static const double sqrt_one_half = 0.707106781186547572737310929369;
static const double sqrt_two = 1.414213562373095145474621858739; static const double sqrt_two = 1.414213562373095145474621858739;
@ -57,7 +60,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
*nchunks = (ndata+(*chunksize)-1)/(*chunksize); *nchunks = (ndata+(*chunksize)-1)/(*chunksize);
} }
NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth) MRUTIL_NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
{ {
double ofs=lmax*0.01; double ofs=lmax*0.01;
if (ofs<100.) ofs=100.; if (ofs<100.) ofs=100.;
@ -76,7 +79,7 @@ typedef struct
double phi0_; double phi0_;
dcmplx *shiftarr; dcmplx *shiftarr;
int s_shift; int s_shift;
pocketfft_plan_r plan; mr::detail_fft::rfftp<double> *plan;
int length; int length;
int norot; int norot;
} ringhelper; } ringhelper;
@ -89,12 +92,12 @@ static void ringhelper_init (ringhelper *self)
static void ringhelper_destroy (ringhelper *self) static void ringhelper_destroy (ringhelper *self)
{ {
if (self->plan) pocketfft_delete_plan_r(self->plan); delete self->plan;
DEALLOC(self->shiftarr); DEALLOC(self->shiftarr);
ringhelper_init(self); ringhelper_init(self);
} }
NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0) MRUTIL_NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
{ {
self->norot = (fabs(phi0)<1e-14); self->norot = (fabs(phi0)<1e-14);
if (!(self->norot)) if (!(self->norot))
@ -105,27 +108,27 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou
self->phi0_ = phi0; self->phi0_ = phi0;
// FIXME: improve this by using sincos2pibyn(nph) etc. // FIXME: improve this by using sincos2pibyn(nph) etc.
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0); self->shiftarr[m] = dcmplx(cos(m*phi0),sin(m*phi0));
// double *tmp=(double *) self->shiftarr; // double *tmp=(double *) self->shiftarr;
// sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2); // sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
} }
// if (!self->plan) self->plan=pocketfft_make_plan_r(nph); // if (!self->plan) self->plan=pocketfft_make_plan_r(nph);
if (nph!=(int)self->length) if (nph!=(int)self->length)
{ {
if (self->plan) pocketfft_delete_plan_r(self->plan); if (self->plan) delete self->plan;
self->plan=pocketfft_make_plan_r(nph); self->plan=new mr::detail_fft::rfftp<double>(nph);
self->length=nph; self->length=nph;
} }
} }
static int ringinfo_compare (const void *xa, const void *xb) static int ringinfo_compare (const void *xa, const void *xb)
{ {
const sharp_ringinfo *a=xa, *b=xb; const sharp_ringinfo *a=(const sharp_ringinfo *)xa, *b=(const sharp_ringinfo *)xb;
return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0; return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
} }
static int ringpair_compare (const void *xa, const void *xb) static int ringpair_compare (const void *xa, const void *xb)
{ {
const sharp_ringpair *a=xa, *b=xb; const sharp_ringpair *a=(const sharp_ringpair *)xa, *b=(const sharp_ringpair *)xb;
// return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0; // return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
if (a->r1.nph==b->r1.nph) if (a->r1.nph==b->r1.nph)
return (a->r1.phi0 < b->r1.phi0) ? -1 : return (a->r1.phi0 < b->r1.phi0) ? -1 :
@ -275,7 +278,7 @@ static int sharp_get_mmax (int *mval, int nm)
return nm-1; return nm-1;
} }
NOINLINE static void ringhelper_phase2ring (ringhelper *self, MRUTIL_NOINLINE static void ringhelper_phase2ring (ringhelper *self,
const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase, const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase,
int pstride, int flags) int pstride, int flags)
{ {
@ -292,22 +295,22 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
if (self->norot) if (self->norot)
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
{ {
data[2*m]=creal(phase[m*pstride])*wgt; data[2*m]=phase[m*pstride].real()*wgt;
data[2*m+1]=cimag(phase[m*pstride])*wgt; data[2*m+1]=phase[m*pstride].imag()*wgt;
} }
else else
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
{ {
dcmplx tmp = phase[m*pstride]*self->shiftarr[m]; dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
data[2*m]=creal(tmp)*wgt; data[2*m]=tmp.real()*wgt;
data[2*m+1]=cimag(tmp)*wgt; data[2*m+1]=tmp.imag()*wgt;
} }
for (int m=2*(mmax+1); m<nph+2; ++m) for (int m=2*(mmax+1); m<nph+2; ++m)
data[m]=0.; data[m]=0.;
} }
else else
{ {
data[0]=creal(phase[0])*wgt; data[0]=phase[0].real()*wgt;
SET_ARRAY(data,1,nph+2,0.); SET_ARRAY(data,1,nph+2,0.);
int idx1=1, idx2=nph-1; int idx1=1, idx2=nph-1;
@ -317,23 +320,23 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
if(!self->norot) tmp*=self->shiftarr[m]; if(!self->norot) tmp*=self->shiftarr[m];
if (idx1<(nph+2)/2) if (idx1<(nph+2)/2)
{ {
data[2*idx1]+=creal(tmp); data[2*idx1]+=tmp.real();
data[2*idx1+1]+=cimag(tmp); data[2*idx1+1]+=tmp.imag();
} }
if (idx2<(nph+2)/2) if (idx2<(nph+2)/2)
{ {
data[2*idx2]+=creal(tmp); data[2*idx2]+=tmp.real();
data[2*idx2+1]-=cimag(tmp); data[2*idx2+1]-=tmp.imag();
} }
if (++idx1>=nph) idx1=0; if (++idx1>=nph) idx1=0;
if (--idx2<0) idx2=nph-1; if (--idx2<0) idx2=nph-1;
} }
} }
data[1]=data[0]; data[1]=data[0];
pocketfft_backward_r (self->plan, &(data[1]), 1.); self->plan->exec(&(data[1]), 1., false);
} }
NOINLINE static void ringhelper_ring2phase (ringhelper *self, MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self,
const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase, const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase,
int pstride, int flags) int pstride, int flags)
{ {
@ -349,7 +352,7 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
if (flags&SHARP_REAL_HARMONICS) if (flags&SHARP_REAL_HARMONICS)
wgt *= sqrt_two; wgt *= sqrt_two;
pocketfft_forward_r (self->plan, &(data[1]), 1.); self->plan->exec (&(data[1]), 1., true);
data[0]=data[1]; data[0]=data[1];
data[1]=data[nph+1]=0.; data[1]=data[nph+1]=0.;
@ -357,11 +360,11 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
{ {
if (self->norot) if (self->norot)
for (int m=0; m<=maxidx; ++m) for (int m=0; m<=maxidx; ++m)
phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * wgt; phase[m*pstride] = dcmplx(data[2*m], data[2*m+1]) * wgt;
else else
for (int m=0; m<=maxidx; ++m) for (int m=0; m<=maxidx; ++m)
phase[m*pstride] = phase[m*pstride] =
(data[2*m] + _Complex_I*data[2*m+1]) * self->shiftarr[m] * wgt; dcmplx(data[2*m], data[2*m+1]) * self->shiftarr[m] * wgt;
} }
else else
{ {
@ -370,9 +373,9 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
int idx=m%nph; int idx=m%nph;
dcmplx val; dcmplx val;
if (idx<(nph-idx)) if (idx<(nph-idx))
val = (data[2*idx] + _Complex_I*data[2*idx+1]) * wgt; val = dcmplx(data[2*idx], data[2*idx+1]) * wgt;
else else
val = (data[2*(nph-idx)] - _Complex_I*data[2*(nph-idx)+1]) * wgt; val = dcmplx(data[2*(nph-idx)], -data[2*(nph-idx)+1]) * wgt;
if (!self->norot) if (!self->norot)
val *= self->shiftarr[m]; val *= self->shiftarr[m];
phase[m*pstride]=val; phase[m*pstride]=val;
@ -383,7 +386,7 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
phase[m*pstride]=0.; phase[m*pstride]=0.;
} }
NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map, MRUTIL_NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
int flags) int flags)
{ {
if (flags & SHARP_NO_FFT) if (flags & SHARP_NO_FFT)
@ -440,7 +443,7 @@ NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
} }
} }
NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm, MRUTIL_NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
int flags) int flags)
{ {
#define CLEARLOOP(real_t,body) \ #define CLEARLOOP(real_t,body) \
@ -477,7 +480,7 @@ NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
} }
} }
NOINLINE static void init_output (sharp_job *job) MRUTIL_NOINLINE static void init_output (sharp_job *job)
{ {
if (job->flags&SHARP_ADD) return; if (job->flags&SHARP_ADD) return;
if (job->type == SHARP_MAP2ALM) if (job->type == SHARP_MAP2ALM)
@ -488,7 +491,7 @@ NOINLINE static void init_output (sharp_job *job)
clear_map (job->ginfo,job->map[i],job->flags); clear_map (job->ginfo,job->map[i],job->flags);
} }
NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta) MRUTIL_NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
{ {
if (job->type==SHARP_MAP2ALM) if (job->type==SHARP_MAP2ALM)
{ {
@ -514,7 +517,7 @@ static void alloc_almtmp (sharp_job *job, int lmax)
static void dealloc_almtmp (sharp_job *job) static void dealloc_almtmp (sharp_job *job)
{ DEALLOC(job->almtmp); } { DEALLOC(job->almtmp); }
NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) MRUTIL_NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
{ {
#define COPY_LOOP(real_t, source_t, expr_of_x) \ #define COPY_LOOP(real_t, source_t, expr_of_x) \
@ -577,7 +580,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
if (job->flags&SHARP_DP) if (job->flags&SHARP_DP)
COPY_LOOP(double, dcmplx, x*job->norm_l[l]) COPY_LOOP(double, dcmplx, x*job->norm_l[l])
else else
COPY_LOOP(float, fcmplx, x*job->norm_l[l]) COPY_LOOP(float, fcmplx, x*float(job->norm_l[l]))
} }
} }
} }
@ -588,7 +591,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
#undef COPY_LOOP #undef COPY_LOOP
} }
NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
{ {
#define COPY_LOOP(real_t, target_t, expr_of_x) \ #define COPY_LOOP(real_t, target_t, expr_of_x) \
@ -617,9 +620,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
if (m==0) if (m==0)
{ {
if (job->flags&SHARP_DP) if (job->flags&SHARP_DP)
COPY_LOOP(double, double, creal(x)*norm_m0) COPY_LOOP(double, double, x.real()*norm_m0)
else else
COPY_LOOP(float, float, crealf(x)*norm_m0) COPY_LOOP(float, float, x.real()*norm_m0)
} }
else else
{ {
@ -634,9 +637,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
if (m==0) if (m==0)
{ {
if (job->flags&SHARP_DP) if (job->flags&SHARP_DP)
COPY_LOOP(double, double, creal(x)*job->norm_l[l]*norm_m0) COPY_LOOP(double, double, x.real()*job->norm_l[l]*norm_m0)
else else
COPY_LOOP(float, fcmplx, (float)(creal(x)*job->norm_l[l]*norm_m0)) COPY_LOOP(float, fcmplx, (float)(x.real()*job->norm_l[l]*norm_m0))
} }
else else
{ {
@ -650,7 +653,7 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
#undef COPY_LOOP #undef COPY_LOOP
} }
NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, MRUTIL_NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
const double *ringtmp, int rstride) const double *ringtmp, int rstride)
{ {
if (job->flags & SHARP_DP) if (job->flags & SHARP_DP)
@ -658,8 +661,8 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
double **dmap = (double **)job->map; double **dmap = (double **)job->map;
for (int i=0; i<job->nmaps; ++i) for (int i=0; i<job->nmaps; ++i)
{ {
double *restrict p1=&dmap[i][ri->ofs]; double *MRUTIL_RESTRICT p1=&dmap[i][ri->ofs];
const double *restrict p2=&ringtmp[i*rstride+1]; const double *MRUTIL_RESTRICT p2=&ringtmp[i*rstride+1];
if (ri->stride==1) if (ri->stride==1)
{ {
if (job->flags&SHARP_ADD) if (job->flags&SHARP_ADD)
@ -682,14 +685,14 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
} }
} }
NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, MRUTIL_NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
double *ringtmp, int rstride) double *ringtmp, int rstride)
{ {
if (job->flags & SHARP_DP) if (job->flags & SHARP_DP)
for (int i=0; i<job->nmaps; ++i) for (int i=0; i<job->nmaps; ++i)
{ {
double *restrict p1=&ringtmp[i*rstride+1], double *MRUTIL_RESTRICT p1=&ringtmp[i*rstride+1],
*restrict p2=&(((double *)(job->map[i]))[ri->ofs]); *MRUTIL_RESTRICT p2=&(((double *)(job->map[i]))[ri->ofs]);
if (ri->stride==1) if (ri->stride==1)
memcpy(p1,p2,ri->nph*sizeof(double)); memcpy(p1,p2,ri->nph*sizeof(double));
else else
@ -721,7 +724,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ? phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt : ((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt; ((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*float(wgt);
} }
} }
static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
@ -743,7 +746,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
} }
//FIXME: set phase to zero if not SHARP_MAP2ALM? //FIXME: set phase to zero if not SHARP_MAP2ALM?
NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim) MRUTIL_NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
{ {
if (job->type != SHARP_MAP2ALM) return; if (job->type != SHARP_MAP2ALM) return;
int pstride = job->s_m; int pstride = job->s_m;
@ -760,35 +763,35 @@ NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
} }
else else
{ {
#pragma omp parallel mr::execDynamic(ulim-llim, 0, 1, [&](mr::Scheduler &sched)
{
ringhelper helper;
ringhelper_init(&helper);
int rstride=job->ginfo->nphmax+2;
double *ringtmp=RALLOC(double,job->nmaps*rstride);
#pragma omp for schedule(dynamic,1)
for (int ith=llim; ith<ulim; ++ith)
{ {
int dim2 = job->s_th*(ith-llim); ringhelper helper;
ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride); ringhelper_init(&helper);
for (int i=0; i<job->nmaps; ++i) int rstride=job->ginfo->nphmax+2;
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1), double *ringtmp=RALLOC(double,job->nmaps*rstride);
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
if (job->ginfo->pair[ith].r2.nph>0) while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; ith<rng.hi+llim; ++ith)
{ {
ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride); int dim2 = job->s_th*(ith-llim);
ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
for (int i=0; i<job->nmaps; ++i) for (int i=0; i<job->nmaps; ++i)
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2), ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags); &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
if (job->ginfo->pair[ith].r2.nph>0)
{
ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
for (int i=0; i<job->nmaps; ++i)
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
}
} }
} DEALLOC(ringtmp);
DEALLOC(ringtmp); ringhelper_destroy(&helper);
ringhelper_destroy(&helper); }); /* end of parallel region */
} /* end of parallel region */
} }
} }
NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim) MRUTIL_NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
{ {
if (job->type == SHARP_MAP2ALM) return; if (job->type == SHARP_MAP2ALM) return;
int pstride = job->s_m; int pstride = job->s_m;
@ -805,35 +808,35 @@ NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
} }
else else
{ {
#pragma omp parallel mr::execDynamic(ulim-llim, 0, 1, [&](mr::Scheduler &sched)
{
ringhelper helper;
ringhelper_init(&helper);
int rstride=job->ginfo->nphmax+2;
double *ringtmp=RALLOC(double,job->nmaps*rstride);
#pragma omp for schedule(dynamic,1)
for (int ith=llim; ith<ulim; ++ith)
{ {
int dim2 = job->s_th*(ith-llim); ringhelper helper;
for (int i=0; i<job->nmaps; ++i) ringhelper_init(&helper);
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1), int rstride=job->ginfo->nphmax+2;
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags); double *ringtmp=RALLOC(double,job->nmaps*rstride);
ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
if (job->ginfo->pair[ith].r2.nph>0) while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; ith<rng.hi+llim; ++ith)
{ {
int dim2 = job->s_th*(ith-llim);
for (int i=0; i<job->nmaps; ++i) for (int i=0; i<job->nmaps; ++i)
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2), ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags); &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride); ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
if (job->ginfo->pair[ith].r2.nph>0)
{
for (int i=0; i<job->nmaps; ++i)
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
}
} }
} DEALLOC(ringtmp);
DEALLOC(ringtmp); ringhelper_destroy(&helper);
ringhelper_destroy(&helper); }); /* end of parallel region */
} /* end of parallel region */
} }
} }
NOINLINE static void sharp_execute_job (sharp_job *job) MRUTIL_NOINLINE static void sharp_execute_job (sharp_job *job)
{ {
double timer=sharp_wallTime(); double timer=sharp_wallTime();
job->opcnt=0; job->opcnt=0;
@ -852,6 +855,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
&nchunks,&chunksize); &nchunks,&chunksize);
//FIXME: needs to be changed to "nm" //FIXME: needs to be changed to "nm"
alloc_phase (job,mmax+1,chunksize); alloc_phase (job,mmax+1,chunksize);
std::atomic<size_t> opcnt = 0;
/* chunk loop */ /* chunk loop */
for (int chunk=0; chunk<nchunks; ++chunk) for (int chunk=0; chunk<nchunks; ++chunk)
@ -871,32 +875,30 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
/* map->phase where necessary */ /* map->phase where necessary */
map2phase (job, mmax, llim, ulim); map2phase (job, mmax, llim, ulim);
#pragma omp parallel mr::execDynamic(job->ainfo->nm, 0, 1, [&](mr::Scheduler &sched)
{
sharp_job ljob = *job;
ljob.opcnt=0;
sharp_Ylmgen_C generator;
sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
alloc_almtmp(&ljob,lmax);
#pragma omp for schedule(dynamic,1)
for (int mi=0; mi<job->ainfo->nm; ++mi)
{ {
/* alm->alm_tmp where necessary */ sharp_job ljob = *job;
alm2almtmp (&ljob, lmax, mi); ljob.opcnt=0;
sharp_Ylmgen_C generator;
sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
alloc_almtmp(&ljob,lmax);
inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim); while (auto rng=sched.getNext()) for(auto mi=rng.lo; mi<rng.hi; ++mi)
{
/* alm->alm_tmp where necessary */
alm2almtmp (&ljob, lmax, mi);
inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim);
/* alm_tmp->alm where necessary */ /* alm_tmp->alm where necessary */
almtmp2alm (&ljob, lmax, mi); almtmp2alm (&ljob, lmax, mi);
} }
sharp_Ylmgen_destroy(&generator); sharp_Ylmgen_destroy(&generator);
dealloc_almtmp(&ljob); dealloc_almtmp(&ljob);
#pragma omp critical opcnt+=ljob.opcnt;
job->opcnt+=ljob.opcnt; }); /* end of parallel region */
} /* end of parallel region */
/* phase->map where necessary */ /* phase->map where necessary */
phase2map (job, mmax, llim, ulim); phase2map (job, mmax, llim, ulim);
@ -909,6 +911,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
DEALLOC(job->norm_l); DEALLOC(job->norm_l);
dealloc_phase (job); dealloc_phase (job);
job->opcnt = opcnt;
job->time=sharp_wallTime()-timer; job->time=sharp_wallTime()-timer;
} }
@ -934,8 +937,8 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
job->flags|=SHARP_REAL_HARMONICS; job->flags|=SHARP_REAL_HARMONICS;
job->time = 0.; job->time = 0.;
job->opcnt = 0; job->opcnt = 0;
job->alm=alm; job->alm=(void **)alm;
job->map=map; job->map=(void **)map;
} }
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map, void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,

View file

@ -27,7 +27,7 @@
#define ARCH default #define ARCH default
#define GENERIC_ARCH #define GENERIC_ARCH
#include "libsharp2/sharp_core_inc.c" #include "libsharp2/sharp_core_inc.cc"
#undef GENERIC_ARCH #undef GENERIC_ARCH
#undef ARCH #undef ARCH

View file

@ -34,7 +34,7 @@
#define XCONCATX2(a,b) XCONCATX(a,b) #define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH) #define XARCH(a) XCONCATX2(a,ARCH)
#include <complex.h> #include <complex>
#include <math.h> #include <math.h>
#include <string.h> #include <string.h>
#include "libsharp2/sharp_vecsupport.h" #include "libsharp2/sharp_vecsupport.h"
@ -49,7 +49,7 @@
// Unfortunately, most compilers don't act on this pragma yet. // Unfortunately, most compilers don't act on this pragma yet.
#pragma STDC FP_CONTRACT ON #pragma STDC FP_CONTRACT ON
typedef complex double dcmplx; typedef complex<double> dcmplx;
#define nv0 (128/VLEN) #define nv0 (128/VLEN)
#define nvx (64/VLEN) #define nvx (64/VLEN)
@ -94,35 +94,35 @@ typedef union
sxdata_s s; sxdata_s s;
} sxdata_u; } sxdata_u;
static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale, static inline void Tvnormalize (Tv * MRUTIL_RESTRICT val, Tv * MRUTIL_RESTRICT scale,
double maxval) double maxval)
{ {
const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval); const Tv vfmin=sharp_fsmall*maxval, vfmax=maxval;
const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig); const Tv vfsmall=sharp_fsmall, vfbig=sharp_fbig;
Tm mask = vgt(vabs(*val),vfmax); auto mask = abs(*val)>vfmax;
while (vanyTrue(mask)) while (any_of(mask))
{ {
vmuleq_mask(mask,*val,vfsmall); where(mask,*val)*=vfsmall;
vaddeq_mask(mask,*scale,vone); where(mask,*scale)+=1;
mask = vgt(vabs(*val),vfmax); mask = abs(*val)>vfmax;
} }
mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero)); mask = (abs(*val)<vfmin) & (*val!=0);
while (vanyTrue(mask)) while (any_of(mask))
{ {
vmuleq_mask(mask,*val,vfbig); where(mask,*val)*=vfbig;
vsubeq_mask(mask,*scale,vone); where(mask,*scale)-=1;
mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero)); mask = (abs(*val)<vfmin) & (*val!=0);
} }
} }
static void mypow(Tv val, int npow, const double * restrict powlimit, static void mypow(Tv val, int npow, const double * MRUTIL_RESTRICT powlimit,
Tv * restrict resd, Tv * restrict ress) Tv * MRUTIL_RESTRICT resd, Tv * MRUTIL_RESTRICT ress)
{ {
Tv vminv=vload(powlimit[npow]); Tv vminv=powlimit[npow];
Tm mask = vlt(vabs(val),vminv); auto mask = abs(val)<vminv;
if (!vanyTrue(mask)) // no underflows possible, use quick algoritm if (none_of(mask)) // no underflows possible, use quick algoritm
{ {
Tv res=vone; Tv res=1;
do do
{ {
if (npow&1) if (npow&1)
@ -131,11 +131,11 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
} }
while(npow>>=1); while(npow>>=1);
*resd=res; *resd=res;
*ress=vzero; *ress=0;
} }
else else
{ {
Tv scale=vzero, scaleint=vzero, res=vone; Tv scale=0, scaleint=0, res=1;
Tvnormalize(&val,&scaleint,sharp_fbighalf); Tvnormalize(&val,&scaleint,sharp_fbighalf);
do do
{ {
@ -155,8 +155,8 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
} }
} }
static inline void getCorfac(Tv scale, Tv * restrict corfac, static inline void getCorfac(Tv scale, Tv * MRUTIL_RESTRICT corfac,
const double * restrict cf) const double * MRUTIL_RESTRICT cf)
{ {
typedef union typedef union
{ Tv v; double s[VLEN]; } Tvu; { Tv v; double s[VLEN]; } Tvu;
@ -169,67 +169,67 @@ static inline void getCorfac(Tv scale, Tv * restrict corfac,
*corfac=corf.v; *corfac=corf.v;
} }
static inline int rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, Tv eps) static inline bool rescale(Tv * MRUTIL_RESTRICT v1, Tv * MRUTIL_RESTRICT v2, Tv * MRUTIL_RESTRICT s, Tv eps)
{ {
Tm mask = vgt(vabs(*v2),eps); auto mask = abs(*v2)>eps;
if (vanyTrue(mask)) if (any_of(mask))
{ {
vmuleq_mask(mask,*v1,vload(sharp_fsmall)); where(mask,*v1)*=sharp_fsmall;
vmuleq_mask(mask,*v2,vload(sharp_fsmall)); where(mask,*v2)*=sharp_fsmall;
vaddeq_mask(mask,*s,vone); where(mask,*s)+=1;
return 1; return true;
} }
return 0; return false;
} }
NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen, MRUTIL_NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * MRUTIL_RESTRICT gen,
s0data_v * restrict d, int * restrict l_, int * restrict il_, int nv2) s0data_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int * MRUTIL_RESTRICT il_, int nv2)
{ {
int l=gen->m, il=0; int l=gen->m, il=0;
Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); Tv mfac = (gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m];
Tv limscale=vload(sharp_limscale); Tv limscale=sharp_limscale;
int below_limit = 1; int below_limit = 1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->lam1[i]=vzero; d->lam1[i]=0;
mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]); mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]);
d->lam2[i] *= mfac; d->lam2[i] *= mfac;
Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol); Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
below_limit &= vallTrue(vlt(d->scale[i],limscale)); below_limit &= all_of(d->scale[i]<limscale);
} }
while (below_limit) while (below_limit)
{ {
if (l+4>gen->lmax) {*l_=gen->lmax+1;return;} if (l+4>gen->lmax) {*l_=gen->lmax+1;return;}
below_limit=1; below_limit=1;
Tv a1=vload(gen->coef[il ].a), b1=vload(gen->coef[il ].b); Tv a1=gen->coef[il ].a, b1=gen->coef[il ].b;
Tv a2=vload(gen->coef[il+1].a), b2=vload(gen->coef[il+1].b); Tv a2=gen->coef[il+1].a, b2=gen->coef[il+1].b;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i]; d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol))
below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale))); below_limit &= all_of(d->scale[i]<sharp_limscale);
} }
l+=4; il+=2; l+=4; il+=2;
} }
*l_=l; *il_=il; *l_=l; *il_=il;
} }
NOINLINE static void alm2map_kernel(s0data_v * restrict d, MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d,
const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm, const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, const dcmplx * MRUTIL_RESTRICT alm,
int l, int il, int lmax, int nv2) int l, int il, int lmax, int nv2)
{ {
if (nv2==nv0) if (nv2==nv0)
{ {
for (; l<=lmax-2; il+=2, l+=4) for (; l<=lmax-2; il+=2, l+=4)
{ {
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag();
Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag();
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b); Tv a1=coef[il ].a, b1=coef[il ].b;
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b); Tv a2=coef[il+1].a, b2=coef[il+1].b;
for (int i=0; i<nv0; ++i) for (int i=0; i<nv0; ++i)
{ {
d->p1r[i] += d->lam2[i]*ar1; d->p1r[i] += d->lam2[i]*ar1;
@ -249,12 +249,12 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
{ {
for (; l<=lmax-2; il+=2, l+=4) for (; l<=lmax-2; il+=2, l+=4)
{ {
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag();
Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag();
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b); Tv a1=coef[il ].a, b1=coef[il ].b;
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b); Tv a2=coef[il+1].a, b2=coef[il+1].b;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->p1r[i] += d->lam2[i]*ar1; d->p1r[i] += d->lam2[i]*ar1;
@ -272,9 +272,9 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
} }
for (; l<=lmax; ++il, l+=2) for (; l<=lmax; ++il, l+=2)
{ {
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
Tv a=vload(coef[il].a), b=vload(coef[il].b); Tv a=coef[il].a, b=coef[il].b;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->p1r[i] += d->lam2[i]*ar1; d->p1r[i] += d->lam2[i]*ar1;
@ -288,8 +288,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
} }
} }
NOINLINE static void calc_alm2map (sharp_job * restrict job, MRUTIL_NOINLINE static void calc_alm2map (sharp_job * MRUTIL_RESTRICT job,
const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth) const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth)
{ {
int l,il,lmax=gen->lmax; int l,il,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN; int nv2 = (nth+VLEN-1)/VLEN;
@ -298,20 +298,20 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
if (l>lmax) return; if (l>lmax) return;
job->opcnt += (lmax+1-l) * 6*nth; job->opcnt += (lmax+1-l) * 6*nth;
const sharp_ylmgen_dbl2 * restrict coef = gen->coef; const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef;
const dcmplx * restrict alm=job->almtmp; const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
int full_ieee=1; int full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
getCorfac(d->scale[i], &d->corfac[i], gen->cf); getCorfac(d->scale[i], &d->corfac[i], gen->cf);
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); full_ieee &= all_of(d->scale[i]>=sharp_minscale);
} }
while((!full_ieee) && (l<=lmax)) while((!full_ieee) && (l<=lmax))
{ {
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
Tv a=vload(coef[il].a), b=vload(coef[il].b); Tv a=coef[il].a, b=coef[il].b;
full_ieee=1; full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
@ -322,9 +322,9 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i]; Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
d->lam1[i] = d->lam2[i]; d->lam1[i] = d->lam2[i];
d->lam2[i] = tmp; d->lam2[i] = tmp;
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol))
getCorfac(d->scale[i], &d->corfac[i], gen->cf); getCorfac(d->scale[i], &d->corfac[i], gen->cf);
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); full_ieee &= all_of(d->scale[i]>=sharp_minscale);
} }
l+=2; ++il; l+=2; ++il;
} }
@ -338,16 +338,16 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
alm2map_kernel(d, coef, alm, l, il, lmax, nv2); alm2map_kernel(d, coef, alm, l, il, lmax, nv2);
} }
NOINLINE static void map2alm_kernel(s0data_v * restrict d, MRUTIL_NOINLINE static void map2alm_kernel(s0data_v * MRUTIL_RESTRICT d,
const sharp_ylmgen_dbl2 * restrict coef, dcmplx * restrict alm, int l, const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, dcmplx * MRUTIL_RESTRICT alm, int l,
int il, int lmax, int nv2) int il, int lmax, int nv2)
{ {
for (; l<=lmax-2; il+=2, l+=4) for (; l<=lmax-2; il+=2, l+=4)
{ {
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b); Tv a1=coef[il ].a, b1=coef[il ].b;
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b); Tv a2=coef[il+1].a, b2=coef[il+1].b;
Tv atmp1[4] = {vzero, vzero, vzero, vzero}; Tv atmp1[4] = {0,0,0,0};
Tv atmp2[4] = {vzero, vzero, vzero, vzero}; Tv atmp2[4] = {0,0,0,0};
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
atmp1[0] += d->lam2[i]*d->p1r[i]; atmp1[0] += d->lam2[i]*d->p1r[i];
@ -366,8 +366,8 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d,
} }
for (; l<=lmax; ++il, l+=2) for (; l<=lmax; ++il, l+=2)
{ {
Tv a=vload(coef[il].a), b=vload(coef[il].b); Tv a=coef[il].a, b=coef[il].b;
Tv atmp[4] = {vzero, vzero, vzero, vzero}; Tv atmp[4] = {0,0,0,0};
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
atmp[0] += d->lam2[i]*d->p1r[i]; atmp[0] += d->lam2[i]*d->p1r[i];
@ -382,8 +382,8 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d,
} }
} }
NOINLINE static void calc_map2alm (sharp_job * restrict job, MRUTIL_NOINLINE static void calc_map2alm (sharp_job * MRUTIL_RESTRICT job,
const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth) const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth)
{ {
int l,il,lmax=gen->lmax; int l,il,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN; int nv2 = (nth+VLEN-1)/VLEN;
@ -392,19 +392,19 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
if (l>lmax) return; if (l>lmax) return;
job->opcnt += (lmax+1-l) * 6*nth; job->opcnt += (lmax+1-l) * 6*nth;
const sharp_ylmgen_dbl2 * restrict coef = gen->coef; const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef;
dcmplx * restrict alm=job->almtmp; dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
int full_ieee=1; int full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
getCorfac(d->scale[i], &d->corfac[i], gen->cf); getCorfac(d->scale[i], &d->corfac[i], gen->cf);
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); full_ieee &= all_of(d->scale[i]>=sharp_minscale);
} }
while((!full_ieee) && (l<=lmax)) while((!full_ieee) && (l<=lmax))
{ {
Tv a=vload(coef[il].a), b=vload(coef[il].b); Tv a=coef[il].a, b=coef[il].b;
Tv atmp[4] = {vzero, vzero, vzero, vzero}; Tv atmp[4] = {0,0,0,0};
full_ieee=1; full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
@ -415,9 +415,9 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i]; Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
d->lam1[i] = d->lam2[i]; d->lam1[i] = d->lam2[i];
d->lam2[i] = tmp; d->lam2[i] = tmp;
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol))
getCorfac(d->scale[i], &d->corfac[i], gen->cf); getCorfac(d->scale[i], &d->corfac[i], gen->cf);
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); full_ieee &= all_of(d->scale[i]>=sharp_minscale);
} }
vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]); vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
l+=2; ++il; l+=2; ++il;
@ -432,21 +432,21 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
map2alm_kernel(d, coef, alm, l, il, lmax, nv2); map2alm_kernel(d, coef, alm, l, il, lmax, nv2);
} }
NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen, MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RESTRICT gen,
sxdata_v * restrict d, int * restrict l_, int nv2) sxdata_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int nv2)
{ {
const sharp_ylmgen_dbl2 * restrict fx = gen->coef; const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
Tv prefac=vload(gen->prefac[gen->m]), Tv prefac=gen->prefac[gen->m],
prescale=vload(gen->fscale[gen->m]); prescale=gen->fscale[gen->m];
Tv limscale=vload(sharp_limscale); Tv limscale=sharp_limscale;
int below_limit=1; int below_limit=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
Tv cth2=vmax(vload(1e-15),vsqrt((vone+d->cth[i])*vload(0.5))); Tv cth2=max(Tv(1e-15),sqrt((1.+d->cth[i])*0.5));
Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5))); Tv sth2=max(Tv(1e-15),sqrt((1.-d->cth[i])*0.5));
Tm mask=vlt(d->sth[i],vzero); auto mask=d->sth[i]<0;
vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.)); where(mask&(d->cth[i]<0),cth2)*=-1.;
vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.)); where(mask&(d->cth[i]<0),sth2)*=-1.;
Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps; Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps); mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
@ -454,8 +454,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps); mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps); mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
d->l1p[i] = vzero; d->l1p[i] = 0;
d->l1m[i] = vzero; d->l1m[i] = 0;
d->l2p[i] = prefac*ccp; d->l2p[i] = prefac*ccp;
d->scp[i] = prescale+ccps; d->scp[i] = prescale+ccps;
d->l2m[i] = prefac*csp; d->l2m[i] = prefac*csp;
@ -467,17 +467,17 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
d->l2m[i] *= scp; d->l2m[i] *= scp;
d->scm[i] += scps; d->scm[i] += scps;
if (gen->preMinus_p) if (gen->preMinus_p)
d->l2p[i] = vneg(d->l2p[i]); d->l2p[i] = -d->l2p[i];
if (gen->preMinus_m) if (gen->preMinus_m)
d->l2m[i] = vneg(d->l2m[i]); d->l2m[i] = -d->l2m[i];
if (gen->s&1) if (gen->s&1)
d->l2p[i] = vneg(d->l2p[i]); d->l2p[i] = -d->l2p[i];
Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol); Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol);
Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol); Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol);
below_limit &= vallTrue(vlt(d->scm[i],limscale)) && below_limit &= all_of(d->scm[i]<limscale) &&
vallTrue(vlt(d->scp[i],limscale)); all_of(d->scp[i]<limscale);
} }
int l=gen->mhi; int l=gen->mhi;
@ -486,18 +486,18 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
{ {
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
below_limit=1; below_limit=1;
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) || if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],sharp_ftol) ||
rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol))) rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],sharp_ftol))
below_limit &= vallTrue(vlt(d->scp[i],limscale)) && below_limit &= all_of(d->scp[i]<limscale) &&
vallTrue(vlt(d->scm[i],limscale)); all_of(d->scm[i]<limscale);
} }
l+=2; l+=2;
} }
@ -505,19 +505,19 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
*l_=l; *l_=l;
} }
NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d, MRUTIL_NOINLINE static void alm2map_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm, const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm,
int l, int lmax, int nv2) int l, int lmax, int nv2)
{ {
int lsave = l; int lsave = l;
while (l<=lmax) while (l<=lmax)
{ {
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(),
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
@ -537,12 +537,12 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
l=lsave; l=lsave;
while (l<=lmax) while (l<=lmax)
{ {
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(),
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
@ -561,8 +561,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
} }
} }
NOINLINE static void calc_alm2map_spin (sharp_job * restrict job, MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job * MRUTIL_RESTRICT job,
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
{ {
int l,lmax=gen->lmax; int l,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN; int nv2 = (nth+VLEN-1)/VLEN;
@ -571,25 +571,25 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
if (l>lmax) return; if (l>lmax) return;
job->opcnt += (lmax+1-l) * 23*nth; job->opcnt += (lmax+1-l) * 23*nth;
const sharp_ylmgen_dbl2 * restrict fx = gen->coef; const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
const dcmplx * restrict alm=job->almtmp; const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
int full_ieee=1; int full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
getCorfac(d->scp[i], &d->cfp[i], gen->cf); getCorfac(d->scp[i], &d->cfp[i], gen->cf);
getCorfac(d->scm[i], &d->cfm[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && full_ieee &= all_of(d->scp[i]>=sharp_minscale) &&
vallTrue(vge(d->scm[i],vload(sharp_minscale))); all_of(d->scm[i]>=sharp_minscale);
} }
while((!full_ieee) && (l<=lmax)) while((!full_ieee) && (l<=lmax))
{ {
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(),
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
full_ieee=1; full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
@ -611,12 +611,12 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol))
getCorfac(d->scp[i], &d->cfp[i], gen->cf); getCorfac(d->scp[i], &d->cfp[i], gen->cf);
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); full_ieee &= all_of(d->scp[i]>=sharp_minscale);
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol))
getCorfac(d->scm[i], &d->cfm[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); full_ieee &= all_of(d->scm[i]>=sharp_minscale);
} }
l+=2; l+=2;
} }
@ -641,17 +641,17 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
} }
} }
NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d, MRUTIL_NOINLINE static void map2alm_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
const sharp_ylmgen_dbl2 * restrict fx, dcmplx * restrict alm, const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, dcmplx * MRUTIL_RESTRICT alm,
int l, int lmax, int nv2) int l, int lmax, int nv2)
{ {
int lsave=l; int lsave=l;
while (l<=lmax) while (l<=lmax)
{ {
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; Tv agr1=0, agi1=0, acr1=0, aci1=0;
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; Tv agr2=0, agi2=0, acr2=0, aci2=0;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
@ -672,10 +672,10 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
l=lsave; l=lsave;
while (l<=lmax) while (l<=lmax)
{ {
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; Tv agr1=0, agi1=0, acr1=0, aci1=0;
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; Tv agr2=0, agi2=0, acr2=0, aci2=0;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
@ -695,8 +695,8 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
} }
} }
NOINLINE static void calc_map2alm_spin (sharp_job * restrict job, MRUTIL_NOINLINE static void calc_map2alm_spin (sharp_job * MRUTIL_RESTRICT job,
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
{ {
int l,lmax=gen->lmax; int l,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN; int nv2 = (nth+VLEN-1)/VLEN;
@ -705,15 +705,15 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
if (l>lmax) return; if (l>lmax) return;
job->opcnt += (lmax+1-l) * 23*nth; job->opcnt += (lmax+1-l) * 23*nth;
const sharp_ylmgen_dbl2 * restrict fx = gen->coef; const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
dcmplx * restrict alm=job->almtmp; dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
int full_ieee=1; int full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
getCorfac(d->scp[i], &d->cfp[i], gen->cf); getCorfac(d->scp[i], &d->cfp[i], gen->cf);
getCorfac(d->scm[i], &d->cfm[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && full_ieee &= all_of(d->scp[i]>=sharp_minscale) &&
vallTrue(vge(d->scm[i],vload(sharp_minscale))); all_of(d->scm[i]>=sharp_minscale);
} }
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
@ -726,10 +726,10 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
while((!full_ieee) && (l<=lmax)) while((!full_ieee) && (l<=lmax))
{ {
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; Tv agr1=0, agi1=0, acr1=0, aci1=0;
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; Tv agr2=0, agi2=0, acr2=0, aci2=0;
full_ieee=1; full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
@ -748,12 +748,12 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol))
getCorfac(d->scp[i], &d->cfp[i], gen->cf); getCorfac(d->scp[i], &d->cfp[i], gen->cf);
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); full_ieee &= all_of(d->scp[i]>=sharp_minscale);
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol))
getCorfac(d->scm[i], &d->cfm[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); full_ieee &= all_of(d->scm[i]>=sharp_minscale);
} }
vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]); vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
@ -772,17 +772,17 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
} }
NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d, MRUTIL_NOINLINE static void alm2map_deriv1_kernel(sxdata_v * MRUTIL_RESTRICT d,
const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm, const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm,
int l, int lmax, int nv2) int l, int lmax, int nv2)
{ {
int lsave=l; int lsave=l;
while (l<=lmax) while (l<=lmax)
{ {
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), Tv ar1=alm[l ].real(), ai1=alm[l ].imag(),
ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); ar2=alm[l+1].real(), ai2=alm[l+1].imag();
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
@ -798,10 +798,10 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
l=lsave; l=lsave;
while (l<=lmax) while (l<=lmax)
{ {
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), Tv ar1=alm[l ].real(), ai1=alm[l ].imag(),
ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); ar2=alm[l+1].real(), ai2=alm[l+1].imag();
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
@ -816,8 +816,8 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
} }
} }
NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job, MRUTIL_NOINLINE static void calc_alm2map_deriv1(sharp_job * MRUTIL_RESTRICT job,
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
{ {
int l,lmax=gen->lmax; int l,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN; int nv2 = (nth+VLEN-1)/VLEN;
@ -826,23 +826,23 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
if (l>lmax) return; if (l>lmax) return;
job->opcnt += (lmax+1-l) * 15*nth; job->opcnt += (lmax+1-l) * 15*nth;
const sharp_ylmgen_dbl2 * restrict fx = gen->coef; const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
const dcmplx * restrict alm=job->almtmp; const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
int full_ieee=1; int full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
getCorfac(d->scp[i], &d->cfp[i], gen->cf); getCorfac(d->scp[i], &d->cfp[i], gen->cf);
getCorfac(d->scm[i], &d->cfm[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && full_ieee &= all_of(d->scp[i]>=sharp_minscale) &&
vallTrue(vge(d->scm[i],vload(sharp_minscale))); all_of(d->scm[i]>=sharp_minscale);
} }
while((!full_ieee) && (l<=lmax)) while((!full_ieee) && (l<=lmax))
{ {
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), Tv ar1=alm[l ].real(), ai1=alm[l ].imag(),
ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); ar2=alm[l+1].real(), ai2=alm[l+1].imag();
full_ieee=1; full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
@ -864,12 +864,12 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol))
getCorfac(d->scp[i], &d->cfp[i], gen->cf); getCorfac(d->scp[i], &d->cfp[i], gen->cf);
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); full_ieee &= all_of(d->scp[i]>=sharp_minscale);
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol))
getCorfac(d->scm[i], &d->cfm[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); full_ieee &= all_of(d->scm[i]>=sharp_minscale);
} }
l+=2; l+=2;
} }
@ -897,7 +897,7 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim, const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim) sharp_Ylmgen_C *gen, int mi, const int *mlim)
{ {
@ -912,7 +912,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
if (job->spin==0) if (job->spin==0)
{ {
//adjust the a_lm for the new algorithm //adjust the a_lm for the new algorithm
dcmplx * restrict alm=job->almtmp; dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2) for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
{ {
dcmplx al = alm[l]; dcmplx al = alm[l];
@ -963,8 +963,8 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
d.s.p2r[i]*=cth_[tgt]; d.s.p2r[i]*=cth_[tgt];
d.s.p2i[i]*=cth_[tgt]; d.s.p2i[i]*=cth_[tgt];
int phas_idx = tgt*job->s_th + mi*job->s_m; int phas_idx = tgt*job->s_th + mi*job->s_m;
complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I, complex<double> r1(d.s.p1r[i], d.s.p1i[i]),
r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I; r2(d.s.p2r[i], d.s.p2i[i]);
job->phase[phas_idx] = r1+r2; job->phase[phas_idx] = r1+r2;
if (ispair[tgt]) if (ispair[tgt])
job->phase[phas_idx+1] = r1-r2; job->phase[phas_idx+1] = r1-r2;
@ -1027,10 +1027,10 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
{ {
int tgt=itgt[i]; int tgt=itgt[i];
int phas_idx = tgt*job->s_th + mi*job->s_m; int phas_idx = tgt*job->s_th + mi*job->s_m;
complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I, complex<double> q1(d.s.p1pr[i], d.s.p1pi[i]),
q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I, q2(d.s.p2pr[i], d.s.p2pi[i]),
u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I, u1(d.s.p1mr[i], d.s.p1mi[i]),
u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I; u2(d.s.p2mr[i], d.s.p2mi[i]);
job->phase[phas_idx ] = q1+q2; job->phase[phas_idx ] = q1+q2;
job->phase[phas_idx+2] = u1+u2; job->phase[phas_idx+2] = u1+u2;
if (ispair[tgt]) if (ispair[tgt])
@ -1056,7 +1056,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
} }
} }
NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim, const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim) sharp_Ylmgen_C *gen, int mi, const int *mlim)
{ {
@ -1083,8 +1083,8 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
int phas_idx = ith*job->s_th + mi*job->s_m; int phas_idx = ith*job->s_th + mi*job->s_m;
dcmplx ph1=job->phase[phas_idx]; dcmplx ph1=job->phase[phas_idx];
dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.; dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.;
d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2); d.s.p1r[nth]=(ph1+ph2).real(); d.s.p1i[nth]=(ph1+ph2).imag();
d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2); d.s.p2r[nth]=(ph1-ph2).real(); d.s.p2i[nth]=(ph1-ph2).imag();
//adjust for new algorithm //adjust for new algorithm
d.s.p2r[nth]*=cth_[ith]; d.s.p2r[nth]*=cth_[ith];
d.s.p2i[nth]*=cth_[ith]; d.s.p2i[nth]*=cth_[ith];
@ -1105,7 +1105,7 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
} }
} }
//adjust the a_lm for the new algorithm //adjust the a_lm for the new algorithm
dcmplx * restrict alm=job->almtmp; dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
dcmplx alm2 = 0.; dcmplx alm2 = 0.;
double alold=0; double alold=0;
for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2) for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
@ -1138,10 +1138,10 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
p2U=ispair[ith] ? job->phase[phas_idx+3]:0.; p2U=ispair[ith] ? job->phase[phas_idx+3]:0.;
if ((gen->mhi-gen->m+gen->s)&1) if ((gen->mhi-gen->m+gen->s)&1)
{ p2Q=-p2Q; p2U=-p2U; } { p2Q=-p2Q; p2U=-p2U; }
d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q); d.s.p1pr[nth]=(p1Q+p2Q).real(); d.s.p1pi[nth]=(p1Q+p2Q).imag();
d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U); d.s.p1mr[nth]=(p1U+p2U).real(); d.s.p1mi[nth]=(p1U+p2U).imag();
d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q); d.s.p2pr[nth]=(p1Q-p2Q).real(); d.s.p2pi[nth]=(p1Q-p2Q).imag();
d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U); d.s.p2mr[nth]=(p1U-p2U).real(); d.s.p2mi[nth]=(p1U-p2U).imag();
++nth; ++nth;
} }
++ith; ++ith;

View file

@ -29,7 +29,7 @@
#include "libsharp2/sharp_geomhelpers.h" #include "libsharp2/sharp_geomhelpers.h"
#include "libsharp2/sharp_legendre_roots.h" #include "libsharp2/sharp_legendre_roots.h"
#include "libsharp2/sharp_utils.h" #include "libsharp2/sharp_utils.h"
#include "libsharp2/pocketfft.h" #include "mr_util/fft.h"
void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings, void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
const int *rings, const double *weight, sharp_geom_info **geom_info) const int *rings, const double *weight, sharp_geom_info **geom_info)
@ -155,9 +155,7 @@ void sharp_make_fejer1_geom_info (int nrings, int ppring, double phi0,
weight[2*k ]=2./(1.-4.*k*k)*sin((k*pi)/nrings); weight[2*k ]=2./(1.-4.*k*k)*sin((k*pi)/nrings);
} }
if ((nrings&1)==0) weight[nrings-1]=0.; if ((nrings&1)==0) weight[nrings-1]=0.;
pocketfft_plan_r plan = pocketfft_make_plan_r(nrings); mr::r2r_fftpack({size_t(nrings)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.);
pocketfft_backward_r(plan,weight,1.);
pocketfft_delete_plan_r(plan);
for (int m=0; m<(nrings+1)/2; ++m) for (int m=0; m<(nrings+1)/2; ++m)
{ {
@ -202,9 +200,7 @@ void sharp_make_cc_geom_info (int nrings, int ppring, double phi0,
for (int k=1; k<=(n/2-1); ++k) for (int k=1; k<=(n/2-1); ++k)
weight[2*k-1]=2./(1.-4.*k*k) + dw; weight[2*k-1]=2./(1.-4.*k*k) + dw;
weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1); weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1);
pocketfft_plan_r plan = pocketfft_make_plan_r(n); mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.);
pocketfft_backward_r(plan,weight,1.);
pocketfft_delete_plan_r(plan);
weight[n]=weight[0]; weight[n]=weight[0];
for (int m=0; m<(nrings+1)/2; ++m) for (int m=0; m<(nrings+1)/2; ++m)
@ -250,9 +246,7 @@ void sharp_make_fejer2_geom_info (int nrings, int ppring, double phi0,
for (int k=1; k<=(n/2-1); ++k) for (int k=1; k<=(n/2-1); ++k)
weight[2*k-1]=2./(1.-4.*k*k); weight[2*k-1]=2./(1.-4.*k*k);
weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.; weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.;
pocketfft_plan_r plan = pocketfft_make_plan_r(n); mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.);
pocketfft_backward_r(plan,weight,1.);
pocketfft_delete_plan_r(plan);
for (int m=0; m<nrings; ++m) for (int m=0; m<nrings; ++m)
weight[m]=weight[m+1]; weight[m]=weight[m+1];

View file

@ -28,14 +28,12 @@
#ifndef SHARP2_INTERNAL_H #ifndef SHARP2_INTERNAL_H
#define SHARP2_INTERNAL_H #define SHARP2_INTERNAL_H
#ifdef __cplusplus #include <complex>
#error This header file cannot be included from C++, only from C
#endif
#include <complex.h>
#include "libsharp2/sharp.h" #include "libsharp2/sharp.h"
#include "libsharp2/sharp_ylmgen_c.h" #include "libsharp2/sharp_ylmgen_c.h"
using std::complex;
typedef struct typedef struct
{ {
sharp_jobtype type; sharp_jobtype type;
@ -45,9 +43,9 @@ typedef struct
void **map; void **map;
void **alm; void **alm;
int s_m, s_th; // strides in m and theta direction int s_m, s_th; // strides in m and theta direction
complex double *phase; complex<double> *phase;
double *norm_l; double *norm_l;
complex double *almtmp; complex<double> *almtmp;
const sharp_geom_info *ginfo; const sharp_geom_info *ginfo;
const sharp_alm_info *ainfo; const sharp_alm_info *ainfo;
double time; double time;

View file

@ -122,10 +122,4 @@ double sharp_wallTime(void);
} }
#endif #endif
#ifdef __GNUC__
#define NOINLINE __attribute__((noinline))
#else
#define NOINLINE
#endif
#endif #endif

View file

@ -28,193 +28,27 @@
#ifndef SHARP2_VECSUPPORT_H #ifndef SHARP2_VECSUPPORT_H
#define SHARP2_VECSUPPORT_H #define SHARP2_VECSUPPORT_H
#include <math.h> #include <cmath>
#include <complex>
using std::complex;
#include <experimental/simd>
using std::experimental::native_simd;
using std::experimental::reduce;
#ifndef VLEN #include "mr_util/useful_macros.h"
#if (defined(__AVX512F__)) using Tv=native_simd<double>;
#define VLEN 8 using Tm=Tv::mask_type;
#elif (defined (__AVX__)) using Ts=Tv::value_type;
#define VLEN 4 static constexpr size_t VLEN=Tv::size();
#elif (defined (__SSE2__))
#define VLEN 2
#else
#define VLEN 1
#endif
#endif
typedef double Ts;
#if (VLEN==1)
typedef double Tv;
typedef int Tm;
#define vload(a) (a) #define vload(a) (a)
#define vzero 0.
#define vone 1.
#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
#define vneg(a) (-(a))
#define vabs(a) fabs(a)
#define vsqrt(a) sqrt(a)
#define vlt(a,b) ((a)<(b))
#define vgt(a,b) ((a)>(b))
#define vge(a,b) ((a)>=(b))
#define vne(a,b) ((a)!=(b))
#define vand_mask(a,b) ((a)&&(b))
#define vor_mask(a,b) ((a)||(b))
static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
#define vanyTrue(a) (a)
#define vallTrue(a) (a)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc) complex<double> * MRUTIL_RESTRICT cc)
{ cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
#endif
#if (VLEN==2)
#include <emmintrin.h>
#if defined (__SSE3__)
#include <pmmintrin.h>
#endif
#if defined (__SSE4_1__)
#include <smmintrin.h>
#endif
typedef __m128d Tv;
typedef __m128d Tm;
#if defined(__SSE4_1__)
#define vblend__(m,a,b) _mm_blendv_pd(b,a,m)
#else
static inline Tv vblend__(Tv m, Tv a, Tv b)
{ return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
#endif
#define vload(a) _mm_set1_pd(a)
#define vzero _mm_setzero_pd()
#define vone vload(1.)
#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
#define vneg(a) _mm_xor_pd(vload(-0.),a)
#define vabs(a) _mm_andnot_pd(vload(-0.),a)
#define vsqrt(a) _mm_sqrt_pd(a)
#define vlt(a,b) _mm_cmplt_pd(a,b)
#define vgt(a,b) _mm_cmpgt_pd(a,b)
#define vge(a,b) _mm_cmpge_pd(a,b)
#define vne(a,b) _mm_cmpneq_pd(a,b)
#define vand_mask(a,b) _mm_and_pd(a,b)
#define vor_mask(a,b) _mm_or_pd(a,b)
#define vmin(a,b) _mm_min_pd(a,b)
#define vmax(a,b) _mm_max_pd(a,b);
#define vanyTrue(a) (_mm_movemask_pd(a)!=0)
#define vallTrue(a) (_mm_movemask_pd(a)==3)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c,
Tv d, _Complex double * restrict cc)
{ {
union {Tv v; _Complex double c; } u1, u2; cc[0] += complex<double>(reduce(a,std::plus<>()),reduce(b,std::plus<>()));
#if defined(__SSE3__) cc[1] += complex<double>(reduce(c,std::plus<>()),reduce(d,std::plus<>()));
u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
#else
u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
#endif
cc[0]+=u1.c; cc[1]+=u2.c;
} }
#endif #endif
#if (VLEN==4)
#include <immintrin.h>
typedef __m256d Tv;
typedef __m256d Tm;
#define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
#define vload(a) _mm256_set1_pd(a)
#define vzero _mm256_setzero_pd()
#define vone vload(1.)
#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
#define vneg(a) _mm256_xor_pd(vload(-0.),a)
#define vabs(a) _mm256_andnot_pd(vload(-0.),a)
#define vsqrt(a) _mm256_sqrt_pd(a)
#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
#define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ)
#define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ)
#define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
#define vand_mask(a,b) _mm256_and_pd(a,b)
#define vor_mask(a,b) _mm256_or_pd(a,b)
#define vmin(a,b) _mm256_min_pd(a,b)
#define vmax(a,b) _mm256_max_pd(a,b)
#define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
#define vallTrue(a) (_mm256_movemask_pd(a)==15)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc)
{
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
tmp1=tmp3+tmp4;
union {Tv v; _Complex double c[2]; } u;
u.v=tmp1;
cc[0]+=u.c[0]; cc[1]+=u.c[1];
}
#endif
#if (VLEN==8)
#include <immintrin.h>
typedef __m512d Tv;
typedef __mmask8 Tm;
#define vload(a) _mm512_set1_pd(a)
#define vzero _mm512_setzero_pd()
#define vone vload(1.)
#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
#define vneg(a) _mm512_mul_pd(a,vload(-1.))
#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)vload(-0.),(__m512i)a)
#define vsqrt(a) _mm512_sqrt_pd(a)
#define vlt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_LT_OQ)
#define vgt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GT_OQ)
#define vge(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GE_OQ)
#define vne(a,b) _mm512_cmp_pd_mask(a,b,_CMP_NEQ_OQ)
#define vand_mask(a,b) ((a)&(b))
#define vor_mask(a,b) ((a)|(b))
#define vmin(a,b) _mm512_min_pd(a,b)
#define vmax(a,b) _mm512_max_pd(a,b)
#define vanyTrue(a) (a!=0)
#define vallTrue(a) (a==255)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc)
{
cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
}
#endif
#endif

View file

@ -26,7 +26,8 @@
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <complex.h> #include <complex>
using std::complex;
#ifdef USE_MPI #ifdef USE_MPI
#include "mpi.h" #include "mpi.h"
#include "libsharp2/sharp_mpi.h" #include "libsharp2/sharp_mpi.h"
@ -38,7 +39,7 @@
#include "libsharp2/sharp_geomhelpers.h" #include "libsharp2/sharp_geomhelpers.h"
#include "libsharp2/sharp_almhelpers.h" #include "libsharp2/sharp_almhelpers.h"
#include "libsharp2/sharp_utils.h" #include "libsharp2/sharp_utils.h"
#include "libsharp2/sharp_utils.c" #include "libsharp2/sharp_utils.cc"
#include "test/memusage.h" #include "test/memusage.h"
static void OpenMP_status(void) static void OpenMP_status(void)
@ -94,7 +95,7 @@ static void sharp_module_startup (const char *name, int argc, int argc_expected,
exit(1); exit(1);
} }
typedef complex double dcmplx; typedef complex<double> dcmplx;
int ntasks, mytask; int ntasks, mytask;
@ -122,7 +123,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
{ {
double rv = drand(-1,1,&state); double rv = drand(-1,1,&state);
double iv = (m==0) ? 0 : drand(-1,1,&state); double iv = (m==0) ? 0 : drand(-1,1,&state);
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv; alm[sharp_alm_index(helper,l,mi)] = dcmplx(rv,iv);
} }
} }
} }
@ -230,8 +231,7 @@ static double *get_sqsum_and_invert (dcmplx **alm, ptrdiff_t nalms, int ncomp)
sqsum[i]=0; sqsum[i]=0;
for (ptrdiff_t j=0; j<nalms; ++j) for (ptrdiff_t j=0; j<nalms; ++j)
{ {
sqsum[i]+=creal(alm[i][j])*creal(alm[i][j]) sqsum[i]+=norm(alm[i][j]);
+cimag(alm[i][j])*cimag(alm[i][j]);
alm[i][j]=-alm[i][j]; alm[i][j]=-alm[i][j];
} }
} }
@ -253,8 +253,7 @@ static void get_errors (dcmplx **alm, ptrdiff_t nalms, int ncomp, double *sqsum,
double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot; double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot;
for (ptrdiff_t j=0; j<nalms; ++j) for (ptrdiff_t j=0; j<nalms; ++j)
{ {
double sqr=creal(alm[i][j])*creal(alm[i][j]) double sqr=norm(alm[i][j]);
+cimag(alm[i][j])*cimag(alm[i][j]);
sum+=sqr; sum+=sqr;
if (sqr>maxdiff) maxdiff=sqr; if (sqr>maxdiff) maxdiff=sqr;
} }
@ -414,7 +413,7 @@ static void check_sign_scale(void)
ALLOC2D(alm,dcmplx,2,nalms); ALLOC2D(alm,dcmplx,2,nalms);
for (int i=0; i<2; ++i) for (int i=0; i<2; ++i)
for (int j=0; j<nalms; ++j) for (int j=0; j<nalms; ++j)
alm[i][j]=1.+_Complex_I; alm[i][j]=dcmplx(1.,1.);
sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP, sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP,
NULL,NULL); NULL,NULL);