Compare commits

..

3 commits
cpp ... master

Author SHA1 Message Date
Martin Reinecke
004b3aea9a fix typo 2022-09-29 14:59:32 +02:00
Martin Reinecke
3d376d9e8c update configure.ac 2022-05-22 09:38:27 +02:00
Martin Reinecke
6374a3a1ff Add deprecation notice 2021-05-20 12:38:51 +00:00
20 changed files with 2829 additions and 384 deletions

View file

@ -49,7 +49,7 @@ It can be disabled at configuration time by specifying "--disable-openmp" at the
configure command line. configure command line.
At runtime OMP_NUM_THREADS should be set to the number of hardware threads At runtime OMP_NUM_THREADS should be set to the number of hardware threads
(not physical cores) of the system. (not physical cores) of the system.
(Usually this is already the default setting when OMP_NUM_THREADS is not (Usually this is already the default setting when OMP_NUM_THREADS is not
specified.) specified.)

View file

@ -3,14 +3,16 @@ ACLOCAL_AMFLAGS = -I m4
lib_LTLIBRARIES = libsharp2.la lib_LTLIBRARIES = libsharp2.la
libsharp2_la_SOURCES = \ libsharp2_la_SOURCES = \
libsharp2/sharp_utils.cc \ libsharp2/pocketfft.c \
libsharp2/pocketfft.h \
libsharp2/sharp_utils.c \
libsharp2/sharp_utils.h \ libsharp2/sharp_utils.h \
libsharp2/sharp.cc \ libsharp2/sharp.c \
libsharp2/sharp_almhelpers.cc \ libsharp2/sharp_almhelpers.c \
libsharp2/sharp_core.cc \ libsharp2/sharp_core.c \
libsharp2/sharp_geomhelpers.cc \ libsharp2/sharp_geomhelpers.c \
libsharp2/sharp_legendre_roots.cc \ libsharp2/sharp_legendre_roots.c \
libsharp2/sharp_ylmgen_c.cc \ libsharp2/sharp_ylmgen_c.c \
libsharp2/sharp_internal.h \ libsharp2/sharp_internal.h \
libsharp2/sharp_legendre_roots.h \ libsharp2/sharp_legendre_roots.h \
libsharp2/sharp_vecsupport.h \ libsharp2/sharp_vecsupport.h \
@ -22,25 +24,25 @@ libsharp2_la_SOURCES = \
# any backward-compatible change: increase age # any backward-compatible change: increase age
# any backward-incompatible change: age=0 # any backward-incompatible change: age=0
# ==> age <= current # ==> age <= current
libsharp2_la_LDFLAGS = -version-info 0:0:0 -lpthread libsharp2_la_LDFLAGS = -version-info 0:0:0
AM_CXXFLAGS = @AM_CXXFLAGS@ AM_CFLAGS = @AM_CFLAGS@
if HAVE_MULTIARCH if HAVE_MULTIARCH
libavx_la_SOURCES = libsharp2/sharp_core_inc.cc libavx_la_SOURCES = libsharp2/sharp_core_inc.c
libavx2_la_SOURCES = libsharp2/sharp_core_inc.cc libavx2_la_SOURCES = libsharp2/sharp_core_inc.c
libfma_la_SOURCES = libsharp2/sharp_core_inc.cc libfma_la_SOURCES = libsharp2/sharp_core_inc.c
libfma4_la_SOURCES = libsharp2/sharp_core_inc.cc libfma4_la_SOURCES = libsharp2/sharp_core_inc.c
libavx512f_la_SOURCES = libsharp2/sharp_core_inc.cc libavx512f_la_SOURCES = libsharp2/sharp_core_inc.c
noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
libavx_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx -DARCH=avx libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx
libavx2_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx2 -DARCH=avx2 libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2
libfma_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma -DARCH=fma libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma
libfma4_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma4 -DARCH=fma4 libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4
libavx512f_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx512f -DARCH=avx512f libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f
libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
@ -54,10 +56,10 @@ nobase_include_HEADERS = \
libsharp2/sharp_cxx.h libsharp2/sharp_cxx.h
EXTRA_DIST = \ EXTRA_DIST = \
runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.cc runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.c
check_PROGRAMS = sharp2_testsuite check_PROGRAMS = sharp2_testsuite
sharp2_testsuite_SOURCES = test/sharp2_testsuite.cc test/memusage.cc test/memusage.h sharp2_testsuite_SOURCES = test/sharp2_testsuite.c test/memusage.c test/memusage.h
sharp2_testsuite_LDADD = libsharp2.la sharp2_testsuite_LDADD = libsharp2.la
TESTS = runtest.sh TESTS = runtest.sh

View file

@ -1,3 +1,13 @@
NOTICE
======
Active development of this package has stopped. The package will receive bug fixes
if necessary, but otherwise the code has been integrated into the `ducc0`
package (https://gitlab.mpcdf.mpg.de/mtr/ducc), and further development is
taking place there.
Please prefer `ducc0` over `libsharp2` if you are starting a new project!
# Libsharp2 # Libsharp2
Library for efficient spherical harmonic transforms at arbitrary spins, Library for efficient spherical harmonic transforms at arbitrary spins,

View file

@ -1,4 +1,4 @@
AC_INIT([libsharp2], [1.0.0]) AC_INIT([libsharp2],[1.0.0])
AM_INIT_AUTOMAKE([foreign subdir-objects -Wall -Werror]) AM_INIT_AUTOMAKE([foreign subdir-objects -Wall -Werror])
AM_MAINTAINER_MODE([enable]) AM_MAINTAINER_MODE([enable])
@ -20,21 +20,28 @@ dnl
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
AC_PROG_CXX m4_version_prereq(2.70, [AC_PROG_CC], [AC_PROG_CC_C99])
AX_CXX_COMPILE_STDCXX([11]) AC_OPENMP
AC_PROG_LIBTOOL # add math library
LIBS="-lm"
tmpval=`echo $CXXFLAGS | grep -c '\-DMULTIARCH'` LT_INIT
tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'`
AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0]) AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS"
PACKAGE_LIBS="-lsharp2" PACKAGE_LIBS="-lsharp2"
PACKAGE_CFLAGS="$PACKAGE_CFLAGS $OPENMP_CFLAGS"
PACKAGE_LDFLAGS="$PACKAGE_LDFLAGS $OPENMP_CFLAGS"
dnl dnl
dnl Create pkgconfig .pc file. dnl Create pkgconfig .pc file.
dnl dnl
AX_CREATE_PKGCONFIG_INFO(,,,,[]) AX_CREATE_PKGCONFIG_INFO(,,,,[])
AC_SUBST([AM_CXXFLAGS]) AC_SUBST([AM_CFLAGS])
AC_CONFIG_FILES([Makefile]) AC_CONFIG_FILES([Makefile])
AC_OUTPUT AC_OUTPUT

2198
libsharp2/pocketfft.c Normal file

File diff suppressed because it is too large Load diff

50
libsharp2/pocketfft.h Normal file
View file

@ -0,0 +1,50 @@
/*
* This file is part of pocketfft.
* Licensed under a 3-clause BSD style license - see LICENSE.md
*/
/*! \file pocketfft.h
* Public interface of the pocketfft library
*
* Copyright (C) 2008-2019 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef POCKETFFT_H
#define POCKETFFT_H
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
struct pocketfft_plan_c_i;
typedef struct pocketfft_plan_c_i * pocketfft_plan_c;
pocketfft_plan_c pocketfft_make_plan_c (size_t length);
void pocketfft_delete_plan_c (pocketfft_plan_c plan);
int pocketfft_backward_c(pocketfft_plan_c plan, double c[], double fct);
int pocketfft_forward_c(pocketfft_plan_c plan, double c[], double fct);
size_t pocketfft_length_c(pocketfft_plan_c plan);
struct pocketfft_plan_r_i;
typedef struct pocketfft_plan_r_i * pocketfft_plan_r;
pocketfft_plan_r pocketfft_make_plan_r (size_t length);
void pocketfft_delete_plan_r (pocketfft_plan_r plan);
/*! Computes a real backward FFT on \a c, using \a plan
and assuming the FFTPACK storage scheme:
- on entry, \a c has the form <tt>r0, r1, i1, r2, i2, ...</tt>
- on exit, it has the form <tt>r0, r1, ..., r[length-1]</tt>. */
int pocketfft_backward_r(pocketfft_plan_r plan, double c[], double fct);
/*! Computes a real forward FFT on \a c, using \a plan
and assuming the FFTPACK storage scheme:
- on entry, \a c has the form <tt>r0, r1, ..., r[length-1]</tt>;
- on exit, it has the form <tt>r0, r1, i1, r2, i2, ...</tt> */
int pocketfft_forward_r(pocketfft_plan_r plan, double c[], double fct);
size_t pocketfft_length_r(pocketfft_plan_r plan);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -25,20 +25,17 @@
* \author Martin Reinecke \author Dag Sverre Seljebotn * \author Martin Reinecke \author Dag Sverre Seljebotn
*/ */
#include <cmath> #include <math.h>
#include <cstring> #include <string.h>
#include <atomic> #include "libsharp2/pocketfft.h"
#include "mr_util/fft.h"
#include "libsharp2/sharp_ylmgen_c.h" #include "libsharp2/sharp_ylmgen_c.h"
#include "libsharp2/sharp_internal.h" #include "libsharp2/sharp_internal.h"
#include "libsharp2/sharp_utils.h" #include "libsharp2/sharp_utils.h"
#include "libsharp2/sharp_almhelpers.h" #include "libsharp2/sharp_almhelpers.h"
#include "libsharp2/sharp_geomhelpers.h" #include "libsharp2/sharp_geomhelpers.h"
#include "mr_util/threading.h"
#include "mr_util/useful_macros.h"
typedef complex<double> dcmplx; typedef complex double dcmplx;
typedef complex<float> fcmplx; typedef complex float fcmplx;
static const double sqrt_one_half = 0.707106781186547572737310929369; static const double sqrt_one_half = 0.707106781186547572737310929369;
static const double sqrt_two = 1.414213562373095145474621858739; static const double sqrt_two = 1.414213562373095145474621858739;
@ -60,7 +57,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
*nchunks = (ndata+(*chunksize)-1)/(*chunksize); *nchunks = (ndata+(*chunksize)-1)/(*chunksize);
} }
MRUTIL_NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth) NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
{ {
double ofs=lmax*0.01; double ofs=lmax*0.01;
if (ofs<100.) ofs=100.; if (ofs<100.) ofs=100.;
@ -79,7 +76,7 @@ typedef struct
double phi0_; double phi0_;
dcmplx *shiftarr; dcmplx *shiftarr;
int s_shift; int s_shift;
mr::detail_fft::rfftp<double> *plan; pocketfft_plan_r plan;
int length; int length;
int norot; int norot;
} ringhelper; } ringhelper;
@ -92,12 +89,12 @@ static void ringhelper_init (ringhelper *self)
static void ringhelper_destroy (ringhelper *self) static void ringhelper_destroy (ringhelper *self)
{ {
delete self->plan; if (self->plan) pocketfft_delete_plan_r(self->plan);
DEALLOC(self->shiftarr); DEALLOC(self->shiftarr);
ringhelper_init(self); ringhelper_init(self);
} }
MRUTIL_NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0) NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
{ {
self->norot = (fabs(phi0)<1e-14); self->norot = (fabs(phi0)<1e-14);
if (!(self->norot)) if (!(self->norot))
@ -108,27 +105,27 @@ MRUTIL_NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mm
self->phi0_ = phi0; self->phi0_ = phi0;
// FIXME: improve this by using sincos2pibyn(nph) etc. // FIXME: improve this by using sincos2pibyn(nph) etc.
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
self->shiftarr[m] = dcmplx(cos(m*phi0),sin(m*phi0)); self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
// double *tmp=(double *) self->shiftarr; // double *tmp=(double *) self->shiftarr;
// sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2); // sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
} }
// if (!self->plan) self->plan=pocketfft_make_plan_r(nph); // if (!self->plan) self->plan=pocketfft_make_plan_r(nph);
if (nph!=(int)self->length) if (nph!=(int)self->length)
{ {
if (self->plan) delete self->plan; if (self->plan) pocketfft_delete_plan_r(self->plan);
self->plan=new mr::detail_fft::rfftp<double>(nph); self->plan=pocketfft_make_plan_r(nph);
self->length=nph; self->length=nph;
} }
} }
static int ringinfo_compare (const void *xa, const void *xb) static int ringinfo_compare (const void *xa, const void *xb)
{ {
const sharp_ringinfo *a=(const sharp_ringinfo *)xa, *b=(const sharp_ringinfo *)xb; const sharp_ringinfo *a=xa, *b=xb;
return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0; return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
} }
static int ringpair_compare (const void *xa, const void *xb) static int ringpair_compare (const void *xa, const void *xb)
{ {
const sharp_ringpair *a=(const sharp_ringpair *)xa, *b=(const sharp_ringpair *)xb; const sharp_ringpair *a=xa, *b=xb;
// return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0; // return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
if (a->r1.nph==b->r1.nph) if (a->r1.nph==b->r1.nph)
return (a->r1.phi0 < b->r1.phi0) ? -1 : return (a->r1.phi0 < b->r1.phi0) ? -1 :
@ -278,7 +275,7 @@ static int sharp_get_mmax (int *mval, int nm)
return nm-1; return nm-1;
} }
MRUTIL_NOINLINE static void ringhelper_phase2ring (ringhelper *self, NOINLINE static void ringhelper_phase2ring (ringhelper *self,
const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase, const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase,
int pstride, int flags) int pstride, int flags)
{ {
@ -295,22 +292,22 @@ MRUTIL_NOINLINE static void ringhelper_phase2ring (ringhelper *self,
if (self->norot) if (self->norot)
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
{ {
data[2*m]=phase[m*pstride].real()*wgt; data[2*m]=creal(phase[m*pstride])*wgt;
data[2*m+1]=phase[m*pstride].imag()*wgt; data[2*m+1]=cimag(phase[m*pstride])*wgt;
} }
else else
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
{ {
dcmplx tmp = phase[m*pstride]*self->shiftarr[m]; dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
data[2*m]=tmp.real()*wgt; data[2*m]=creal(tmp)*wgt;
data[2*m+1]=tmp.imag()*wgt; data[2*m+1]=cimag(tmp)*wgt;
} }
for (int m=2*(mmax+1); m<nph+2; ++m) for (int m=2*(mmax+1); m<nph+2; ++m)
data[m]=0.; data[m]=0.;
} }
else else
{ {
data[0]=phase[0].real()*wgt; data[0]=creal(phase[0])*wgt;
SET_ARRAY(data,1,nph+2,0.); SET_ARRAY(data,1,nph+2,0.);
int idx1=1, idx2=nph-1; int idx1=1, idx2=nph-1;
@ -320,23 +317,23 @@ MRUTIL_NOINLINE static void ringhelper_phase2ring (ringhelper *self,
if(!self->norot) tmp*=self->shiftarr[m]; if(!self->norot) tmp*=self->shiftarr[m];
if (idx1<(nph+2)/2) if (idx1<(nph+2)/2)
{ {
data[2*idx1]+=tmp.real(); data[2*idx1]+=creal(tmp);
data[2*idx1+1]+=tmp.imag(); data[2*idx1+1]+=cimag(tmp);
} }
if (idx2<(nph+2)/2) if (idx2<(nph+2)/2)
{ {
data[2*idx2]+=tmp.real(); data[2*idx2]+=creal(tmp);
data[2*idx2+1]-=tmp.imag(); data[2*idx2+1]-=cimag(tmp);
} }
if (++idx1>=nph) idx1=0; if (++idx1>=nph) idx1=0;
if (--idx2<0) idx2=nph-1; if (--idx2<0) idx2=nph-1;
} }
} }
data[1]=data[0]; data[1]=data[0];
self->plan->exec(&(data[1]), 1., false); pocketfft_backward_r (self->plan, &(data[1]), 1.);
} }
MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self, NOINLINE static void ringhelper_ring2phase (ringhelper *self,
const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase, const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase,
int pstride, int flags) int pstride, int flags)
{ {
@ -352,7 +349,7 @@ MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self,
if (flags&SHARP_REAL_HARMONICS) if (flags&SHARP_REAL_HARMONICS)
wgt *= sqrt_two; wgt *= sqrt_two;
self->plan->exec (&(data[1]), 1., true); pocketfft_forward_r (self->plan, &(data[1]), 1.);
data[0]=data[1]; data[0]=data[1];
data[1]=data[nph+1]=0.; data[1]=data[nph+1]=0.;
@ -360,11 +357,11 @@ MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self,
{ {
if (self->norot) if (self->norot)
for (int m=0; m<=maxidx; ++m) for (int m=0; m<=maxidx; ++m)
phase[m*pstride] = dcmplx(data[2*m], data[2*m+1]) * wgt; phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * wgt;
else else
for (int m=0; m<=maxidx; ++m) for (int m=0; m<=maxidx; ++m)
phase[m*pstride] = phase[m*pstride] =
dcmplx(data[2*m], data[2*m+1]) * self->shiftarr[m] * wgt; (data[2*m] + _Complex_I*data[2*m+1]) * self->shiftarr[m] * wgt;
} }
else else
{ {
@ -373,9 +370,9 @@ MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self,
int idx=m%nph; int idx=m%nph;
dcmplx val; dcmplx val;
if (idx<(nph-idx)) if (idx<(nph-idx))
val = dcmplx(data[2*idx], data[2*idx+1]) * wgt; val = (data[2*idx] + _Complex_I*data[2*idx+1]) * wgt;
else else
val = dcmplx(data[2*(nph-idx)], -data[2*(nph-idx)+1]) * wgt; val = (data[2*(nph-idx)] - _Complex_I*data[2*(nph-idx)+1]) * wgt;
if (!self->norot) if (!self->norot)
val *= self->shiftarr[m]; val *= self->shiftarr[m];
phase[m*pstride]=val; phase[m*pstride]=val;
@ -386,7 +383,7 @@ MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self,
phase[m*pstride]=0.; phase[m*pstride]=0.;
} }
MRUTIL_NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map, NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
int flags) int flags)
{ {
if (flags & SHARP_NO_FFT) if (flags & SHARP_NO_FFT)
@ -443,7 +440,7 @@ MRUTIL_NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
} }
} }
MRUTIL_NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm, NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
int flags) int flags)
{ {
#define CLEARLOOP(real_t,body) \ #define CLEARLOOP(real_t,body) \
@ -480,7 +477,7 @@ MRUTIL_NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
} }
} }
MRUTIL_NOINLINE static void init_output (sharp_job *job) NOINLINE static void init_output (sharp_job *job)
{ {
if (job->flags&SHARP_ADD) return; if (job->flags&SHARP_ADD) return;
if (job->type == SHARP_MAP2ALM) if (job->type == SHARP_MAP2ALM)
@ -491,7 +488,7 @@ MRUTIL_NOINLINE static void init_output (sharp_job *job)
clear_map (job->ginfo,job->map[i],job->flags); clear_map (job->ginfo,job->map[i],job->flags);
} }
MRUTIL_NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta) NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
{ {
if (job->type==SHARP_MAP2ALM) if (job->type==SHARP_MAP2ALM)
{ {
@ -517,7 +514,7 @@ static void alloc_almtmp (sharp_job *job, int lmax)
static void dealloc_almtmp (sharp_job *job) static void dealloc_almtmp (sharp_job *job)
{ DEALLOC(job->almtmp); } { DEALLOC(job->almtmp); }
MRUTIL_NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
{ {
#define COPY_LOOP(real_t, source_t, expr_of_x) \ #define COPY_LOOP(real_t, source_t, expr_of_x) \
@ -580,7 +577,7 @@ MRUTIL_NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
if (job->flags&SHARP_DP) if (job->flags&SHARP_DP)
COPY_LOOP(double, dcmplx, x*job->norm_l[l]) COPY_LOOP(double, dcmplx, x*job->norm_l[l])
else else
COPY_LOOP(float, fcmplx, x*float(job->norm_l[l])) COPY_LOOP(float, fcmplx, x*job->norm_l[l])
} }
} }
} }
@ -591,7 +588,7 @@ MRUTIL_NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
#undef COPY_LOOP #undef COPY_LOOP
} }
MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
{ {
#define COPY_LOOP(real_t, target_t, expr_of_x) \ #define COPY_LOOP(real_t, target_t, expr_of_x) \
@ -620,9 +617,9 @@ MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
if (m==0) if (m==0)
{ {
if (job->flags&SHARP_DP) if (job->flags&SHARP_DP)
COPY_LOOP(double, double, x.real()*norm_m0) COPY_LOOP(double, double, creal(x)*norm_m0)
else else
COPY_LOOP(float, float, x.real()*norm_m0) COPY_LOOP(float, float, crealf(x)*norm_m0)
} }
else else
{ {
@ -637,9 +634,9 @@ MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
if (m==0) if (m==0)
{ {
if (job->flags&SHARP_DP) if (job->flags&SHARP_DP)
COPY_LOOP(double, double, x.real()*job->norm_l[l]*norm_m0) COPY_LOOP(double, double, creal(x)*job->norm_l[l]*norm_m0)
else else
COPY_LOOP(float, fcmplx, (float)(x.real()*job->norm_l[l]*norm_m0)) COPY_LOOP(float, fcmplx, (float)(creal(x)*job->norm_l[l]*norm_m0))
} }
else else
{ {
@ -653,7 +650,7 @@ MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
#undef COPY_LOOP #undef COPY_LOOP
} }
MRUTIL_NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
const double *ringtmp, int rstride) const double *ringtmp, int rstride)
{ {
if (job->flags & SHARP_DP) if (job->flags & SHARP_DP)
@ -661,8 +658,8 @@ MRUTIL_NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
double **dmap = (double **)job->map; double **dmap = (double **)job->map;
for (int i=0; i<job->nmaps; ++i) for (int i=0; i<job->nmaps; ++i)
{ {
double *MRUTIL_RESTRICT p1=&dmap[i][ri->ofs]; double *restrict p1=&dmap[i][ri->ofs];
const double *MRUTIL_RESTRICT p2=&ringtmp[i*rstride+1]; const double *restrict p2=&ringtmp[i*rstride+1];
if (ri->stride==1) if (ri->stride==1)
{ {
if (job->flags&SHARP_ADD) if (job->flags&SHARP_ADD)
@ -685,14 +682,14 @@ MRUTIL_NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
} }
} }
MRUTIL_NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
double *ringtmp, int rstride) double *ringtmp, int rstride)
{ {
if (job->flags & SHARP_DP) if (job->flags & SHARP_DP)
for (int i=0; i<job->nmaps; ++i) for (int i=0; i<job->nmaps; ++i)
{ {
double *MRUTIL_RESTRICT p1=&ringtmp[i*rstride+1], double *restrict p1=&ringtmp[i*rstride+1],
*MRUTIL_RESTRICT p2=&(((double *)(job->map[i]))[ri->ofs]); *restrict p2=&(((double *)(job->map[i]))[ri->ofs]);
if (ri->stride==1) if (ri->stride==1)
memcpy(p1,p2,ri->nph*sizeof(double)); memcpy(p1,p2,ri->nph*sizeof(double));
else else
@ -724,7 +721,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ? phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt : ((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*float(wgt); ((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt;
} }
} }
static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
@ -746,7 +743,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
} }
//FIXME: set phase to zero if not SHARP_MAP2ALM? //FIXME: set phase to zero if not SHARP_MAP2ALM?
MRUTIL_NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim) NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
{ {
if (job->type != SHARP_MAP2ALM) return; if (job->type != SHARP_MAP2ALM) return;
int pstride = job->s_m; int pstride = job->s_m;
@ -763,35 +760,35 @@ MRUTIL_NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int u
} }
else else
{ {
mr::execDynamic(ulim-llim, 0, 1, [&](mr::Scheduler &sched) #pragma omp parallel
{
ringhelper helper;
ringhelper_init(&helper);
int rstride=job->ginfo->nphmax+2;
double *ringtmp=RALLOC(double,job->nmaps*rstride);
#pragma omp for schedule(dynamic,1)
for (int ith=llim; ith<ulim; ++ith)
{ {
ringhelper helper; int dim2 = job->s_th*(ith-llim);
ringhelper_init(&helper); ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
int rstride=job->ginfo->nphmax+2; for (int i=0; i<job->nmaps; ++i)
double *ringtmp=RALLOC(double,job->nmaps*rstride); ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; ith<rng.hi+llim; ++ith) if (job->ginfo->pair[ith].r2.nph>0)
{ {
int dim2 = job->s_th*(ith-llim); ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
for (int i=0; i<job->nmaps; ++i) for (int i=0; i<job->nmaps; ++i)
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1), ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags); &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
if (job->ginfo->pair[ith].r2.nph>0)
{
ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
for (int i=0; i<job->nmaps; ++i)
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
}
} }
DEALLOC(ringtmp); }
ringhelper_destroy(&helper); DEALLOC(ringtmp);
}); /* end of parallel region */ ringhelper_destroy(&helper);
} /* end of parallel region */
} }
} }
MRUTIL_NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim) NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
{ {
if (job->type == SHARP_MAP2ALM) return; if (job->type == SHARP_MAP2ALM) return;
int pstride = job->s_m; int pstride = job->s_m;
@ -808,35 +805,35 @@ MRUTIL_NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int u
} }
else else
{ {
mr::execDynamic(ulim-llim, 0, 1, [&](mr::Scheduler &sched) #pragma omp parallel
{
ringhelper helper;
ringhelper_init(&helper);
int rstride=job->ginfo->nphmax+2;
double *ringtmp=RALLOC(double,job->nmaps*rstride);
#pragma omp for schedule(dynamic,1)
for (int ith=llim; ith<ulim; ++ith)
{ {
ringhelper helper; int dim2 = job->s_th*(ith-llim);
ringhelper_init(&helper); for (int i=0; i<job->nmaps; ++i)
int rstride=job->ginfo->nphmax+2; ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1),
double *ringtmp=RALLOC(double,job->nmaps*rstride); &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; ith<rng.hi+llim; ++ith) if (job->ginfo->pair[ith].r2.nph>0)
{ {
int dim2 = job->s_th*(ith-llim);
for (int i=0; i<job->nmaps; ++i) for (int i=0; i<job->nmaps; ++i)
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1), ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags); &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride); ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
if (job->ginfo->pair[ith].r2.nph>0)
{
for (int i=0; i<job->nmaps; ++i)
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
}
} }
DEALLOC(ringtmp); }
ringhelper_destroy(&helper); DEALLOC(ringtmp);
}); /* end of parallel region */ ringhelper_destroy(&helper);
} /* end of parallel region */
} }
} }
MRUTIL_NOINLINE static void sharp_execute_job (sharp_job *job) NOINLINE static void sharp_execute_job (sharp_job *job)
{ {
double timer=sharp_wallTime(); double timer=sharp_wallTime();
job->opcnt=0; job->opcnt=0;
@ -855,7 +852,6 @@ MRUTIL_NOINLINE static void sharp_execute_job (sharp_job *job)
&nchunks,&chunksize); &nchunks,&chunksize);
//FIXME: needs to be changed to "nm" //FIXME: needs to be changed to "nm"
alloc_phase (job,mmax+1,chunksize); alloc_phase (job,mmax+1,chunksize);
std::atomic<size_t> opcnt = 0;
/* chunk loop */ /* chunk loop */
for (int chunk=0; chunk<nchunks; ++chunk) for (int chunk=0; chunk<nchunks; ++chunk)
@ -875,30 +871,32 @@ MRUTIL_NOINLINE static void sharp_execute_job (sharp_job *job)
/* map->phase where necessary */ /* map->phase where necessary */
map2phase (job, mmax, llim, ulim); map2phase (job, mmax, llim, ulim);
mr::execDynamic(job->ainfo->nm, 0, 1, [&](mr::Scheduler &sched) #pragma omp parallel
{
sharp_job ljob = *job;
ljob.opcnt=0;
sharp_Ylmgen_C generator;
sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
alloc_almtmp(&ljob,lmax);
#pragma omp for schedule(dynamic,1)
for (int mi=0; mi<job->ainfo->nm; ++mi)
{ {
sharp_job ljob = *job;
ljob.opcnt=0;
sharp_Ylmgen_C generator;
sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
alloc_almtmp(&ljob,lmax);
while (auto rng=sched.getNext()) for(auto mi=rng.lo; mi<rng.hi; ++mi)
{
/* alm->alm_tmp where necessary */ /* alm->alm_tmp where necessary */
alm2almtmp (&ljob, lmax, mi); alm2almtmp (&ljob, lmax, mi);
inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim); inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim);
/* alm_tmp->alm where necessary */ /* alm_tmp->alm where necessary */
almtmp2alm (&ljob, lmax, mi); almtmp2alm (&ljob, lmax, mi);
} }
sharp_Ylmgen_destroy(&generator); sharp_Ylmgen_destroy(&generator);
dealloc_almtmp(&ljob); dealloc_almtmp(&ljob);
opcnt+=ljob.opcnt; #pragma omp critical
}); /* end of parallel region */ job->opcnt+=ljob.opcnt;
} /* end of parallel region */
/* phase->map where necessary */ /* phase->map where necessary */
phase2map (job, mmax, llim, ulim); phase2map (job, mmax, llim, ulim);
@ -911,7 +909,6 @@ MRUTIL_NOINLINE static void sharp_execute_job (sharp_job *job)
DEALLOC(job->norm_l); DEALLOC(job->norm_l);
dealloc_phase (job); dealloc_phase (job);
job->opcnt = opcnt;
job->time=sharp_wallTime()-timer; job->time=sharp_wallTime()-timer;
} }
@ -937,8 +934,8 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
job->flags|=SHARP_REAL_HARMONICS; job->flags|=SHARP_REAL_HARMONICS;
job->time = 0.; job->time = 0.;
job->opcnt = 0; job->opcnt = 0;
job->alm=(void **)alm; job->alm=alm;
job->map=(void **)map; job->map=map;
} }
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map, void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,

View file

@ -27,7 +27,7 @@
#define ARCH default #define ARCH default
#define GENERIC_ARCH #define GENERIC_ARCH
#include "libsharp2/sharp_core_inc.cc" #include "libsharp2/sharp_core_inc.c"
#undef GENERIC_ARCH #undef GENERIC_ARCH
#undef ARCH #undef ARCH

View file

@ -34,7 +34,7 @@
#define XCONCATX2(a,b) XCONCATX(a,b) #define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH) #define XARCH(a) XCONCATX2(a,ARCH)
#include <complex> #include <complex.h>
#include <math.h> #include <math.h>
#include <string.h> #include <string.h>
#include "libsharp2/sharp_vecsupport.h" #include "libsharp2/sharp_vecsupport.h"
@ -49,7 +49,7 @@
// Unfortunately, most compilers don't act on this pragma yet. // Unfortunately, most compilers don't act on this pragma yet.
#pragma STDC FP_CONTRACT ON #pragma STDC FP_CONTRACT ON
typedef complex<double> dcmplx; typedef complex double dcmplx;
#define nv0 (128/VLEN) #define nv0 (128/VLEN)
#define nvx (64/VLEN) #define nvx (64/VLEN)
@ -94,35 +94,35 @@ typedef union
sxdata_s s; sxdata_s s;
} sxdata_u; } sxdata_u;
static inline void Tvnormalize (Tv * MRUTIL_RESTRICT val, Tv * MRUTIL_RESTRICT scale, static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale,
double maxval) double maxval)
{ {
const Tv vfmin=sharp_fsmall*maxval, vfmax=maxval; const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
const Tv vfsmall=sharp_fsmall, vfbig=sharp_fbig; const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
auto mask = abs(*val)>vfmax; Tm mask = vgt(vabs(*val),vfmax);
while (any_of(mask)) while (vanyTrue(mask))
{ {
where(mask,*val)*=vfsmall; vmuleq_mask(mask,*val,vfsmall);
where(mask,*scale)+=1; vaddeq_mask(mask,*scale,vone);
mask = abs(*val)>vfmax; mask = vgt(vabs(*val),vfmax);
} }
mask = (abs(*val)<vfmin) & (*val!=0); mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
while (any_of(mask)) while (vanyTrue(mask))
{ {
where(mask,*val)*=vfbig; vmuleq_mask(mask,*val,vfbig);
where(mask,*scale)-=1; vsubeq_mask(mask,*scale,vone);
mask = (abs(*val)<vfmin) & (*val!=0); mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
} }
} }
static void mypow(Tv val, int npow, const double * MRUTIL_RESTRICT powlimit, static void mypow(Tv val, int npow, const double * restrict powlimit,
Tv * MRUTIL_RESTRICT resd, Tv * MRUTIL_RESTRICT ress) Tv * restrict resd, Tv * restrict ress)
{ {
Tv vminv=powlimit[npow]; Tv vminv=vload(powlimit[npow]);
auto mask = abs(val)<vminv; Tm mask = vlt(vabs(val),vminv);
if (none_of(mask)) // no underflows possible, use quick algoritm if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
{ {
Tv res=1; Tv res=vone;
do do
{ {
if (npow&1) if (npow&1)
@ -131,11 +131,11 @@ static void mypow(Tv val, int npow, const double * MRUTIL_RESTRICT powlimit,
} }
while(npow>>=1); while(npow>>=1);
*resd=res; *resd=res;
*ress=0; *ress=vzero;
} }
else else
{ {
Tv scale=0, scaleint=0, res=1; Tv scale=vzero, scaleint=vzero, res=vone;
Tvnormalize(&val,&scaleint,sharp_fbighalf); Tvnormalize(&val,&scaleint,sharp_fbighalf);
do do
{ {
@ -155,8 +155,8 @@ static void mypow(Tv val, int npow, const double * MRUTIL_RESTRICT powlimit,
} }
} }
static inline void getCorfac(Tv scale, Tv * MRUTIL_RESTRICT corfac, static inline void getCorfac(Tv scale, Tv * restrict corfac,
const double * MRUTIL_RESTRICT cf) const double * restrict cf)
{ {
typedef union typedef union
{ Tv v; double s[VLEN]; } Tvu; { Tv v; double s[VLEN]; } Tvu;
@ -169,67 +169,67 @@ static inline void getCorfac(Tv scale, Tv * MRUTIL_RESTRICT corfac,
*corfac=corf.v; *corfac=corf.v;
} }
static inline bool rescale(Tv * MRUTIL_RESTRICT v1, Tv * MRUTIL_RESTRICT v2, Tv * MRUTIL_RESTRICT s, Tv eps) static inline int rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, Tv eps)
{ {
auto mask = abs(*v2)>eps; Tm mask = vgt(vabs(*v2),eps);
if (any_of(mask)) if (vanyTrue(mask))
{ {
where(mask,*v1)*=sharp_fsmall; vmuleq_mask(mask,*v1,vload(sharp_fsmall));
where(mask,*v2)*=sharp_fsmall; vmuleq_mask(mask,*v2,vload(sharp_fsmall));
where(mask,*s)+=1; vaddeq_mask(mask,*s,vone);
return true; return 1;
} }
return false; return 0;
} }
MRUTIL_NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
s0data_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int * MRUTIL_RESTRICT il_, int nv2) s0data_v * restrict d, int * restrict l_, int * restrict il_, int nv2)
{ {
int l=gen->m, il=0; int l=gen->m, il=0;
Tv mfac = (gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]; Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
Tv limscale=sharp_limscale; Tv limscale=vload(sharp_limscale);
int below_limit = 1; int below_limit = 1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->lam1[i]=0; d->lam1[i]=vzero;
mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]); mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]);
d->lam2[i] *= mfac; d->lam2[i] *= mfac;
Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol); Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
below_limit &= all_of(d->scale[i]<limscale); below_limit &= vallTrue(vlt(d->scale[i],limscale));
} }
while (below_limit) while (below_limit)
{ {
if (l+4>gen->lmax) {*l_=gen->lmax+1;return;} if (l+4>gen->lmax) {*l_=gen->lmax+1;return;}
below_limit=1; below_limit=1;
Tv a1=gen->coef[il ].a, b1=gen->coef[il ].b; Tv a1=vload(gen->coef[il ].a), b1=vload(gen->coef[il ].b);
Tv a2=gen->coef[il+1].a, b2=gen->coef[il+1].b; Tv a2=vload(gen->coef[il+1].a), b2=vload(gen->coef[il+1].b);
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i]; d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol)) if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
below_limit &= all_of(d->scale[i]<sharp_limscale); below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale)));
} }
l+=4; il+=2; l+=4; il+=2;
} }
*l_=l; *il_=il; *l_=l; *il_=il;
} }
MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d, NOINLINE static void alm2map_kernel(s0data_v * restrict d,
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, const dcmplx * MRUTIL_RESTRICT alm, const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm,
int l, int il, int lmax, int nv2) int l, int il, int lmax, int nv2)
{ {
if (nv2==nv0) if (nv2==nv0)
{ {
for (; l<=lmax-2; il+=2, l+=4) for (; l<=lmax-2; il+=2, l+=4)
{ {
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(); Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag(); Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag(); Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag(); Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
Tv a1=coef[il ].a, b1=coef[il ].b; Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
Tv a2=coef[il+1].a, b2=coef[il+1].b; Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
for (int i=0; i<nv0; ++i) for (int i=0; i<nv0; ++i)
{ {
d->p1r[i] += d->lam2[i]*ar1; d->p1r[i] += d->lam2[i]*ar1;
@ -249,12 +249,12 @@ MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d,
{ {
for (; l<=lmax-2; il+=2, l+=4) for (; l<=lmax-2; il+=2, l+=4)
{ {
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(); Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag(); Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag(); Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag(); Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
Tv a1=coef[il ].a, b1=coef[il ].b; Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
Tv a2=coef[il+1].a, b2=coef[il+1].b; Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->p1r[i] += d->lam2[i]*ar1; d->p1r[i] += d->lam2[i]*ar1;
@ -272,9 +272,9 @@ MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d,
} }
for (; l<=lmax; ++il, l+=2) for (; l<=lmax; ++il, l+=2)
{ {
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(); Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag(); Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
Tv a=coef[il].a, b=coef[il].b; Tv a=vload(coef[il].a), b=vload(coef[il].b);
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->p1r[i] += d->lam2[i]*ar1; d->p1r[i] += d->lam2[i]*ar1;
@ -288,8 +288,8 @@ MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d,
} }
} }
MRUTIL_NOINLINE static void calc_alm2map (sharp_job * MRUTIL_RESTRICT job, NOINLINE static void calc_alm2map (sharp_job * restrict job,
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth) const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
{ {
int l,il,lmax=gen->lmax; int l,il,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN; int nv2 = (nth+VLEN-1)/VLEN;
@ -298,20 +298,20 @@ MRUTIL_NOINLINE static void calc_alm2map (sharp_job * MRUTIL_RESTRICT job,
if (l>lmax) return; if (l>lmax) return;
job->opcnt += (lmax+1-l) * 6*nth; job->opcnt += (lmax+1-l) * 6*nth;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef; const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
const dcmplx * MRUTIL_RESTRICT alm=job->almtmp; const dcmplx * restrict alm=job->almtmp;
int full_ieee=1; int full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
getCorfac(d->scale[i], &d->corfac[i], gen->cf); getCorfac(d->scale[i], &d->corfac[i], gen->cf);
full_ieee &= all_of(d->scale[i]>=sharp_minscale); full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
} }
while((!full_ieee) && (l<=lmax)) while((!full_ieee) && (l<=lmax))
{ {
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(); Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag(); Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
Tv a=coef[il].a, b=coef[il].b; Tv a=vload(coef[il].a), b=vload(coef[il].b);
full_ieee=1; full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
@ -322,9 +322,9 @@ MRUTIL_NOINLINE static void calc_alm2map (sharp_job * MRUTIL_RESTRICT job,
Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i]; Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
d->lam1[i] = d->lam2[i]; d->lam1[i] = d->lam2[i];
d->lam2[i] = tmp; d->lam2[i] = tmp;
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol)) if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
getCorfac(d->scale[i], &d->corfac[i], gen->cf); getCorfac(d->scale[i], &d->corfac[i], gen->cf);
full_ieee &= all_of(d->scale[i]>=sharp_minscale); full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
} }
l+=2; ++il; l+=2; ++il;
} }
@ -338,16 +338,16 @@ MRUTIL_NOINLINE static void calc_alm2map (sharp_job * MRUTIL_RESTRICT job,
alm2map_kernel(d, coef, alm, l, il, lmax, nv2); alm2map_kernel(d, coef, alm, l, il, lmax, nv2);
} }
MRUTIL_NOINLINE static void map2alm_kernel(s0data_v * MRUTIL_RESTRICT d, NOINLINE static void map2alm_kernel(s0data_v * restrict d,
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, dcmplx * MRUTIL_RESTRICT alm, int l, const sharp_ylmgen_dbl2 * restrict coef, dcmplx * restrict alm, int l,
int il, int lmax, int nv2) int il, int lmax, int nv2)
{ {
for (; l<=lmax-2; il+=2, l+=4) for (; l<=lmax-2; il+=2, l+=4)
{ {
Tv a1=coef[il ].a, b1=coef[il ].b; Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
Tv a2=coef[il+1].a, b2=coef[il+1].b; Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
Tv atmp1[4] = {0,0,0,0}; Tv atmp1[4] = {vzero, vzero, vzero, vzero};
Tv atmp2[4] = {0,0,0,0}; Tv atmp2[4] = {vzero, vzero, vzero, vzero};
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
atmp1[0] += d->lam2[i]*d->p1r[i]; atmp1[0] += d->lam2[i]*d->p1r[i];
@ -366,8 +366,8 @@ MRUTIL_NOINLINE static void map2alm_kernel(s0data_v * MRUTIL_RESTRICT d,
} }
for (; l<=lmax; ++il, l+=2) for (; l<=lmax; ++il, l+=2)
{ {
Tv a=coef[il].a, b=coef[il].b; Tv a=vload(coef[il].a), b=vload(coef[il].b);
Tv atmp[4] = {0,0,0,0}; Tv atmp[4] = {vzero, vzero, vzero, vzero};
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
atmp[0] += d->lam2[i]*d->p1r[i]; atmp[0] += d->lam2[i]*d->p1r[i];
@ -382,8 +382,8 @@ MRUTIL_NOINLINE static void map2alm_kernel(s0data_v * MRUTIL_RESTRICT d,
} }
} }
MRUTIL_NOINLINE static void calc_map2alm (sharp_job * MRUTIL_RESTRICT job, NOINLINE static void calc_map2alm (sharp_job * restrict job,
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth) const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
{ {
int l,il,lmax=gen->lmax; int l,il,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN; int nv2 = (nth+VLEN-1)/VLEN;
@ -392,19 +392,19 @@ MRUTIL_NOINLINE static void calc_map2alm (sharp_job * MRUTIL_RESTRICT job,
if (l>lmax) return; if (l>lmax) return;
job->opcnt += (lmax+1-l) * 6*nth; job->opcnt += (lmax+1-l) * 6*nth;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef; const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
dcmplx * MRUTIL_RESTRICT alm=job->almtmp; dcmplx * restrict alm=job->almtmp;
int full_ieee=1; int full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
getCorfac(d->scale[i], &d->corfac[i], gen->cf); getCorfac(d->scale[i], &d->corfac[i], gen->cf);
full_ieee &= all_of(d->scale[i]>=sharp_minscale); full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
} }
while((!full_ieee) && (l<=lmax)) while((!full_ieee) && (l<=lmax))
{ {
Tv a=coef[il].a, b=coef[il].b; Tv a=vload(coef[il].a), b=vload(coef[il].b);
Tv atmp[4] = {0,0,0,0}; Tv atmp[4] = {vzero, vzero, vzero, vzero};
full_ieee=1; full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
@ -415,9 +415,9 @@ MRUTIL_NOINLINE static void calc_map2alm (sharp_job * MRUTIL_RESTRICT job,
Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i]; Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
d->lam1[i] = d->lam2[i]; d->lam1[i] = d->lam2[i];
d->lam2[i] = tmp; d->lam2[i] = tmp;
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol)) if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
getCorfac(d->scale[i], &d->corfac[i], gen->cf); getCorfac(d->scale[i], &d->corfac[i], gen->cf);
full_ieee &= all_of(d->scale[i]>=sharp_minscale); full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
} }
vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]); vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
l+=2; ++il; l+=2; ++il;
@ -432,21 +432,21 @@ MRUTIL_NOINLINE static void calc_map2alm (sharp_job * MRUTIL_RESTRICT job,
map2alm_kernel(d, coef, alm, l, il, lmax, nv2); map2alm_kernel(d, coef, alm, l, il, lmax, nv2);
} }
MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
sxdata_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int nv2) sxdata_v * restrict d, int * restrict l_, int nv2)
{ {
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
Tv prefac=gen->prefac[gen->m], Tv prefac=vload(gen->prefac[gen->m]),
prescale=gen->fscale[gen->m]; prescale=vload(gen->fscale[gen->m]);
Tv limscale=sharp_limscale; Tv limscale=vload(sharp_limscale);
int below_limit=1; int below_limit=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
Tv cth2=max(Tv(1e-15),sqrt((1.+d->cth[i])*0.5)); Tv cth2=vmax(vload(1e-15),vsqrt((vone+d->cth[i])*vload(0.5)));
Tv sth2=max(Tv(1e-15),sqrt((1.-d->cth[i])*0.5)); Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5)));
auto mask=d->sth[i]<0; Tm mask=vlt(d->sth[i],vzero);
where(mask&(d->cth[i]<0),cth2)*=-1.; vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.));
where(mask&(d->cth[i]<0),sth2)*=-1.; vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.));
Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps; Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps); mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
@ -454,8 +454,8 @@ MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RES
mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps); mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps); mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
d->l1p[i] = 0; d->l1p[i] = vzero;
d->l1m[i] = 0; d->l1m[i] = vzero;
d->l2p[i] = prefac*ccp; d->l2p[i] = prefac*ccp;
d->scp[i] = prescale+ccps; d->scp[i] = prescale+ccps;
d->l2m[i] = prefac*csp; d->l2m[i] = prefac*csp;
@ -467,17 +467,17 @@ MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RES
d->l2m[i] *= scp; d->l2m[i] *= scp;
d->scm[i] += scps; d->scm[i] += scps;
if (gen->preMinus_p) if (gen->preMinus_p)
d->l2p[i] = -d->l2p[i]; d->l2p[i] = vneg(d->l2p[i]);
if (gen->preMinus_m) if (gen->preMinus_m)
d->l2m[i] = -d->l2m[i]; d->l2m[i] = vneg(d->l2m[i]);
if (gen->s&1) if (gen->s&1)
d->l2p[i] = -d->l2p[i]; d->l2p[i] = vneg(d->l2p[i]);
Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol); Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol);
Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol); Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol);
below_limit &= all_of(d->scm[i]<limscale) && below_limit &= vallTrue(vlt(d->scm[i],limscale)) &&
all_of(d->scp[i]<limscale); vallTrue(vlt(d->scp[i],limscale));
} }
int l=gen->mhi; int l=gen->mhi;
@ -486,18 +486,18 @@ MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RES
{ {
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
below_limit=1; below_limit=1;
Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],sharp_ftol) || if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],sharp_ftol)) rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
below_limit &= all_of(d->scp[i]<limscale) && below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
all_of(d->scm[i]<limscale); vallTrue(vlt(d->scm[i],limscale));
} }
l+=2; l+=2;
} }
@ -505,19 +505,19 @@ MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RES
*l_=l; *l_=l;
} }
MRUTIL_NOINLINE static void alm2map_spin_kernel(sxdata_v * MRUTIL_RESTRICT d, NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm, const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
int l, int lmax, int nv2) int l, int lmax, int nv2)
{ {
int lsave = l; int lsave = l;
while (l<=lmax) while (l<=lmax)
{ {
Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(), Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag(); acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(), Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag(); acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
@ -537,12 +537,12 @@ MRUTIL_NOINLINE static void alm2map_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
l=lsave; l=lsave;
while (l<=lmax) while (l<=lmax)
{ {
Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(), Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag(); acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(), Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag(); acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
@ -561,8 +561,8 @@ MRUTIL_NOINLINE static void alm2map_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
} }
} }
MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job * MRUTIL_RESTRICT job, NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth) const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
{ {
int l,lmax=gen->lmax; int l,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN; int nv2 = (nth+VLEN-1)/VLEN;
@ -571,25 +571,25 @@ MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job * MRUTIL_RESTRICT job,
if (l>lmax) return; if (l>lmax) return;
job->opcnt += (lmax+1-l) * 23*nth; job->opcnt += (lmax+1-l) * 23*nth;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
const dcmplx * MRUTIL_RESTRICT alm=job->almtmp; const dcmplx * restrict alm=job->almtmp;
int full_ieee=1; int full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
getCorfac(d->scp[i], &d->cfp[i], gen->cf); getCorfac(d->scp[i], &d->cfp[i], gen->cf);
getCorfac(d->scm[i], &d->cfm[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= all_of(d->scp[i]>=sharp_minscale) && full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
all_of(d->scm[i]>=sharp_minscale); vallTrue(vge(d->scm[i],vload(sharp_minscale)));
} }
while((!full_ieee) && (l<=lmax)) while((!full_ieee) && (l<=lmax))
{ {
Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(), Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag(); acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(), Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag(); acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
full_ieee=1; full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
@ -611,12 +611,12 @@ MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job * MRUTIL_RESTRICT job,
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol)) if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
getCorfac(d->scp[i], &d->cfp[i], gen->cf); getCorfac(d->scp[i], &d->cfp[i], gen->cf);
full_ieee &= all_of(d->scp[i]>=sharp_minscale); full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol)) if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
getCorfac(d->scm[i], &d->cfm[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= all_of(d->scm[i]>=sharp_minscale); full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
} }
l+=2; l+=2;
} }
@ -641,17 +641,17 @@ MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job * MRUTIL_RESTRICT job,
} }
} }
MRUTIL_NOINLINE static void map2alm_spin_kernel(sxdata_v * MRUTIL_RESTRICT d, NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, dcmplx * MRUTIL_RESTRICT alm, const sharp_ylmgen_dbl2 * restrict fx, dcmplx * restrict alm,
int l, int lmax, int nv2) int l, int lmax, int nv2)
{ {
int lsave=l; int lsave=l;
while (l<=lmax) while (l<=lmax)
{ {
Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv agr1=0, agi1=0, acr1=0, aci1=0; Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
Tv agr2=0, agi2=0, acr2=0, aci2=0; Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
@ -672,10 +672,10 @@ MRUTIL_NOINLINE static void map2alm_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
l=lsave; l=lsave;
while (l<=lmax) while (l<=lmax)
{ {
Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv agr1=0, agi1=0, acr1=0, aci1=0; Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
Tv agr2=0, agi2=0, acr2=0, aci2=0; Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
@ -695,8 +695,8 @@ MRUTIL_NOINLINE static void map2alm_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
} }
} }
MRUTIL_NOINLINE static void calc_map2alm_spin (sharp_job * MRUTIL_RESTRICT job, NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth) const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
{ {
int l,lmax=gen->lmax; int l,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN; int nv2 = (nth+VLEN-1)/VLEN;
@ -705,15 +705,15 @@ MRUTIL_NOINLINE static void calc_map2alm_spin (sharp_job * MRUTIL_RESTRICT job,
if (l>lmax) return; if (l>lmax) return;
job->opcnt += (lmax+1-l) * 23*nth; job->opcnt += (lmax+1-l) * 23*nth;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
dcmplx * MRUTIL_RESTRICT alm=job->almtmp; dcmplx * restrict alm=job->almtmp;
int full_ieee=1; int full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
getCorfac(d->scp[i], &d->cfp[i], gen->cf); getCorfac(d->scp[i], &d->cfp[i], gen->cf);
getCorfac(d->scm[i], &d->cfm[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= all_of(d->scp[i]>=sharp_minscale) && full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
all_of(d->scm[i]>=sharp_minscale); vallTrue(vge(d->scm[i],vload(sharp_minscale)));
} }
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
@ -726,10 +726,10 @@ MRUTIL_NOINLINE static void calc_map2alm_spin (sharp_job * MRUTIL_RESTRICT job,
while((!full_ieee) && (l<=lmax)) while((!full_ieee) && (l<=lmax))
{ {
Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv agr1=0, agi1=0, acr1=0, aci1=0; Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
Tv agr2=0, agi2=0, acr2=0, aci2=0; Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
full_ieee=1; full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
@ -748,12 +748,12 @@ MRUTIL_NOINLINE static void calc_map2alm_spin (sharp_job * MRUTIL_RESTRICT job,
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol)) if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
getCorfac(d->scp[i], &d->cfp[i], gen->cf); getCorfac(d->scp[i], &d->cfp[i], gen->cf);
full_ieee &= all_of(d->scp[i]>=sharp_minscale); full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol)) if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
getCorfac(d->scm[i], &d->cfm[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= all_of(d->scm[i]>=sharp_minscale); full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
} }
vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]); vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
@ -772,17 +772,17 @@ MRUTIL_NOINLINE static void calc_map2alm_spin (sharp_job * MRUTIL_RESTRICT job,
} }
MRUTIL_NOINLINE static void alm2map_deriv1_kernel(sxdata_v * MRUTIL_RESTRICT d, NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm, const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
int l, int lmax, int nv2) int l, int lmax, int nv2)
{ {
int lsave=l; int lsave=l;
while (l<=lmax) while (l<=lmax)
{ {
Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(), Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])),
ar2=alm[l+1].real(), ai2=alm[l+1].imag(); ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
@ -798,10 +798,10 @@ MRUTIL_NOINLINE static void alm2map_deriv1_kernel(sxdata_v * MRUTIL_RESTRICT d,
l=lsave; l=lsave;
while (l<=lmax) while (l<=lmax)
{ {
Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(), Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])),
ar2=alm[l+1].real(), ai2=alm[l+1].imag(); ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
@ -816,8 +816,8 @@ MRUTIL_NOINLINE static void alm2map_deriv1_kernel(sxdata_v * MRUTIL_RESTRICT d,
} }
} }
MRUTIL_NOINLINE static void calc_alm2map_deriv1(sharp_job * MRUTIL_RESTRICT job, NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth) const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
{ {
int l,lmax=gen->lmax; int l,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN; int nv2 = (nth+VLEN-1)/VLEN;
@ -826,23 +826,23 @@ MRUTIL_NOINLINE static void calc_alm2map_deriv1(sharp_job * MRUTIL_RESTRICT job,
if (l>lmax) return; if (l>lmax) return;
job->opcnt += (lmax+1-l) * 15*nth; job->opcnt += (lmax+1-l) * 15*nth;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
const dcmplx * MRUTIL_RESTRICT alm=job->almtmp; const dcmplx * restrict alm=job->almtmp;
int full_ieee=1; int full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
getCorfac(d->scp[i], &d->cfp[i], gen->cf); getCorfac(d->scp[i], &d->cfp[i], gen->cf);
getCorfac(d->scm[i], &d->cfm[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= all_of(d->scp[i]>=sharp_minscale) && full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
all_of(d->scm[i]>=sharp_minscale); vallTrue(vge(d->scm[i],vload(sharp_minscale)));
} }
while((!full_ieee) && (l<=lmax)) while((!full_ieee) && (l<=lmax))
{ {
Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(), Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])),
ar2=alm[l+1].real(), ai2=alm[l+1].imag(); ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
full_ieee=1; full_ieee=1;
for (int i=0; i<nv2; ++i) for (int i=0; i<nv2; ++i)
{ {
@ -864,12 +864,12 @@ MRUTIL_NOINLINE static void calc_alm2map_deriv1(sharp_job * MRUTIL_RESTRICT job,
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol)) if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
getCorfac(d->scp[i], &d->cfp[i], gen->cf); getCorfac(d->scp[i], &d->cfp[i], gen->cf);
full_ieee &= all_of(d->scp[i]>=sharp_minscale); full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol)) if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
getCorfac(d->scm[i], &d->cfm[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= all_of(d->scm[i]>=sharp_minscale); full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
} }
l+=2; l+=2;
} }
@ -897,7 +897,7 @@ MRUTIL_NOINLINE static void calc_alm2map_deriv1(sharp_job * MRUTIL_RESTRICT job,
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim, const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim) sharp_Ylmgen_C *gen, int mi, const int *mlim)
{ {
@ -912,7 +912,7 @@ MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
if (job->spin==0) if (job->spin==0)
{ {
//adjust the a_lm for the new algorithm //adjust the a_lm for the new algorithm
dcmplx * MRUTIL_RESTRICT alm=job->almtmp; dcmplx * restrict alm=job->almtmp;
for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2) for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
{ {
dcmplx al = alm[l]; dcmplx al = alm[l];
@ -963,8 +963,8 @@ MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
d.s.p2r[i]*=cth_[tgt]; d.s.p2r[i]*=cth_[tgt];
d.s.p2i[i]*=cth_[tgt]; d.s.p2i[i]*=cth_[tgt];
int phas_idx = tgt*job->s_th + mi*job->s_m; int phas_idx = tgt*job->s_th + mi*job->s_m;
complex<double> r1(d.s.p1r[i], d.s.p1i[i]), complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I,
r2(d.s.p2r[i], d.s.p2i[i]); r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I;
job->phase[phas_idx] = r1+r2; job->phase[phas_idx] = r1+r2;
if (ispair[tgt]) if (ispair[tgt])
job->phase[phas_idx+1] = r1-r2; job->phase[phas_idx+1] = r1-r2;
@ -1027,10 +1027,10 @@ MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
{ {
int tgt=itgt[i]; int tgt=itgt[i];
int phas_idx = tgt*job->s_th + mi*job->s_m; int phas_idx = tgt*job->s_th + mi*job->s_m;
complex<double> q1(d.s.p1pr[i], d.s.p1pi[i]), complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I,
q2(d.s.p2pr[i], d.s.p2pi[i]), q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I,
u1(d.s.p1mr[i], d.s.p1mi[i]), u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I,
u2(d.s.p2mr[i], d.s.p2mi[i]); u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I;
job->phase[phas_idx ] = q1+q2; job->phase[phas_idx ] = q1+q2;
job->phase[phas_idx+2] = u1+u2; job->phase[phas_idx+2] = u1+u2;
if (ispair[tgt]) if (ispair[tgt])
@ -1056,7 +1056,7 @@ MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
} }
} }
MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim, const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim) sharp_Ylmgen_C *gen, int mi, const int *mlim)
{ {
@ -1083,8 +1083,8 @@ MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
int phas_idx = ith*job->s_th + mi*job->s_m; int phas_idx = ith*job->s_th + mi*job->s_m;
dcmplx ph1=job->phase[phas_idx]; dcmplx ph1=job->phase[phas_idx];
dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.; dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.;
d.s.p1r[nth]=(ph1+ph2).real(); d.s.p1i[nth]=(ph1+ph2).imag(); d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2);
d.s.p2r[nth]=(ph1-ph2).real(); d.s.p2i[nth]=(ph1-ph2).imag(); d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2);
//adjust for new algorithm //adjust for new algorithm
d.s.p2r[nth]*=cth_[ith]; d.s.p2r[nth]*=cth_[ith];
d.s.p2i[nth]*=cth_[ith]; d.s.p2i[nth]*=cth_[ith];
@ -1105,7 +1105,7 @@ MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
} }
} }
//adjust the a_lm for the new algorithm //adjust the a_lm for the new algorithm
dcmplx * MRUTIL_RESTRICT alm=job->almtmp; dcmplx * restrict alm=job->almtmp;
dcmplx alm2 = 0.; dcmplx alm2 = 0.;
double alold=0; double alold=0;
for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2) for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
@ -1138,10 +1138,10 @@ MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
p2U=ispair[ith] ? job->phase[phas_idx+3]:0.; p2U=ispair[ith] ? job->phase[phas_idx+3]:0.;
if ((gen->mhi-gen->m+gen->s)&1) if ((gen->mhi-gen->m+gen->s)&1)
{ p2Q=-p2Q; p2U=-p2U; } { p2Q=-p2Q; p2U=-p2U; }
d.s.p1pr[nth]=(p1Q+p2Q).real(); d.s.p1pi[nth]=(p1Q+p2Q).imag(); d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q);
d.s.p1mr[nth]=(p1U+p2U).real(); d.s.p1mi[nth]=(p1U+p2U).imag(); d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U);
d.s.p2pr[nth]=(p1Q-p2Q).real(); d.s.p2pi[nth]=(p1Q-p2Q).imag(); d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q);
d.s.p2mr[nth]=(p1U-p2U).real(); d.s.p2mi[nth]=(p1U-p2U).imag(); d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U);
++nth; ++nth;
} }
++ith; ++ith;

View file

@ -29,7 +29,7 @@
#include "libsharp2/sharp_geomhelpers.h" #include "libsharp2/sharp_geomhelpers.h"
#include "libsharp2/sharp_legendre_roots.h" #include "libsharp2/sharp_legendre_roots.h"
#include "libsharp2/sharp_utils.h" #include "libsharp2/sharp_utils.h"
#include "mr_util/fft.h" #include "libsharp2/pocketfft.h"
void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings, void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
const int *rings, const double *weight, sharp_geom_info **geom_info) const int *rings, const double *weight, sharp_geom_info **geom_info)
@ -155,7 +155,9 @@ void sharp_make_fejer1_geom_info (int nrings, int ppring, double phi0,
weight[2*k ]=2./(1.-4.*k*k)*sin((k*pi)/nrings); weight[2*k ]=2./(1.-4.*k*k)*sin((k*pi)/nrings);
} }
if ((nrings&1)==0) weight[nrings-1]=0.; if ((nrings&1)==0) weight[nrings-1]=0.;
mr::r2r_fftpack({size_t(nrings)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.); pocketfft_plan_r plan = pocketfft_make_plan_r(nrings);
pocketfft_backward_r(plan,weight,1.);
pocketfft_delete_plan_r(plan);
for (int m=0; m<(nrings+1)/2; ++m) for (int m=0; m<(nrings+1)/2; ++m)
{ {
@ -200,7 +202,9 @@ void sharp_make_cc_geom_info (int nrings, int ppring, double phi0,
for (int k=1; k<=(n/2-1); ++k) for (int k=1; k<=(n/2-1); ++k)
weight[2*k-1]=2./(1.-4.*k*k) + dw; weight[2*k-1]=2./(1.-4.*k*k) + dw;
weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1); weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1);
mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.); pocketfft_plan_r plan = pocketfft_make_plan_r(n);
pocketfft_backward_r(plan,weight,1.);
pocketfft_delete_plan_r(plan);
weight[n]=weight[0]; weight[n]=weight[0];
for (int m=0; m<(nrings+1)/2; ++m) for (int m=0; m<(nrings+1)/2; ++m)
@ -246,7 +250,9 @@ void sharp_make_fejer2_geom_info (int nrings, int ppring, double phi0,
for (int k=1; k<=(n/2-1); ++k) for (int k=1; k<=(n/2-1); ++k)
weight[2*k-1]=2./(1.-4.*k*k); weight[2*k-1]=2./(1.-4.*k*k);
weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.; weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.;
mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.); pocketfft_plan_r plan = pocketfft_make_plan_r(n);
pocketfft_backward_r(plan,weight,1.);
pocketfft_delete_plan_r(plan);
for (int m=0; m<nrings; ++m) for (int m=0; m<nrings; ++m)
weight[m]=weight[m+1]; weight[m]=weight[m+1];

View file

@ -28,12 +28,14 @@
#ifndef SHARP2_INTERNAL_H #ifndef SHARP2_INTERNAL_H
#define SHARP2_INTERNAL_H #define SHARP2_INTERNAL_H
#include <complex> #ifdef __cplusplus
#error This header file cannot be included from C++, only from C
#endif
#include <complex.h>
#include "libsharp2/sharp.h" #include "libsharp2/sharp.h"
#include "libsharp2/sharp_ylmgen_c.h" #include "libsharp2/sharp_ylmgen_c.h"
using std::complex;
typedef struct typedef struct
{ {
sharp_jobtype type; sharp_jobtype type;
@ -43,9 +45,9 @@ typedef struct
void **map; void **map;
void **alm; void **alm;
int s_m, s_th; // strides in m and theta direction int s_m, s_th; // strides in m and theta direction
complex<double> *phase; complex double *phase;
double *norm_l; double *norm_l;
complex<double> *almtmp; complex double *almtmp;
const sharp_geom_info *ginfo; const sharp_geom_info *ginfo;
const sharp_alm_info *ainfo; const sharp_alm_info *ainfo;
double time; double time;

View file

@ -122,4 +122,10 @@ double sharp_wallTime(void);
} }
#endif #endif
#ifdef __GNUC__
#define NOINLINE __attribute__((noinline))
#else
#define NOINLINE
#endif
#endif #endif

View file

@ -28,27 +28,193 @@
#ifndef SHARP2_VECSUPPORT_H #ifndef SHARP2_VECSUPPORT_H
#define SHARP2_VECSUPPORT_H #define SHARP2_VECSUPPORT_H
#include <cmath> #include <math.h>
#include <complex>
using std::complex;
#include <experimental/simd>
using std::experimental::native_simd;
using std::experimental::reduce;
#include "mr_util/useful_macros.h" #ifndef VLEN
using Tv=native_simd<double>; #if (defined(__AVX512F__))
using Tm=Tv::mask_type; #define VLEN 8
using Ts=Tv::value_type; #elif (defined (__AVX__))
static constexpr size_t VLEN=Tv::size(); #define VLEN 4
#elif (defined (__SSE2__))
#define VLEN 2
#else
#define VLEN 1
#endif
#endif
typedef double Ts;
#if (VLEN==1)
typedef double Tv;
typedef int Tm;
#define vload(a) (a) #define vload(a) (a)
#define vzero 0.
#define vone 1.
#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
#define vneg(a) (-(a))
#define vabs(a) fabs(a)
#define vsqrt(a) sqrt(a)
#define vlt(a,b) ((a)<(b))
#define vgt(a,b) ((a)>(b))
#define vge(a,b) ((a)>=(b))
#define vne(a,b) ((a)!=(b))
#define vand_mask(a,b) ((a)&&(b))
#define vor_mask(a,b) ((a)||(b))
static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
#define vanyTrue(a) (a)
#define vallTrue(a) (a)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
complex<double> * MRUTIL_RESTRICT cc) _Complex double * restrict cc)
{ cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
#endif
#if (VLEN==2)
#include <emmintrin.h>
#if defined (__SSE3__)
#include <pmmintrin.h>
#endif
#if defined (__SSE4_1__)
#include <smmintrin.h>
#endif
typedef __m128d Tv;
typedef __m128d Tm;
#if defined(__SSE4_1__)
#define vblend__(m,a,b) _mm_blendv_pd(b,a,m)
#else
static inline Tv vblend__(Tv m, Tv a, Tv b)
{ return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
#endif
#define vload(a) _mm_set1_pd(a)
#define vzero _mm_setzero_pd()
#define vone vload(1.)
#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
#define vneg(a) _mm_xor_pd(vload(-0.),a)
#define vabs(a) _mm_andnot_pd(vload(-0.),a)
#define vsqrt(a) _mm_sqrt_pd(a)
#define vlt(a,b) _mm_cmplt_pd(a,b)
#define vgt(a,b) _mm_cmpgt_pd(a,b)
#define vge(a,b) _mm_cmpge_pd(a,b)
#define vne(a,b) _mm_cmpneq_pd(a,b)
#define vand_mask(a,b) _mm_and_pd(a,b)
#define vor_mask(a,b) _mm_or_pd(a,b)
#define vmin(a,b) _mm_min_pd(a,b)
#define vmax(a,b) _mm_max_pd(a,b);
#define vanyTrue(a) (_mm_movemask_pd(a)!=0)
#define vallTrue(a) (_mm_movemask_pd(a)==3)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c,
Tv d, _Complex double * restrict cc)
{ {
cc[0] += complex<double>(reduce(a,std::plus<>()),reduce(b,std::plus<>())); union {Tv v; _Complex double c; } u1, u2;
cc[1] += complex<double>(reduce(c,std::plus<>()),reduce(d,std::plus<>())); #if defined(__SSE3__)
u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
#else
u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
#endif
cc[0]+=u1.c; cc[1]+=u2.c;
} }
#endif #endif
#if (VLEN==4)
#include <immintrin.h>
typedef __m256d Tv;
typedef __m256d Tm;
#define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
#define vload(a) _mm256_set1_pd(a)
#define vzero _mm256_setzero_pd()
#define vone vload(1.)
#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
#define vneg(a) _mm256_xor_pd(vload(-0.),a)
#define vabs(a) _mm256_andnot_pd(vload(-0.),a)
#define vsqrt(a) _mm256_sqrt_pd(a)
#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
#define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ)
#define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ)
#define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
#define vand_mask(a,b) _mm256_and_pd(a,b)
#define vor_mask(a,b) _mm256_or_pd(a,b)
#define vmin(a,b) _mm256_min_pd(a,b)
#define vmax(a,b) _mm256_max_pd(a,b)
#define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
#define vallTrue(a) (_mm256_movemask_pd(a)==15)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc)
{
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
tmp1=tmp3+tmp4;
union {Tv v; _Complex double c[2]; } u;
u.v=tmp1;
cc[0]+=u.c[0]; cc[1]+=u.c[1];
}
#endif
#if (VLEN==8)
#include <immintrin.h>
typedef __m512d Tv;
typedef __mmask8 Tm;
#define vload(a) _mm512_set1_pd(a)
#define vzero _mm512_setzero_pd()
#define vone vload(1.)
#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
#define vneg(a) _mm512_mul_pd(a,vload(-1.))
#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)vload(-0.),(__m512i)a)
#define vsqrt(a) _mm512_sqrt_pd(a)
#define vlt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_LT_OQ)
#define vgt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GT_OQ)
#define vge(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GE_OQ)
#define vne(a,b) _mm512_cmp_pd_mask(a,b,_CMP_NEQ_OQ)
#define vand_mask(a,b) ((a)&(b))
#define vor_mask(a,b) ((a)|(b))
#define vmin(a,b) _mm512_min_pd(a,b)
#define vmax(a,b) _mm512_max_pd(a,b)
#define vanyTrue(a) (a!=0)
#define vallTrue(a) (a==255)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc)
{
cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
}
#endif
#endif

View file

@ -26,8 +26,7 @@
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <complex> #include <complex.h>
using std::complex;
#ifdef USE_MPI #ifdef USE_MPI
#include "mpi.h" #include "mpi.h"
#include "libsharp2/sharp_mpi.h" #include "libsharp2/sharp_mpi.h"
@ -39,7 +38,7 @@ using std::complex;
#include "libsharp2/sharp_geomhelpers.h" #include "libsharp2/sharp_geomhelpers.h"
#include "libsharp2/sharp_almhelpers.h" #include "libsharp2/sharp_almhelpers.h"
#include "libsharp2/sharp_utils.h" #include "libsharp2/sharp_utils.h"
#include "libsharp2/sharp_utils.cc" #include "libsharp2/sharp_utils.c"
#include "test/memusage.h" #include "test/memusage.h"
static void OpenMP_status(void) static void OpenMP_status(void)
@ -95,7 +94,7 @@ static void sharp_module_startup (const char *name, int argc, int argc_expected,
exit(1); exit(1);
} }
typedef complex<double> dcmplx; typedef complex double dcmplx;
int ntasks, mytask; int ntasks, mytask;
@ -123,7 +122,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
{ {
double rv = drand(-1,1,&state); double rv = drand(-1,1,&state);
double iv = (m==0) ? 0 : drand(-1,1,&state); double iv = (m==0) ? 0 : drand(-1,1,&state);
alm[sharp_alm_index(helper,l,mi)] = dcmplx(rv,iv); alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
} }
} }
} }
@ -231,7 +230,8 @@ static double *get_sqsum_and_invert (dcmplx **alm, ptrdiff_t nalms, int ncomp)
sqsum[i]=0; sqsum[i]=0;
for (ptrdiff_t j=0; j<nalms; ++j) for (ptrdiff_t j=0; j<nalms; ++j)
{ {
sqsum[i]+=norm(alm[i][j]); sqsum[i]+=creal(alm[i][j])*creal(alm[i][j])
+cimag(alm[i][j])*cimag(alm[i][j]);
alm[i][j]=-alm[i][j]; alm[i][j]=-alm[i][j];
} }
} }
@ -253,7 +253,8 @@ static void get_errors (dcmplx **alm, ptrdiff_t nalms, int ncomp, double *sqsum,
double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot; double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot;
for (ptrdiff_t j=0; j<nalms; ++j) for (ptrdiff_t j=0; j<nalms; ++j)
{ {
double sqr=norm(alm[i][j]); double sqr=creal(alm[i][j])*creal(alm[i][j])
+cimag(alm[i][j])*cimag(alm[i][j]);
sum+=sqr; sum+=sqr;
if (sqr>maxdiff) maxdiff=sqr; if (sqr>maxdiff) maxdiff=sqr;
} }
@ -413,7 +414,7 @@ static void check_sign_scale(void)
ALLOC2D(alm,dcmplx,2,nalms); ALLOC2D(alm,dcmplx,2,nalms);
for (int i=0; i<2; ++i) for (int i=0; i<2; ++i)
for (int j=0; j<nalms; ++j) for (int j=0; j<nalms; ++j)
alm[i][j]=dcmplx(1.,1.); alm[i][j]=1.+_Complex_I;
sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP, sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP,
NULL,NULL); NULL,NULL);
@ -519,7 +520,7 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
{ {
#ifdef USE_MPI #ifdef USE_MPI
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,&alm[itrans*ncomp],&map[itrans*ncomp],ginfo, sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,&alm[itrans*ncomp],&map[itrans*ncomp],ginfo,
ainfo,SHARP_DP|SHARP_ADD,&ttm2a,op_&tom2a); ainfo,SHARP_DP|SHARP_ADD,&ttm2a,&tom2a);
#else #else
sharp_execute(SHARP_MAP2ALM,spin,&alm[itrans*ncomp],&map[itrans*ncomp],ginfo,ainfo, sharp_execute(SHARP_MAP2ALM,spin,&alm[itrans*ncomp],&map[itrans*ncomp],ginfo,ainfo,
SHARP_DP|SHARP_ADD,&ttm2a,&tom2a); SHARP_DP|SHARP_ADD,&ttm2a,&tom2a);