Compare commits

...
Sign in to create a new pull request.

6 commits
master ... cpp

Author SHA1 Message Date
Martin Reinecke
b27ce30cde switch to new FFT 2020-01-04 12:59:59 +01:00
Martin Reinecke
f8a9c96acf use MRUTIL macros 2020-01-03 22:55:52 +01:00
Martin Reinecke
75559e6894 vectorization cleanup 2020-01-03 22:47:49 +01:00
Martin Reinecke
54cbae0534 multithreading demonstrated to work, but not safe yet 2020-01-03 18:26:53 +01:00
Martin Reinecke
0f8051fc28 vectors work with experimental::simd 2020-01-03 18:01:34 +01:00
Martin Reinecke
0378ce155a runs as C++, no vector support yet 2020-01-03 17:22:31 +01:00
19 changed files with 382 additions and 2817 deletions

View file

@ -49,7 +49,7 @@ It can be disabled at configuration time by specifying "--disable-openmp" at the
configure command line.
At runtime OMP_NUM_THREADS should be set to the number of hardware threads
(not physical cores) of the system.
(Usually this is already the default setting when OMP_NUM_THREADS is not
(Usually this is already the default setting when OMP_NUM_THREADS is not
specified.)

View file

@ -3,16 +3,14 @@ ACLOCAL_AMFLAGS = -I m4
lib_LTLIBRARIES = libsharp2.la
libsharp2_la_SOURCES = \
libsharp2/pocketfft.c \
libsharp2/pocketfft.h \
libsharp2/sharp_utils.c \
libsharp2/sharp_utils.cc \
libsharp2/sharp_utils.h \
libsharp2/sharp.c \
libsharp2/sharp_almhelpers.c \
libsharp2/sharp_core.c \
libsharp2/sharp_geomhelpers.c \
libsharp2/sharp_legendre_roots.c \
libsharp2/sharp_ylmgen_c.c \
libsharp2/sharp.cc \
libsharp2/sharp_almhelpers.cc \
libsharp2/sharp_core.cc \
libsharp2/sharp_geomhelpers.cc \
libsharp2/sharp_legendre_roots.cc \
libsharp2/sharp_ylmgen_c.cc \
libsharp2/sharp_internal.h \
libsharp2/sharp_legendre_roots.h \
libsharp2/sharp_vecsupport.h \
@ -24,25 +22,25 @@ libsharp2_la_SOURCES = \
# any backward-compatible change: increase age
# any backward-incompatible change: age=0
# ==> age <= current
libsharp2_la_LDFLAGS = -version-info 0:0:0
libsharp2_la_LDFLAGS = -version-info 0:0:0 -lpthread
AM_CFLAGS = @AM_CFLAGS@
AM_CXXFLAGS = @AM_CXXFLAGS@
if HAVE_MULTIARCH
libavx_la_SOURCES = libsharp2/sharp_core_inc.c
libavx2_la_SOURCES = libsharp2/sharp_core_inc.c
libfma_la_SOURCES = libsharp2/sharp_core_inc.c
libfma4_la_SOURCES = libsharp2/sharp_core_inc.c
libavx512f_la_SOURCES = libsharp2/sharp_core_inc.c
libavx_la_SOURCES = libsharp2/sharp_core_inc.cc
libavx2_la_SOURCES = libsharp2/sharp_core_inc.cc
libfma_la_SOURCES = libsharp2/sharp_core_inc.cc
libfma4_la_SOURCES = libsharp2/sharp_core_inc.cc
libavx512f_la_SOURCES = libsharp2/sharp_core_inc.cc
noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx
libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2
libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma
libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4
libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f
libavx_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx -DARCH=avx
libavx2_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx2 -DARCH=avx2
libfma_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma -DARCH=fma
libfma4_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma4 -DARCH=fma4
libavx512f_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx512f -DARCH=avx512f
libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
@ -56,10 +54,10 @@ nobase_include_HEADERS = \
libsharp2/sharp_cxx.h
EXTRA_DIST = \
runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.c
runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.cc
check_PROGRAMS = sharp2_testsuite
sharp2_testsuite_SOURCES = test/sharp2_testsuite.c test/memusage.c test/memusage.h
sharp2_testsuite_SOURCES = test/sharp2_testsuite.cc test/memusage.cc test/memusage.h
sharp2_testsuite_LDADD = libsharp2.la
TESTS = runtest.sh

View file

@ -20,28 +20,21 @@ dnl
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
AC_PROG_CC_C99
AC_OPENMP
# add math library
LIBS="-lm"
AC_PROG_CXX
AX_CXX_COMPILE_STDCXX([11])
AC_PROG_LIBTOOL
tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'`
tmpval=`echo $CXXFLAGS | grep -c '\-DMULTIARCH'`
AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])
AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS"
PACKAGE_LIBS="-lsharp2"
PACKAGE_CFLAGS="$PACKAGE_CFLAGS $OPENMP_CFLAGS"
PACKAGE_LDFLAGS="$PACKAGE_LDFLAGS $OPENMP_CFLAGS"
dnl
dnl Create pkgconfig .pc file.
dnl
AX_CREATE_PKGCONFIG_INFO(,,,,[])
AC_SUBST([AM_CFLAGS])
AC_SUBST([AM_CXXFLAGS])
AC_CONFIG_FILES([Makefile])
AC_OUTPUT

File diff suppressed because it is too large Load diff

View file

@ -1,50 +0,0 @@
/*
* This file is part of pocketfft.
* Licensed under a 3-clause BSD style license - see LICENSE.md
*/
/*! \file pocketfft.h
* Public interface of the pocketfft library
*
* Copyright (C) 2008-2019 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef POCKETFFT_H
#define POCKETFFT_H
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
struct pocketfft_plan_c_i;
typedef struct pocketfft_plan_c_i * pocketfft_plan_c;
pocketfft_plan_c pocketfft_make_plan_c (size_t length);
void pocketfft_delete_plan_c (pocketfft_plan_c plan);
int pocketfft_backward_c(pocketfft_plan_c plan, double c[], double fct);
int pocketfft_forward_c(pocketfft_plan_c plan, double c[], double fct);
size_t pocketfft_length_c(pocketfft_plan_c plan);
struct pocketfft_plan_r_i;
typedef struct pocketfft_plan_r_i * pocketfft_plan_r;
pocketfft_plan_r pocketfft_make_plan_r (size_t length);
void pocketfft_delete_plan_r (pocketfft_plan_r plan);
/*! Computes a real backward FFT on \a c, using \a plan
and assuming the FFTPACK storage scheme:
- on entry, \a c has the form <tt>r0, r1, i1, r2, i2, ...</tt>
- on exit, it has the form <tt>r0, r1, ..., r[length-1]</tt>. */
int pocketfft_backward_r(pocketfft_plan_r plan, double c[], double fct);
/*! Computes a real forward FFT on \a c, using \a plan
and assuming the FFTPACK storage scheme:
- on entry, \a c has the form <tt>r0, r1, ..., r[length-1]</tt>;
- on exit, it has the form <tt>r0, r1, i1, r2, i2, ...</tt> */
int pocketfft_forward_r(pocketfft_plan_r plan, double c[], double fct);
size_t pocketfft_length_r(pocketfft_plan_r plan);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -25,17 +25,20 @@
* \author Martin Reinecke \author Dag Sverre Seljebotn
*/
#include <math.h>
#include <string.h>
#include "libsharp2/pocketfft.h"
#include <cmath>
#include <cstring>
#include <atomic>
#include "mr_util/fft.h"
#include "libsharp2/sharp_ylmgen_c.h"
#include "libsharp2/sharp_internal.h"
#include "libsharp2/sharp_utils.h"
#include "libsharp2/sharp_almhelpers.h"
#include "libsharp2/sharp_geomhelpers.h"
#include "mr_util/threading.h"
#include "mr_util/useful_macros.h"
typedef complex double dcmplx;
typedef complex float fcmplx;
typedef complex<double> dcmplx;
typedef complex<float> fcmplx;
static const double sqrt_one_half = 0.707106781186547572737310929369;
static const double sqrt_two = 1.414213562373095145474621858739;
@ -57,7 +60,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
*nchunks = (ndata+(*chunksize)-1)/(*chunksize);
}
NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
MRUTIL_NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
{
double ofs=lmax*0.01;
if (ofs<100.) ofs=100.;
@ -76,7 +79,7 @@ typedef struct
double phi0_;
dcmplx *shiftarr;
int s_shift;
pocketfft_plan_r plan;
mr::detail_fft::rfftp<double> *plan;
int length;
int norot;
} ringhelper;
@ -89,12 +92,12 @@ static void ringhelper_init (ringhelper *self)
static void ringhelper_destroy (ringhelper *self)
{
if (self->plan) pocketfft_delete_plan_r(self->plan);
delete self->plan;
DEALLOC(self->shiftarr);
ringhelper_init(self);
}
NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
MRUTIL_NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
{
self->norot = (fabs(phi0)<1e-14);
if (!(self->norot))
@ -105,27 +108,27 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou
self->phi0_ = phi0;
// FIXME: improve this by using sincos2pibyn(nph) etc.
for (int m=0; m<=mmax; ++m)
self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
self->shiftarr[m] = dcmplx(cos(m*phi0),sin(m*phi0));
// double *tmp=(double *) self->shiftarr;
// sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
}
// if (!self->plan) self->plan=pocketfft_make_plan_r(nph);
if (nph!=(int)self->length)
{
if (self->plan) pocketfft_delete_plan_r(self->plan);
self->plan=pocketfft_make_plan_r(nph);
if (self->plan) delete self->plan;
self->plan=new mr::detail_fft::rfftp<double>(nph);
self->length=nph;
}
}
static int ringinfo_compare (const void *xa, const void *xb)
{
const sharp_ringinfo *a=xa, *b=xb;
const sharp_ringinfo *a=(const sharp_ringinfo *)xa, *b=(const sharp_ringinfo *)xb;
return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
}
static int ringpair_compare (const void *xa, const void *xb)
{
const sharp_ringpair *a=xa, *b=xb;
const sharp_ringpair *a=(const sharp_ringpair *)xa, *b=(const sharp_ringpair *)xb;
// return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
if (a->r1.nph==b->r1.nph)
return (a->r1.phi0 < b->r1.phi0) ? -1 :
@ -275,7 +278,7 @@ static int sharp_get_mmax (int *mval, int nm)
return nm-1;
}
NOINLINE static void ringhelper_phase2ring (ringhelper *self,
MRUTIL_NOINLINE static void ringhelper_phase2ring (ringhelper *self,
const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase,
int pstride, int flags)
{
@ -292,22 +295,22 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
if (self->norot)
for (int m=0; m<=mmax; ++m)
{
data[2*m]=creal(phase[m*pstride])*wgt;
data[2*m+1]=cimag(phase[m*pstride])*wgt;
data[2*m]=phase[m*pstride].real()*wgt;
data[2*m+1]=phase[m*pstride].imag()*wgt;
}
else
for (int m=0; m<=mmax; ++m)
{
dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
data[2*m]=creal(tmp)*wgt;
data[2*m+1]=cimag(tmp)*wgt;
data[2*m]=tmp.real()*wgt;
data[2*m+1]=tmp.imag()*wgt;
}
for (int m=2*(mmax+1); m<nph+2; ++m)
data[m]=0.;
}
else
{
data[0]=creal(phase[0])*wgt;
data[0]=phase[0].real()*wgt;
SET_ARRAY(data,1,nph+2,0.);
int idx1=1, idx2=nph-1;
@ -317,23 +320,23 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
if(!self->norot) tmp*=self->shiftarr[m];
if (idx1<(nph+2)/2)
{
data[2*idx1]+=creal(tmp);
data[2*idx1+1]+=cimag(tmp);
data[2*idx1]+=tmp.real();
data[2*idx1+1]+=tmp.imag();
}
if (idx2<(nph+2)/2)
{
data[2*idx2]+=creal(tmp);
data[2*idx2+1]-=cimag(tmp);
data[2*idx2]+=tmp.real();
data[2*idx2+1]-=tmp.imag();
}
if (++idx1>=nph) idx1=0;
if (--idx2<0) idx2=nph-1;
}
}
data[1]=data[0];
pocketfft_backward_r (self->plan, &(data[1]), 1.);
self->plan->exec(&(data[1]), 1., false);
}
NOINLINE static void ringhelper_ring2phase (ringhelper *self,
MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self,
const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase,
int pstride, int flags)
{
@ -349,7 +352,7 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
if (flags&SHARP_REAL_HARMONICS)
wgt *= sqrt_two;
pocketfft_forward_r (self->plan, &(data[1]), 1.);
self->plan->exec (&(data[1]), 1., true);
data[0]=data[1];
data[1]=data[nph+1]=0.;
@ -357,11 +360,11 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
{
if (self->norot)
for (int m=0; m<=maxidx; ++m)
phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * wgt;
phase[m*pstride] = dcmplx(data[2*m], data[2*m+1]) * wgt;
else
for (int m=0; m<=maxidx; ++m)
phase[m*pstride] =
(data[2*m] + _Complex_I*data[2*m+1]) * self->shiftarr[m] * wgt;
dcmplx(data[2*m], data[2*m+1]) * self->shiftarr[m] * wgt;
}
else
{
@ -370,9 +373,9 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
int idx=m%nph;
dcmplx val;
if (idx<(nph-idx))
val = (data[2*idx] + _Complex_I*data[2*idx+1]) * wgt;
val = dcmplx(data[2*idx], data[2*idx+1]) * wgt;
else
val = (data[2*(nph-idx)] - _Complex_I*data[2*(nph-idx)+1]) * wgt;
val = dcmplx(data[2*(nph-idx)], -data[2*(nph-idx)+1]) * wgt;
if (!self->norot)
val *= self->shiftarr[m];
phase[m*pstride]=val;
@ -383,7 +386,7 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
phase[m*pstride]=0.;
}
NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
MRUTIL_NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
int flags)
{
if (flags & SHARP_NO_FFT)
@ -440,7 +443,7 @@ NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
}
}
NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
MRUTIL_NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
int flags)
{
#define CLEARLOOP(real_t,body) \
@ -477,7 +480,7 @@ NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
}
}
NOINLINE static void init_output (sharp_job *job)
MRUTIL_NOINLINE static void init_output (sharp_job *job)
{
if (job->flags&SHARP_ADD) return;
if (job->type == SHARP_MAP2ALM)
@ -488,7 +491,7 @@ NOINLINE static void init_output (sharp_job *job)
clear_map (job->ginfo,job->map[i],job->flags);
}
NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
MRUTIL_NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
{
if (job->type==SHARP_MAP2ALM)
{
@ -514,7 +517,7 @@ static void alloc_almtmp (sharp_job *job, int lmax)
static void dealloc_almtmp (sharp_job *job)
{ DEALLOC(job->almtmp); }
NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
MRUTIL_NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
{
#define COPY_LOOP(real_t, source_t, expr_of_x) \
@ -577,7 +580,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
if (job->flags&SHARP_DP)
COPY_LOOP(double, dcmplx, x*job->norm_l[l])
else
COPY_LOOP(float, fcmplx, x*job->norm_l[l])
COPY_LOOP(float, fcmplx, x*float(job->norm_l[l]))
}
}
}
@ -588,7 +591,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
#undef COPY_LOOP
}
NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
{
#define COPY_LOOP(real_t, target_t, expr_of_x) \
@ -617,9 +620,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
if (m==0)
{
if (job->flags&SHARP_DP)
COPY_LOOP(double, double, creal(x)*norm_m0)
COPY_LOOP(double, double, x.real()*norm_m0)
else
COPY_LOOP(float, float, crealf(x)*norm_m0)
COPY_LOOP(float, float, x.real()*norm_m0)
}
else
{
@ -634,9 +637,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
if (m==0)
{
if (job->flags&SHARP_DP)
COPY_LOOP(double, double, creal(x)*job->norm_l[l]*norm_m0)
COPY_LOOP(double, double, x.real()*job->norm_l[l]*norm_m0)
else
COPY_LOOP(float, fcmplx, (float)(creal(x)*job->norm_l[l]*norm_m0))
COPY_LOOP(float, fcmplx, (float)(x.real()*job->norm_l[l]*norm_m0))
}
else
{
@ -650,7 +653,7 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
#undef COPY_LOOP
}
NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
MRUTIL_NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
const double *ringtmp, int rstride)
{
if (job->flags & SHARP_DP)
@ -658,8 +661,8 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
double **dmap = (double **)job->map;
for (int i=0; i<job->nmaps; ++i)
{
double *restrict p1=&dmap[i][ri->ofs];
const double *restrict p2=&ringtmp[i*rstride+1];
double *MRUTIL_RESTRICT p1=&dmap[i][ri->ofs];
const double *MRUTIL_RESTRICT p2=&ringtmp[i*rstride+1];
if (ri->stride==1)
{
if (job->flags&SHARP_ADD)
@ -682,14 +685,14 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
}
}
NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
MRUTIL_NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
double *ringtmp, int rstride)
{
if (job->flags & SHARP_DP)
for (int i=0; i<job->nmaps; ++i)
{
double *restrict p1=&ringtmp[i*rstride+1],
*restrict p2=&(((double *)(job->map[i]))[ri->ofs]);
double *MRUTIL_RESTRICT p1=&ringtmp[i*rstride+1],
*MRUTIL_RESTRICT p2=&(((double *)(job->map[i]))[ri->ofs]);
if (ri->stride==1)
memcpy(p1,p2,ri->nph*sizeof(double));
else
@ -721,7 +724,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
for (int m=0; m<=mmax; ++m)
phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt;
((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*float(wgt);
}
}
static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
@ -743,7 +746,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
}
//FIXME: set phase to zero if not SHARP_MAP2ALM?
NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
MRUTIL_NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
{
if (job->type != SHARP_MAP2ALM) return;
int pstride = job->s_m;
@ -760,35 +763,35 @@ NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
}
else
{
#pragma omp parallel
{
ringhelper helper;
ringhelper_init(&helper);
int rstride=job->ginfo->nphmax+2;
double *ringtmp=RALLOC(double,job->nmaps*rstride);
#pragma omp for schedule(dynamic,1)
for (int ith=llim; ith<ulim; ++ith)
mr::execDynamic(ulim-llim, 0, 1, [&](mr::Scheduler &sched)
{
int dim2 = job->s_th*(ith-llim);
ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
for (int i=0; i<job->nmaps; ++i)
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
if (job->ginfo->pair[ith].r2.nph>0)
ringhelper helper;
ringhelper_init(&helper);
int rstride=job->ginfo->nphmax+2;
double *ringtmp=RALLOC(double,job->nmaps*rstride);
while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; ith<rng.hi+llim; ++ith)
{
ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
int dim2 = job->s_th*(ith-llim);
ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
for (int i=0; i<job->nmaps; ++i)
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
if (job->ginfo->pair[ith].r2.nph>0)
{
ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
for (int i=0; i<job->nmaps; ++i)
ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
}
}
}
DEALLOC(ringtmp);
ringhelper_destroy(&helper);
} /* end of parallel region */
DEALLOC(ringtmp);
ringhelper_destroy(&helper);
}); /* end of parallel region */
}
}
NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
MRUTIL_NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
{
if (job->type == SHARP_MAP2ALM) return;
int pstride = job->s_m;
@ -805,35 +808,35 @@ NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
}
else
{
#pragma omp parallel
{
ringhelper helper;
ringhelper_init(&helper);
int rstride=job->ginfo->nphmax+2;
double *ringtmp=RALLOC(double,job->nmaps*rstride);
#pragma omp for schedule(dynamic,1)
for (int ith=llim; ith<ulim; ++ith)
mr::execDynamic(ulim-llim, 0, 1, [&](mr::Scheduler &sched)
{
int dim2 = job->s_th*(ith-llim);
for (int i=0; i<job->nmaps; ++i)
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
if (job->ginfo->pair[ith].r2.nph>0)
ringhelper helper;
ringhelper_init(&helper);
int rstride=job->ginfo->nphmax+2;
double *ringtmp=RALLOC(double,job->nmaps*rstride);
while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; ith<rng.hi+llim; ++ith)
{
int dim2 = job->s_th*(ith-llim);
for (int i=0; i<job->nmaps; ++i)
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
if (job->ginfo->pair[ith].r2.nph>0)
{
for (int i=0; i<job->nmaps; ++i)
ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2),
&ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
}
}
}
DEALLOC(ringtmp);
ringhelper_destroy(&helper);
} /* end of parallel region */
DEALLOC(ringtmp);
ringhelper_destroy(&helper);
}); /* end of parallel region */
}
}
NOINLINE static void sharp_execute_job (sharp_job *job)
MRUTIL_NOINLINE static void sharp_execute_job (sharp_job *job)
{
double timer=sharp_wallTime();
job->opcnt=0;
@ -852,6 +855,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
&nchunks,&chunksize);
//FIXME: needs to be changed to "nm"
alloc_phase (job,mmax+1,chunksize);
std::atomic<size_t> opcnt = 0;
/* chunk loop */
for (int chunk=0; chunk<nchunks; ++chunk)
@ -871,32 +875,30 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
/* map->phase where necessary */
map2phase (job, mmax, llim, ulim);
#pragma omp parallel
{
sharp_job ljob = *job;
ljob.opcnt=0;
sharp_Ylmgen_C generator;
sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
alloc_almtmp(&ljob,lmax);
#pragma omp for schedule(dynamic,1)
for (int mi=0; mi<job->ainfo->nm; ++mi)
mr::execDynamic(job->ainfo->nm, 0, 1, [&](mr::Scheduler &sched)
{
/* alm->alm_tmp where necessary */
alm2almtmp (&ljob, lmax, mi);
sharp_job ljob = *job;
ljob.opcnt=0;
sharp_Ylmgen_C generator;
sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
alloc_almtmp(&ljob,lmax);
inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim);
while (auto rng=sched.getNext()) for(auto mi=rng.lo; mi<rng.hi; ++mi)
{
/* alm->alm_tmp where necessary */
alm2almtmp (&ljob, lmax, mi);
inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim);
/* alm_tmp->alm where necessary */
almtmp2alm (&ljob, lmax, mi);
}
almtmp2alm (&ljob, lmax, mi);
}
sharp_Ylmgen_destroy(&generator);
dealloc_almtmp(&ljob);
sharp_Ylmgen_destroy(&generator);
dealloc_almtmp(&ljob);
#pragma omp critical
job->opcnt+=ljob.opcnt;
} /* end of parallel region */
opcnt+=ljob.opcnt;
}); /* end of parallel region */
/* phase->map where necessary */
phase2map (job, mmax, llim, ulim);
@ -909,6 +911,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
DEALLOC(job->norm_l);
dealloc_phase (job);
job->opcnt = opcnt;
job->time=sharp_wallTime()-timer;
}
@ -934,8 +937,8 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
job->flags|=SHARP_REAL_HARMONICS;
job->time = 0.;
job->opcnt = 0;
job->alm=alm;
job->map=map;
job->alm=(void **)alm;
job->map=(void **)map;
}
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,

View file

@ -27,7 +27,7 @@
#define ARCH default
#define GENERIC_ARCH
#include "libsharp2/sharp_core_inc.c"
#include "libsharp2/sharp_core_inc.cc"
#undef GENERIC_ARCH
#undef ARCH

View file

@ -34,7 +34,7 @@
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#include <complex.h>
#include <complex>
#include <math.h>
#include <string.h>
#include "libsharp2/sharp_vecsupport.h"
@ -49,7 +49,7 @@
// Unfortunately, most compilers don't act on this pragma yet.
#pragma STDC FP_CONTRACT ON
typedef complex double dcmplx;
typedef complex<double> dcmplx;
#define nv0 (128/VLEN)
#define nvx (64/VLEN)
@ -94,35 +94,35 @@ typedef union
sxdata_s s;
} sxdata_u;
static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale,
static inline void Tvnormalize (Tv * MRUTIL_RESTRICT val, Tv * MRUTIL_RESTRICT scale,
double maxval)
{
const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
Tm mask = vgt(vabs(*val),vfmax);
while (vanyTrue(mask))
const Tv vfmin=sharp_fsmall*maxval, vfmax=maxval;
const Tv vfsmall=sharp_fsmall, vfbig=sharp_fbig;
auto mask = abs(*val)>vfmax;
while (any_of(mask))
{
vmuleq_mask(mask,*val,vfsmall);
vaddeq_mask(mask,*scale,vone);
mask = vgt(vabs(*val),vfmax);
where(mask,*val)*=vfsmall;
where(mask,*scale)+=1;
mask = abs(*val)>vfmax;
}
mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
while (vanyTrue(mask))
mask = (abs(*val)<vfmin) & (*val!=0);
while (any_of(mask))
{
vmuleq_mask(mask,*val,vfbig);
vsubeq_mask(mask,*scale,vone);
mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
where(mask,*val)*=vfbig;
where(mask,*scale)-=1;
mask = (abs(*val)<vfmin) & (*val!=0);
}
}
static void mypow(Tv val, int npow, const double * restrict powlimit,
Tv * restrict resd, Tv * restrict ress)
static void mypow(Tv val, int npow, const double * MRUTIL_RESTRICT powlimit,
Tv * MRUTIL_RESTRICT resd, Tv * MRUTIL_RESTRICT ress)
{
Tv vminv=vload(powlimit[npow]);
Tm mask = vlt(vabs(val),vminv);
if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
Tv vminv=powlimit[npow];
auto mask = abs(val)<vminv;
if (none_of(mask)) // no underflows possible, use quick algoritm
{
Tv res=vone;
Tv res=1;
do
{
if (npow&1)
@ -131,11 +131,11 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
}
while(npow>>=1);
*resd=res;
*ress=vzero;
*ress=0;
}
else
{
Tv scale=vzero, scaleint=vzero, res=vone;
Tv scale=0, scaleint=0, res=1;
Tvnormalize(&val,&scaleint,sharp_fbighalf);
do
{
@ -155,8 +155,8 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
}
}
static inline void getCorfac(Tv scale, Tv * restrict corfac,
const double * restrict cf)
static inline void getCorfac(Tv scale, Tv * MRUTIL_RESTRICT corfac,
const double * MRUTIL_RESTRICT cf)
{
typedef union
{ Tv v; double s[VLEN]; } Tvu;
@ -169,67 +169,67 @@ static inline void getCorfac(Tv scale, Tv * restrict corfac,
*corfac=corf.v;
}
static inline int rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, Tv eps)
static inline bool rescale(Tv * MRUTIL_RESTRICT v1, Tv * MRUTIL_RESTRICT v2, Tv * MRUTIL_RESTRICT s, Tv eps)
{
Tm mask = vgt(vabs(*v2),eps);
if (vanyTrue(mask))
auto mask = abs(*v2)>eps;
if (any_of(mask))
{
vmuleq_mask(mask,*v1,vload(sharp_fsmall));
vmuleq_mask(mask,*v2,vload(sharp_fsmall));
vaddeq_mask(mask,*s,vone);
return 1;
where(mask,*v1)*=sharp_fsmall;
where(mask,*v2)*=sharp_fsmall;
where(mask,*s)+=1;
return true;
}
return 0;
return false;
}
NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
s0data_v * restrict d, int * restrict l_, int * restrict il_, int nv2)
MRUTIL_NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * MRUTIL_RESTRICT gen,
s0data_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int * MRUTIL_RESTRICT il_, int nv2)
{
int l=gen->m, il=0;
Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
Tv limscale=vload(sharp_limscale);
Tv mfac = (gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m];
Tv limscale=sharp_limscale;
int below_limit = 1;
for (int i=0; i<nv2; ++i)
{
d->lam1[i]=vzero;
d->lam1[i]=0;
mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]);
d->lam2[i] *= mfac;
Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
below_limit &= vallTrue(vlt(d->scale[i],limscale));
below_limit &= all_of(d->scale[i]<limscale);
}
while (below_limit)
{
if (l+4>gen->lmax) {*l_=gen->lmax+1;return;}
below_limit=1;
Tv a1=vload(gen->coef[il ].a), b1=vload(gen->coef[il ].b);
Tv a2=vload(gen->coef[il+1].a), b2=vload(gen->coef[il+1].b);
Tv a1=gen->coef[il ].a, b1=gen->coef[il ].b;
Tv a2=gen->coef[il+1].a, b2=gen->coef[il+1].b;
for (int i=0; i<nv2; ++i)
{
d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale)));
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol))
below_limit &= all_of(d->scale[i]<sharp_limscale);
}
l+=4; il+=2;
}
*l_=l; *il_=il;
}
NOINLINE static void alm2map_kernel(s0data_v * restrict d,
const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm,
MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d,
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, const dcmplx * MRUTIL_RESTRICT alm,
int l, int il, int lmax, int nv2)
{
if (nv2==nv0)
{
for (; l<=lmax-2; il+=2, l+=4)
{
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag();
Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag();
Tv a1=coef[il ].a, b1=coef[il ].b;
Tv a2=coef[il+1].a, b2=coef[il+1].b;
for (int i=0; i<nv0; ++i)
{
d->p1r[i] += d->lam2[i]*ar1;
@ -249,12 +249,12 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
{
for (; l<=lmax-2; il+=2, l+=4)
{
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag();
Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag();
Tv a1=coef[il ].a, b1=coef[il ].b;
Tv a2=coef[il+1].a, b2=coef[il+1].b;
for (int i=0; i<nv2; ++i)
{
d->p1r[i] += d->lam2[i]*ar1;
@ -272,9 +272,9 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
}
for (; l<=lmax; ++il, l+=2)
{
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
Tv a=vload(coef[il].a), b=vload(coef[il].b);
Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
Tv a=coef[il].a, b=coef[il].b;
for (int i=0; i<nv2; ++i)
{
d->p1r[i] += d->lam2[i]*ar1;
@ -288,8 +288,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
}
}
NOINLINE static void calc_alm2map (sharp_job * restrict job,
const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
MRUTIL_NOINLINE static void calc_alm2map (sharp_job * MRUTIL_RESTRICT job,
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth)
{
int l,il,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN;
@ -298,20 +298,20 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
if (l>lmax) return;
job->opcnt += (lmax+1-l) * 6*nth;
const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
const dcmplx * restrict alm=job->almtmp;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef;
const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
int full_ieee=1;
for (int i=0; i<nv2; ++i)
{
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
full_ieee &= all_of(d->scale[i]>=sharp_minscale);
}
while((!full_ieee) && (l<=lmax))
{
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
Tv a=vload(coef[il].a), b=vload(coef[il].b);
Tv ar1=alm[l ].real(), ai1=alm[l ].imag();
Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
Tv a=coef[il].a, b=coef[il].b;
full_ieee=1;
for (int i=0; i<nv2; ++i)
{
@ -322,9 +322,9 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
d->lam1[i] = d->lam2[i];
d->lam2[i] = tmp;
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol))
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
full_ieee &= all_of(d->scale[i]>=sharp_minscale);
}
l+=2; ++il;
}
@ -338,16 +338,16 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
alm2map_kernel(d, coef, alm, l, il, lmax, nv2);
}
NOINLINE static void map2alm_kernel(s0data_v * restrict d,
const sharp_ylmgen_dbl2 * restrict coef, dcmplx * restrict alm, int l,
MRUTIL_NOINLINE static void map2alm_kernel(s0data_v * MRUTIL_RESTRICT d,
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, dcmplx * MRUTIL_RESTRICT alm, int l,
int il, int lmax, int nv2)
{
for (; l<=lmax-2; il+=2, l+=4)
{
Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b);
Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
Tv atmp1[4] = {vzero, vzero, vzero, vzero};
Tv atmp2[4] = {vzero, vzero, vzero, vzero};
Tv a1=coef[il ].a, b1=coef[il ].b;
Tv a2=coef[il+1].a, b2=coef[il+1].b;
Tv atmp1[4] = {0,0,0,0};
Tv atmp2[4] = {0,0,0,0};
for (int i=0; i<nv2; ++i)
{
atmp1[0] += d->lam2[i]*d->p1r[i];
@ -366,8 +366,8 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d,
}
for (; l<=lmax; ++il, l+=2)
{
Tv a=vload(coef[il].a), b=vload(coef[il].b);
Tv atmp[4] = {vzero, vzero, vzero, vzero};
Tv a=coef[il].a, b=coef[il].b;
Tv atmp[4] = {0,0,0,0};
for (int i=0; i<nv2; ++i)
{
atmp[0] += d->lam2[i]*d->p1r[i];
@ -382,8 +382,8 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d,
}
}
NOINLINE static void calc_map2alm (sharp_job * restrict job,
const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
MRUTIL_NOINLINE static void calc_map2alm (sharp_job * MRUTIL_RESTRICT job,
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth)
{
int l,il,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN;
@ -392,19 +392,19 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
if (l>lmax) return;
job->opcnt += (lmax+1-l) * 6*nth;
const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
dcmplx * restrict alm=job->almtmp;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef;
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
int full_ieee=1;
for (int i=0; i<nv2; ++i)
{
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
full_ieee &= all_of(d->scale[i]>=sharp_minscale);
}
while((!full_ieee) && (l<=lmax))
{
Tv a=vload(coef[il].a), b=vload(coef[il].b);
Tv atmp[4] = {vzero, vzero, vzero, vzero};
Tv a=coef[il].a, b=coef[il].b;
Tv atmp[4] = {0,0,0,0};
full_ieee=1;
for (int i=0; i<nv2; ++i)
{
@ -415,9 +415,9 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
d->lam1[i] = d->lam2[i];
d->lam2[i] = tmp;
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol))
getCorfac(d->scale[i], &d->corfac[i], gen->cf);
full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
full_ieee &= all_of(d->scale[i]>=sharp_minscale);
}
vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
l+=2; ++il;
@ -432,21 +432,21 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
map2alm_kernel(d, coef, alm, l, il, lmax, nv2);
}
NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
sxdata_v * restrict d, int * restrict l_, int nv2)
MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RESTRICT gen,
sxdata_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int nv2)
{
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
Tv prefac=vload(gen->prefac[gen->m]),
prescale=vload(gen->fscale[gen->m]);
Tv limscale=vload(sharp_limscale);
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
Tv prefac=gen->prefac[gen->m],
prescale=gen->fscale[gen->m];
Tv limscale=sharp_limscale;
int below_limit=1;
for (int i=0; i<nv2; ++i)
{
Tv cth2=vmax(vload(1e-15),vsqrt((vone+d->cth[i])*vload(0.5)));
Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5)));
Tm mask=vlt(d->sth[i],vzero);
vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.));
vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.));
Tv cth2=max(Tv(1e-15),sqrt((1.+d->cth[i])*0.5));
Tv sth2=max(Tv(1e-15),sqrt((1.-d->cth[i])*0.5));
auto mask=d->sth[i]<0;
where(mask&(d->cth[i]<0),cth2)*=-1.;
where(mask&(d->cth[i]<0),sth2)*=-1.;
Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
@ -454,8 +454,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
d->l1p[i] = vzero;
d->l1m[i] = vzero;
d->l1p[i] = 0;
d->l1m[i] = 0;
d->l2p[i] = prefac*ccp;
d->scp[i] = prescale+ccps;
d->l2m[i] = prefac*csp;
@ -467,17 +467,17 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
d->l2m[i] *= scp;
d->scm[i] += scps;
if (gen->preMinus_p)
d->l2p[i] = vneg(d->l2p[i]);
d->l2p[i] = -d->l2p[i];
if (gen->preMinus_m)
d->l2m[i] = vneg(d->l2m[i]);
d->l2m[i] = -d->l2m[i];
if (gen->s&1)
d->l2p[i] = vneg(d->l2p[i]);
d->l2p[i] = -d->l2p[i];
Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol);
Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol);
below_limit &= vallTrue(vlt(d->scm[i],limscale)) &&
vallTrue(vlt(d->scp[i],limscale));
below_limit &= all_of(d->scm[i]<limscale) &&
all_of(d->scp[i]<limscale);
}
int l=gen->mhi;
@ -486,18 +486,18 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
{
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
below_limit=1;
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
for (int i=0; i<nv2; ++i)
{
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
vallTrue(vlt(d->scm[i],limscale));
if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],sharp_ftol) ||
rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],sharp_ftol))
below_limit &= all_of(d->scp[i]<limscale) &&
all_of(d->scm[i]<limscale);
}
l+=2;
}
@ -505,19 +505,19 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
*l_=l;
}
NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
MRUTIL_NOINLINE static void alm2map_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm,
int l, int lmax, int nv2)
{
int lsave = l;
while (l<=lmax)
{
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(),
acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
for (int i=0; i<nv2; ++i)
{
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
@ -537,12 +537,12 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
l=lsave;
while (l<=lmax)
{
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(),
acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
for (int i=0; i<nv2; ++i)
{
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
@ -561,8 +561,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
}
}
NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job * MRUTIL_RESTRICT job,
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
{
int l,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN;
@ -571,25 +571,25 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
if (l>lmax) return;
job->opcnt += (lmax+1-l) * 23*nth;
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
const dcmplx * restrict alm=job->almtmp;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
int full_ieee=1;
for (int i=0; i<nv2; ++i)
{
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
vallTrue(vge(d->scm[i],vload(sharp_minscale)));
full_ieee &= all_of(d->scp[i]>=sharp_minscale) &&
all_of(d->scm[i]>=sharp_minscale);
}
while((!full_ieee) && (l<=lmax))
{
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(),
acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
full_ieee=1;
for (int i=0; i<nv2; ++i)
{
@ -611,12 +611,12 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol))
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
full_ieee &= all_of(d->scp[i]>=sharp_minscale);
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol))
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
full_ieee &= all_of(d->scm[i]>=sharp_minscale);
}
l+=2;
}
@ -641,17 +641,17 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
}
}
NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
const sharp_ylmgen_dbl2 * restrict fx, dcmplx * restrict alm,
MRUTIL_NOINLINE static void map2alm_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, dcmplx * MRUTIL_RESTRICT alm,
int l, int lmax, int nv2)
{
int lsave=l;
while (l<=lmax)
{
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv agr1=0, agi1=0, acr1=0, aci1=0;
Tv agr2=0, agi2=0, acr2=0, aci2=0;
for (int i=0; i<nv2; ++i)
{
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
@ -672,10 +672,10 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
l=lsave;
while (l<=lmax)
{
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv agr1=0, agi1=0, acr1=0, aci1=0;
Tv agr2=0, agi2=0, acr2=0, aci2=0;
for (int i=0; i<nv2; ++i)
{
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
@ -695,8 +695,8 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
}
}
NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
MRUTIL_NOINLINE static void calc_map2alm_spin (sharp_job * MRUTIL_RESTRICT job,
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
{
int l,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN;
@ -705,15 +705,15 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
if (l>lmax) return;
job->opcnt += (lmax+1-l) * 23*nth;
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
dcmplx * restrict alm=job->almtmp;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
int full_ieee=1;
for (int i=0; i<nv2; ++i)
{
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
vallTrue(vge(d->scm[i],vload(sharp_minscale)));
full_ieee &= all_of(d->scp[i]>=sharp_minscale) &&
all_of(d->scm[i]>=sharp_minscale);
}
for (int i=0; i<nv2; ++i)
{
@ -726,10 +726,10 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
while((!full_ieee) && (l<=lmax))
{
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv agr1=0, agi1=0, acr1=0, aci1=0;
Tv agr2=0, agi2=0, acr2=0, aci2=0;
full_ieee=1;
for (int i=0; i<nv2; ++i)
{
@ -748,12 +748,12 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol))
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
full_ieee &= all_of(d->scp[i]>=sharp_minscale);
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol))
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
full_ieee &= all_of(d->scm[i]>=sharp_minscale);
}
vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
@ -772,17 +772,17 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
}
NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
MRUTIL_NOINLINE static void alm2map_deriv1_kernel(sxdata_v * MRUTIL_RESTRICT d,
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm,
int l, int lmax, int nv2)
{
int lsave=l;
while (l<=lmax)
{
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])),
ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(),
ar2=alm[l+1].real(), ai2=alm[l+1].imag();
for (int i=0; i<nv2; ++i)
{
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
@ -798,10 +798,10 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
l=lsave;
while (l<=lmax)
{
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])),
ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(),
ar2=alm[l+1].real(), ai2=alm[l+1].imag();
for (int i=0; i<nv2; ++i)
{
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
@ -816,8 +816,8 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
}
}
NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
MRUTIL_NOINLINE static void calc_alm2map_deriv1(sharp_job * MRUTIL_RESTRICT job,
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
{
int l,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN;
@ -826,23 +826,23 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
if (l>lmax) return;
job->opcnt += (lmax+1-l) * 15*nth;
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
const dcmplx * restrict alm=job->almtmp;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
int full_ieee=1;
for (int i=0; i<nv2; ++i)
{
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
vallTrue(vge(d->scm[i],vload(sharp_minscale)));
full_ieee &= all_of(d->scp[i]>=sharp_minscale) &&
all_of(d->scm[i]>=sharp_minscale);
}
while((!full_ieee) && (l<=lmax))
{
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])),
ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Tv ar1=alm[l ].real(), ai1=alm[l ].imag(),
ar2=alm[l+1].real(), ai2=alm[l+1].imag();
full_ieee=1;
for (int i=0; i<nv2; ++i)
{
@ -864,12 +864,12 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol))
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
full_ieee &= all_of(d->scp[i]>=sharp_minscale);
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol))
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
full_ieee &= all_of(d->scm[i]>=sharp_minscale);
}
l+=2;
}
@ -897,7 +897,7 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim)
{
@ -912,7 +912,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
if (job->spin==0)
{
//adjust the a_lm for the new algorithm
dcmplx * restrict alm=job->almtmp;
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
{
dcmplx al = alm[l];
@ -963,8 +963,8 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
d.s.p2r[i]*=cth_[tgt];
d.s.p2i[i]*=cth_[tgt];
int phas_idx = tgt*job->s_th + mi*job->s_m;
complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I,
r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I;
complex<double> r1(d.s.p1r[i], d.s.p1i[i]),
r2(d.s.p2r[i], d.s.p2i[i]);
job->phase[phas_idx] = r1+r2;
if (ispair[tgt])
job->phase[phas_idx+1] = r1-r2;
@ -1027,10 +1027,10 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
{
int tgt=itgt[i];
int phas_idx = tgt*job->s_th + mi*job->s_m;
complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I,
q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I,
u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I,
u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I;
complex<double> q1(d.s.p1pr[i], d.s.p1pi[i]),
q2(d.s.p2pr[i], d.s.p2pi[i]),
u1(d.s.p1mr[i], d.s.p1mi[i]),
u2(d.s.p2mr[i], d.s.p2mi[i]);
job->phase[phas_idx ] = q1+q2;
job->phase[phas_idx+2] = u1+u2;
if (ispair[tgt])
@ -1056,7 +1056,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
}
}
NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim)
{
@ -1083,8 +1083,8 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
int phas_idx = ith*job->s_th + mi*job->s_m;
dcmplx ph1=job->phase[phas_idx];
dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.;
d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2);
d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2);
d.s.p1r[nth]=(ph1+ph2).real(); d.s.p1i[nth]=(ph1+ph2).imag();
d.s.p2r[nth]=(ph1-ph2).real(); d.s.p2i[nth]=(ph1-ph2).imag();
//adjust for new algorithm
d.s.p2r[nth]*=cth_[ith];
d.s.p2i[nth]*=cth_[ith];
@ -1105,7 +1105,7 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
}
}
//adjust the a_lm for the new algorithm
dcmplx * restrict alm=job->almtmp;
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
dcmplx alm2 = 0.;
double alold=0;
for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
@ -1138,10 +1138,10 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
p2U=ispair[ith] ? job->phase[phas_idx+3]:0.;
if ((gen->mhi-gen->m+gen->s)&1)
{ p2Q=-p2Q; p2U=-p2U; }
d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q);
d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U);
d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q);
d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U);
d.s.p1pr[nth]=(p1Q+p2Q).real(); d.s.p1pi[nth]=(p1Q+p2Q).imag();
d.s.p1mr[nth]=(p1U+p2U).real(); d.s.p1mi[nth]=(p1U+p2U).imag();
d.s.p2pr[nth]=(p1Q-p2Q).real(); d.s.p2pi[nth]=(p1Q-p2Q).imag();
d.s.p2mr[nth]=(p1U-p2U).real(); d.s.p2mi[nth]=(p1U-p2U).imag();
++nth;
}
++ith;

View file

@ -29,7 +29,7 @@
#include "libsharp2/sharp_geomhelpers.h"
#include "libsharp2/sharp_legendre_roots.h"
#include "libsharp2/sharp_utils.h"
#include "libsharp2/pocketfft.h"
#include "mr_util/fft.h"
void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
const int *rings, const double *weight, sharp_geom_info **geom_info)
@ -155,9 +155,7 @@ void sharp_make_fejer1_geom_info (int nrings, int ppring, double phi0,
weight[2*k ]=2./(1.-4.*k*k)*sin((k*pi)/nrings);
}
if ((nrings&1)==0) weight[nrings-1]=0.;
pocketfft_plan_r plan = pocketfft_make_plan_r(nrings);
pocketfft_backward_r(plan,weight,1.);
pocketfft_delete_plan_r(plan);
mr::r2r_fftpack({size_t(nrings)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.);
for (int m=0; m<(nrings+1)/2; ++m)
{
@ -202,9 +200,7 @@ void sharp_make_cc_geom_info (int nrings, int ppring, double phi0,
for (int k=1; k<=(n/2-1); ++k)
weight[2*k-1]=2./(1.-4.*k*k) + dw;
weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1);
pocketfft_plan_r plan = pocketfft_make_plan_r(n);
pocketfft_backward_r(plan,weight,1.);
pocketfft_delete_plan_r(plan);
mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.);
weight[n]=weight[0];
for (int m=0; m<(nrings+1)/2; ++m)
@ -250,9 +246,7 @@ void sharp_make_fejer2_geom_info (int nrings, int ppring, double phi0,
for (int k=1; k<=(n/2-1); ++k)
weight[2*k-1]=2./(1.-4.*k*k);
weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.;
pocketfft_plan_r plan = pocketfft_make_plan_r(n);
pocketfft_backward_r(plan,weight,1.);
pocketfft_delete_plan_r(plan);
mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.);
for (int m=0; m<nrings; ++m)
weight[m]=weight[m+1];

View file

@ -28,14 +28,12 @@
#ifndef SHARP2_INTERNAL_H
#define SHARP2_INTERNAL_H
#ifdef __cplusplus
#error This header file cannot be included from C++, only from C
#endif
#include <complex.h>
#include <complex>
#include "libsharp2/sharp.h"
#include "libsharp2/sharp_ylmgen_c.h"
using std::complex;
typedef struct
{
sharp_jobtype type;
@ -45,9 +43,9 @@ typedef struct
void **map;
void **alm;
int s_m, s_th; // strides in m and theta direction
complex double *phase;
complex<double> *phase;
double *norm_l;
complex double *almtmp;
complex<double> *almtmp;
const sharp_geom_info *ginfo;
const sharp_alm_info *ainfo;
double time;

View file

@ -122,10 +122,4 @@ double sharp_wallTime(void);
}
#endif
#ifdef __GNUC__
#define NOINLINE __attribute__((noinline))
#else
#define NOINLINE
#endif
#endif

View file

@ -28,193 +28,27 @@
#ifndef SHARP2_VECSUPPORT_H
#define SHARP2_VECSUPPORT_H
#include <math.h>
#include <cmath>
#include <complex>
using std::complex;
#include <experimental/simd>
using std::experimental::native_simd;
using std::experimental::reduce;
#ifndef VLEN
#include "mr_util/useful_macros.h"
#if (defined(__AVX512F__))
#define VLEN 8
#elif (defined (__AVX__))
#define VLEN 4
#elif (defined (__SSE2__))
#define VLEN 2
#else
#define VLEN 1
#endif
#endif
typedef double Ts;
#if (VLEN==1)
typedef double Tv;
typedef int Tm;
using Tv=native_simd<double>;
using Tm=Tv::mask_type;
using Ts=Tv::value_type;
static constexpr size_t VLEN=Tv::size();
#define vload(a) (a)
#define vzero 0.
#define vone 1.
#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
#define vneg(a) (-(a))
#define vabs(a) fabs(a)
#define vsqrt(a) sqrt(a)
#define vlt(a,b) ((a)<(b))
#define vgt(a,b) ((a)>(b))
#define vge(a,b) ((a)>=(b))
#define vne(a,b) ((a)!=(b))
#define vand_mask(a,b) ((a)&&(b))
#define vor_mask(a,b) ((a)||(b))
static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
#define vanyTrue(a) (a)
#define vallTrue(a) (a)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc)
{ cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
#endif
#if (VLEN==2)
#include <emmintrin.h>
#if defined (__SSE3__)
#include <pmmintrin.h>
#endif
#if defined (__SSE4_1__)
#include <smmintrin.h>
#endif
typedef __m128d Tv;
typedef __m128d Tm;
#if defined(__SSE4_1__)
#define vblend__(m,a,b) _mm_blendv_pd(b,a,m)
#else
static inline Tv vblend__(Tv m, Tv a, Tv b)
{ return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
#endif
#define vload(a) _mm_set1_pd(a)
#define vzero _mm_setzero_pd()
#define vone vload(1.)
#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
#define vneg(a) _mm_xor_pd(vload(-0.),a)
#define vabs(a) _mm_andnot_pd(vload(-0.),a)
#define vsqrt(a) _mm_sqrt_pd(a)
#define vlt(a,b) _mm_cmplt_pd(a,b)
#define vgt(a,b) _mm_cmpgt_pd(a,b)
#define vge(a,b) _mm_cmpge_pd(a,b)
#define vne(a,b) _mm_cmpneq_pd(a,b)
#define vand_mask(a,b) _mm_and_pd(a,b)
#define vor_mask(a,b) _mm_or_pd(a,b)
#define vmin(a,b) _mm_min_pd(a,b)
#define vmax(a,b) _mm_max_pd(a,b);
#define vanyTrue(a) (_mm_movemask_pd(a)!=0)
#define vallTrue(a) (_mm_movemask_pd(a)==3)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c,
Tv d, _Complex double * restrict cc)
complex<double> * MRUTIL_RESTRICT cc)
{
union {Tv v; _Complex double c; } u1, u2;
#if defined(__SSE3__)
u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
#else
u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
#endif
cc[0]+=u1.c; cc[1]+=u2.c;
cc[0] += complex<double>(reduce(a,std::plus<>()),reduce(b,std::plus<>()));
cc[1] += complex<double>(reduce(c,std::plus<>()),reduce(d,std::plus<>()));
}
#endif
#if (VLEN==4)
#include <immintrin.h>
typedef __m256d Tv;
typedef __m256d Tm;
#define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
#define vload(a) _mm256_set1_pd(a)
#define vzero _mm256_setzero_pd()
#define vone vload(1.)
#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
#define vneg(a) _mm256_xor_pd(vload(-0.),a)
#define vabs(a) _mm256_andnot_pd(vload(-0.),a)
#define vsqrt(a) _mm256_sqrt_pd(a)
#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
#define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ)
#define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ)
#define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
#define vand_mask(a,b) _mm256_and_pd(a,b)
#define vor_mask(a,b) _mm256_or_pd(a,b)
#define vmin(a,b) _mm256_min_pd(a,b)
#define vmax(a,b) _mm256_max_pd(a,b)
#define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
#define vallTrue(a) (_mm256_movemask_pd(a)==15)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc)
{
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
tmp1=tmp3+tmp4;
union {Tv v; _Complex double c[2]; } u;
u.v=tmp1;
cc[0]+=u.c[0]; cc[1]+=u.c[1];
}
#endif
#if (VLEN==8)
#include <immintrin.h>
typedef __m512d Tv;
typedef __mmask8 Tm;
#define vload(a) _mm512_set1_pd(a)
#define vzero _mm512_setzero_pd()
#define vone vload(1.)
#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
#define vneg(a) _mm512_mul_pd(a,vload(-1.))
#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)vload(-0.),(__m512i)a)
#define vsqrt(a) _mm512_sqrt_pd(a)
#define vlt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_LT_OQ)
#define vgt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GT_OQ)
#define vge(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GE_OQ)
#define vne(a,b) _mm512_cmp_pd_mask(a,b,_CMP_NEQ_OQ)
#define vand_mask(a,b) ((a)&(b))
#define vor_mask(a,b) ((a)|(b))
#define vmin(a,b) _mm512_min_pd(a,b)
#define vmax(a,b) _mm512_max_pd(a,b)
#define vanyTrue(a) (a!=0)
#define vallTrue(a) (a==255)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
_Complex double * restrict cc)
{
cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
}
#endif
#endif

View file

@ -26,7 +26,8 @@
#include <stdio.h>
#include <string.h>
#include <complex.h>
#include <complex>
using std::complex;
#ifdef USE_MPI
#include "mpi.h"
#include "libsharp2/sharp_mpi.h"
@ -38,7 +39,7 @@
#include "libsharp2/sharp_geomhelpers.h"
#include "libsharp2/sharp_almhelpers.h"
#include "libsharp2/sharp_utils.h"
#include "libsharp2/sharp_utils.c"
#include "libsharp2/sharp_utils.cc"
#include "test/memusage.h"
static void OpenMP_status(void)
@ -94,7 +95,7 @@ static void sharp_module_startup (const char *name, int argc, int argc_expected,
exit(1);
}
typedef complex double dcmplx;
typedef complex<double> dcmplx;
int ntasks, mytask;
@ -122,7 +123,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
{
double rv = drand(-1,1,&state);
double iv = (m==0) ? 0 : drand(-1,1,&state);
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
alm[sharp_alm_index(helper,l,mi)] = dcmplx(rv,iv);
}
}
}
@ -230,8 +231,7 @@ static double *get_sqsum_and_invert (dcmplx **alm, ptrdiff_t nalms, int ncomp)
sqsum[i]=0;
for (ptrdiff_t j=0; j<nalms; ++j)
{
sqsum[i]+=creal(alm[i][j])*creal(alm[i][j])
+cimag(alm[i][j])*cimag(alm[i][j]);
sqsum[i]+=norm(alm[i][j]);
alm[i][j]=-alm[i][j];
}
}
@ -253,8 +253,7 @@ static void get_errors (dcmplx **alm, ptrdiff_t nalms, int ncomp, double *sqsum,
double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot;
for (ptrdiff_t j=0; j<nalms; ++j)
{
double sqr=creal(alm[i][j])*creal(alm[i][j])
+cimag(alm[i][j])*cimag(alm[i][j]);
double sqr=norm(alm[i][j]);
sum+=sqr;
if (sqr>maxdiff) maxdiff=sqr;
}
@ -414,7 +413,7 @@ static void check_sign_scale(void)
ALLOC2D(alm,dcmplx,2,nalms);
for (int i=0; i<2; ++i)
for (int j=0; j<nalms; ++j)
alm[i][j]=1.+_Complex_I;
alm[i][j]=dcmplx(1.,1.);
sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP,
NULL,NULL);