Merge branch 'simplify' of gitlab.mpcdf.mpg.de:mtr/libsharp into simplify
This commit is contained in:
commit
57da18d154
17 changed files with 972 additions and 723 deletions
|
@ -16,17 +16,14 @@ src_sharp = \
|
|||
libsharp/sharp_ylmgen_c.c \
|
||||
libsharp/sharp_announce.h \
|
||||
libsharp/sharp_complex_hacks.h \
|
||||
libsharp/sharp_core.h \
|
||||
libsharp/sharp_internal.h \
|
||||
libsharp/sharp_legendre_roots.h \
|
||||
libsharp/sharp_lowlevel.h \
|
||||
libsharp/sharp_vecsupport.h \
|
||||
libsharp/sharp_vecutil.h \
|
||||
libsharp/sharp_ylmgen_c.h
|
||||
|
||||
include_HEADERS = \
|
||||
libsharp/sharp.h \
|
||||
libsharp/sharp_lowlevel.h \
|
||||
libsharp/sharp_geomhelpers.h \
|
||||
libsharp/sharp_almhelpers.h \
|
||||
libsharp/sharp_cxx.h
|
||||
|
|
|
@ -35,7 +35,6 @@
|
|||
#include "sharp_ylmgen_c.h"
|
||||
#include "sharp_internal.h"
|
||||
#include "c_utils.h"
|
||||
#include "sharp_core.h"
|
||||
#include "walltime_c.h"
|
||||
#include "sharp_almhelpers.h"
|
||||
#include "sharp_geomhelpers.h"
|
||||
|
@ -588,7 +587,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
|
|||
}
|
||||
else
|
||||
memset (job->almtmp+job->nalm*job->ainfo->mval[mi], 0,
|
||||
job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx));
|
||||
job->nalm*(lmax+2-job->ainfo->mval[mi])*sizeof(dcmplx));
|
||||
|
||||
#undef COPY_LOOP
|
||||
}
|
||||
|
@ -960,9 +959,6 @@ void sharp_set_chunksize_min(int new_chunksize_min)
|
|||
void sharp_set_nchunks_max(int new_nchunks_max)
|
||||
{ nchunks_max=new_nchunks_max; }
|
||||
|
||||
int sharp_get_nv_max (void)
|
||||
{ return 6; }
|
||||
|
||||
#ifdef USE_MPI
|
||||
#include "sharp_mpi.c"
|
||||
|
||||
|
|
243
libsharp/sharp.h
243
libsharp/sharp.h
|
@ -23,21 +23,248 @@
|
|||
*/
|
||||
|
||||
/*! \file sharp.h
|
||||
* Interface for the spherical transform library.
|
||||
* Portable interface for the spherical transform library.
|
||||
*
|
||||
* Copyright (C) 2006-2012 Max-Planck-Society
|
||||
* \author Martin Reinecke
|
||||
* Copyright (C) 2012-2018 Max-Planck-Society
|
||||
* \author Martin Reinecke \author Dag Sverre Seljebotn
|
||||
*/
|
||||
|
||||
#ifndef PLANCK_SHARP_H
|
||||
#define PLANCK_SHARP_H
|
||||
#ifndef PLANCK_SHARP_LOWLEVEL_H
|
||||
#define PLANCK_SHARP_LOWLEVEL_H
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
#error This header file cannot be included from C++, only from C
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <complex.h>
|
||||
/*! \internal
|
||||
Helper type containing information about a single ring. */
|
||||
typedef struct
|
||||
{
|
||||
double theta, phi0, weight, cth, sth;
|
||||
ptrdiff_t ofs;
|
||||
int nph, stride;
|
||||
} sharp_ringinfo;
|
||||
|
||||
#include "sharp_lowlevel.h"
|
||||
/*! \internal
|
||||
Helper type containing information about a pair of rings with colatitudes
|
||||
symmetric around the equator. */
|
||||
typedef struct
|
||||
{
|
||||
sharp_ringinfo r1,r2;
|
||||
} sharp_ringpair;
|
||||
|
||||
/*! \internal
|
||||
Type holding all required information about a map geometry. */
|
||||
typedef struct
|
||||
{
|
||||
sharp_ringpair *pair;
|
||||
int npairs, nphmax;
|
||||
} sharp_geom_info;
|
||||
|
||||
/*! \defgroup almgroup Helpers for dealing with a_lm */
|
||||
/*! \{ */
|
||||
|
||||
/*! \internal
|
||||
Helper type for index calculation in a_lm arrays. */
|
||||
typedef struct
|
||||
{
|
||||
/*! Maximum \a l index of the array */
|
||||
int lmax;
|
||||
/*! Number of different \a m values in this object */
|
||||
int nm;
|
||||
/*! Array with \a nm entries containing the individual m values */
|
||||
int *mval;
|
||||
/*! Combination of flags from sharp_almflags */
|
||||
int flags;
|
||||
/*! Array with \a nm entries containing the (hypothetical) indices of
|
||||
the coefficients with quantum numbers 0,\a mval[i] */
|
||||
ptrdiff_t *mvstart;
|
||||
/*! Stride between a_lm and a_(l+1),m */
|
||||
ptrdiff_t stride;
|
||||
} sharp_alm_info;
|
||||
|
||||
/*! alm_info flags */
|
||||
typedef enum { SHARP_PACKED = 1,
|
||||
/*!< m=0-coefficients are packed so that the (zero) imaginary part is
|
||||
not present. mvstart is in units of *real* float/double for all
|
||||
m; stride is in units of reals for m=0 and complex for m!=0 */
|
||||
SHARP_REAL_HARMONICS = 1<<6
|
||||
/*!< Use the real spherical harmonic convention. For
|
||||
m==0, the alm are treated exactly the same as in
|
||||
the complex case. For m!=0, alm[i] represent a
|
||||
pair (+abs(m), -abs(m)) instead of (real, imag),
|
||||
and the coefficients are scaled by a factor of
|
||||
sqrt(2) relative to the complex case. In other
|
||||
words, (sqrt(.5) * alm[i]) recovers the
|
||||
corresponding complex coefficient (when accessed
|
||||
as complex).
|
||||
*/
|
||||
} sharp_almflags;
|
||||
|
||||
|
||||
|
||||
/*! Creates an a_lm data structure from the following parameters:
|
||||
\param lmax maximum \a l quantum number (>=0)
|
||||
\param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
|
||||
\param stride the stride between entries with identical \a m, and \a l
|
||||
differing by 1.
|
||||
\param mstart the index of the (hypothetical) coefficient with the
|
||||
quantum numbers 0,\a m. Must have \a mmax+1 entries.
|
||||
\param alm_info will hold a pointer to the newly created data structure
|
||||
*/
|
||||
void sharp_make_alm_info (int lmax, int mmax, int stride,
|
||||
const ptrdiff_t *mstart, sharp_alm_info **alm_info);
|
||||
/*! Creates an a_lm data structure which from the following parameters:
|
||||
\param lmax maximum \a l quantum number (\a >=0)
|
||||
\param nm number of different \a m (\a 0<=nm<=lmax+1)
|
||||
\param stride the stride between entries with identical \a m, and \a l
|
||||
differing by 1.
|
||||
\param mval array with \a nm entries containing the individual m values
|
||||
\param mvstart array with \a nm entries containing the (hypothetical)
|
||||
indices of the coefficients with the quantum numbers 0,\a mval[i]
|
||||
\param flags a combination of sharp_almflags (pass 0 unless you know you need this)
|
||||
\param alm_info will hold a pointer to the newly created data structure
|
||||
*/
|
||||
void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
|
||||
const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info);
|
||||
/*! Returns the index of the coefficient with quantum numbers \a l,
|
||||
\a mval[mi].
|
||||
\note for a \a sharp_alm_info generated by sharp_make_alm_info() this is
|
||||
the index for the coefficient with the quantum numbers \a l, \a mi. */
|
||||
ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
|
||||
/*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED
|
||||
flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0),
|
||||
otherwise it is the number of complex coefficients (with m>=0). */
|
||||
ptrdiff_t sharp_alm_count(const sharp_alm_info *self);
|
||||
/*! Deallocates the a_lm info object. */
|
||||
void sharp_destroy_alm_info (sharp_alm_info *info);
|
||||
|
||||
/*! \} */
|
||||
|
||||
/*! \defgroup geominfogroup Functions for dealing with geometry information */
|
||||
/*! \{ */
|
||||
|
||||
/*! Creates a geometry information from a set of ring descriptions.
|
||||
All arrays passed to this function must have \a nrings elements.
|
||||
\param nrings the number of rings in the map
|
||||
\param nph the number of pixels in each ring
|
||||
\param ofs the index of the first pixel in each ring in the map array
|
||||
\param stride the stride between consecutive pixels
|
||||
\param phi0 the azimuth (in radians) of the first pixel in each ring
|
||||
\param theta the colatitude (in radians) of each ring
|
||||
\param wgt the pixel weight to be used for the ring in map2alm
|
||||
and adjoint map2alm transforms.
|
||||
Pass NULL to use 1.0 as weight for all rings.
|
||||
\param geom_info will hold a pointer to the newly created data structure
|
||||
*/
|
||||
void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
|
||||
const int *stride, const double *phi0, const double *theta,
|
||||
const double *wgt, sharp_geom_info **geom_info);
|
||||
|
||||
/*! Counts the number of grid points needed for (the local part of) a map described
|
||||
by \a info.
|
||||
*/
|
||||
ptrdiff_t sharp_map_size(const sharp_geom_info *info);
|
||||
|
||||
/*! Deallocates the geometry information in \a info. */
|
||||
void sharp_destroy_geom_info (sharp_geom_info *info);
|
||||
|
||||
/*! \} */
|
||||
|
||||
/*! \defgroup lowlevelgroup Low-level libsharp SHT interface */
|
||||
/*! \{ */
|
||||
|
||||
/*! Enumeration of SHARP job types. */
|
||||
typedef enum { SHARP_YtW=0, /*!< analysis */
|
||||
SHARP_MAP2ALM=SHARP_YtW, /*!< analysis */
|
||||
SHARP_Y=1, /*!< synthesis */
|
||||
SHARP_ALM2MAP=SHARP_Y, /*!< synthesis */
|
||||
SHARP_Yt=2, /*!< adjoint synthesis */
|
||||
SHARP_WY=3, /*!< adjoint analysis */
|
||||
SHARP_ALM2MAP_DERIV1=4 /*!< synthesis of first derivatives */
|
||||
} sharp_jobtype;
|
||||
|
||||
/*! Job flags */
|
||||
typedef enum { SHARP_DP = 1<<4,
|
||||
/*!< map and a_lm are in double precision */
|
||||
SHARP_ADD = 1<<5,
|
||||
/*!< results are added to the output arrays, instead of
|
||||
overwriting them */
|
||||
|
||||
/* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags,
|
||||
but its use here is deprecated in favor of having it in the sharp_alm_info */
|
||||
|
||||
SHARP_NO_FFT = 1<<7,
|
||||
|
||||
SHARP_USE_WEIGHTS = 1<<20, /* internal use only */
|
||||
SHARP_NO_OPENMP = 1<<21, /* internal use only */
|
||||
} sharp_jobflags;
|
||||
|
||||
/*! Performs a libsharp SHT job. The interface deliberately does not use
|
||||
the C99 "complex" data type, in order to be callable from C89 and C++.
|
||||
\param type the type of SHT
|
||||
\param spin the spin of the quantities to be transformed
|
||||
\param alm contains pointers to the a_lm coefficients. If \a spin==0,
|
||||
alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
|
||||
etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
|
||||
alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
|
||||
depends on whether the SHARP_DP flag is set.
|
||||
\param map contains pointers to the maps. If \a spin==0,
|
||||
map[0] points to the map of the first SHT, map[1] to that of the second
|
||||
etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
|
||||
point to the maps of the first SHT, map[2] and map[3] to those of the
|
||||
second, etc. The exact data type of \a map depends on whether the SHARP_DP
|
||||
flag is set.
|
||||
\param geom_info A \c sharp_geom_info object compatible with the provided
|
||||
\a map arrays.
|
||||
\param alm_info A \c sharp_alm_info object compatible with the provided
|
||||
\a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
|
||||
exactly once.
|
||||
\param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
|
||||
\a alm is expected to have the type "complex double **" and \a map is
|
||||
expected to have the type "double **"; otherwise, the expected
|
||||
types are "complex float **" and "float **", respectively.
|
||||
\param time If not NULL, the wall clock time required for this SHT
|
||||
(in seconds) will be written here.
|
||||
\param opcnt If not NULL, a conservative estimate of the total floating point
|
||||
operation count for this SHT will be written here. */
|
||||
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
|
||||
const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
|
||||
int flags, double *time, unsigned long long *opcnt);
|
||||
|
||||
void sharp_set_chunksize_min(int new_chunksize_min);
|
||||
void sharp_set_nchunks_max(int new_nchunks_max);
|
||||
|
||||
|
||||
typedef enum { SHARP_ERROR_NO_MPI = 1,
|
||||
/*!< libsharp not compiled with MPI support */
|
||||
} sharp_errors;
|
||||
|
||||
/*! Works like sharp_execute_mpi, but is always present whether or not libsharp
|
||||
is compiled with USE_MPI. This is primarily useful for wrapper code etc.
|
||||
|
||||
Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid
|
||||
pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi
|
||||
is *(MPI_Comm*)pcomm.
|
||||
|
||||
Other parameters are the same as sharp_execute_mpi.
|
||||
|
||||
Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available
|
||||
(in which case nothing is done).
|
||||
*/
|
||||
int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
|
||||
void *alm, void *map, const sharp_geom_info *geom_info,
|
||||
const sharp_alm_info *alm_info, int flags, double *time,
|
||||
unsigned long long *opcnt);
|
||||
|
||||
|
||||
|
||||
/*! \} */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
#ifndef PLANCK_SHARP_ALMHELPERS_H
|
||||
#define PLANCK_SHARP_ALMHELPERS_H
|
||||
|
||||
#include "sharp_lowlevel.h"
|
||||
#include "sharp.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
|
@ -40,7 +40,7 @@
|
|||
#endif
|
||||
|
||||
#include "sharp_announce.h"
|
||||
#include "sharp_core.h"
|
||||
#include "sharp_internal.h"
|
||||
|
||||
static void OpenMP_status(void)
|
||||
{
|
||||
|
|
|
@ -25,131 +25,126 @@
|
|||
/* \file sharp_complex_hacks.h
|
||||
* support for converting vector types and complex numbers
|
||||
*
|
||||
* Copyright (C) 2012-2016 Max-Planck-Society
|
||||
* Copyright (C) 2012-2018 Max-Planck-Society
|
||||
* Author: Martin Reinecke
|
||||
*/
|
||||
|
||||
#ifndef SHARP_COMPLEX_HACKS_H
|
||||
#define SHARP_COMPLEX_HACKS_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
#error This header file cannot be included from C++, only from C
|
||||
#endif
|
||||
|
||||
#include <math.h>
|
||||
#include <complex.h>
|
||||
#include "sharp_vecsupport.h"
|
||||
|
||||
#define UNSAFE_CODE
|
||||
|
||||
#if (VLEN==1)
|
||||
|
||||
static inline complex double vhsum_cmplx(Tv a, Tv b)
|
||||
static inline _Complex double vhsum_cmplx(Tv a, Tv b)
|
||||
{ return a+_Complex_I*b; }
|
||||
|
||||
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
|
||||
complex double * restrict c1, complex double * restrict c2)
|
||||
_Complex double * restrict c1, _Complex double * restrict c2)
|
||||
{ *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
|
||||
|
||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||
complex double * restrict cc)
|
||||
_Complex double * restrict cc)
|
||||
{ cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
|
||||
|
||||
#endif
|
||||
|
||||
#if (VLEN==2)
|
||||
|
||||
static inline complex double vhsum_cmplx (Tv a, Tv b)
|
||||
static inline _Complex double vhsum_cmplx (Tv a, Tv b)
|
||||
{
|
||||
#if defined(__SSE3__)
|
||||
Tv tmp = _mm_hadd_pd(a,b);
|
||||
#else
|
||||
Tv tmp = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
|
||||
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
|
||||
Tv tmp = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
|
||||
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
|
||||
#endif
|
||||
union {Tv v; complex double c; } u;
|
||||
union {Tv v; _Complex double c; } u;
|
||||
u.v=tmp; return u.c;
|
||||
}
|
||||
|
||||
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
|
||||
Tv d, complex double * restrict c1, complex double * restrict c2)
|
||||
Tv d, _Complex double * restrict c1, _Complex double * restrict c2)
|
||||
{
|
||||
#ifdef UNSAFE_CODE
|
||||
#if defined(__SSE3__)
|
||||
vaddeq(*((__m128d *)c1),_mm_hadd_pd(a,b));
|
||||
vaddeq(*((__m128d *)c2),_mm_hadd_pd(c,d));
|
||||
*((__m128d *)c1) += _mm_hadd_pd(a,b);
|
||||
*((__m128d *)c2) += _mm_hadd_pd(c,d);
|
||||
#else
|
||||
vaddeq(*((__m128d *)c1),vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
|
||||
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0))));
|
||||
vaddeq(*((__m128d *)c2),vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
|
||||
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0))));
|
||||
*((__m128d *)c1) += _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
|
||||
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
|
||||
*((__m128d *)c2) += _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
|
||||
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
|
||||
#endif
|
||||
#else
|
||||
union {Tv v; complex double c; } u1, u2;
|
||||
union {Tv v; _Complex double c; } u1, u2;
|
||||
#if defined(__SSE3__)
|
||||
u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
|
||||
#else
|
||||
u1.v = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
|
||||
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
|
||||
u2.v = vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
|
||||
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)));
|
||||
u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
|
||||
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
|
||||
u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
|
||||
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
|
||||
#endif
|
||||
*c1+=u1.c; *c2+=u2.c;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||
complex double * restrict cc)
|
||||
_Complex double * restrict cc)
|
||||
{ vhsum_cmplx2(a,b,c,d,cc,cc+1); }
|
||||
|
||||
#endif
|
||||
|
||||
#if (VLEN==4)
|
||||
|
||||
static inline complex double vhsum_cmplx (Tv a, Tv b)
|
||||
static inline _Complex double vhsum_cmplx (Tv a, Tv b)
|
||||
{
|
||||
Tv tmp=_mm256_hadd_pd(a,b);
|
||||
Tv tmp2=_mm256_permute2f128_pd(tmp,tmp,1);
|
||||
tmp=_mm256_add_pd(tmp,tmp2);
|
||||
#ifdef UNSAFE_CODE
|
||||
complex double ret;
|
||||
_Complex double ret;
|
||||
*((__m128d *)&ret)=_mm256_extractf128_pd(tmp, 0);
|
||||
return ret;
|
||||
#else
|
||||
union {Tv v; complex double c[2]; } u;
|
||||
union {Tv v; _Complex double c[2]; } u;
|
||||
u.v=tmp; return u.c[0];
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
|
||||
complex double * restrict c1, complex double * restrict c2)
|
||||
_Complex double * restrict c1, _Complex double * restrict c2)
|
||||
{
|
||||
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
|
||||
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
|
||||
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
|
||||
tmp1=vadd(tmp3,tmp4);
|
||||
tmp1=tmp3+tmp4;
|
||||
#ifdef UNSAFE_CODE
|
||||
*((__m128d *)c1)=_mm_add_pd(*((__m128d *)c1),_mm256_extractf128_pd(tmp1, 0));
|
||||
*((__m128d *)c2)=_mm_add_pd(*((__m128d *)c2),_mm256_extractf128_pd(tmp1, 1));
|
||||
#else
|
||||
union {Tv v; complex double c[2]; } u;
|
||||
union {Tv v; _Complex double c[2]; } u;
|
||||
u.v=tmp1;
|
||||
*c1+=u.c[0]; *c2+=u.c[1];
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||
complex double * restrict cc)
|
||||
_Complex double * restrict cc)
|
||||
{
|
||||
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
|
||||
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
|
||||
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
|
||||
tmp1=vadd(tmp3,tmp4);
|
||||
tmp1=tmp3+tmp4;
|
||||
#ifdef UNSAFE_CODE
|
||||
_mm256_storeu_pd((double *)cc,
|
||||
_mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1));
|
||||
#else
|
||||
union {Tv v; complex double c[2]; } u;
|
||||
union {Tv v; _Complex double c[2]; } u;
|
||||
u.v=tmp1;
|
||||
cc[0]+=u.c[0]; cc[1]+=u.c[1];
|
||||
#endif
|
||||
|
@ -159,18 +154,18 @@ static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
|||
|
||||
#if (VLEN==8)
|
||||
|
||||
static inline complex double vhsum_cmplx(Tv a, Tv b)
|
||||
static inline _Complex double vhsum_cmplx(Tv a, Tv b)
|
||||
{ return _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); }
|
||||
|
||||
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
|
||||
complex double * restrict c1, complex double * restrict c2)
|
||||
_Complex double * restrict c1, _Complex double * restrict c2)
|
||||
{
|
||||
*c1 += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
|
||||
*c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
|
||||
}
|
||||
|
||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||
complex double * restrict cc)
|
||||
_Complex double * restrict cc)
|
||||
{ vhsum_cmplx2(a,b,c,d,cc,cc+1); }
|
||||
|
||||
#endif
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,53 +0,0 @@
|
|||
/*
|
||||
* This file is part of libsharp.
|
||||
*
|
||||
* libsharp is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* libsharp is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with libsharp; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/*
|
||||
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
|
||||
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
|
||||
* (DLR).
|
||||
*/
|
||||
|
||||
/*! \file sharp_core.h
|
||||
* Interface for the computational core
|
||||
*
|
||||
* Copyright (C) 2012-2013 Max-Planck-Society
|
||||
* \author Martin Reinecke
|
||||
*/
|
||||
|
||||
#ifndef PLANCK_SHARP_CORE_H
|
||||
#define PLANCK_SHARP_CORE_H
|
||||
|
||||
#include "sharp_internal.h"
|
||||
#include "sharp_ylmgen_c.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void inner_loop (sharp_job *job, const int *ispair,const double *cth,
|
||||
const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
|
||||
const int *mlim);
|
||||
|
||||
int sharp_veclen(void);
|
||||
int sharp_max_nvec(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -33,7 +33,7 @@
|
|||
#define PLANCK_SHARP_CXX_H
|
||||
|
||||
#include <complex>
|
||||
#include "sharp_lowlevel.h"
|
||||
#include "sharp.h"
|
||||
#include "sharp_geomhelpers.h"
|
||||
#include "sharp_almhelpers.h"
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
#ifndef PLANCK_SHARP_GEOMHELPERS_H
|
||||
#define PLANCK_SHARP_GEOMHELPERS_H
|
||||
|
||||
#include "sharp_lowlevel.h"
|
||||
#include "sharp.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
|
@ -25,7 +25,7 @@
|
|||
/*! \file sharp_internal.h
|
||||
* Internally used functionality for the spherical transform library.
|
||||
*
|
||||
* Copyright (C) 2006-2013 Max-Planck-Society
|
||||
* Copyright (C) 2006-2018 Max-Planck-Society
|
||||
* \author Martin Reinecke \author Dag Sverre Seljebotn
|
||||
*/
|
||||
|
||||
|
@ -36,7 +36,9 @@
|
|||
#error This header file cannot be included from C++, only from C
|
||||
#endif
|
||||
|
||||
#include <complex.h>
|
||||
#include "sharp.h"
|
||||
#include "sharp_ylmgen_c.h"
|
||||
|
||||
#define SHARP_MAXTRANS 100
|
||||
|
||||
|
@ -58,8 +60,13 @@ typedef struct
|
|||
unsigned long long opcnt;
|
||||
} sharp_job;
|
||||
|
||||
int sharp_get_nv_max (void);
|
||||
int sharp_nv_oracle (sharp_jobtype type, int spin);
|
||||
int sharp_get_mlim (int lmax, int spin, double sth, double cth);
|
||||
|
||||
void inner_loop (sharp_job *job, const int *ispair,const double *cth,
|
||||
const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
|
||||
const int *mlim);
|
||||
|
||||
int sharp_veclen(void);
|
||||
int sharp_max_nvec(void);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,270 +0,0 @@
|
|||
/*
|
||||
* This file is part of libsharp.
|
||||
*
|
||||
* libsharp is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* libsharp is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with libsharp; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/*
|
||||
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
|
||||
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
|
||||
* (DLR).
|
||||
*/
|
||||
|
||||
/*! \file sharp_lowlevel.h
|
||||
* Low-level, portable interface for the spherical transform library.
|
||||
*
|
||||
* Copyright (C) 2012-2013 Max-Planck-Society
|
||||
* \author Martin Reinecke \author Dag Sverre Seljebotn
|
||||
*/
|
||||
|
||||
#ifndef PLANCK_SHARP_LOWLEVEL_H
|
||||
#define PLANCK_SHARP_LOWLEVEL_H
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*! \internal
|
||||
Helper type containing information about a single ring. */
|
||||
typedef struct
|
||||
{
|
||||
double theta, phi0, weight, cth, sth;
|
||||
ptrdiff_t ofs;
|
||||
int nph, stride;
|
||||
} sharp_ringinfo;
|
||||
|
||||
/*! \internal
|
||||
Helper type containing information about a pair of rings with colatitudes
|
||||
symmetric around the equator. */
|
||||
typedef struct
|
||||
{
|
||||
sharp_ringinfo r1,r2;
|
||||
} sharp_ringpair;
|
||||
|
||||
/*! \internal
|
||||
Type holding all required information about a map geometry. */
|
||||
typedef struct
|
||||
{
|
||||
sharp_ringpair *pair;
|
||||
int npairs, nphmax;
|
||||
} sharp_geom_info;
|
||||
|
||||
/*! \defgroup almgroup Helpers for dealing with a_lm */
|
||||
/*! \{ */
|
||||
|
||||
/*! \internal
|
||||
Helper type for index calculation in a_lm arrays. */
|
||||
typedef struct
|
||||
{
|
||||
/*! Maximum \a l index of the array */
|
||||
int lmax;
|
||||
/*! Number of different \a m values in this object */
|
||||
int nm;
|
||||
/*! Array with \a nm entries containing the individual m values */
|
||||
int *mval;
|
||||
/*! Combination of flags from sharp_almflags */
|
||||
int flags;
|
||||
/*! Array with \a nm entries containing the (hypothetical) indices of
|
||||
the coefficients with quantum numbers 0,\a mval[i] */
|
||||
ptrdiff_t *mvstart;
|
||||
/*! Stride between a_lm and a_(l+1),m */
|
||||
ptrdiff_t stride;
|
||||
} sharp_alm_info;
|
||||
|
||||
/*! alm_info flags */
|
||||
typedef enum { SHARP_PACKED = 1,
|
||||
/*!< m=0-coefficients are packed so that the (zero) imaginary part is
|
||||
not present. mvstart is in units of *real* float/double for all
|
||||
m; stride is in units of reals for m=0 and complex for m!=0 */
|
||||
SHARP_REAL_HARMONICS = 1<<6
|
||||
/*!< Use the real spherical harmonic convention. For
|
||||
m==0, the alm are treated exactly the same as in
|
||||
the complex case. For m!=0, alm[i] represent a
|
||||
pair (+abs(m), -abs(m)) instead of (real, imag),
|
||||
and the coefficients are scaled by a factor of
|
||||
sqrt(2) relative to the complex case. In other
|
||||
words, (sqrt(.5) * alm[i]) recovers the
|
||||
corresponding complex coefficient (when accessed
|
||||
as complex).
|
||||
*/
|
||||
} sharp_almflags;
|
||||
|
||||
|
||||
|
||||
/*! Creates an a_lm data structure from the following parameters:
|
||||
\param lmax maximum \a l quantum number (>=0)
|
||||
\param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
|
||||
\param stride the stride between entries with identical \a m, and \a l
|
||||
differing by 1.
|
||||
\param mstart the index of the (hypothetical) coefficient with the
|
||||
quantum numbers 0,\a m. Must have \a mmax+1 entries.
|
||||
\param alm_info will hold a pointer to the newly created data structure
|
||||
*/
|
||||
void sharp_make_alm_info (int lmax, int mmax, int stride,
|
||||
const ptrdiff_t *mstart, sharp_alm_info **alm_info);
|
||||
/*! Creates an a_lm data structure which from the following parameters:
|
||||
\param lmax maximum \a l quantum number (\a >=0)
|
||||
\param nm number of different \a m (\a 0<=nm<=lmax+1)
|
||||
\param stride the stride between entries with identical \a m, and \a l
|
||||
differing by 1.
|
||||
\param mval array with \a nm entries containing the individual m values
|
||||
\param mvstart array with \a nm entries containing the (hypothetical)
|
||||
indices of the coefficients with the quantum numbers 0,\a mval[i]
|
||||
\param flags a combination of sharp_almflags (pass 0 unless you know you need this)
|
||||
\param alm_info will hold a pointer to the newly created data structure
|
||||
*/
|
||||
void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
|
||||
const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info);
|
||||
/*! Returns the index of the coefficient with quantum numbers \a l,
|
||||
\a mval[mi].
|
||||
\note for a \a sharp_alm_info generated by sharp_make_alm_info() this is
|
||||
the index for the coefficient with the quantum numbers \a l, \a mi. */
|
||||
ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
|
||||
/*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED
|
||||
flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0),
|
||||
otherwise it is the number of complex coefficients (with m>=0). */
|
||||
ptrdiff_t sharp_alm_count(const sharp_alm_info *self);
|
||||
/*! Deallocates the a_lm info object. */
|
||||
void sharp_destroy_alm_info (sharp_alm_info *info);
|
||||
|
||||
/*! \} */
|
||||
|
||||
/*! \defgroup geominfogroup Functions for dealing with geometry information */
|
||||
/*! \{ */
|
||||
|
||||
/*! Creates a geometry information from a set of ring descriptions.
|
||||
All arrays passed to this function must have \a nrings elements.
|
||||
\param nrings the number of rings in the map
|
||||
\param nph the number of pixels in each ring
|
||||
\param ofs the index of the first pixel in each ring in the map array
|
||||
\param stride the stride between consecutive pixels
|
||||
\param phi0 the azimuth (in radians) of the first pixel in each ring
|
||||
\param theta the colatitude (in radians) of each ring
|
||||
\param wgt the pixel weight to be used for the ring in map2alm
|
||||
and adjoint map2alm transforms.
|
||||
Pass NULL to use 1.0 as weight for all rings.
|
||||
\param geom_info will hold a pointer to the newly created data structure
|
||||
*/
|
||||
void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
|
||||
const int *stride, const double *phi0, const double *theta,
|
||||
const double *wgt, sharp_geom_info **geom_info);
|
||||
|
||||
/*! Counts the number of grid points needed for (the local part of) a map described
|
||||
by \a info.
|
||||
*/
|
||||
ptrdiff_t sharp_map_size(const sharp_geom_info *info);
|
||||
|
||||
/*! Deallocates the geometry information in \a info. */
|
||||
void sharp_destroy_geom_info (sharp_geom_info *info);
|
||||
|
||||
/*! \} */
|
||||
|
||||
/*! \defgroup lowlevelgroup Low-level libsharp SHT interface */
|
||||
/*! \{ */
|
||||
|
||||
/*! Enumeration of SHARP job types. */
|
||||
typedef enum { SHARP_YtW=0, /*!< analysis */
|
||||
SHARP_MAP2ALM=SHARP_YtW, /*!< analysis */
|
||||
SHARP_Y=1, /*!< synthesis */
|
||||
SHARP_ALM2MAP=SHARP_Y, /*!< synthesis */
|
||||
SHARP_Yt=2, /*!< adjoint synthesis */
|
||||
SHARP_WY=3, /*!< adjoint analysis */
|
||||
SHARP_ALM2MAP_DERIV1=4 /*!< synthesis of first derivatives */
|
||||
} sharp_jobtype;
|
||||
|
||||
/*! Job flags */
|
||||
typedef enum { SHARP_DP = 1<<4,
|
||||
/*!< map and a_lm are in double precision */
|
||||
SHARP_ADD = 1<<5,
|
||||
/*!< results are added to the output arrays, instead of
|
||||
overwriting them */
|
||||
|
||||
/* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags,
|
||||
but its use here is deprecated in favor of having it in the sharp_alm_info */
|
||||
|
||||
SHARP_NO_FFT = 1<<7,
|
||||
|
||||
SHARP_USE_WEIGHTS = 1<<20, /* internal use only */
|
||||
SHARP_NO_OPENMP = 1<<21, /* internal use only */
|
||||
} sharp_jobflags;
|
||||
|
||||
/*! Performs a libsharp SHT job. The interface deliberately does not use
|
||||
the C99 "complex" data type, in order to be callable from C89 and C++.
|
||||
\param type the type of SHT
|
||||
\param spin the spin of the quantities to be transformed
|
||||
\param alm contains pointers to the a_lm coefficients. If \a spin==0,
|
||||
alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
|
||||
etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
|
||||
alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
|
||||
depends on whether the SHARP_DP flag is set.
|
||||
\param map contains pointers to the maps. If \a spin==0,
|
||||
map[0] points to the map of the first SHT, map[1] to that of the second
|
||||
etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
|
||||
point to the maps of the first SHT, map[2] and map[3] to those of the
|
||||
second, etc. The exact data type of \a map depends on whether the SHARP_DP
|
||||
flag is set.
|
||||
\param geom_info A \c sharp_geom_info object compatible with the provided
|
||||
\a map arrays.
|
||||
\param alm_info A \c sharp_alm_info object compatible with the provided
|
||||
\a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
|
||||
exactly once.
|
||||
\param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
|
||||
\a alm is expected to have the type "complex double **" and \a map is
|
||||
expected to have the type "double **"; otherwise, the expected
|
||||
types are "complex float **" and "float **", respectively.
|
||||
\param time If not NULL, the wall clock time required for this SHT
|
||||
(in seconds) will be written here.
|
||||
\param opcnt If not NULL, a conservative estimate of the total floating point
|
||||
operation count for this SHT will be written here. */
|
||||
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
|
||||
const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
|
||||
int flags, double *time, unsigned long long *opcnt);
|
||||
|
||||
void sharp_set_chunksize_min(int new_chunksize_min);
|
||||
void sharp_set_nchunks_max(int new_nchunks_max);
|
||||
|
||||
|
||||
typedef enum { SHARP_ERROR_NO_MPI = 1,
|
||||
/*!< libsharp not compiled with MPI support */
|
||||
} sharp_errors;
|
||||
|
||||
/*! Works like sharp_execute_mpi, but is always present whether or not libsharp
|
||||
is compiled with USE_MPI. This is primarily useful for wrapper code etc.
|
||||
|
||||
Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid
|
||||
pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi
|
||||
is *(MPI_Comm*)pcomm.
|
||||
|
||||
Other parameters are the same as sharp_execute_mpi.
|
||||
|
||||
Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available
|
||||
(in which case nothing is done).
|
||||
*/
|
||||
int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
|
||||
void *alm, void *map, const sharp_geom_info *geom_info,
|
||||
const sharp_alm_info *alm_info, int flags, double *time,
|
||||
unsigned long long *opcnt);
|
||||
|
||||
|
||||
|
||||
/*! \} */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -33,7 +33,7 @@
|
|||
#define PLANCK_SHARP_MPI_H
|
||||
|
||||
#include <mpi.h>
|
||||
#include "sharp_lowlevel.h"
|
||||
#include "sharp.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
|
||||
/* \file sharp_testsuite.c
|
||||
*
|
||||
* Copyright (C) 2012-2013 Max-Planck-Society
|
||||
* Copyright (C) 2012-2018 Max-Planck-Society
|
||||
* \author Martin Reinecke
|
||||
*/
|
||||
|
||||
|
@ -375,7 +375,6 @@ static void check_sign_scale(void)
|
|||
UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.234675107554816442e+01,1e-12),
|
||||
"error");
|
||||
|
||||
#if 0
|
||||
sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,SHARP_DP,
|
||||
NULL,NULL);
|
||||
UTIL_ASSERT(FAPPROX(map[0][0 ], 2.750897760535633285e+00,1e-12),
|
||||
|
@ -420,7 +419,6 @@ static void check_sign_scale(void)
|
|||
"error");
|
||||
UTIL_ASSERT(FAPPROX(map[1][npix-1], 7.821618677689795049e+02,1e-12),
|
||||
"error");
|
||||
#endif
|
||||
|
||||
DEALLOC2D(map);
|
||||
DEALLOC2D(alm);
|
||||
|
@ -430,7 +428,7 @@ static void check_sign_scale(void)
|
|||
}
|
||||
|
||||
static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
|
||||
int spin, int nv, double **err_abs, double **err_rel,
|
||||
int spin, double **err_abs, double **err_rel,
|
||||
double *t_a2m, double *t_m2a, unsigned long long *op_a2m,
|
||||
unsigned long long *op_m2a)
|
||||
{
|
||||
|
@ -450,20 +448,20 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
|
|||
|
||||
#ifdef USE_MPI
|
||||
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,
|
||||
ainfo, SHARP_DP|SHARP_ADD|nv,t_a2m,op_a2m);
|
||||
ainfo, SHARP_DP|SHARP_ADD,t_a2m,op_a2m);
|
||||
#else
|
||||
sharp_execute(SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,ainfo,
|
||||
SHARP_DP|nv,t_a2m,op_a2m);
|
||||
SHARP_DP,t_a2m,op_a2m);
|
||||
#endif
|
||||
if (t_a2m!=NULL) *t_a2m=maxTime(*t_a2m);
|
||||
if (op_a2m!=NULL) *op_a2m=totalops(*op_a2m);
|
||||
double *sqsum=get_sqsum_and_invert(alm,nalms,ncomp);
|
||||
#ifdef USE_MPI
|
||||
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,
|
||||
ainfo,SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
|
||||
ainfo,SHARP_DP|SHARP_ADD,t_m2a,op_m2a);
|
||||
#else
|
||||
sharp_execute(SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,ainfo,
|
||||
SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
|
||||
SHARP_DP|SHARP_ADD,t_m2a,op_m2a);
|
||||
#endif
|
||||
if (t_m2a!=NULL) *t_m2a=maxTime(*t_m2a);
|
||||
if (op_m2a!=NULL) *op_m2a=totalops(*op_m2a);
|
||||
|
@ -475,11 +473,11 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
|
|||
}
|
||||
|
||||
static void check_accuracy (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
|
||||
int spin, int nv)
|
||||
int spin)
|
||||
{
|
||||
int ncomp = (spin==0) ? 1 : 2;
|
||||
double *err_abs, *err_rel;
|
||||
do_sht (ginfo, ainfo, spin, nv, &err_abs, &err_rel, NULL, NULL,
|
||||
do_sht (ginfo, ainfo, spin, &err_abs, &err_rel, NULL, NULL,
|
||||
NULL, NULL);
|
||||
for (int i=0; i<ncomp; ++i)
|
||||
UTIL_ASSERT((err_rel[i]<1e-10) && (err_abs[i]<1e-10),"error");
|
||||
|
@ -501,16 +499,11 @@ static void sharp_acctest(void)
|
|||
sharp_alm_info *ainfo;
|
||||
int lmax=127, mmax=127, nlat=128, nlon=256;
|
||||
get_infos ("gauss", lmax, &mmax, &nlat, &nlon, &ginfo, &ainfo);
|
||||
for (int nv=1; nv<=6; ++nv)
|
||||
{
|
||||
check_accuracy(ginfo,ainfo,0,nv);
|
||||
#if 0
|
||||
check_accuracy(ginfo,ainfo,1,nv);
|
||||
check_accuracy(ginfo,ainfo,2,nv);
|
||||
check_accuracy(ginfo,ainfo,3,nv);
|
||||
check_accuracy(ginfo,ainfo,30,nv);
|
||||
#endif
|
||||
}
|
||||
check_accuracy(ginfo,ainfo,0);
|
||||
check_accuracy(ginfo,ainfo,1);
|
||||
check_accuracy(ginfo,ainfo,2);
|
||||
check_accuracy(ginfo,ainfo,3);
|
||||
check_accuracy(ginfo,ainfo,30);
|
||||
sharp_destroy_alm_info(ainfo);
|
||||
sharp_destroy_geom_info(ginfo);
|
||||
if (mytask==0) printf("Passed.\n\n");
|
||||
|
@ -544,7 +537,7 @@ static void sharp_test (int argc, const char **argv)
|
|||
{
|
||||
++nrpt;
|
||||
double ta2m2, tm2a2;
|
||||
do_sht (ginfo, ainfo, spin, 0, &err_abs, &err_rel, &ta2m2, &tm2a2,
|
||||
do_sht (ginfo, ainfo, spin, &err_abs, &err_rel, &ta2m2, &tm2a2,
|
||||
&op_a2m, &op_m2a);
|
||||
if (ta2m2<t_a2m) t_a2m=ta2m2;
|
||||
if (tm2a2<t_m2a) t_m2a=tm2a2;
|
||||
|
@ -594,7 +587,7 @@ static void sharp_test (int argc, const char **argv)
|
|||
}
|
||||
|
||||
if (mytask==0)
|
||||
printf("%-12s %-10s %2d %d %2d %3d %6d %6d %6d %6d %2d %.2e %7.2f %.2e %7.2f"
|
||||
printf("%-12s %-10s %2d %d %2d %3d %6d %6d %6d %6d %.2e %7.2f %.2e %7.2f"
|
||||
" %9.2f %6.2f %.2e %.2e\n",
|
||||
getenv("HOST"),argv[2],spin,sharp_veclen(),nomp,ntasks,lmax,mmax,gpar1,gpar2,
|
||||
t_a2m,1e-9*op_a2m/t_a2m,t_m2a,1e-9*op_m2a/t_m2a,tmem/(1<<20),
|
||||
|
@ -604,68 +597,6 @@ static void sharp_test (int argc, const char **argv)
|
|||
DEALLOC(err_rel);
|
||||
}
|
||||
|
||||
static void sharp_bench (int argc, const char **argv)
|
||||
{
|
||||
if (mytask==0) sharp_announce("sharp_bench");
|
||||
UTIL_ASSERT(argc>=8,"usage: grid lmax mmax geom1 geom2 spin");
|
||||
int lmax=atoi(argv[3]);
|
||||
int mmax=atoi(argv[4]);
|
||||
int gpar1=atoi(argv[5]);
|
||||
int gpar2=atoi(argv[6]);
|
||||
int spin=atoi(argv[7]);
|
||||
|
||||
if (mytask==0) printf("Testing map analysis accuracy.\n");
|
||||
if (mytask==0) printf("spin=%d\n", spin);
|
||||
|
||||
sharp_geom_info *ginfo;
|
||||
sharp_alm_info *ainfo;
|
||||
get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo);
|
||||
|
||||
double ta2m_auto=1e30, tm2a_auto=1e30, ta2m_min=1e30, tm2a_min=1e30;
|
||||
unsigned long long opa2m_min=0, opm2a_min=0;
|
||||
int nvmin_a2m=-1, nvmin_m2a=-1;
|
||||
for (int nv=0; nv<=6; ++nv)
|
||||
{
|
||||
int ntries=0;
|
||||
double tacc=0;
|
||||
do
|
||||
{
|
||||
double t_a2m, t_m2a;
|
||||
unsigned long long op_a2m, op_m2a;
|
||||
double *err_abs,*err_rel;
|
||||
do_sht (ginfo, ainfo, spin, nv, &err_abs, &err_rel,
|
||||
&t_a2m, &t_m2a, &op_a2m, &op_m2a);
|
||||
|
||||
DEALLOC(err_abs);
|
||||
DEALLOC(err_rel);
|
||||
tacc+=t_a2m+t_m2a;
|
||||
++ntries;
|
||||
if (nv==0)
|
||||
{
|
||||
if (t_a2m<ta2m_auto) ta2m_auto=t_a2m;
|
||||
if (t_m2a<tm2a_auto) tm2a_auto=t_m2a;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (t_a2m<ta2m_min) { nvmin_a2m=nv; ta2m_min=t_a2m; opa2m_min=op_a2m; }
|
||||
if (t_m2a<tm2a_min) { nvmin_m2a=nv; tm2a_min=t_m2a; opm2a_min=op_m2a; }
|
||||
}
|
||||
} while((ntries<2)||(tacc<3.));
|
||||
}
|
||||
if (mytask==0)
|
||||
{
|
||||
printf("a2m: nvmin=%d tmin=%fs speedup=%.2f%% perf=%.2fGFlops/s\n",
|
||||
nvmin_a2m,ta2m_min,100.*(ta2m_auto-ta2m_min)/ta2m_auto,
|
||||
1e-9*opa2m_min/ta2m_min);
|
||||
printf("m2a: nvmin=%d tmin=%fs speedup=%.2f%% perf=%.2fGFlops/s\n",
|
||||
nvmin_m2a,tm2a_min,100.*(tm2a_auto-tm2a_min)/tm2a_auto,
|
||||
1e-9*opm2a_min/tm2a_min);
|
||||
}
|
||||
|
||||
sharp_destroy_alm_info(ainfo);
|
||||
sharp_destroy_geom_info(ginfo);
|
||||
}
|
||||
|
||||
int main(int argc, const char **argv)
|
||||
{
|
||||
#ifdef USE_MPI
|
||||
|
@ -682,8 +613,6 @@ int main(int argc, const char **argv)
|
|||
sharp_acctest();
|
||||
else if (strcmp(argv[1],"test")==0)
|
||||
sharp_test(argc,argv);
|
||||
else if (strcmp(argv[1],"bench")==0)
|
||||
sharp_bench(argc,argv);
|
||||
else
|
||||
UTIL_FAIL("unknown command");
|
||||
|
||||
|
|
|
@ -40,32 +40,13 @@ typedef double Ts;
|
|||
#if (VLEN==1)
|
||||
|
||||
typedef double Tv;
|
||||
typedef float Tv_s;
|
||||
typedef int Tm;
|
||||
|
||||
#define vadd(a,b) ((a)+(b))
|
||||
#define vadd_s(a,b) ((a)+(b))
|
||||
#define vaddeq(a,b) ((a)+=(b))
|
||||
#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
|
||||
#define vsub(a,b) ((a)-(b))
|
||||
#define vsub_s(a,b) ((a)-(b))
|
||||
#define vsubeq(a,b) ((a)-=(b))
|
||||
#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
|
||||
#define vmul(a,b) ((a)*(b))
|
||||
#define vmul_s(a,b) ((a)*(b))
|
||||
#define vmuleq(a,b) ((a)*=(b))
|
||||
#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
|
||||
#define vfmaeq(a,b,c) ((a)+=(b)*(c))
|
||||
#define vfmaeq_s(a,b,c) ((a)+=(b)*(c))
|
||||
#define vfmseq(a,b,c) ((a)-=(b)*(c))
|
||||
#define vabmc(a,b,c) ((a)*(b)-(c))
|
||||
#define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
|
||||
#define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
|
||||
#define vneg(a) (-(a))
|
||||
#define vload(a) (a)
|
||||
#define vload_s(a) (a)
|
||||
#define vloadu(p) (*(p))
|
||||
#define vloadu_s(p) (*(p))
|
||||
#define vabs(a) fabs(a)
|
||||
#define vsqrt(a) sqrt(a)
|
||||
#define vlt(a,b) ((a)<(b))
|
||||
|
@ -74,8 +55,6 @@ typedef int Tm;
|
|||
#define vne(a,b) ((a)!=(b))
|
||||
#define vand_mask(a,b) ((a)&&(b))
|
||||
#define vor_mask(a,b) ((a)||(b))
|
||||
#define vstoreu(p, a) (*(p)=a)
|
||||
#define vstoreu_s(p, a) (*(p)=a)
|
||||
|
||||
static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
|
||||
static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
|
||||
|
@ -99,7 +78,6 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
|
|||
#endif
|
||||
|
||||
typedef __m128d Tv;
|
||||
typedef __m128 Tv_s;
|
||||
typedef __m128d Tm;
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
@ -111,29 +89,11 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
|
|||
#define vzero _mm_setzero_pd()
|
||||
#define vone _mm_set1_pd(1.)
|
||||
|
||||
#define vadd(a,b) _mm_add_pd(a,b)
|
||||
#define vadd_s(a,b) _mm_add_ps(a,b)
|
||||
#define vaddeq(a,b) a=_mm_add_pd(a,b)
|
||||
#define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero))
|
||||
#define vsub(a,b) _mm_sub_pd(a,b)
|
||||
#define vsub_s(a,b) _mm_sub_ps(a,b)
|
||||
#define vsubeq(a,b) a=_mm_sub_pd(a,b)
|
||||
#define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero))
|
||||
#define vmul(a,b) _mm_mul_pd(a,b)
|
||||
#define vmul_s(a,b) _mm_mul_ps(a,b)
|
||||
#define vmuleq(a,b) a=_mm_mul_pd(a,b)
|
||||
#define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone))
|
||||
#define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
|
||||
#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c))
|
||||
#define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
|
||||
#define vabmc(a,b,c) _mm_sub_pd(_mm_mul_pd(a,b),c)
|
||||
#define vfmaaeq(a,b,c,d,e) \
|
||||
a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
|
||||
#define vfmaseq(a,b,c,d,e) \
|
||||
a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
|
||||
#define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
|
||||
#define vload(a) _mm_set1_pd(a)
|
||||
#define vload_s(a) _mm_set1_ps(a)
|
||||
#define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
|
||||
#define vsqrt(a) _mm_sqrt_pd(a)
|
||||
#define vlt(a,b) _mm_cmplt_pd(a,b)
|
||||
|
@ -146,10 +106,6 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
|
|||
#define vmax(a,b) _mm_max_pd(a,b);
|
||||
#define vanyTrue(a) (_mm_movemask_pd(a)!=0)
|
||||
#define vallTrue(a) (_mm_movemask_pd(a)==3)
|
||||
#define vloadu(p) _mm_loadu_pd(p)
|
||||
#define vloadu_s(p) _mm_loadu_ps(p)
|
||||
#define vstoreu(p, v) _mm_storeu_pd(p, v)
|
||||
#define vstoreu_s(p, v) _mm_storeu_ps(p, v)
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -161,54 +117,17 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
|
|||
#endif
|
||||
|
||||
typedef __m256d Tv;
|
||||
typedef __m256 Tv_s;
|
||||
typedef __m256d Tm;
|
||||
|
||||
#define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
|
||||
#define vzero _mm256_setzero_pd()
|
||||
#define vone _mm256_set1_pd(1.)
|
||||
|
||||
#define vadd(a,b) _mm256_add_pd(a,b)
|
||||
#define vadd_s(a,b) _mm256_add_ps(a,b)
|
||||
#define vaddeq(a,b) a=_mm256_add_pd(a,b)
|
||||
#define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero))
|
||||
#define vsub(a,b) _mm256_sub_pd(a,b)
|
||||
#define vsub_s(a,b) _mm256_sub_ps(a,b)
|
||||
#define vsubeq(a,b) a=_mm256_sub_pd(a,b)
|
||||
#define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero))
|
||||
#define vmul(a,b) _mm256_mul_pd(a,b)
|
||||
#define vmul_s(a,b) _mm256_mul_ps(a,b)
|
||||
#define vmuleq(a,b) a=_mm256_mul_pd(a,b)
|
||||
#define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
|
||||
#if (USE_FMA4)
|
||||
#define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
|
||||
#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
|
||||
#define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
|
||||
#define vabmc(a,b,c) _mm256_msub_pd(a,b,c)
|
||||
#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
|
||||
#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
|
||||
#else
|
||||
#if (USE_FMA)
|
||||
#define vfmaeq(a,b,c) a=_mm256_fmadd_pd(b,c,a)
|
||||
#define vfmaeq_s(a,b,c) a=_mm256_fmadd_ps(b,c,a)
|
||||
#define vfmseq(a,b,c) a=_mm256_fnmadd_pd(b,c,a)
|
||||
#define vabmc(a,b,c) _mm256_fmsub_pd(a,b,c)
|
||||
#define vfmaaeq(a,b,c,d,e) a=_mm256_fmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
|
||||
#define vfmaseq(a,b,c,d,e) a=_mm256_fnmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
|
||||
#else
|
||||
#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
|
||||
#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
|
||||
#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
|
||||
#define vabmc(a,b,c) _mm256_sub_pd(_mm256_mul_pd(a,b),c)
|
||||
#define vfmaaeq(a,b,c,d,e) \
|
||||
a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
|
||||
#define vfmaseq(a,b,c,d,e) \
|
||||
a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
|
||||
#endif
|
||||
#endif
|
||||
#define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
|
||||
#define vload(a) _mm256_set1_pd(a)
|
||||
#define vload_s(a) _mm256_set1_ps(a)
|
||||
#define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
|
||||
#define vsqrt(a) _mm256_sqrt_pd(a)
|
||||
#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
|
||||
|
@ -222,11 +141,6 @@ typedef __m256d Tm;
|
|||
#define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
|
||||
#define vallTrue(a) (_mm256_movemask_pd(a)==15)
|
||||
|
||||
#define vloadu(p) _mm256_loadu_pd(p)
|
||||
#define vloadu_s(p) _mm256_loadu_ps(p)
|
||||
#define vstoreu(p, v) _mm256_storeu_pd(p, v)
|
||||
#define vstoreu_s(p, v) _mm256_storeu_ps(p, v)
|
||||
|
||||
#endif
|
||||
|
||||
#if (VLEN==8)
|
||||
|
@ -236,20 +150,8 @@ typedef __m256d Tm;
|
|||
typedef __m512d Tv;
|
||||
typedef __mmask8 Tm;
|
||||
|
||||
#define vadd(a,b) _mm512_add_pd(a,b)
|
||||
#define vaddeq(a,b) a=_mm512_add_pd(a,b)
|
||||
#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
|
||||
#define vsub(a,b) _mm512_sub_pd(a,b)
|
||||
#define vsubeq(a,b) a=_mm512_sub_pd(a,b)
|
||||
#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
|
||||
#define vmul(a,b) _mm512_mul_pd(a,b)
|
||||
#define vmuleq(a,b) a=_mm512_mul_pd(a,b)
|
||||
#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
|
||||
#define vfmaeq(a,b,c) a=_mm512_fmadd_pd(b,c,a)
|
||||
//#define vabmc(a,b,c) a=_mm512_fnmadd_pd(b,c,a)
|
||||
//#define vfms(a,b,c) _mm512_fnmadd_pd(b,c,a)
|
||||
#define vfmaaeq(a,b,c,d,e) a=_mm512_fmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
|
||||
#define vfmaseq(a,b,c,d,e) a=_mm512_fnmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
|
||||
#define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))
|
||||
#define vload(a) _mm512_set1_pd(a)
|
||||
#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)_mm512_set1_pd(-0.),(__m512i)a)
|
||||
|
|
|
@ -25,7 +25,7 @@
|
|||
/*! \file sharp_vecutil.h
|
||||
* Functionality related to vector instruction support
|
||||
*
|
||||
* Copyright (C) 2012,2013 Max-Planck-Society
|
||||
* Copyright (C) 2012-2018 Max-Planck-Society
|
||||
* \author Martin Reinecke
|
||||
*/
|
||||
|
||||
|
|
|
@ -85,15 +85,15 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
|
|||
else
|
||||
{
|
||||
gen->m=gen->mlo=gen->mhi=-1234567890;
|
||||
ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+2);
|
||||
for (int m=0; m<gen->lmax+2; ++m)
|
||||
ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+3);
|
||||
for (int m=0; m<gen->lmax+3; ++m)
|
||||
gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
|
||||
ALLOC(gen->inv,double,gen->lmax+1);
|
||||
ALLOC(gen->inv,double,gen->lmax+2);
|
||||
gen->inv[0]=0;
|
||||
for (int m=1; m<gen->lmax+1; ++m) gen->inv[m]=1./m;
|
||||
ALLOC(gen->flm1,double,2*gen->lmax+1);
|
||||
ALLOC(gen->flm2,double,2*gen->lmax+1);
|
||||
for (int m=0; m<2*gen->lmax+1; ++m)
|
||||
for (int m=1; m<gen->lmax+2; ++m) gen->inv[m]=1./m;
|
||||
ALLOC(gen->flm1,double,2*gen->lmax+3);
|
||||
ALLOC(gen->flm2,double,2*gen->lmax+3);
|
||||
for (int m=0; m<2*gen->lmax+3; ++m)
|
||||
{
|
||||
gen->flm1[m] = sqrt(1./(m+1.));
|
||||
gen->flm2[m] = sqrt(m/(m+1.));
|
||||
|
@ -176,7 +176,7 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
|
|||
|
||||
if (!ms_similar)
|
||||
{
|
||||
for (int l=gen->mhi; l<gen->lmax; ++l)
|
||||
for (int l=gen->mhi; l<gen->lmax+1; ++l)
|
||||
{
|
||||
double t = gen->flm1[l+gen->m]*gen->flm1[l-gen->m]
|
||||
*gen->flm1[l+gen->s]*gen->flm1[l-gen->s];
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue