cleanup
This commit is contained in:
parent
c750162e2b
commit
0b8222393f
12 changed files with 281 additions and 428 deletions
|
@ -19,14 +19,12 @@ src_sharp = \
|
||||||
libsharp/sharp_core.h \
|
libsharp/sharp_core.h \
|
||||||
libsharp/sharp_internal.h \
|
libsharp/sharp_internal.h \
|
||||||
libsharp/sharp_legendre_roots.h \
|
libsharp/sharp_legendre_roots.h \
|
||||||
libsharp/sharp_lowlevel.h \
|
|
||||||
libsharp/sharp_vecsupport.h \
|
libsharp/sharp_vecsupport.h \
|
||||||
libsharp/sharp_vecutil.h \
|
libsharp/sharp_vecutil.h \
|
||||||
libsharp/sharp_ylmgen_c.h
|
libsharp/sharp_ylmgen_c.h
|
||||||
|
|
||||||
include_HEADERS = \
|
include_HEADERS = \
|
||||||
libsharp/sharp.h \
|
libsharp/sharp.h \
|
||||||
libsharp/sharp_lowlevel.h \
|
|
||||||
libsharp/sharp_geomhelpers.h \
|
libsharp/sharp_geomhelpers.h \
|
||||||
libsharp/sharp_almhelpers.h \
|
libsharp/sharp_almhelpers.h \
|
||||||
libsharp/sharp_cxx.h
|
libsharp/sharp_cxx.h
|
||||||
|
|
243
libsharp/sharp.h
243
libsharp/sharp.h
|
@ -23,21 +23,248 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*! \file sharp.h
|
/*! \file sharp.h
|
||||||
* Interface for the spherical transform library.
|
* Portable interface for the spherical transform library.
|
||||||
*
|
*
|
||||||
* Copyright (C) 2006-2012 Max-Planck-Society
|
* Copyright (C) 2012-2018 Max-Planck-Society
|
||||||
* \author Martin Reinecke
|
* \author Martin Reinecke \author Dag Sverre Seljebotn
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef PLANCK_SHARP_H
|
#ifndef PLANCK_SHARP_LOWLEVEL_H
|
||||||
#define PLANCK_SHARP_H
|
#define PLANCK_SHARP_LOWLEVEL_H
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
#error This header file cannot be included from C++, only from C
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <complex.h>
|
/*! \internal
|
||||||
|
Helper type containing information about a single ring. */
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
double theta, phi0, weight, cth, sth;
|
||||||
|
ptrdiff_t ofs;
|
||||||
|
int nph, stride;
|
||||||
|
} sharp_ringinfo;
|
||||||
|
|
||||||
#include "sharp_lowlevel.h"
|
/*! \internal
|
||||||
|
Helper type containing information about a pair of rings with colatitudes
|
||||||
|
symmetric around the equator. */
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
sharp_ringinfo r1,r2;
|
||||||
|
} sharp_ringpair;
|
||||||
|
|
||||||
|
/*! \internal
|
||||||
|
Type holding all required information about a map geometry. */
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
sharp_ringpair *pair;
|
||||||
|
int npairs, nphmax;
|
||||||
|
} sharp_geom_info;
|
||||||
|
|
||||||
|
/*! \defgroup almgroup Helpers for dealing with a_lm */
|
||||||
|
/*! \{ */
|
||||||
|
|
||||||
|
/*! \internal
|
||||||
|
Helper type for index calculation in a_lm arrays. */
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
/*! Maximum \a l index of the array */
|
||||||
|
int lmax;
|
||||||
|
/*! Number of different \a m values in this object */
|
||||||
|
int nm;
|
||||||
|
/*! Array with \a nm entries containing the individual m values */
|
||||||
|
int *mval;
|
||||||
|
/*! Combination of flags from sharp_almflags */
|
||||||
|
int flags;
|
||||||
|
/*! Array with \a nm entries containing the (hypothetical) indices of
|
||||||
|
the coefficients with quantum numbers 0,\a mval[i] */
|
||||||
|
ptrdiff_t *mvstart;
|
||||||
|
/*! Stride between a_lm and a_(l+1),m */
|
||||||
|
ptrdiff_t stride;
|
||||||
|
} sharp_alm_info;
|
||||||
|
|
||||||
|
/*! alm_info flags */
|
||||||
|
typedef enum { SHARP_PACKED = 1,
|
||||||
|
/*!< m=0-coefficients are packed so that the (zero) imaginary part is
|
||||||
|
not present. mvstart is in units of *real* float/double for all
|
||||||
|
m; stride is in units of reals for m=0 and complex for m!=0 */
|
||||||
|
SHARP_REAL_HARMONICS = 1<<6
|
||||||
|
/*!< Use the real spherical harmonic convention. For
|
||||||
|
m==0, the alm are treated exactly the same as in
|
||||||
|
the complex case. For m!=0, alm[i] represent a
|
||||||
|
pair (+abs(m), -abs(m)) instead of (real, imag),
|
||||||
|
and the coefficients are scaled by a factor of
|
||||||
|
sqrt(2) relative to the complex case. In other
|
||||||
|
words, (sqrt(.5) * alm[i]) recovers the
|
||||||
|
corresponding complex coefficient (when accessed
|
||||||
|
as complex).
|
||||||
|
*/
|
||||||
|
} sharp_almflags;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! Creates an a_lm data structure from the following parameters:
|
||||||
|
\param lmax maximum \a l quantum number (>=0)
|
||||||
|
\param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
|
||||||
|
\param stride the stride between entries with identical \a m, and \a l
|
||||||
|
differing by 1.
|
||||||
|
\param mstart the index of the (hypothetical) coefficient with the
|
||||||
|
quantum numbers 0,\a m. Must have \a mmax+1 entries.
|
||||||
|
\param alm_info will hold a pointer to the newly created data structure
|
||||||
|
*/
|
||||||
|
void sharp_make_alm_info (int lmax, int mmax, int stride,
|
||||||
|
const ptrdiff_t *mstart, sharp_alm_info **alm_info);
|
||||||
|
/*! Creates an a_lm data structure which from the following parameters:
|
||||||
|
\param lmax maximum \a l quantum number (\a >=0)
|
||||||
|
\param nm number of different \a m (\a 0<=nm<=lmax+1)
|
||||||
|
\param stride the stride between entries with identical \a m, and \a l
|
||||||
|
differing by 1.
|
||||||
|
\param mval array with \a nm entries containing the individual m values
|
||||||
|
\param mvstart array with \a nm entries containing the (hypothetical)
|
||||||
|
indices of the coefficients with the quantum numbers 0,\a mval[i]
|
||||||
|
\param flags a combination of sharp_almflags (pass 0 unless you know you need this)
|
||||||
|
\param alm_info will hold a pointer to the newly created data structure
|
||||||
|
*/
|
||||||
|
void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
|
||||||
|
const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info);
|
||||||
|
/*! Returns the index of the coefficient with quantum numbers \a l,
|
||||||
|
\a mval[mi].
|
||||||
|
\note for a \a sharp_alm_info generated by sharp_make_alm_info() this is
|
||||||
|
the index for the coefficient with the quantum numbers \a l, \a mi. */
|
||||||
|
ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
|
||||||
|
/*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED
|
||||||
|
flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0),
|
||||||
|
otherwise it is the number of complex coefficients (with m>=0). */
|
||||||
|
ptrdiff_t sharp_alm_count(const sharp_alm_info *self);
|
||||||
|
/*! Deallocates the a_lm info object. */
|
||||||
|
void sharp_destroy_alm_info (sharp_alm_info *info);
|
||||||
|
|
||||||
|
/*! \} */
|
||||||
|
|
||||||
|
/*! \defgroup geominfogroup Functions for dealing with geometry information */
|
||||||
|
/*! \{ */
|
||||||
|
|
||||||
|
/*! Creates a geometry information from a set of ring descriptions.
|
||||||
|
All arrays passed to this function must have \a nrings elements.
|
||||||
|
\param nrings the number of rings in the map
|
||||||
|
\param nph the number of pixels in each ring
|
||||||
|
\param ofs the index of the first pixel in each ring in the map array
|
||||||
|
\param stride the stride between consecutive pixels
|
||||||
|
\param phi0 the azimuth (in radians) of the first pixel in each ring
|
||||||
|
\param theta the colatitude (in radians) of each ring
|
||||||
|
\param wgt the pixel weight to be used for the ring in map2alm
|
||||||
|
and adjoint map2alm transforms.
|
||||||
|
Pass NULL to use 1.0 as weight for all rings.
|
||||||
|
\param geom_info will hold a pointer to the newly created data structure
|
||||||
|
*/
|
||||||
|
void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
|
||||||
|
const int *stride, const double *phi0, const double *theta,
|
||||||
|
const double *wgt, sharp_geom_info **geom_info);
|
||||||
|
|
||||||
|
/*! Counts the number of grid points needed for (the local part of) a map described
|
||||||
|
by \a info.
|
||||||
|
*/
|
||||||
|
ptrdiff_t sharp_map_size(const sharp_geom_info *info);
|
||||||
|
|
||||||
|
/*! Deallocates the geometry information in \a info. */
|
||||||
|
void sharp_destroy_geom_info (sharp_geom_info *info);
|
||||||
|
|
||||||
|
/*! \} */
|
||||||
|
|
||||||
|
/*! \defgroup lowlevelgroup Low-level libsharp SHT interface */
|
||||||
|
/*! \{ */
|
||||||
|
|
||||||
|
/*! Enumeration of SHARP job types. */
|
||||||
|
typedef enum { SHARP_YtW=0, /*!< analysis */
|
||||||
|
SHARP_MAP2ALM=SHARP_YtW, /*!< analysis */
|
||||||
|
SHARP_Y=1, /*!< synthesis */
|
||||||
|
SHARP_ALM2MAP=SHARP_Y, /*!< synthesis */
|
||||||
|
SHARP_Yt=2, /*!< adjoint synthesis */
|
||||||
|
SHARP_WY=3, /*!< adjoint analysis */
|
||||||
|
SHARP_ALM2MAP_DERIV1=4 /*!< synthesis of first derivatives */
|
||||||
|
} sharp_jobtype;
|
||||||
|
|
||||||
|
/*! Job flags */
|
||||||
|
typedef enum { SHARP_DP = 1<<4,
|
||||||
|
/*!< map and a_lm are in double precision */
|
||||||
|
SHARP_ADD = 1<<5,
|
||||||
|
/*!< results are added to the output arrays, instead of
|
||||||
|
overwriting them */
|
||||||
|
|
||||||
|
/* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags,
|
||||||
|
but its use here is deprecated in favor of having it in the sharp_alm_info */
|
||||||
|
|
||||||
|
SHARP_NO_FFT = 1<<7,
|
||||||
|
|
||||||
|
SHARP_USE_WEIGHTS = 1<<20, /* internal use only */
|
||||||
|
SHARP_NO_OPENMP = 1<<21, /* internal use only */
|
||||||
|
} sharp_jobflags;
|
||||||
|
|
||||||
|
/*! Performs a libsharp SHT job. The interface deliberately does not use
|
||||||
|
the C99 "complex" data type, in order to be callable from C89 and C++.
|
||||||
|
\param type the type of SHT
|
||||||
|
\param spin the spin of the quantities to be transformed
|
||||||
|
\param alm contains pointers to the a_lm coefficients. If \a spin==0,
|
||||||
|
alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
|
||||||
|
etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
|
||||||
|
alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
|
||||||
|
depends on whether the SHARP_DP flag is set.
|
||||||
|
\param map contains pointers to the maps. If \a spin==0,
|
||||||
|
map[0] points to the map of the first SHT, map[1] to that of the second
|
||||||
|
etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
|
||||||
|
point to the maps of the first SHT, map[2] and map[3] to those of the
|
||||||
|
second, etc. The exact data type of \a map depends on whether the SHARP_DP
|
||||||
|
flag is set.
|
||||||
|
\param geom_info A \c sharp_geom_info object compatible with the provided
|
||||||
|
\a map arrays.
|
||||||
|
\param alm_info A \c sharp_alm_info object compatible with the provided
|
||||||
|
\a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
|
||||||
|
exactly once.
|
||||||
|
\param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
|
||||||
|
\a alm is expected to have the type "complex double **" and \a map is
|
||||||
|
expected to have the type "double **"; otherwise, the expected
|
||||||
|
types are "complex float **" and "float **", respectively.
|
||||||
|
\param time If not NULL, the wall clock time required for this SHT
|
||||||
|
(in seconds) will be written here.
|
||||||
|
\param opcnt If not NULL, a conservative estimate of the total floating point
|
||||||
|
operation count for this SHT will be written here. */
|
||||||
|
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
|
||||||
|
const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
|
||||||
|
int flags, double *time, unsigned long long *opcnt);
|
||||||
|
|
||||||
|
void sharp_set_chunksize_min(int new_chunksize_min);
|
||||||
|
void sharp_set_nchunks_max(int new_nchunks_max);
|
||||||
|
|
||||||
|
|
||||||
|
typedef enum { SHARP_ERROR_NO_MPI = 1,
|
||||||
|
/*!< libsharp not compiled with MPI support */
|
||||||
|
} sharp_errors;
|
||||||
|
|
||||||
|
/*! Works like sharp_execute_mpi, but is always present whether or not libsharp
|
||||||
|
is compiled with USE_MPI. This is primarily useful for wrapper code etc.
|
||||||
|
|
||||||
|
Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid
|
||||||
|
pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi
|
||||||
|
is *(MPI_Comm*)pcomm.
|
||||||
|
|
||||||
|
Other parameters are the same as sharp_execute_mpi.
|
||||||
|
|
||||||
|
Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available
|
||||||
|
(in which case nothing is done).
|
||||||
|
*/
|
||||||
|
int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
|
||||||
|
void *alm, void *map, const sharp_geom_info *geom_info,
|
||||||
|
const sharp_alm_info *alm_info, int flags, double *time,
|
||||||
|
unsigned long long *opcnt);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \} */
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -32,7 +32,7 @@
|
||||||
#ifndef PLANCK_SHARP_ALMHELPERS_H
|
#ifndef PLANCK_SHARP_ALMHELPERS_H
|
||||||
#define PLANCK_SHARP_ALMHELPERS_H
|
#define PLANCK_SHARP_ALMHELPERS_H
|
||||||
|
|
||||||
#include "sharp_lowlevel.h"
|
#include "sharp.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
|
@ -25,131 +25,126 @@
|
||||||
/* \file sharp_complex_hacks.h
|
/* \file sharp_complex_hacks.h
|
||||||
* support for converting vector types and complex numbers
|
* support for converting vector types and complex numbers
|
||||||
*
|
*
|
||||||
* Copyright (C) 2012-2016 Max-Planck-Society
|
* Copyright (C) 2012-2018 Max-Planck-Society
|
||||||
* Author: Martin Reinecke
|
* Author: Martin Reinecke
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef SHARP_COMPLEX_HACKS_H
|
#ifndef SHARP_COMPLEX_HACKS_H
|
||||||
#define SHARP_COMPLEX_HACKS_H
|
#define SHARP_COMPLEX_HACKS_H
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
#error This header file cannot be included from C++, only from C
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <complex.h>
|
|
||||||
#include "sharp_vecsupport.h"
|
#include "sharp_vecsupport.h"
|
||||||
|
|
||||||
#define UNSAFE_CODE
|
#define UNSAFE_CODE
|
||||||
|
|
||||||
#if (VLEN==1)
|
#if (VLEN==1)
|
||||||
|
|
||||||
static inline complex double vhsum_cmplx(Tv a, Tv b)
|
static inline _Complex double vhsum_cmplx(Tv a, Tv b)
|
||||||
{ return a+_Complex_I*b; }
|
{ return a+_Complex_I*b; }
|
||||||
|
|
||||||
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
|
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
|
||||||
complex double * restrict c1, complex double * restrict c2)
|
_Complex double * restrict c1, _Complex double * restrict c2)
|
||||||
{ *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
|
{ *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
|
||||||
|
|
||||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||||
complex double * restrict cc)
|
_Complex double * restrict cc)
|
||||||
{ cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
|
{ cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (VLEN==2)
|
#if (VLEN==2)
|
||||||
|
|
||||||
static inline complex double vhsum_cmplx (Tv a, Tv b)
|
static inline _Complex double vhsum_cmplx (Tv a, Tv b)
|
||||||
{
|
{
|
||||||
#if defined(__SSE3__)
|
#if defined(__SSE3__)
|
||||||
Tv tmp = _mm_hadd_pd(a,b);
|
Tv tmp = _mm_hadd_pd(a,b);
|
||||||
#else
|
#else
|
||||||
Tv tmp = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
|
Tv tmp = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
|
||||||
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
|
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
|
||||||
#endif
|
#endif
|
||||||
union {Tv v; complex double c; } u;
|
union {Tv v; _Complex double c; } u;
|
||||||
u.v=tmp; return u.c;
|
u.v=tmp; return u.c;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
|
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
|
||||||
Tv d, complex double * restrict c1, complex double * restrict c2)
|
Tv d, _Complex double * restrict c1, _Complex double * restrict c2)
|
||||||
{
|
{
|
||||||
#ifdef UNSAFE_CODE
|
#ifdef UNSAFE_CODE
|
||||||
#if defined(__SSE3__)
|
#if defined(__SSE3__)
|
||||||
vaddeq(*((__m128d *)c1),_mm_hadd_pd(a,b));
|
*((__m128d *)c1) += _mm_hadd_pd(a,b);
|
||||||
vaddeq(*((__m128d *)c2),_mm_hadd_pd(c,d));
|
*((__m128d *)c2) += _mm_hadd_pd(c,d);
|
||||||
#else
|
#else
|
||||||
vaddeq(*((__m128d *)c1),vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
|
*((__m128d *)c1) += _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
|
||||||
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0))));
|
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
|
||||||
vaddeq(*((__m128d *)c2),vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
|
*((__m128d *)c2) += _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
|
||||||
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0))));
|
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
union {Tv v; complex double c; } u1, u2;
|
union {Tv v; _Complex double c; } u1, u2;
|
||||||
#if defined(__SSE3__)
|
#if defined(__SSE3__)
|
||||||
u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
|
u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
|
||||||
#else
|
#else
|
||||||
u1.v = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
|
u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
|
||||||
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
|
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
|
||||||
u2.v = vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
|
u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
|
||||||
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)));
|
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
|
||||||
#endif
|
#endif
|
||||||
*c1+=u1.c; *c2+=u2.c;
|
*c1+=u1.c; *c2+=u2.c;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||||
complex double * restrict cc)
|
_Complex double * restrict cc)
|
||||||
{ vhsum_cmplx2(a,b,c,d,cc,cc+1); }
|
{ vhsum_cmplx2(a,b,c,d,cc,cc+1); }
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (VLEN==4)
|
#if (VLEN==4)
|
||||||
|
|
||||||
static inline complex double vhsum_cmplx (Tv a, Tv b)
|
static inline _Complex double vhsum_cmplx (Tv a, Tv b)
|
||||||
{
|
{
|
||||||
Tv tmp=_mm256_hadd_pd(a,b);
|
Tv tmp=_mm256_hadd_pd(a,b);
|
||||||
Tv tmp2=_mm256_permute2f128_pd(tmp,tmp,1);
|
Tv tmp2=_mm256_permute2f128_pd(tmp,tmp,1);
|
||||||
tmp=_mm256_add_pd(tmp,tmp2);
|
tmp=_mm256_add_pd(tmp,tmp2);
|
||||||
#ifdef UNSAFE_CODE
|
#ifdef UNSAFE_CODE
|
||||||
complex double ret;
|
_Complex double ret;
|
||||||
*((__m128d *)&ret)=_mm256_extractf128_pd(tmp, 0);
|
*((__m128d *)&ret)=_mm256_extractf128_pd(tmp, 0);
|
||||||
return ret;
|
return ret;
|
||||||
#else
|
#else
|
||||||
union {Tv v; complex double c[2]; } u;
|
union {Tv v; _Complex double c[2]; } u;
|
||||||
u.v=tmp; return u.c[0];
|
u.v=tmp; return u.c[0];
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
|
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
|
||||||
complex double * restrict c1, complex double * restrict c2)
|
_Complex double * restrict c1, _Complex double * restrict c2)
|
||||||
{
|
{
|
||||||
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
|
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
|
||||||
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
|
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
|
||||||
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
|
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
|
||||||
tmp1=vadd(tmp3,tmp4);
|
tmp1=tmp3+tmp4;
|
||||||
#ifdef UNSAFE_CODE
|
#ifdef UNSAFE_CODE
|
||||||
*((__m128d *)c1)=_mm_add_pd(*((__m128d *)c1),_mm256_extractf128_pd(tmp1, 0));
|
*((__m128d *)c1)=_mm_add_pd(*((__m128d *)c1),_mm256_extractf128_pd(tmp1, 0));
|
||||||
*((__m128d *)c2)=_mm_add_pd(*((__m128d *)c2),_mm256_extractf128_pd(tmp1, 1));
|
*((__m128d *)c2)=_mm_add_pd(*((__m128d *)c2),_mm256_extractf128_pd(tmp1, 1));
|
||||||
#else
|
#else
|
||||||
union {Tv v; complex double c[2]; } u;
|
union {Tv v; _Complex double c[2]; } u;
|
||||||
u.v=tmp1;
|
u.v=tmp1;
|
||||||
*c1+=u.c[0]; *c2+=u.c[1];
|
*c1+=u.c[0]; *c2+=u.c[1];
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||||
complex double * restrict cc)
|
_Complex double * restrict cc)
|
||||||
{
|
{
|
||||||
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
|
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
|
||||||
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
|
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
|
||||||
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
|
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
|
||||||
tmp1=vadd(tmp3,tmp4);
|
tmp1=tmp3+tmp4;
|
||||||
#ifdef UNSAFE_CODE
|
#ifdef UNSAFE_CODE
|
||||||
_mm256_storeu_pd((double *)cc,
|
_mm256_storeu_pd((double *)cc,
|
||||||
_mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1));
|
_mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1));
|
||||||
#else
|
#else
|
||||||
union {Tv v; complex double c[2]; } u;
|
union {Tv v; _Complex double c[2]; } u;
|
||||||
u.v=tmp1;
|
u.v=tmp1;
|
||||||
cc[0]+=u.c[0]; cc[1]+=u.c[1];
|
cc[0]+=u.c[0]; cc[1]+=u.c[1];
|
||||||
#endif
|
#endif
|
||||||
|
@ -159,18 +154,18 @@ static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||||
|
|
||||||
#if (VLEN==8)
|
#if (VLEN==8)
|
||||||
|
|
||||||
static inline complex double vhsum_cmplx(Tv a, Tv b)
|
static inline _Complex double vhsum_cmplx(Tv a, Tv b)
|
||||||
{ return _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); }
|
{ return _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); }
|
||||||
|
|
||||||
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
|
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
|
||||||
complex double * restrict c1, complex double * restrict c2)
|
_Complex double * restrict c1, _Complex double * restrict c2)
|
||||||
{
|
{
|
||||||
*c1 += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
|
*c1 += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
|
||||||
*c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
|
*c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
|
||||||
complex double * restrict cc)
|
_Complex double * restrict cc)
|
||||||
{ vhsum_cmplx2(a,b,c,d,cc,cc+1); }
|
{ vhsum_cmplx2(a,b,c,d,cc,cc+1); }
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -114,8 +114,8 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
if (npow&1)
|
if (npow&1)
|
||||||
vmuleq(res,val);
|
res*=val;
|
||||||
vmuleq(val,val);
|
val*=val;
|
||||||
}
|
}
|
||||||
while(npow>>=1);
|
while(npow>>=1);
|
||||||
*resd=res;
|
*resd=res;
|
||||||
|
@ -129,12 +129,12 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
|
||||||
{
|
{
|
||||||
if (npow&1)
|
if (npow&1)
|
||||||
{
|
{
|
||||||
vmuleq(res,val);
|
res*=val;
|
||||||
vaddeq(scale,scaleint);
|
scale+=scaleint;
|
||||||
Tvnormalize(&res,&scale,sharp_fbighalf);
|
Tvnormalize(&res,&scale,sharp_fbighalf);
|
||||||
}
|
}
|
||||||
vmuleq(val,val);
|
val*=val;
|
||||||
vaddeq(scaleint,scaleint);
|
scaleint+=scaleint;
|
||||||
Tvnormalize(&val,&scaleint,sharp_fbighalf);
|
Tvnormalize(&val,&scaleint,sharp_fbighalf);
|
||||||
}
|
}
|
||||||
while(npow>>=1);
|
while(npow>>=1);
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
#define PLANCK_SHARP_CXX_H
|
#define PLANCK_SHARP_CXX_H
|
||||||
|
|
||||||
#include <complex>
|
#include <complex>
|
||||||
#include "sharp_lowlevel.h"
|
#include "sharp.h"
|
||||||
#include "sharp_geomhelpers.h"
|
#include "sharp_geomhelpers.h"
|
||||||
#include "sharp_almhelpers.h"
|
#include "sharp_almhelpers.h"
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,7 @@
|
||||||
#ifndef PLANCK_SHARP_GEOMHELPERS_H
|
#ifndef PLANCK_SHARP_GEOMHELPERS_H
|
||||||
#define PLANCK_SHARP_GEOMHELPERS_H
|
#define PLANCK_SHARP_GEOMHELPERS_H
|
||||||
|
|
||||||
#include "sharp_lowlevel.h"
|
#include "sharp.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
|
@ -25,7 +25,7 @@
|
||||||
/*! \file sharp_internal.h
|
/*! \file sharp_internal.h
|
||||||
* Internally used functionality for the spherical transform library.
|
* Internally used functionality for the spherical transform library.
|
||||||
*
|
*
|
||||||
* Copyright (C) 2006-2013 Max-Planck-Society
|
* Copyright (C) 2006-2018 Max-Planck-Society
|
||||||
* \author Martin Reinecke \author Dag Sverre Seljebotn
|
* \author Martin Reinecke \author Dag Sverre Seljebotn
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -36,6 +36,7 @@
|
||||||
#error This header file cannot be included from C++, only from C
|
#error This header file cannot be included from C++, only from C
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <complex.h>
|
||||||
#include "sharp.h"
|
#include "sharp.h"
|
||||||
|
|
||||||
#define SHARP_MAXTRANS 100
|
#define SHARP_MAXTRANS 100
|
||||||
|
|
|
@ -1,270 +0,0 @@
|
||||||
/*
|
|
||||||
* This file is part of libsharp.
|
|
||||||
*
|
|
||||||
* libsharp is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
|
||||||
* (at your option) any later version.
|
|
||||||
*
|
|
||||||
* libsharp is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with libsharp; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
|
|
||||||
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
|
|
||||||
* (DLR).
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*! \file sharp_lowlevel.h
|
|
||||||
* Low-level, portable interface for the spherical transform library.
|
|
||||||
*
|
|
||||||
* Copyright (C) 2012-2013 Max-Planck-Society
|
|
||||||
* \author Martin Reinecke \author Dag Sverre Seljebotn
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef PLANCK_SHARP_LOWLEVEL_H
|
|
||||||
#define PLANCK_SHARP_LOWLEVEL_H
|
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*! \internal
|
|
||||||
Helper type containing information about a single ring. */
|
|
||||||
typedef struct
|
|
||||||
{
|
|
||||||
double theta, phi0, weight, cth, sth;
|
|
||||||
ptrdiff_t ofs;
|
|
||||||
int nph, stride;
|
|
||||||
} sharp_ringinfo;
|
|
||||||
|
|
||||||
/*! \internal
|
|
||||||
Helper type containing information about a pair of rings with colatitudes
|
|
||||||
symmetric around the equator. */
|
|
||||||
typedef struct
|
|
||||||
{
|
|
||||||
sharp_ringinfo r1,r2;
|
|
||||||
} sharp_ringpair;
|
|
||||||
|
|
||||||
/*! \internal
|
|
||||||
Type holding all required information about a map geometry. */
|
|
||||||
typedef struct
|
|
||||||
{
|
|
||||||
sharp_ringpair *pair;
|
|
||||||
int npairs, nphmax;
|
|
||||||
} sharp_geom_info;
|
|
||||||
|
|
||||||
/*! \defgroup almgroup Helpers for dealing with a_lm */
|
|
||||||
/*! \{ */
|
|
||||||
|
|
||||||
/*! \internal
|
|
||||||
Helper type for index calculation in a_lm arrays. */
|
|
||||||
typedef struct
|
|
||||||
{
|
|
||||||
/*! Maximum \a l index of the array */
|
|
||||||
int lmax;
|
|
||||||
/*! Number of different \a m values in this object */
|
|
||||||
int nm;
|
|
||||||
/*! Array with \a nm entries containing the individual m values */
|
|
||||||
int *mval;
|
|
||||||
/*! Combination of flags from sharp_almflags */
|
|
||||||
int flags;
|
|
||||||
/*! Array with \a nm entries containing the (hypothetical) indices of
|
|
||||||
the coefficients with quantum numbers 0,\a mval[i] */
|
|
||||||
ptrdiff_t *mvstart;
|
|
||||||
/*! Stride between a_lm and a_(l+1),m */
|
|
||||||
ptrdiff_t stride;
|
|
||||||
} sharp_alm_info;
|
|
||||||
|
|
||||||
/*! alm_info flags */
|
|
||||||
typedef enum { SHARP_PACKED = 1,
|
|
||||||
/*!< m=0-coefficients are packed so that the (zero) imaginary part is
|
|
||||||
not present. mvstart is in units of *real* float/double for all
|
|
||||||
m; stride is in units of reals for m=0 and complex for m!=0 */
|
|
||||||
SHARP_REAL_HARMONICS = 1<<6
|
|
||||||
/*!< Use the real spherical harmonic convention. For
|
|
||||||
m==0, the alm are treated exactly the same as in
|
|
||||||
the complex case. For m!=0, alm[i] represent a
|
|
||||||
pair (+abs(m), -abs(m)) instead of (real, imag),
|
|
||||||
and the coefficients are scaled by a factor of
|
|
||||||
sqrt(2) relative to the complex case. In other
|
|
||||||
words, (sqrt(.5) * alm[i]) recovers the
|
|
||||||
corresponding complex coefficient (when accessed
|
|
||||||
as complex).
|
|
||||||
*/
|
|
||||||
} sharp_almflags;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*! Creates an a_lm data structure from the following parameters:
|
|
||||||
\param lmax maximum \a l quantum number (>=0)
|
|
||||||
\param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
|
|
||||||
\param stride the stride between entries with identical \a m, and \a l
|
|
||||||
differing by 1.
|
|
||||||
\param mstart the index of the (hypothetical) coefficient with the
|
|
||||||
quantum numbers 0,\a m. Must have \a mmax+1 entries.
|
|
||||||
\param alm_info will hold a pointer to the newly created data structure
|
|
||||||
*/
|
|
||||||
void sharp_make_alm_info (int lmax, int mmax, int stride,
|
|
||||||
const ptrdiff_t *mstart, sharp_alm_info **alm_info);
|
|
||||||
/*! Creates an a_lm data structure which from the following parameters:
|
|
||||||
\param lmax maximum \a l quantum number (\a >=0)
|
|
||||||
\param nm number of different \a m (\a 0<=nm<=lmax+1)
|
|
||||||
\param stride the stride between entries with identical \a m, and \a l
|
|
||||||
differing by 1.
|
|
||||||
\param mval array with \a nm entries containing the individual m values
|
|
||||||
\param mvstart array with \a nm entries containing the (hypothetical)
|
|
||||||
indices of the coefficients with the quantum numbers 0,\a mval[i]
|
|
||||||
\param flags a combination of sharp_almflags (pass 0 unless you know you need this)
|
|
||||||
\param alm_info will hold a pointer to the newly created data structure
|
|
||||||
*/
|
|
||||||
void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
|
|
||||||
const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info);
|
|
||||||
/*! Returns the index of the coefficient with quantum numbers \a l,
|
|
||||||
\a mval[mi].
|
|
||||||
\note for a \a sharp_alm_info generated by sharp_make_alm_info() this is
|
|
||||||
the index for the coefficient with the quantum numbers \a l, \a mi. */
|
|
||||||
ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
|
|
||||||
/*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED
|
|
||||||
flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0),
|
|
||||||
otherwise it is the number of complex coefficients (with m>=0). */
|
|
||||||
ptrdiff_t sharp_alm_count(const sharp_alm_info *self);
|
|
||||||
/*! Deallocates the a_lm info object. */
|
|
||||||
void sharp_destroy_alm_info (sharp_alm_info *info);
|
|
||||||
|
|
||||||
/*! \} */
|
|
||||||
|
|
||||||
/*! \defgroup geominfogroup Functions for dealing with geometry information */
|
|
||||||
/*! \{ */
|
|
||||||
|
|
||||||
/*! Creates a geometry information from a set of ring descriptions.
|
|
||||||
All arrays passed to this function must have \a nrings elements.
|
|
||||||
\param nrings the number of rings in the map
|
|
||||||
\param nph the number of pixels in each ring
|
|
||||||
\param ofs the index of the first pixel in each ring in the map array
|
|
||||||
\param stride the stride between consecutive pixels
|
|
||||||
\param phi0 the azimuth (in radians) of the first pixel in each ring
|
|
||||||
\param theta the colatitude (in radians) of each ring
|
|
||||||
\param wgt the pixel weight to be used for the ring in map2alm
|
|
||||||
and adjoint map2alm transforms.
|
|
||||||
Pass NULL to use 1.0 as weight for all rings.
|
|
||||||
\param geom_info will hold a pointer to the newly created data structure
|
|
||||||
*/
|
|
||||||
void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
|
|
||||||
const int *stride, const double *phi0, const double *theta,
|
|
||||||
const double *wgt, sharp_geom_info **geom_info);
|
|
||||||
|
|
||||||
/*! Counts the number of grid points needed for (the local part of) a map described
|
|
||||||
by \a info.
|
|
||||||
*/
|
|
||||||
ptrdiff_t sharp_map_size(const sharp_geom_info *info);
|
|
||||||
|
|
||||||
/*! Deallocates the geometry information in \a info. */
|
|
||||||
void sharp_destroy_geom_info (sharp_geom_info *info);
|
|
||||||
|
|
||||||
/*! \} */
|
|
||||||
|
|
||||||
/*! \defgroup lowlevelgroup Low-level libsharp SHT interface */
|
|
||||||
/*! \{ */
|
|
||||||
|
|
||||||
/*! Enumeration of SHARP job types. */
|
|
||||||
typedef enum { SHARP_YtW=0, /*!< analysis */
|
|
||||||
SHARP_MAP2ALM=SHARP_YtW, /*!< analysis */
|
|
||||||
SHARP_Y=1, /*!< synthesis */
|
|
||||||
SHARP_ALM2MAP=SHARP_Y, /*!< synthesis */
|
|
||||||
SHARP_Yt=2, /*!< adjoint synthesis */
|
|
||||||
SHARP_WY=3, /*!< adjoint analysis */
|
|
||||||
SHARP_ALM2MAP_DERIV1=4 /*!< synthesis of first derivatives */
|
|
||||||
} sharp_jobtype;
|
|
||||||
|
|
||||||
/*! Job flags */
|
|
||||||
typedef enum { SHARP_DP = 1<<4,
|
|
||||||
/*!< map and a_lm are in double precision */
|
|
||||||
SHARP_ADD = 1<<5,
|
|
||||||
/*!< results are added to the output arrays, instead of
|
|
||||||
overwriting them */
|
|
||||||
|
|
||||||
/* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags,
|
|
||||||
but its use here is deprecated in favor of having it in the sharp_alm_info */
|
|
||||||
|
|
||||||
SHARP_NO_FFT = 1<<7,
|
|
||||||
|
|
||||||
SHARP_USE_WEIGHTS = 1<<20, /* internal use only */
|
|
||||||
SHARP_NO_OPENMP = 1<<21, /* internal use only */
|
|
||||||
} sharp_jobflags;
|
|
||||||
|
|
||||||
/*! Performs a libsharp SHT job. The interface deliberately does not use
|
|
||||||
the C99 "complex" data type, in order to be callable from C89 and C++.
|
|
||||||
\param type the type of SHT
|
|
||||||
\param spin the spin of the quantities to be transformed
|
|
||||||
\param alm contains pointers to the a_lm coefficients. If \a spin==0,
|
|
||||||
alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
|
|
||||||
etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
|
|
||||||
alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
|
|
||||||
depends on whether the SHARP_DP flag is set.
|
|
||||||
\param map contains pointers to the maps. If \a spin==0,
|
|
||||||
map[0] points to the map of the first SHT, map[1] to that of the second
|
|
||||||
etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
|
|
||||||
point to the maps of the first SHT, map[2] and map[3] to those of the
|
|
||||||
second, etc. The exact data type of \a map depends on whether the SHARP_DP
|
|
||||||
flag is set.
|
|
||||||
\param geom_info A \c sharp_geom_info object compatible with the provided
|
|
||||||
\a map arrays.
|
|
||||||
\param alm_info A \c sharp_alm_info object compatible with the provided
|
|
||||||
\a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
|
|
||||||
exactly once.
|
|
||||||
\param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
|
|
||||||
\a alm is expected to have the type "complex double **" and \a map is
|
|
||||||
expected to have the type "double **"; otherwise, the expected
|
|
||||||
types are "complex float **" and "float **", respectively.
|
|
||||||
\param time If not NULL, the wall clock time required for this SHT
|
|
||||||
(in seconds) will be written here.
|
|
||||||
\param opcnt If not NULL, a conservative estimate of the total floating point
|
|
||||||
operation count for this SHT will be written here. */
|
|
||||||
void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
|
|
||||||
const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
|
|
||||||
int flags, double *time, unsigned long long *opcnt);
|
|
||||||
|
|
||||||
void sharp_set_chunksize_min(int new_chunksize_min);
|
|
||||||
void sharp_set_nchunks_max(int new_nchunks_max);
|
|
||||||
|
|
||||||
|
|
||||||
typedef enum { SHARP_ERROR_NO_MPI = 1,
|
|
||||||
/*!< libsharp not compiled with MPI support */
|
|
||||||
} sharp_errors;
|
|
||||||
|
|
||||||
/*! Works like sharp_execute_mpi, but is always present whether or not libsharp
|
|
||||||
is compiled with USE_MPI. This is primarily useful for wrapper code etc.
|
|
||||||
|
|
||||||
Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid
|
|
||||||
pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi
|
|
||||||
is *(MPI_Comm*)pcomm.
|
|
||||||
|
|
||||||
Other parameters are the same as sharp_execute_mpi.
|
|
||||||
|
|
||||||
Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available
|
|
||||||
(in which case nothing is done).
|
|
||||||
*/
|
|
||||||
int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
|
|
||||||
void *alm, void *map, const sharp_geom_info *geom_info,
|
|
||||||
const sharp_alm_info *alm_info, int flags, double *time,
|
|
||||||
unsigned long long *opcnt);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*! \} */
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -33,7 +33,7 @@
|
||||||
#define PLANCK_SHARP_MPI_H
|
#define PLANCK_SHARP_MPI_H
|
||||||
|
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
#include "sharp_lowlevel.h"
|
#include "sharp.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
|
@ -40,32 +40,13 @@ typedef double Ts;
|
||||||
#if (VLEN==1)
|
#if (VLEN==1)
|
||||||
|
|
||||||
typedef double Tv;
|
typedef double Tv;
|
||||||
typedef float Tv_s;
|
|
||||||
typedef int Tm;
|
typedef int Tm;
|
||||||
|
|
||||||
#define vadd(a,b) ((a)+(b))
|
|
||||||
#define vadd_s(a,b) ((a)+(b))
|
|
||||||
#define vaddeq(a,b) ((a)+=(b))
|
|
||||||
#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
|
#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
|
||||||
#define vsub(a,b) ((a)-(b))
|
|
||||||
#define vsub_s(a,b) ((a)-(b))
|
|
||||||
#define vsubeq(a,b) ((a)-=(b))
|
|
||||||
#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
|
#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
|
||||||
#define vmul(a,b) ((a)*(b))
|
|
||||||
#define vmul_s(a,b) ((a)*(b))
|
|
||||||
#define vmuleq(a,b) ((a)*=(b))
|
|
||||||
#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
|
#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
|
||||||
#define vfmaeq(a,b,c) ((a)+=(b)*(c))
|
|
||||||
#define vfmaeq_s(a,b,c) ((a)+=(b)*(c))
|
|
||||||
#define vfmseq(a,b,c) ((a)-=(b)*(c))
|
|
||||||
#define vabmc(a,b,c) ((a)*(b)-(c))
|
|
||||||
#define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
|
|
||||||
#define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
|
|
||||||
#define vneg(a) (-(a))
|
#define vneg(a) (-(a))
|
||||||
#define vload(a) (a)
|
#define vload(a) (a)
|
||||||
#define vload_s(a) (a)
|
|
||||||
#define vloadu(p) (*(p))
|
|
||||||
#define vloadu_s(p) (*(p))
|
|
||||||
#define vabs(a) fabs(a)
|
#define vabs(a) fabs(a)
|
||||||
#define vsqrt(a) sqrt(a)
|
#define vsqrt(a) sqrt(a)
|
||||||
#define vlt(a,b) ((a)<(b))
|
#define vlt(a,b) ((a)<(b))
|
||||||
|
@ -74,8 +55,6 @@ typedef int Tm;
|
||||||
#define vne(a,b) ((a)!=(b))
|
#define vne(a,b) ((a)!=(b))
|
||||||
#define vand_mask(a,b) ((a)&&(b))
|
#define vand_mask(a,b) ((a)&&(b))
|
||||||
#define vor_mask(a,b) ((a)||(b))
|
#define vor_mask(a,b) ((a)||(b))
|
||||||
#define vstoreu(p, a) (*(p)=a)
|
|
||||||
#define vstoreu_s(p, a) (*(p)=a)
|
|
||||||
|
|
||||||
static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
|
static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
|
||||||
static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
|
static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
|
||||||
|
@ -99,7 +78,6 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef __m128d Tv;
|
typedef __m128d Tv;
|
||||||
typedef __m128 Tv_s;
|
|
||||||
typedef __m128d Tm;
|
typedef __m128d Tm;
|
||||||
|
|
||||||
#if defined(__SSE4_1__)
|
#if defined(__SSE4_1__)
|
||||||
|
@ -111,29 +89,11 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
|
||||||
#define vzero _mm_setzero_pd()
|
#define vzero _mm_setzero_pd()
|
||||||
#define vone _mm_set1_pd(1.)
|
#define vone _mm_set1_pd(1.)
|
||||||
|
|
||||||
#define vadd(a,b) _mm_add_pd(a,b)
|
|
||||||
#define vadd_s(a,b) _mm_add_ps(a,b)
|
|
||||||
#define vaddeq(a,b) a=_mm_add_pd(a,b)
|
|
||||||
#define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero))
|
#define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero))
|
||||||
#define vsub(a,b) _mm_sub_pd(a,b)
|
|
||||||
#define vsub_s(a,b) _mm_sub_ps(a,b)
|
|
||||||
#define vsubeq(a,b) a=_mm_sub_pd(a,b)
|
|
||||||
#define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero))
|
#define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero))
|
||||||
#define vmul(a,b) _mm_mul_pd(a,b)
|
|
||||||
#define vmul_s(a,b) _mm_mul_ps(a,b)
|
|
||||||
#define vmuleq(a,b) a=_mm_mul_pd(a,b)
|
|
||||||
#define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone))
|
#define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone))
|
||||||
#define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
|
|
||||||
#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c))
|
|
||||||
#define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
|
|
||||||
#define vabmc(a,b,c) _mm_sub_pd(_mm_mul_pd(a,b),c)
|
|
||||||
#define vfmaaeq(a,b,c,d,e) \
|
|
||||||
a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
|
|
||||||
#define vfmaseq(a,b,c,d,e) \
|
|
||||||
a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
|
|
||||||
#define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
|
#define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
|
||||||
#define vload(a) _mm_set1_pd(a)
|
#define vload(a) _mm_set1_pd(a)
|
||||||
#define vload_s(a) _mm_set1_ps(a)
|
|
||||||
#define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
|
#define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
|
||||||
#define vsqrt(a) _mm_sqrt_pd(a)
|
#define vsqrt(a) _mm_sqrt_pd(a)
|
||||||
#define vlt(a,b) _mm_cmplt_pd(a,b)
|
#define vlt(a,b) _mm_cmplt_pd(a,b)
|
||||||
|
@ -146,10 +106,6 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
|
||||||
#define vmax(a,b) _mm_max_pd(a,b);
|
#define vmax(a,b) _mm_max_pd(a,b);
|
||||||
#define vanyTrue(a) (_mm_movemask_pd(a)!=0)
|
#define vanyTrue(a) (_mm_movemask_pd(a)!=0)
|
||||||
#define vallTrue(a) (_mm_movemask_pd(a)==3)
|
#define vallTrue(a) (_mm_movemask_pd(a)==3)
|
||||||
#define vloadu(p) _mm_loadu_pd(p)
|
|
||||||
#define vloadu_s(p) _mm_loadu_ps(p)
|
|
||||||
#define vstoreu(p, v) _mm_storeu_pd(p, v)
|
|
||||||
#define vstoreu_s(p, v) _mm_storeu_ps(p, v)
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -161,54 +117,17 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef __m256d Tv;
|
typedef __m256d Tv;
|
||||||
typedef __m256 Tv_s;
|
|
||||||
typedef __m256d Tm;
|
typedef __m256d Tm;
|
||||||
|
|
||||||
#define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
|
#define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
|
||||||
#define vzero _mm256_setzero_pd()
|
#define vzero _mm256_setzero_pd()
|
||||||
#define vone _mm256_set1_pd(1.)
|
#define vone _mm256_set1_pd(1.)
|
||||||
|
|
||||||
#define vadd(a,b) _mm256_add_pd(a,b)
|
|
||||||
#define vadd_s(a,b) _mm256_add_ps(a,b)
|
|
||||||
#define vaddeq(a,b) a=_mm256_add_pd(a,b)
|
|
||||||
#define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero))
|
#define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero))
|
||||||
#define vsub(a,b) _mm256_sub_pd(a,b)
|
|
||||||
#define vsub_s(a,b) _mm256_sub_ps(a,b)
|
|
||||||
#define vsubeq(a,b) a=_mm256_sub_pd(a,b)
|
|
||||||
#define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero))
|
#define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero))
|
||||||
#define vmul(a,b) _mm256_mul_pd(a,b)
|
|
||||||
#define vmul_s(a,b) _mm256_mul_ps(a,b)
|
|
||||||
#define vmuleq(a,b) a=_mm256_mul_pd(a,b)
|
|
||||||
#define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
|
#define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
|
||||||
#if (USE_FMA4)
|
|
||||||
#define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
|
|
||||||
#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
|
|
||||||
#define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
|
|
||||||
#define vabmc(a,b,c) _mm256_msub_pd(a,b,c)
|
|
||||||
#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
|
|
||||||
#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
|
|
||||||
#else
|
|
||||||
#if (USE_FMA)
|
|
||||||
#define vfmaeq(a,b,c) a=_mm256_fmadd_pd(b,c,a)
|
|
||||||
#define vfmaeq_s(a,b,c) a=_mm256_fmadd_ps(b,c,a)
|
|
||||||
#define vfmseq(a,b,c) a=_mm256_fnmadd_pd(b,c,a)
|
|
||||||
#define vabmc(a,b,c) _mm256_fmsub_pd(a,b,c)
|
|
||||||
#define vfmaaeq(a,b,c,d,e) a=_mm256_fmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
|
|
||||||
#define vfmaseq(a,b,c,d,e) a=_mm256_fnmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
|
|
||||||
#else
|
|
||||||
#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
|
|
||||||
#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
|
|
||||||
#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
|
|
||||||
#define vabmc(a,b,c) _mm256_sub_pd(_mm256_mul_pd(a,b),c)
|
|
||||||
#define vfmaaeq(a,b,c,d,e) \
|
|
||||||
a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
|
|
||||||
#define vfmaseq(a,b,c,d,e) \
|
|
||||||
a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
|
#define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
|
||||||
#define vload(a) _mm256_set1_pd(a)
|
#define vload(a) _mm256_set1_pd(a)
|
||||||
#define vload_s(a) _mm256_set1_ps(a)
|
|
||||||
#define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
|
#define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
|
||||||
#define vsqrt(a) _mm256_sqrt_pd(a)
|
#define vsqrt(a) _mm256_sqrt_pd(a)
|
||||||
#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
|
#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
|
||||||
|
@ -222,11 +141,6 @@ typedef __m256d Tm;
|
||||||
#define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
|
#define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
|
||||||
#define vallTrue(a) (_mm256_movemask_pd(a)==15)
|
#define vallTrue(a) (_mm256_movemask_pd(a)==15)
|
||||||
|
|
||||||
#define vloadu(p) _mm256_loadu_pd(p)
|
|
||||||
#define vloadu_s(p) _mm256_loadu_ps(p)
|
|
||||||
#define vstoreu(p, v) _mm256_storeu_pd(p, v)
|
|
||||||
#define vstoreu_s(p, v) _mm256_storeu_ps(p, v)
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (VLEN==8)
|
#if (VLEN==8)
|
||||||
|
@ -236,20 +150,8 @@ typedef __m256d Tm;
|
||||||
typedef __m512d Tv;
|
typedef __m512d Tv;
|
||||||
typedef __mmask8 Tm;
|
typedef __mmask8 Tm;
|
||||||
|
|
||||||
#define vadd(a,b) _mm512_add_pd(a,b)
|
|
||||||
#define vaddeq(a,b) a=_mm512_add_pd(a,b)
|
|
||||||
#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
|
#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
|
||||||
#define vsub(a,b) _mm512_sub_pd(a,b)
|
|
||||||
#define vsubeq(a,b) a=_mm512_sub_pd(a,b)
|
|
||||||
#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
|
|
||||||
#define vmul(a,b) _mm512_mul_pd(a,b)
|
|
||||||
#define vmuleq(a,b) a=_mm512_mul_pd(a,b)
|
|
||||||
#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
|
#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
|
||||||
#define vfmaeq(a,b,c) a=_mm512_fmadd_pd(b,c,a)
|
|
||||||
//#define vabmc(a,b,c) a=_mm512_fnmadd_pd(b,c,a)
|
|
||||||
//#define vfms(a,b,c) _mm512_fnmadd_pd(b,c,a)
|
|
||||||
#define vfmaaeq(a,b,c,d,e) a=_mm512_fmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
|
|
||||||
#define vfmaseq(a,b,c,d,e) a=_mm512_fnmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
|
|
||||||
#define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))
|
#define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))
|
||||||
#define vload(a) _mm512_set1_pd(a)
|
#define vload(a) _mm512_set1_pd(a)
|
||||||
#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)_mm512_set1_pd(-0.),(__m512i)a)
|
#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)_mm512_set1_pd(-0.),(__m512i)a)
|
||||||
|
|
|
@ -25,7 +25,7 @@
|
||||||
/*! \file sharp_vecutil.h
|
/*! \file sharp_vecutil.h
|
||||||
* Functionality related to vector instruction support
|
* Functionality related to vector instruction support
|
||||||
*
|
*
|
||||||
* Copyright (C) 2012,2013 Max-Planck-Society
|
* Copyright (C) 2012-2018 Max-Planck-Society
|
||||||
* \author Martin Reinecke
|
* \author Martin Reinecke
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue