diff --git a/Makefile.am b/Makefile.am index c738f29..6370d95 100644 --- a/Makefile.am +++ b/Makefile.am @@ -16,17 +16,14 @@ src_sharp = \ libsharp/sharp_ylmgen_c.c \ libsharp/sharp_announce.h \ libsharp/sharp_complex_hacks.h \ - libsharp/sharp_core.h \ libsharp/sharp_internal.h \ libsharp/sharp_legendre_roots.h \ - libsharp/sharp_lowlevel.h \ libsharp/sharp_vecsupport.h \ libsharp/sharp_vecutil.h \ libsharp/sharp_ylmgen_c.h include_HEADERS = \ libsharp/sharp.h \ - libsharp/sharp_lowlevel.h \ libsharp/sharp_geomhelpers.h \ libsharp/sharp_almhelpers.h \ libsharp/sharp_cxx.h diff --git a/libsharp/sharp.c b/libsharp/sharp.c index f312fc3..bbb3872 100644 --- a/libsharp/sharp.c +++ b/libsharp/sharp.c @@ -35,7 +35,6 @@ #include "sharp_ylmgen_c.h" #include "sharp_internal.h" #include "c_utils.h" -#include "sharp_core.h" #include "walltime_c.h" #include "sharp_almhelpers.h" #include "sharp_geomhelpers.h" @@ -588,7 +587,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) } else memset (job->almtmp+job->nalm*job->ainfo->mval[mi], 0, - job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx)); + job->nalm*(lmax+2-job->ainfo->mval[mi])*sizeof(dcmplx)); #undef COPY_LOOP } @@ -960,9 +959,6 @@ void sharp_set_chunksize_min(int new_chunksize_min) void sharp_set_nchunks_max(int new_nchunks_max) { nchunks_max=new_nchunks_max; } -int sharp_get_nv_max (void) -{ return 6; } - #ifdef USE_MPI #include "sharp_mpi.c" diff --git a/libsharp/sharp.h b/libsharp/sharp.h index 9c5dd57..35a0cb5 100644 --- a/libsharp/sharp.h +++ b/libsharp/sharp.h @@ -23,21 +23,248 @@ */ /*! \file sharp.h - * Interface for the spherical transform library. + * Portable interface for the spherical transform library. * - * Copyright (C) 2006-2012 Max-Planck-Society - * \author Martin Reinecke + * Copyright (C) 2012-2018 Max-Planck-Society + * \author Martin Reinecke \author Dag Sverre Seljebotn */ -#ifndef PLANCK_SHARP_H -#define PLANCK_SHARP_H +#ifndef PLANCK_SHARP_LOWLEVEL_H +#define PLANCK_SHARP_LOWLEVEL_H + +#include #ifdef __cplusplus -#error This header file cannot be included from C++, only from C +extern "C" { #endif -#include +/*! \internal + Helper type containing information about a single ring. */ +typedef struct + { + double theta, phi0, weight, cth, sth; + ptrdiff_t ofs; + int nph, stride; + } sharp_ringinfo; -#include "sharp_lowlevel.h" +/*! \internal + Helper type containing information about a pair of rings with colatitudes + symmetric around the equator. */ +typedef struct + { + sharp_ringinfo r1,r2; + } sharp_ringpair; + +/*! \internal + Type holding all required information about a map geometry. */ +typedef struct + { + sharp_ringpair *pair; + int npairs, nphmax; + } sharp_geom_info; + +/*! \defgroup almgroup Helpers for dealing with a_lm */ +/*! \{ */ + +/*! \internal + Helper type for index calculation in a_lm arrays. */ +typedef struct + { + /*! Maximum \a l index of the array */ + int lmax; + /*! Number of different \a m values in this object */ + int nm; + /*! Array with \a nm entries containing the individual m values */ + int *mval; + /*! Combination of flags from sharp_almflags */ + int flags; + /*! Array with \a nm entries containing the (hypothetical) indices of + the coefficients with quantum numbers 0,\a mval[i] */ + ptrdiff_t *mvstart; + /*! Stride between a_lm and a_(l+1),m */ + ptrdiff_t stride; + } sharp_alm_info; + +/*! alm_info flags */ +typedef enum { SHARP_PACKED = 1, + /*!< m=0-coefficients are packed so that the (zero) imaginary part is + not present. mvstart is in units of *real* float/double for all + m; stride is in units of reals for m=0 and complex for m!=0 */ + SHARP_REAL_HARMONICS = 1<<6 + /*!< Use the real spherical harmonic convention. For + m==0, the alm are treated exactly the same as in + the complex case. For m!=0, alm[i] represent a + pair (+abs(m), -abs(m)) instead of (real, imag), + and the coefficients are scaled by a factor of + sqrt(2) relative to the complex case. In other + words, (sqrt(.5) * alm[i]) recovers the + corresponding complex coefficient (when accessed + as complex). + */ + } sharp_almflags; + + + +/*! Creates an a_lm data structure from the following parameters: + \param lmax maximum \a l quantum number (>=0) + \param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax) + \param stride the stride between entries with identical \a m, and \a l + differing by 1. + \param mstart the index of the (hypothetical) coefficient with the + quantum numbers 0,\a m. Must have \a mmax+1 entries. + \param alm_info will hold a pointer to the newly created data structure + */ +void sharp_make_alm_info (int lmax, int mmax, int stride, + const ptrdiff_t *mstart, sharp_alm_info **alm_info); +/*! Creates an a_lm data structure which from the following parameters: + \param lmax maximum \a l quantum number (\a >=0) + \param nm number of different \a m (\a 0<=nm<=lmax+1) + \param stride the stride between entries with identical \a m, and \a l + differing by 1. + \param mval array with \a nm entries containing the individual m values + \param mvstart array with \a nm entries containing the (hypothetical) + indices of the coefficients with the quantum numbers 0,\a mval[i] + \param flags a combination of sharp_almflags (pass 0 unless you know you need this) + \param alm_info will hold a pointer to the newly created data structure + */ +void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval, + const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info); +/*! Returns the index of the coefficient with quantum numbers \a l, + \a mval[mi]. + \note for a \a sharp_alm_info generated by sharp_make_alm_info() this is + the index for the coefficient with the quantum numbers \a l, \a mi. */ +ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi); +/*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED + flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0), + otherwise it is the number of complex coefficients (with m>=0). */ +ptrdiff_t sharp_alm_count(const sharp_alm_info *self); +/*! Deallocates the a_lm info object. */ +void sharp_destroy_alm_info (sharp_alm_info *info); + +/*! \} */ + +/*! \defgroup geominfogroup Functions for dealing with geometry information */ +/*! \{ */ + +/*! Creates a geometry information from a set of ring descriptions. + All arrays passed to this function must have \a nrings elements. + \param nrings the number of rings in the map + \param nph the number of pixels in each ring + \param ofs the index of the first pixel in each ring in the map array + \param stride the stride between consecutive pixels + \param phi0 the azimuth (in radians) of the first pixel in each ring + \param theta the colatitude (in radians) of each ring + \param wgt the pixel weight to be used for the ring in map2alm + and adjoint map2alm transforms. + Pass NULL to use 1.0 as weight for all rings. + \param geom_info will hold a pointer to the newly created data structure + */ +void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs, + const int *stride, const double *phi0, const double *theta, + const double *wgt, sharp_geom_info **geom_info); + +/*! Counts the number of grid points needed for (the local part of) a map described + by \a info. + */ +ptrdiff_t sharp_map_size(const sharp_geom_info *info); + +/*! Deallocates the geometry information in \a info. */ +void sharp_destroy_geom_info (sharp_geom_info *info); + +/*! \} */ + +/*! \defgroup lowlevelgroup Low-level libsharp SHT interface */ +/*! \{ */ + +/*! Enumeration of SHARP job types. */ +typedef enum { SHARP_YtW=0, /*!< analysis */ + SHARP_MAP2ALM=SHARP_YtW, /*!< analysis */ + SHARP_Y=1, /*!< synthesis */ + SHARP_ALM2MAP=SHARP_Y, /*!< synthesis */ + SHARP_Yt=2, /*!< adjoint synthesis */ + SHARP_WY=3, /*!< adjoint analysis */ + SHARP_ALM2MAP_DERIV1=4 /*!< synthesis of first derivatives */ + } sharp_jobtype; + +/*! Job flags */ +typedef enum { SHARP_DP = 1<<4, + /*!< map and a_lm are in double precision */ + SHARP_ADD = 1<<5, + /*!< results are added to the output arrays, instead of + overwriting them */ + + /* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags, + but its use here is deprecated in favor of having it in the sharp_alm_info */ + + SHARP_NO_FFT = 1<<7, + + SHARP_USE_WEIGHTS = 1<<20, /* internal use only */ + SHARP_NO_OPENMP = 1<<21, /* internal use only */ + } sharp_jobflags; + +/*! Performs a libsharp SHT job. The interface deliberately does not use + the C99 "complex" data type, in order to be callable from C89 and C++. + \param type the type of SHT + \param spin the spin of the quantities to be transformed + \param alm contains pointers to the a_lm coefficients. If \a spin==0, + alm[0] points to the a_lm of the first SHT, alm[1] to those of the second + etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT, + alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm + depends on whether the SHARP_DP flag is set. + \param map contains pointers to the maps. If \a spin==0, + map[0] points to the map of the first SHT, map[1] to that of the second + etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1] + point to the maps of the first SHT, map[2] and map[3] to those of the + second, etc. The exact data type of \a map depends on whether the SHARP_DP + flag is set. + \param geom_info A \c sharp_geom_info object compatible with the provided + \a map arrays. + \param alm_info A \c sharp_alm_info object compatible with the provided + \a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present + exactly once. + \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then + \a alm is expected to have the type "complex double **" and \a map is + expected to have the type "double **"; otherwise, the expected + types are "complex float **" and "float **", respectively. + \param time If not NULL, the wall clock time required for this SHT + (in seconds) will be written here. + \param opcnt If not NULL, a conservative estimate of the total floating point + operation count for this SHT will be written here. */ +void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map, + const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, + int flags, double *time, unsigned long long *opcnt); + +void sharp_set_chunksize_min(int new_chunksize_min); +void sharp_set_nchunks_max(int new_nchunks_max); + + +typedef enum { SHARP_ERROR_NO_MPI = 1, + /*!< libsharp not compiled with MPI support */ + } sharp_errors; + +/*! Works like sharp_execute_mpi, but is always present whether or not libsharp + is compiled with USE_MPI. This is primarily useful for wrapper code etc. + + Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid + pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi + is *(MPI_Comm*)pcomm. + + Other parameters are the same as sharp_execute_mpi. + + Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available + (in which case nothing is done). + */ +int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin, + void *alm, void *map, const sharp_geom_info *geom_info, + const sharp_alm_info *alm_info, int flags, double *time, + unsigned long long *opcnt); + + + +/*! \} */ + +#ifdef __cplusplus +} +#endif #endif diff --git a/libsharp/sharp_almhelpers.h b/libsharp/sharp_almhelpers.h index 67016d7..c17028a 100644 --- a/libsharp/sharp_almhelpers.h +++ b/libsharp/sharp_almhelpers.h @@ -32,7 +32,7 @@ #ifndef PLANCK_SHARP_ALMHELPERS_H #define PLANCK_SHARP_ALMHELPERS_H -#include "sharp_lowlevel.h" +#include "sharp.h" #ifdef __cplusplus extern "C" { diff --git a/libsharp/sharp_announce.c b/libsharp/sharp_announce.c index 7027167..a028258 100644 --- a/libsharp/sharp_announce.c +++ b/libsharp/sharp_announce.c @@ -40,7 +40,7 @@ #endif #include "sharp_announce.h" -#include "sharp_core.h" +#include "sharp_internal.h" static void OpenMP_status(void) { diff --git a/libsharp/sharp_complex_hacks.h b/libsharp/sharp_complex_hacks.h index 86d1153..6ec27bb 100644 --- a/libsharp/sharp_complex_hacks.h +++ b/libsharp/sharp_complex_hacks.h @@ -25,131 +25,126 @@ /* \file sharp_complex_hacks.h * support for converting vector types and complex numbers * - * Copyright (C) 2012-2016 Max-Planck-Society + * Copyright (C) 2012-2018 Max-Planck-Society * Author: Martin Reinecke */ #ifndef SHARP_COMPLEX_HACKS_H #define SHARP_COMPLEX_HACKS_H -#ifdef __cplusplus -#error This header file cannot be included from C++, only from C -#endif - #include -#include #include "sharp_vecsupport.h" #define UNSAFE_CODE #if (VLEN==1) -static inline complex double vhsum_cmplx(Tv a, Tv b) +static inline _Complex double vhsum_cmplx(Tv a, Tv b) { return a+_Complex_I*b; } static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d, - complex double * restrict c1, complex double * restrict c2) + _Complex double * restrict c1, _Complex double * restrict c2) { *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; } static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, - complex double * restrict cc) + _Complex double * restrict cc) { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; } #endif #if (VLEN==2) -static inline complex double vhsum_cmplx (Tv a, Tv b) +static inline _Complex double vhsum_cmplx (Tv a, Tv b) { #if defined(__SSE3__) Tv tmp = _mm_hadd_pd(a,b); #else - Tv tmp = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)), - _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0))); + Tv tmp = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) + + _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)); #endif - union {Tv v; complex double c; } u; + union {Tv v; _Complex double c; } u; u.v=tmp; return u.c; } static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, - Tv d, complex double * restrict c1, complex double * restrict c2) + Tv d, _Complex double * restrict c1, _Complex double * restrict c2) { #ifdef UNSAFE_CODE #if defined(__SSE3__) - vaddeq(*((__m128d *)c1),_mm_hadd_pd(a,b)); - vaddeq(*((__m128d *)c2),_mm_hadd_pd(c,d)); + *((__m128d *)c1) += _mm_hadd_pd(a,b); + *((__m128d *)c2) += _mm_hadd_pd(c,d); #else - vaddeq(*((__m128d *)c1),vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)), - _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)))); - vaddeq(*((__m128d *)c2),vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)), - _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)))); + *((__m128d *)c1) += _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) + + _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)); + *((__m128d *)c2) += _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) + + _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)); #endif #else - union {Tv v; complex double c; } u1, u2; + union {Tv v; _Complex double c; } u1, u2; #if defined(__SSE3__) u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d); #else - u1.v = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)), - _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0))); - u2.v = vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)), - _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0))); + u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) + + _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)); + u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) + + _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)); #endif *c1+=u1.c; *c2+=u2.c; #endif } static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, - complex double * restrict cc) + _Complex double * restrict cc) { vhsum_cmplx2(a,b,c,d,cc,cc+1); } #endif #if (VLEN==4) -static inline complex double vhsum_cmplx (Tv a, Tv b) +static inline _Complex double vhsum_cmplx (Tv a, Tv b) { Tv tmp=_mm256_hadd_pd(a,b); Tv tmp2=_mm256_permute2f128_pd(tmp,tmp,1); tmp=_mm256_add_pd(tmp,tmp2); #ifdef UNSAFE_CODE - complex double ret; + _Complex double ret; *((__m128d *)&ret)=_mm256_extractf128_pd(tmp, 0); return ret; #else - union {Tv v; complex double c[2]; } u; + union {Tv v; _Complex double c[2]; } u; u.v=tmp; return u.c[0]; #endif } static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d, - complex double * restrict c1, complex double * restrict c2) + _Complex double * restrict c1, _Complex double * restrict c2) { Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d); Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49), tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32); - tmp1=vadd(tmp3,tmp4); + tmp1=tmp3+tmp4; #ifdef UNSAFE_CODE *((__m128d *)c1)=_mm_add_pd(*((__m128d *)c1),_mm256_extractf128_pd(tmp1, 0)); *((__m128d *)c2)=_mm_add_pd(*((__m128d *)c2),_mm256_extractf128_pd(tmp1, 1)); #else - union {Tv v; complex double c[2]; } u; + union {Tv v; _Complex double c[2]; } u; u.v=tmp1; *c1+=u.c[0]; *c2+=u.c[1]; #endif } static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, - complex double * restrict cc) + _Complex double * restrict cc) { Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d); Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49), tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32); - tmp1=vadd(tmp3,tmp4); + tmp1=tmp3+tmp4; #ifdef UNSAFE_CODE _mm256_storeu_pd((double *)cc, _mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1)); #else - union {Tv v; complex double c[2]; } u; + union {Tv v; _Complex double c[2]; } u; u.v=tmp1; cc[0]+=u.c[0]; cc[1]+=u.c[1]; #endif @@ -159,18 +154,18 @@ static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, #if (VLEN==8) -static inline complex double vhsum_cmplx(Tv a, Tv b) +static inline _Complex double vhsum_cmplx(Tv a, Tv b) { return _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); } static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d, - complex double * restrict c1, complex double * restrict c2) + _Complex double * restrict c1, _Complex double * restrict c2) { *c1 += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); *c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d); } static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, - complex double * restrict cc) + _Complex double * restrict cc) { vhsum_cmplx2(a,b,c,d,cc,cc+1); } #endif diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c index 57a3ecd..2dcee90 100644 --- a/libsharp/sharp_core.c +++ b/libsharp/sharp_core.c @@ -35,32 +35,55 @@ #include "sharp_vecsupport.h" #include "sharp_complex_hacks.h" #include "sharp.h" -#include "sharp_core.h" +#include "sharp_internal.h" #include "c_utils.h" typedef complex double dcmplx; -#define nvec (256/VLEN) +#define nv0 (128/VLEN) +#define nvx (64/VLEN) -typedef union - { Tv v; double s[VLEN]; } Tvu; +typedef Tv Tbv0[nv0]; +typedef double Tbs0[nv0*VLEN]; typedef struct - { Tv v[nvec]; } Tb; - -typedef union - { Tb b; double s[VLEN*nvec]; } Tbu; + { + Tbv0 sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i; + } s0data_v; typedef struct - { Tb r, i; } Tbri; - -typedef struct - { double r[VLEN*nvec], i[VLEN*nvec]; } Tsri; + { + Tbs0 sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i; + } s0data_s; typedef union - { Tbri b; Tsri s; } Tburi; + { + s0data_v v; + s0data_s s; + } s0data_u; -static void Tvnormalize (Tv * restrict val, Tv * restrict scale, +typedef Tv Tbvx[nvx]; +typedef double Tbsx[nvx*VLEN]; + +typedef struct + { + Tbvx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth, + p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi; + } sxdata_v; + +typedef struct + { + Tbsx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth, + p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi; + } sxdata_s; + +typedef union + { + sxdata_v v; + sxdata_s s; + } sxdata_u; + +static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale, double maxval) { const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval); @@ -85,7 +108,6 @@ static void mypow(Tv val, int npow, const double * restrict powlimit, Tv * restrict resd, Tv * restrict ress) { Tv vminv=vload(powlimit[npow]); - Tv res=vone; Tm mask = vlt(vabs(val),vminv); if (!vanyTrue(mask)) // no underflows possible, use quick algoritm { @@ -93,8 +115,8 @@ static void mypow(Tv val, int npow, const double * restrict powlimit, do { if (npow&1) - vmuleq(res,val); - vmuleq(val,val); + res*=val; + val*=val; } while(npow>>=1); *resd=res; @@ -108,12 +130,12 @@ static void mypow(Tv val, int npow, const double * restrict powlimit, { if (npow&1) { - vmuleq(res,val); - vaddeq(scale,scaleint); + res*=val; + scale+=scaleint; Tvnormalize(&res,&scale,sharp_fbighalf); } - vmuleq(val,val); - vaddeq(scaleint,scaleint); + val*=val; + scaleint+=scaleint; Tvnormalize(&val,&scaleint,sharp_fbighalf); } while(npow>>=1); @@ -122,9 +144,12 @@ static void mypow(Tv val, int npow, const double * restrict powlimit, } } -static void getCorfac(Tv scale, Tv * restrict corfac, +static inline void getCorfac(Tv scale, Tv * restrict corfac, const double * restrict cf) { + typedef union + { Tv v; double s[VLEN]; } Tvu; + Tvu sc, corf; sc.v=scale; for (int i=0; im; Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); - Tv fsmall=vload(sharp_fsmall), limscale=vload(sharp_limscale); + Tv limscale=vload(sharp_limscale); int below_limit = 1; for (int i=0; iv[i]=vzero; - mypow(sth.v[i],l,gen->powlimit,&lam_2->v[i],&scale->v[i]); - lam_2->v[i] *= mfac; - Tvnormalize(&lam_2->v[i],&scale->v[i],sharp_ftol); - below_limit &= vallTrue(vlt(scale->v[i],limscale)); + d->lam1[i]=vzero; + mypow(d->sth[i],l,gen->powlimit,&d->lam2[i],&d->scale[i]); + d->lam2[i] *= mfac; + Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol); + below_limit &= vallTrue(vlt(d->scale[i],limscale)); } while (below_limit) @@ -158,25 +195,17 @@ NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_, r20=vload(gen->rf[l+1].f[0]), r21=vload(gen->rf[l+1].f[1]); for (int i=0; iv[i] = r10*cth.v[i]*lam_2->v[i] - r11*lam_1->v[i]; - lam_2->v[i] = r20*cth.v[i]*lam_1->v[i] - r21*lam_2->v[i]; - Tm mask = vgt(vabs(lam_2->v[i]),vload(sharp_ftol)); - if (vanyTrue(mask)) - { - vmuleq_mask(mask,lam_1->v[i],fsmall); - vmuleq_mask(mask,lam_2->v[i],fsmall); - vaddeq_mask(mask,scale->v[i],vone); - below_limit &= vallTrue(vlt(scale->v[i],limscale)); - } + d->lam1[i] = r10*d->cth[i]*d->lam2[i] - r11*d->lam1[i]; + d->lam2[i] = r20*d->cth[i]*d->lam1[i] - r21*d->lam2[i]; + if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) + below_limit &= vallTrue(vlt(d->scale[i],limscale)); } l+=2; } *l_=l; } - -NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1, - Tbri * restrict p2, Tb lam_1, Tb lam_2, +NOINLINE static void alm2map_kernel(s0data_v * restrict d, const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, int l, int lmax, int nv2) { @@ -188,49 +217,23 @@ NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1, f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]); for (int i=0; ir.v[i],lam_2.v[i],ar1); - vfmaeq(p1->i.v[i],lam_2.v[i],ai1); - lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i]; - vfmaeq(p2->r.v[i],lam_1.v[i],ar2); - vfmaeq(p2->i.v[i],lam_1.v[i],ai2); + d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i]; + d->p1r[i] += d->lam2[i]*ar1; + d->p1i[i] += d->lam2[i]*ai1; + d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i]; + d->p2r[i] += d->lam1[i]*ar2; + d->p2i[i] += d->lam1[i]*ai2; } l+=2; } } -NOINLINE static void map2alm_kernel (const Tb cth, - const Tbri * restrict p1, const Tbri * restrict p2, Tb lam_1, Tb lam_2, - const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, - int lmax, int nv2) - { - while (l<=lmax) - { - Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]), - f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]); - Tv atmp[4] = {vzero, vzero, vzero, vzero}; - for (int i=0; ir.v[i]); - vfmaeq(atmp[1],lam_2.v[i],p1->i.v[i]); - lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i]; - vfmaeq(atmp[2],lam_1.v[i],p2->r.v[i]); - vfmaeq(atmp[3],lam_1.v[i],p2->i.v[i]); - } - vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]); - l+=2; - } - } - -NOINLINE static void calc_alm2map (const Tb cth, const Tb sth, - const sharp_Ylmgen_C *gen, sharp_job *job, Tbri * restrict p1, - Tbri * restrict p2, int nth) +NOINLINE static void calc_alm2map (sharp_job * restrict job, + const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth) { int l,lmax=gen->lmax; - Tb lam_1,lam_2,scale; int nv2 = (nth+VLEN-1)/VLEN; - iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen,nv2); + iter_to_ieee(gen, d, &l, nv2); job->opcnt += (l-gen->m) * 4*nth; if (l>lmax) return; job->opcnt += (lmax+1-l) * 8*nth; @@ -238,11 +241,10 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth, const sharp_ylmgen_dbl2 * restrict rf = gen->rf; const dcmplx * restrict alm=job->almtmp; int full_ieee=1; - Tb corfac; for (int i=0; icf); - full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale))); + getCorfac(d->scale[i], &d->corfac[i], gen->cf); + full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); } while((!full_ieee) && (l<=lmax)) @@ -254,21 +256,17 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth, full_ieee=1; for (int i=0; ir.v[i],lam_2.v[i]*corfac.v[i],ar1); - vfmaeq(p1->i.v[i],lam_2.v[i]*corfac.v[i],ai1); - lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i]; - Tm mask = vgt(vabs(lam_2.v[i]),vload(sharp_ftol)); - if (vanyTrue(mask)) + d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i]; + d->p1r[i] += d->lam2[i]*d->corfac[i]*ar1; + d->p1i[i] += d->lam2[i]*d->corfac[i]*ai1; + d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i]; + if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) { - vmuleq_mask(mask,lam_1.v[i],vload(sharp_fsmall)); - vmuleq_mask(mask,lam_2.v[i],vload(sharp_fsmall)); - vaddeq_mask(mask,scale.v[i],vone); - getCorfac(scale.v[i], &corfac.v[i], gen->cf); - full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale))); + getCorfac(d->scale[i], &d->corfac[i], gen->cf); + full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); } - vfmaeq(p2->r.v[i],lam_1.v[i]*corfac.v[i],ar2); - vfmaeq(p2->i.v[i],lam_1.v[i]*corfac.v[i],ai2); + d->p2r[i] += d->lam1[i]*d->corfac[i]*ar2; + d->p2i[i] += d->lam1[i]*d->corfac[i]*ai2; } l+=2; } @@ -276,21 +274,42 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth, for (int i=0; ilam1[i] *= d->corfac[i]; + d->lam2[i] *= d->corfac[i]; } - alm2map_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, nv2); + alm2map_kernel(d, rf, alm, l, lmax, nv2); } -NOINLINE static void calc_map2alm(const Tb cth, const Tb sth, - const sharp_Ylmgen_C *gen, sharp_job *job, const Tbri * restrict p1, - const Tbri * restrict p2, int nth) +NOINLINE static void map2alm_kernel(s0data_v * restrict d, + const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, + int lmax, int nv2) + { + while (l<=lmax) + { + Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]), + f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]); + Tv atmp[4] = {vzero, vzero, vzero, vzero}; + for (int i=0; ilam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i]; + atmp[0] += d->lam2[i]*d->p1r[i]; + atmp[1] += d->lam2[i]*d->p1i[i]; + d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i]; + atmp[2] += d->lam1[i]*d->p2r[i]; + atmp[3] += d->lam1[i]*d->p2i[i]; + } + vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]); + l+=2; + } + } + +NOINLINE static void calc_map2alm(sharp_job * restrict job, + const sharp_Ylmgen_C *gen, s0data_v * restrict d, int nth) { int lmax=gen->lmax; - Tb lam_1,lam_2,scale; int l=gen->m; int nv2 = (nth+VLEN-1)/VLEN; - iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen,nv2); + iter_to_ieee(gen, d, &l, nv2); job->opcnt += (l-gen->m) * 4*nth; if (l>lmax) return; job->opcnt += (lmax+1-l) * 8*nth; @@ -298,11 +317,10 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth, const sharp_ylmgen_dbl2 * restrict rf = gen->rf; dcmplx * restrict alm=job->almtmp; int full_ieee=1; - Tb corfac; for (int i=0; icf); - full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale))); + getCorfac(d->scale[i], &d->corfac[i], gen->cf); + full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); } while ((!full_ieee) && (l<=lmax)) @@ -313,21 +331,17 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth, Tv atmp[4] = {vzero, vzero, vzero, vzero}; for (int i=0; ir.v[i]); - vfmaeq(atmp[1],lam_2.v[i]*corfac.v[i],p1->i.v[i]); - lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i]; - Tm mask = vgt(vabs(lam_2.v[i]),vload(sharp_ftol)); - if (vanyTrue(mask)) + d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i]; + atmp[0] += d->lam2[i]*d->corfac[i]*d->p1r[i]; + atmp[1] += d->lam2[i]*d->corfac[i]*d->p1i[i]; + d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i]; + if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) { - vmuleq_mask(mask,lam_1.v[i],vload(sharp_fsmall)); - vmuleq_mask(mask,lam_2.v[i],vload(sharp_fsmall)); - vaddeq_mask(mask,scale.v[i],vone); - getCorfac(scale.v[i], &corfac.v[i], gen->cf); - full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale))); + getCorfac(d->scale[i], &d->corfac[i], gen->cf); + full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); } - vfmaeq(atmp[2],lam_1.v[i]*corfac.v[i],p2->r.v[i]); - vfmaeq(atmp[3],lam_1.v[i]*corfac.v[i],p2->i.v[i]); + atmp[2] += d->lam1[i]*d->corfac[i]*d->p2r[i]; + atmp[3] += d->lam1[i]*d->corfac[i]*d->p2i[i]; } vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]); l+=2; @@ -335,10 +349,418 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth, for (int i=0; ilam1[i] *= d->corfac[i]; + d->lam2[i] *= d->corfac[i]; } - map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, nv2); + map2alm_kernel(d, rf, alm, l, lmax, nv2); + } + +NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen, + sxdata_v * restrict d, int * restrict l_, int nv2) + { + const sharp_ylmgen_dbl3 * restrict fx = gen->fx; + Tv prefac=vload(gen->prefac[gen->m]), + prescale=vload(gen->fscale[gen->m]); + Tv limscale=vload(sharp_limscale); + int below_limit=1; + for (int i=0; icth[i])*vload(0.5))); + Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5))); + Tm mask=vlt(d->sth[i],vzero); + vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.)); + vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.)); + + Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps; + mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps); + mypow(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps); + mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps); + mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps); + + d->l1p[i] = vzero; + d->l1m[i] = vzero; + d->l2p[i] = prefac*ccp; + d->scp[i] = prescale+ccps; + d->l2m[i] = prefac*csp; + d->scm[i] = prescale+csps; + Tvnormalize(&d->l2m[i],&d->scm[i],sharp_fbighalf); + Tvnormalize(&d->l2p[i],&d->scp[i],sharp_fbighalf); + d->l2p[i] *= ssp; + d->scp[i] += ssps; + d->l2m[i] *= scp; + d->scm[i] += scps; + if (gen->preMinus_p) + d->l2p[i] = vneg(d->l2p[i]); + if (gen->preMinus_m) + d->l2m[i] = vneg(d->l2m[i]); + if (gen->s&1) + d->l2p[i] = vneg(d->l2p[i]); + + Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol); + Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol); + + below_limit &= vallTrue(vlt(d->scm[i],limscale)) && + vallTrue(vlt(d->scp[i],limscale)); + } + + int l=gen->mhi; + + while (below_limit) + { + if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} + below_limit=1; + Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]), + fx12=vload(fx[l+1].f[2]); + Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]), + fx22=vload(fx[l+2].f[2]); + for (int i=0; il1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i]; + d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i]; + d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i]; + d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i]; + if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) || + rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol))) + below_limit &= vallTrue(vlt(d->scp[i],limscale)) && + vallTrue(vlt(d->scm[i],limscale)); + } + l+=2; + } + + *l_=l; + } + +NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d, + const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, + int l, int lmax, int nv2) + { + while (l<=lmax) + { + Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]), + fx12=vload(fx[l+1].f[2]); + Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]), + fx22=vload(fx[l+2].f[2]); + Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), + acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); + Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), + acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); + for (int i=0; il1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i]; + d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i]; + Tv lw1=d->l2p[i]+d->l2m[i]; + Tv lx2=d->l1m[i]-d->l1p[i]; + d->p1pr[i] += agr1*lw1 - aci2*lx2; + d->p1pi[i] += agi1*lw1 + acr2*lx2; + d->p1mr[i] += acr1*lw1 + agi2*lx2; + d->p1mi[i] += aci1*lw1 - agr2*lx2; + Tv lx1=d->l2m[i]-d->l2p[i]; + Tv lw2=d->l1p[i]+d->l1m[i]; + d->p2pr[i] += agr2*lw2 - aci1*lx1; + d->p2pi[i] += agi2*lw2 + acr1*lx1; + d->p2mr[i] += acr2*lw2 + agi1*lx1; + d->p2mi[i] += aci2*lw2 - agr1*lx1; + d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i]; + d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i]; + } + l+=2; + } + } + +NOINLINE static void calc_alm2map_spin (sharp_job * restrict job, + const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) + { + int l,lmax=gen->lmax; + int nv2 = (nth+VLEN-1)/VLEN; + iter_to_ieee_spin(gen, d, &l, nv2); + job->opcnt += (l-gen->mhi) * 10*nth; + if (l>lmax) return; + job->opcnt += (lmax+1-l) * 28*nth; + + const sharp_ylmgen_dbl3 * restrict fx = gen->fx; + const dcmplx * restrict alm=job->almtmp; + int full_ieee=1; + for (int i=0; iscp[i], &d->cfp[i], gen->cf); + getCorfac(d->scm[i], &d->cfm[i], gen->cf); + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && + vallTrue(vge(d->scm[i],vload(sharp_minscale))); + } + + while((!full_ieee) && (l<=lmax)) + { + Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]), + fx12=vload(fx[l+1].f[2]); + Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]), + fx22=vload(fx[l+2].f[2]); + Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), + acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); + Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), + acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); + full_ieee=1; + for (int i=0; il1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i]; + d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i]; + Tv lw1=d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i]; + Tv lx2=d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i]; + d->p1pr[i] += agr1*lw1 - aci2*lx2; + d->p1pi[i] += agi1*lw1 + acr2*lx2; + d->p1mr[i] += acr1*lw1 + agi2*lx2; + d->p1mi[i] += aci1*lw1 - agr2*lx2; + Tv lx1=d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i]; + Tv lw2=d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i]; + d->p2pr[i] += agr2*lw2 - aci1*lx1; + d->p2pi[i] += agi2*lw2 + acr1*lx1; + d->p2mr[i] += acr2*lw2 + agi1*lx1; + d->p2mi[i] += aci2*lw2 - agr1*lx1; + d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i]; + d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i]; + if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) + { + getCorfac(d->scp[i], &d->cfp[i], gen->cf); + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); + } + if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) + { + getCorfac(d->scm[i], &d->cfm[i], gen->cf); + full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); + } + } + l+=2; + } + if (l>lmax) return; + + for (int i=0; il1p[i] *= d->cfp[i]; + d->l2p[i] *= d->cfp[i]; + d->l1m[i] *= d->cfm[i]; + d->l2m[i] *= d->cfm[i]; + } + alm2map_spin_kernel(d, fx, alm, l, lmax, nv2); + } + +NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d, + const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, + int l, int lmax, int nv2) + { + while (l<=lmax) + { + Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]), + fx12=vload(fx[l+1].f[2]); + Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]), + fx22=vload(fx[l+2].f[2]); + Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; + Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; + for (int i=0; il1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i]; + d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i]; + Tv lw = d->l2p[i] + d->l2m[i]; + Tv lx = d->l2m[i] - d->l2p[i]; + agr1 += d->p1pr[i]*lw - d->p2mi[i]*lx;; + agi1 += d->p1pi[i]*lw + d->p2mr[i]*lx; + acr1 += d->p1mr[i]*lw + d->p2pi[i]*lx; + aci1 += d->p1mi[i]*lw - d->p2pr[i]*lx; + d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i]; + d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i]; + lw = d->l1p[i] + d->l1m[i]; + lx = d->l1m[i] - d->l1p[i]; + agr2 += d->p2pr[i]*lw - d->p1mi[i]*lx; + agi2 += d->p2pi[i]*lw + d->p1mr[i]*lx; + acr2 += d->p2mr[i]*lw + d->p1pi[i]*lx; + aci2 += d->p2mi[i]*lw - d->p1pr[i]*lx; + } + vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); + vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]); + l+=2; + } + } + +NOINLINE static void calc_map2alm_spin (sharp_job * restrict job, + const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) + { + int l,lmax=gen->lmax; + int nv2 = (nth+VLEN-1)/VLEN; + iter_to_ieee_spin(gen, d, &l, nv2); + job->opcnt += (l-gen->mhi) * 10*nth; + if (l>lmax) return; + job->opcnt += (lmax+1-l) * 28*nth; + + const sharp_ylmgen_dbl3 * restrict fx = gen->fx; + dcmplx * restrict alm=job->almtmp; + int full_ieee=1; + for (int i=0; iscp[i], &d->cfp[i], gen->cf); + getCorfac(d->scm[i], &d->cfm[i], gen->cf); + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && + vallTrue(vge(d->scm[i],vload(sharp_minscale))); + } + + while((!full_ieee) && (l<=lmax)) + { + Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]), + fx12=vload(fx[l+1].f[2]); + Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]), + fx22=vload(fx[l+2].f[2]); + Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; + Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; + full_ieee=1; + for (int i=0; il1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i]; + d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i]; + Tv lw = d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i]; + Tv lx = d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i]; + agr1 += d->p1pr[i]*lw - d->p2mi[i]*lx; + agi1 += d->p1pi[i]*lw + d->p2mr[i]*lx; + acr1 += d->p1mr[i]*lw + d->p2pi[i]*lx; + aci1 += d->p1mi[i]*lw - d->p2pr[i]*lx; + d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i]; + d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i]; + lw = d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i]; + lx = d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i]; + agr2 += d->p2pr[i]*lw - d->p1mi[i]*lx; + agi2 += d->p2pi[i]*lw + d->p1mr[i]*lx; + acr2 += d->p2mr[i]*lw + d->p1pi[i]*lx; + aci2 += d->p2mi[i]*lw - d->p1pr[i]*lx; + if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) + { + getCorfac(d->scp[i], &d->cfp[i], gen->cf); + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); + } + if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) + { + getCorfac(d->scm[i], &d->cfm[i], gen->cf); + full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); + } + } + vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); + vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]); + l+=2; + } + if (l>lmax) return; + + for (int i=0; il1p[i] *= d->cfp[i]; + d->l2p[i] *= d->cfp[i]; + d->l1m[i] *= d->cfm[i]; + d->l2m[i] *= d->cfm[i]; + } + map2alm_spin_kernel(d, fx, alm, l, lmax, nv2); + } + + +NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d, + const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, + int l, int lmax, int nv2) + { + while (l<=lmax) + { + Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]), + fx12=vload(fx[l+1].f[2]); + Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]), + fx22=vload(fx[l+2].f[2]); + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), + ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + for (int i=0; il1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i]; + d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i]; + Tv lw=d->l2p[i]+d->l2m[i]; + d->p1pr[i] += ar1*lw; + d->p1pi[i] += ai1*lw; + Tv lx=d->l2m[i]-d->l2p[i]; + d->p2mr[i] += ai1*lx; + d->p2mi[i] -= ar1*lx; + lw=d->l1p[i]+d->l1m[i]; + d->p2pr[i] += ar2*lw; + d->p2pi[i] += ai2*lw; + lx=d->l1m[i]-d->l1p[i]; + d->p1mr[i] += ai2*lx; + d->p1mi[i] -= ar2*lx; + d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i]; + d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i]; + } + l+=2; + } + } + +NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job, + const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) + { + int l,lmax=gen->lmax; + int nv2 = (nth+VLEN-1)/VLEN; + iter_to_ieee_spin(gen, d, &l, nv2); + job->opcnt += (l-gen->mhi) * 10*nth; + if (l>lmax) return; + job->opcnt += (lmax+1-l) * 20*nth; + + const sharp_ylmgen_dbl3 * restrict fx = gen->fx; + const dcmplx * restrict alm=job->almtmp; + int full_ieee=1; + for (int i=0; iscp[i], &d->cfp[i], gen->cf); + getCorfac(d->scm[i], &d->cfm[i], gen->cf); + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && + vallTrue(vge(d->scm[i],vload(sharp_minscale))); + } + + while((!full_ieee) && (l<=lmax)) + { + Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]), + fx12=vload(fx[l+1].f[2]); + Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]), + fx22=vload(fx[l+2].f[2]); + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), + ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + full_ieee=1; + for (int i=0; il1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i]; + d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i]; + Tv lw=d->l2p[i]*d->cfp[i]+d->l2m[i]*d->cfm[i]; + d->p1pr[i] += ar1*lw; + d->p1pi[i] += ai1*lw; + Tv lx=d->l2m[i]*d->cfm[i]-d->l2p[i]*d->cfp[i]; + d->p2mr[i] += ai1*lx; + d->p2mi[i] -= ar1*lx; + lw=d->l1p[i]*d->cfp[i]+d->l1m[i]*d->cfm[i]; + d->p2pr[i] += ar2*lw; + d->p2pi[i] += ai2*lw; + lx=d->l1m[i]*d->cfm[i]-d->l1p[i]*d->cfp[i]; + d->p1mr[i] += ai2*lx; + d->p1mi[i] -= ar2*lx; + d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i]; + d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i]; + if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) + { + getCorfac(d->scp[i], &d->cfp[i], gen->cf); + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); + } + if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) + { + getCorfac(d->scm[i], &d->cfm[i], gen->cf); + full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); + } + } + l+=2; + } + if (l>lmax) return; + + for (int i=0; il1p[i] *= d->cfp[i]; + d->l2p[i] *= d->cfp[i]; + d->l1m[i] *= d->cfm[i]; + d->l2m[i] *= d->cfm[i]; + } + alm2map_deriv1_kernel(d, fx, alm, l, lmax, nv2); } @@ -348,7 +770,6 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, const double *cth_, const double *sth_, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, const int *mlim) { - const int nval=nvec*VLEN; const int m = job->ainfo->mval[mi]; sharp_Ylmgen_prepare (gen, m); @@ -359,19 +780,20 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, { if (job->spin==0) { + const int nval=nv0*VLEN; int ith=0; - int itgt[nvec*VLEN]; + int itgt[nval]; while (ith=m) { itgt[nth] = ith; - cth.s[nth]=cth_[ith]; sth.s[nth]=sth_[ith]; + d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith]; ++nth; } else @@ -386,16 +808,17 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, int i2=((nth+VLEN-1)/VLEN)*VLEN; for (int i=nth; is_th + mi*job->s_m; - complex double r1 = p1.s.r[i] + p1.s.i[i]*_Complex_I, - r2 = p2.s.r[i] + p2.s.i[i]*_Complex_I; + complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I, + r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I; job->phase[phas_idx] = r1+r2; if (ispair[tgt]) job->phase[phas_idx+1] = r1-r2; @@ -405,7 +828,66 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, } else { - UTIL_FAIL("only spin==0 allowed at the moment"); + const int nval=nvx*VLEN; + int ith=0; + int itgt[nval]; + while (ith=m) + { + itgt[nth] = ith; + d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith]; + ++nth; + } + else + { + int phas_idx = ith*job->s_th + mi*job->s_m; + job->phase[phas_idx ] = job->phase[phas_idx+1] = 0; + job->phase[phas_idx+2] = job->phase[phas_idx+3] = 0; + } + ++ith; + } + if (nth>0) + { + int i2=((nth+VLEN-1)/VLEN)*VLEN; + for (int i=nth; itype==SHARP_ALM2MAP) ? + calc_alm2map_spin (job, gen, &d.v, nth) : + calc_alm2map_deriv1(job, gen, &d.v, nth); + for (int i=0; is_th + mi*job->s_m; + complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I, + q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I, + u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I, + u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I; + job->phase[phas_idx ] = q1+q2; + job->phase[phas_idx+2] = u1+u2; + if (ispair[tgt]) + { + dcmplx *phQ = &(job->phase[phas_idx+1]), + *phU = &(job->phase[phas_idx+3]); + *phQ = q1-q2; + *phU = u1-u2; + if ((gen->mhi-gen->m+gen->s)&1) + { *phQ=-(*phQ); *phU=-(*phU); } + } + } + } + } } break; } @@ -421,7 +903,6 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, const double *cth_, const double *sth_, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, const int *mlim) { - const int nval=nvec*VLEN; const int m = job->ainfo->mval[mi]; sharp_Ylmgen_prepare (gen, m); @@ -431,22 +912,22 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, { if (job->spin==0) { + const int nval=nv0*VLEN; int ith=0; while (ith=m) { - cth.s[nth]=cth_[ith]; sth.s[nth]=sth_[ith]; + d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith]; int phas_idx = ith*job->s_th + mi*job->s_m; dcmplx ph1=job->phase[phas_idx]; dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.; - p1.s.r[nth]=creal(ph1+ph2); p1.s.i[nth]=cimag(ph1+ph2); - p2.s.r[nth]=creal(ph1-ph2); p2.s.i[nth]=cimag(ph1-ph2); + d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2); + d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2); ++nth; } ++ith; @@ -456,17 +937,55 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, int i2=((nth+VLEN-1)/VLEN)*VLEN; for (int i=nth; i=m) + { + d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith]; + int phas_idx = ith*job->s_th + mi*job->s_m; + dcmplx p1Q=job->phase[phas_idx], + p1U=job->phase[phas_idx+2], + p2Q=ispair[ith] ? job->phase[phas_idx+1]:0., + p2U=ispair[ith] ? job->phase[phas_idx+3]:0.; + if ((gen->mhi-gen->m+gen->s)&1) + { p2Q=-p2Q; p2U=-p2U; } + d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q); + d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U); + d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q); + d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U); + ++nth; + } + ++ith; + } + if (nth>0) + { + int i2=((nth+VLEN-1)/VLEN)*VLEN; + for (int i=nth; i -#include "sharp_lowlevel.h" +#include "sharp.h" #include "sharp_geomhelpers.h" #include "sharp_almhelpers.h" diff --git a/libsharp/sharp_geomhelpers.h b/libsharp/sharp_geomhelpers.h index b7f98c4..1c77e27 100644 --- a/libsharp/sharp_geomhelpers.h +++ b/libsharp/sharp_geomhelpers.h @@ -32,7 +32,7 @@ #ifndef PLANCK_SHARP_GEOMHELPERS_H #define PLANCK_SHARP_GEOMHELPERS_H -#include "sharp_lowlevel.h" +#include "sharp.h" #ifdef __cplusplus extern "C" { diff --git a/libsharp/sharp_internal.h b/libsharp/sharp_internal.h index 11f23cb..635aeb8 100644 --- a/libsharp/sharp_internal.h +++ b/libsharp/sharp_internal.h @@ -25,7 +25,7 @@ /*! \file sharp_internal.h * Internally used functionality for the spherical transform library. * - * Copyright (C) 2006-2013 Max-Planck-Society + * Copyright (C) 2006-2018 Max-Planck-Society * \author Martin Reinecke \author Dag Sverre Seljebotn */ @@ -36,7 +36,9 @@ #error This header file cannot be included from C++, only from C #endif +#include #include "sharp.h" +#include "sharp_ylmgen_c.h" #define SHARP_MAXTRANS 100 @@ -58,8 +60,13 @@ typedef struct unsigned long long opcnt; } sharp_job; -int sharp_get_nv_max (void); -int sharp_nv_oracle (sharp_jobtype type, int spin); int sharp_get_mlim (int lmax, int spin, double sth, double cth); +void inner_loop (sharp_job *job, const int *ispair,const double *cth, + const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, + const int *mlim); + +int sharp_veclen(void); +int sharp_max_nvec(void); + #endif diff --git a/libsharp/sharp_lowlevel.h b/libsharp/sharp_lowlevel.h deleted file mode 100644 index 2e7ab24..0000000 --- a/libsharp/sharp_lowlevel.h +++ /dev/null @@ -1,270 +0,0 @@ -/* - * This file is part of libsharp. - * - * libsharp is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * libsharp is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with libsharp; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* - * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik - * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt - * (DLR). - */ - -/*! \file sharp_lowlevel.h - * Low-level, portable interface for the spherical transform library. - * - * Copyright (C) 2012-2013 Max-Planck-Society - * \author Martin Reinecke \author Dag Sverre Seljebotn - */ - -#ifndef PLANCK_SHARP_LOWLEVEL_H -#define PLANCK_SHARP_LOWLEVEL_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/*! \internal - Helper type containing information about a single ring. */ -typedef struct - { - double theta, phi0, weight, cth, sth; - ptrdiff_t ofs; - int nph, stride; - } sharp_ringinfo; - -/*! \internal - Helper type containing information about a pair of rings with colatitudes - symmetric around the equator. */ -typedef struct - { - sharp_ringinfo r1,r2; - } sharp_ringpair; - -/*! \internal - Type holding all required information about a map geometry. */ -typedef struct - { - sharp_ringpair *pair; - int npairs, nphmax; - } sharp_geom_info; - -/*! \defgroup almgroup Helpers for dealing with a_lm */ -/*! \{ */ - -/*! \internal - Helper type for index calculation in a_lm arrays. */ -typedef struct - { - /*! Maximum \a l index of the array */ - int lmax; - /*! Number of different \a m values in this object */ - int nm; - /*! Array with \a nm entries containing the individual m values */ - int *mval; - /*! Combination of flags from sharp_almflags */ - int flags; - /*! Array with \a nm entries containing the (hypothetical) indices of - the coefficients with quantum numbers 0,\a mval[i] */ - ptrdiff_t *mvstart; - /*! Stride between a_lm and a_(l+1),m */ - ptrdiff_t stride; - } sharp_alm_info; - -/*! alm_info flags */ -typedef enum { SHARP_PACKED = 1, - /*!< m=0-coefficients are packed so that the (zero) imaginary part is - not present. mvstart is in units of *real* float/double for all - m; stride is in units of reals for m=0 and complex for m!=0 */ - SHARP_REAL_HARMONICS = 1<<6 - /*!< Use the real spherical harmonic convention. For - m==0, the alm are treated exactly the same as in - the complex case. For m!=0, alm[i] represent a - pair (+abs(m), -abs(m)) instead of (real, imag), - and the coefficients are scaled by a factor of - sqrt(2) relative to the complex case. In other - words, (sqrt(.5) * alm[i]) recovers the - corresponding complex coefficient (when accessed - as complex). - */ - } sharp_almflags; - - - -/*! Creates an a_lm data structure from the following parameters: - \param lmax maximum \a l quantum number (>=0) - \param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax) - \param stride the stride between entries with identical \a m, and \a l - differing by 1. - \param mstart the index of the (hypothetical) coefficient with the - quantum numbers 0,\a m. Must have \a mmax+1 entries. - \param alm_info will hold a pointer to the newly created data structure - */ -void sharp_make_alm_info (int lmax, int mmax, int stride, - const ptrdiff_t *mstart, sharp_alm_info **alm_info); -/*! Creates an a_lm data structure which from the following parameters: - \param lmax maximum \a l quantum number (\a >=0) - \param nm number of different \a m (\a 0<=nm<=lmax+1) - \param stride the stride between entries with identical \a m, and \a l - differing by 1. - \param mval array with \a nm entries containing the individual m values - \param mvstart array with \a nm entries containing the (hypothetical) - indices of the coefficients with the quantum numbers 0,\a mval[i] - \param flags a combination of sharp_almflags (pass 0 unless you know you need this) - \param alm_info will hold a pointer to the newly created data structure - */ -void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval, - const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info); -/*! Returns the index of the coefficient with quantum numbers \a l, - \a mval[mi]. - \note for a \a sharp_alm_info generated by sharp_make_alm_info() this is - the index for the coefficient with the quantum numbers \a l, \a mi. */ -ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi); -/*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED - flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0), - otherwise it is the number of complex coefficients (with m>=0). */ -ptrdiff_t sharp_alm_count(const sharp_alm_info *self); -/*! Deallocates the a_lm info object. */ -void sharp_destroy_alm_info (sharp_alm_info *info); - -/*! \} */ - -/*! \defgroup geominfogroup Functions for dealing with geometry information */ -/*! \{ */ - -/*! Creates a geometry information from a set of ring descriptions. - All arrays passed to this function must have \a nrings elements. - \param nrings the number of rings in the map - \param nph the number of pixels in each ring - \param ofs the index of the first pixel in each ring in the map array - \param stride the stride between consecutive pixels - \param phi0 the azimuth (in radians) of the first pixel in each ring - \param theta the colatitude (in radians) of each ring - \param wgt the pixel weight to be used for the ring in map2alm - and adjoint map2alm transforms. - Pass NULL to use 1.0 as weight for all rings. - \param geom_info will hold a pointer to the newly created data structure - */ -void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs, - const int *stride, const double *phi0, const double *theta, - const double *wgt, sharp_geom_info **geom_info); - -/*! Counts the number of grid points needed for (the local part of) a map described - by \a info. - */ -ptrdiff_t sharp_map_size(const sharp_geom_info *info); - -/*! Deallocates the geometry information in \a info. */ -void sharp_destroy_geom_info (sharp_geom_info *info); - -/*! \} */ - -/*! \defgroup lowlevelgroup Low-level libsharp SHT interface */ -/*! \{ */ - -/*! Enumeration of SHARP job types. */ -typedef enum { SHARP_YtW=0, /*!< analysis */ - SHARP_MAP2ALM=SHARP_YtW, /*!< analysis */ - SHARP_Y=1, /*!< synthesis */ - SHARP_ALM2MAP=SHARP_Y, /*!< synthesis */ - SHARP_Yt=2, /*!< adjoint synthesis */ - SHARP_WY=3, /*!< adjoint analysis */ - SHARP_ALM2MAP_DERIV1=4 /*!< synthesis of first derivatives */ - } sharp_jobtype; - -/*! Job flags */ -typedef enum { SHARP_DP = 1<<4, - /*!< map and a_lm are in double precision */ - SHARP_ADD = 1<<5, - /*!< results are added to the output arrays, instead of - overwriting them */ - - /* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags, - but its use here is deprecated in favor of having it in the sharp_alm_info */ - - SHARP_NO_FFT = 1<<7, - - SHARP_USE_WEIGHTS = 1<<20, /* internal use only */ - SHARP_NO_OPENMP = 1<<21, /* internal use only */ - } sharp_jobflags; - -/*! Performs a libsharp SHT job. The interface deliberately does not use - the C99 "complex" data type, in order to be callable from C89 and C++. - \param type the type of SHT - \param spin the spin of the quantities to be transformed - \param alm contains pointers to the a_lm coefficients. If \a spin==0, - alm[0] points to the a_lm of the first SHT, alm[1] to those of the second - etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT, - alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm - depends on whether the SHARP_DP flag is set. - \param map contains pointers to the maps. If \a spin==0, - map[0] points to the map of the first SHT, map[1] to that of the second - etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1] - point to the maps of the first SHT, map[2] and map[3] to those of the - second, etc. The exact data type of \a map depends on whether the SHARP_DP - flag is set. - \param geom_info A \c sharp_geom_info object compatible with the provided - \a map arrays. - \param alm_info A \c sharp_alm_info object compatible with the provided - \a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present - exactly once. - \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then - \a alm is expected to have the type "complex double **" and \a map is - expected to have the type "double **"; otherwise, the expected - types are "complex float **" and "float **", respectively. - \param time If not NULL, the wall clock time required for this SHT - (in seconds) will be written here. - \param opcnt If not NULL, a conservative estimate of the total floating point - operation count for this SHT will be written here. */ -void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map, - const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, - int flags, double *time, unsigned long long *opcnt); - -void sharp_set_chunksize_min(int new_chunksize_min); -void sharp_set_nchunks_max(int new_nchunks_max); - - -typedef enum { SHARP_ERROR_NO_MPI = 1, - /*!< libsharp not compiled with MPI support */ - } sharp_errors; - -/*! Works like sharp_execute_mpi, but is always present whether or not libsharp - is compiled with USE_MPI. This is primarily useful for wrapper code etc. - - Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid - pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi - is *(MPI_Comm*)pcomm. - - Other parameters are the same as sharp_execute_mpi. - - Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available - (in which case nothing is done). - */ -int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin, - void *alm, void *map, const sharp_geom_info *geom_info, - const sharp_alm_info *alm_info, int flags, double *time, - unsigned long long *opcnt); - - - -/*! \} */ - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/libsharp/sharp_mpi.h b/libsharp/sharp_mpi.h index df07117..73a8aa0 100644 --- a/libsharp/sharp_mpi.h +++ b/libsharp/sharp_mpi.h @@ -33,7 +33,7 @@ #define PLANCK_SHARP_MPI_H #include -#include "sharp_lowlevel.h" +#include "sharp.h" #ifdef __cplusplus extern "C" { diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c index 4b124af..f712ae4 100644 --- a/libsharp/sharp_testsuite.c +++ b/libsharp/sharp_testsuite.c @@ -24,7 +24,7 @@ /* \file sharp_testsuite.c * - * Copyright (C) 2012-2013 Max-Planck-Society + * Copyright (C) 2012-2018 Max-Planck-Society * \author Martin Reinecke */ @@ -375,7 +375,6 @@ static void check_sign_scale(void) UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.234675107554816442e+01,1e-12), "error"); -#if 0 sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,SHARP_DP, NULL,NULL); UTIL_ASSERT(FAPPROX(map[0][0 ], 2.750897760535633285e+00,1e-12), @@ -420,7 +419,6 @@ static void check_sign_scale(void) "error"); UTIL_ASSERT(FAPPROX(map[1][npix-1], 7.821618677689795049e+02,1e-12), "error"); -#endif DEALLOC2D(map); DEALLOC2D(alm); @@ -430,7 +428,7 @@ static void check_sign_scale(void) } static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo, - int spin, int nv, double **err_abs, double **err_rel, + int spin, double **err_abs, double **err_rel, double *t_a2m, double *t_m2a, unsigned long long *op_a2m, unsigned long long *op_m2a) { @@ -450,20 +448,20 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo, #ifdef USE_MPI sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo, - ainfo, SHARP_DP|SHARP_ADD|nv,t_a2m,op_a2m); + ainfo, SHARP_DP|SHARP_ADD,t_a2m,op_a2m); #else sharp_execute(SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,ainfo, - SHARP_DP|nv,t_a2m,op_a2m); + SHARP_DP,t_a2m,op_a2m); #endif if (t_a2m!=NULL) *t_a2m=maxTime(*t_a2m); if (op_a2m!=NULL) *op_a2m=totalops(*op_a2m); double *sqsum=get_sqsum_and_invert(alm,nalms,ncomp); #ifdef USE_MPI sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo, - ainfo,SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a); + ainfo,SHARP_DP|SHARP_ADD,t_m2a,op_m2a); #else sharp_execute(SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,ainfo, - SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a); + SHARP_DP|SHARP_ADD,t_m2a,op_m2a); #endif if (t_m2a!=NULL) *t_m2a=maxTime(*t_m2a); if (op_m2a!=NULL) *op_m2a=totalops(*op_m2a); @@ -475,11 +473,11 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo, } static void check_accuracy (sharp_geom_info *ginfo, sharp_alm_info *ainfo, - int spin, int nv) + int spin) { int ncomp = (spin==0) ? 1 : 2; double *err_abs, *err_rel; - do_sht (ginfo, ainfo, spin, nv, &err_abs, &err_rel, NULL, NULL, + do_sht (ginfo, ainfo, spin, &err_abs, &err_rel, NULL, NULL, NULL, NULL); for (int i=0; i=8,"usage: grid lmax mmax geom1 geom2 spin"); - int lmax=atoi(argv[3]); - int mmax=atoi(argv[4]); - int gpar1=atoi(argv[5]); - int gpar2=atoi(argv[6]); - int spin=atoi(argv[7]); - - if (mytask==0) printf("Testing map analysis accuracy.\n"); - if (mytask==0) printf("spin=%d\n", spin); - - sharp_geom_info *ginfo; - sharp_alm_info *ainfo; - get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo); - - double ta2m_auto=1e30, tm2a_auto=1e30, ta2m_min=1e30, tm2a_min=1e30; - unsigned long long opa2m_min=0, opm2a_min=0; - int nvmin_a2m=-1, nvmin_m2a=-1; - for (int nv=0; nv<=6; ++nv) - { - int ntries=0; - double tacc=0; - do - { - double t_a2m, t_m2a; - unsigned long long op_a2m, op_m2a; - double *err_abs,*err_rel; - do_sht (ginfo, ainfo, spin, nv, &err_abs, &err_rel, - &t_a2m, &t_m2a, &op_a2m, &op_m2a); - - DEALLOC(err_abs); - DEALLOC(err_rel); - tacc+=t_a2m+t_m2a; - ++ntries; - if (nv==0) - { - if (t_a2mb) ? a : b; } @@ -99,7 +78,6 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; } #endif typedef __m128d Tv; -typedef __m128 Tv_s; typedef __m128d Tm; #if defined(__SSE4_1__) @@ -111,29 +89,11 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) #define vzero _mm_setzero_pd() #define vone _mm_set1_pd(1.) -#define vadd(a,b) _mm_add_pd(a,b) -#define vadd_s(a,b) _mm_add_ps(a,b) -#define vaddeq(a,b) a=_mm_add_pd(a,b) #define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero)) -#define vsub(a,b) _mm_sub_pd(a,b) -#define vsub_s(a,b) _mm_sub_ps(a,b) -#define vsubeq(a,b) a=_mm_sub_pd(a,b) #define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero)) -#define vmul(a,b) _mm_mul_pd(a,b) -#define vmul_s(a,b) _mm_mul_ps(a,b) -#define vmuleq(a,b) a=_mm_mul_pd(a,b) #define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone)) -#define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c)) -#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c)) -#define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c)) -#define vabmc(a,b,c) _mm_sub_pd(_mm_mul_pd(a,b),c) -#define vfmaaeq(a,b,c,d,e) \ - a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e))) -#define vfmaseq(a,b,c,d,e) \ - a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e))) #define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a) #define vload(a) _mm_set1_pd(a) -#define vload_s(a) _mm_set1_ps(a) #define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a) #define vsqrt(a) _mm_sqrt_pd(a) #define vlt(a,b) _mm_cmplt_pd(a,b) @@ -146,10 +106,6 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) #define vmax(a,b) _mm_max_pd(a,b); #define vanyTrue(a) (_mm_movemask_pd(a)!=0) #define vallTrue(a) (_mm_movemask_pd(a)==3) -#define vloadu(p) _mm_loadu_pd(p) -#define vloadu_s(p) _mm_loadu_ps(p) -#define vstoreu(p, v) _mm_storeu_pd(p, v) -#define vstoreu_s(p, v) _mm_storeu_ps(p, v) #endif @@ -161,54 +117,17 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) #endif typedef __m256d Tv; -typedef __m256 Tv_s; typedef __m256d Tm; #define vblend__(m,a,b) _mm256_blendv_pd(b,a,m) #define vzero _mm256_setzero_pd() #define vone _mm256_set1_pd(1.) -#define vadd(a,b) _mm256_add_pd(a,b) -#define vadd_s(a,b) _mm256_add_ps(a,b) -#define vaddeq(a,b) a=_mm256_add_pd(a,b) #define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero)) -#define vsub(a,b) _mm256_sub_pd(a,b) -#define vsub_s(a,b) _mm256_sub_ps(a,b) -#define vsubeq(a,b) a=_mm256_sub_pd(a,b) #define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero)) -#define vmul(a,b) _mm256_mul_pd(a,b) -#define vmul_s(a,b) _mm256_mul_ps(a,b) -#define vmuleq(a,b) a=_mm256_mul_pd(a,b) #define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone)) -#if (USE_FMA4) -#define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a) -#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a) -#define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a) -#define vabmc(a,b,c) _mm256_msub_pd(a,b,c) -#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a)) -#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a)) -#else -#if (USE_FMA) -#define vfmaeq(a,b,c) a=_mm256_fmadd_pd(b,c,a) -#define vfmaeq_s(a,b,c) a=_mm256_fmadd_ps(b,c,a) -#define vfmseq(a,b,c) a=_mm256_fnmadd_pd(b,c,a) -#define vabmc(a,b,c) _mm256_fmsub_pd(a,b,c) -#define vfmaaeq(a,b,c,d,e) a=_mm256_fmadd_pd(d,e,_mm256_fmadd_pd(b,c,a)) -#define vfmaseq(a,b,c,d,e) a=_mm256_fnmadd_pd(d,e,_mm256_fmadd_pd(b,c,a)) -#else -#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c)) -#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c)) -#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c)) -#define vabmc(a,b,c) _mm256_sub_pd(_mm256_mul_pd(a,b),c) -#define vfmaaeq(a,b,c,d,e) \ - a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e))) -#define vfmaseq(a,b,c,d,e) \ - a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e))) -#endif -#endif #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a) #define vload(a) _mm256_set1_pd(a) -#define vload_s(a) _mm256_set1_ps(a) #define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a) #define vsqrt(a) _mm256_sqrt_pd(a) #define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ) @@ -222,11 +141,6 @@ typedef __m256d Tm; #define vanyTrue(a) (_mm256_movemask_pd(a)!=0) #define vallTrue(a) (_mm256_movemask_pd(a)==15) -#define vloadu(p) _mm256_loadu_pd(p) -#define vloadu_s(p) _mm256_loadu_ps(p) -#define vstoreu(p, v) _mm256_storeu_pd(p, v) -#define vstoreu_s(p, v) _mm256_storeu_ps(p, v) - #endif #if (VLEN==8) @@ -236,20 +150,8 @@ typedef __m256d Tm; typedef __m512d Tv; typedef __mmask8 Tm; -#define vadd(a,b) _mm512_add_pd(a,b) -#define vaddeq(a,b) a=_mm512_add_pd(a,b) #define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b); -#define vsub(a,b) _mm512_sub_pd(a,b) -#define vsubeq(a,b) a=_mm512_sub_pd(a,b) -#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b); -#define vmul(a,b) _mm512_mul_pd(a,b) -#define vmuleq(a,b) a=_mm512_mul_pd(a,b) #define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b); -#define vfmaeq(a,b,c) a=_mm512_fmadd_pd(b,c,a) -//#define vabmc(a,b,c) a=_mm512_fnmadd_pd(b,c,a) -//#define vfms(a,b,c) _mm512_fnmadd_pd(b,c,a) -#define vfmaaeq(a,b,c,d,e) a=_mm512_fmadd_pd(d,e,_mm512_fmadd_pd(b,c,a)) -#define vfmaseq(a,b,c,d,e) a=_mm512_fnmadd_pd(d,e,_mm512_fmadd_pd(b,c,a)) #define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.)) #define vload(a) _mm512_set1_pd(a) #define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)_mm512_set1_pd(-0.),(__m512i)a) diff --git a/libsharp/sharp_vecutil.h b/libsharp/sharp_vecutil.h index 24a2e94..522cc5f 100644 --- a/libsharp/sharp_vecutil.h +++ b/libsharp/sharp_vecutil.h @@ -25,7 +25,7 @@ /*! \file sharp_vecutil.h * Functionality related to vector instruction support * - * Copyright (C) 2012,2013 Max-Planck-Society + * Copyright (C) 2012-2018 Max-Planck-Society * \author Martin Reinecke */ diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c index 548f061..e2247c8 100644 --- a/libsharp/sharp_ylmgen_c.c +++ b/libsharp/sharp_ylmgen_c.c @@ -85,15 +85,15 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin) else { gen->m=gen->mlo=gen->mhi=-1234567890; - ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+2); - for (int m=0; mlmax+2; ++m) + ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+3); + for (int m=0; mlmax+3; ++m) gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.; - ALLOC(gen->inv,double,gen->lmax+1); + ALLOC(gen->inv,double,gen->lmax+2); gen->inv[0]=0; - for (int m=1; mlmax+1; ++m) gen->inv[m]=1./m; - ALLOC(gen->flm1,double,2*gen->lmax+1); - ALLOC(gen->flm2,double,2*gen->lmax+1); - for (int m=0; m<2*gen->lmax+1; ++m) + for (int m=1; mlmax+2; ++m) gen->inv[m]=1./m; + ALLOC(gen->flm1,double,2*gen->lmax+3); + ALLOC(gen->flm2,double,2*gen->lmax+3); + for (int m=0; m<2*gen->lmax+3; ++m) { gen->flm1[m] = sqrt(1./(m+1.)); gen->flm2[m] = sqrt(m/(m+1.)); @@ -176,7 +176,7 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m) if (!ms_similar) { - for (int l=gen->mhi; llmax; ++l) + for (int l=gen->mhi; llmax+1; ++l) { double t = gen->flm1[l+gen->m]*gen->flm1[l-gen->m] *gen->flm1[l+gen->s]*gen->flm1[l-gen->s];