This commit is contained in:
Martin Reinecke 2018-10-26 14:36:25 +02:00
parent dce3c2b430
commit 18c82762c3
15 changed files with 424 additions and 219 deletions

View file

@ -25,7 +25,7 @@
/* /*
* Convenience functions * Convenience functions
* *
* Copyright (C) 2008, 2009, 2010, 2011, 2012 Max-Planck-Society * Copyright (C) 2008-2017 Max-Planck-Society
* Author: Martin Reinecke * Author: Martin Reinecke
*/ */
@ -44,7 +44,7 @@ void util_warn_ (const char *file, int line, const char *func, const char *msg)
/* This function tries to avoid allocations with a total size close to a high /* This function tries to avoid allocations with a total size close to a high
power of two (called the "critical stride" here), by adding a few more bytes power of two (called the "critical stride" here), by adding a few more bytes
if necssary. This lowers the probability that two arrays differ by a multiple if necessary. This lowers the probability that two arrays differ by a multiple
of the critical stride in their starting address, which in turn lowers the of the critical stride in their starting address, which in turn lowers the
risk of cache line contention. */ risk of cache line contention. */
static size_t manipsize(size_t sz) static size_t manipsize(size_t sz)
@ -61,7 +61,7 @@ void *util_malloc_ (size_t sz)
{ {
void *res; void *res;
if (sz==0) return NULL; if (sz==0) return NULL;
res = _mm_malloc(manipsize(sz),16); res = _mm_malloc(manipsize(sz),32);
UTIL_ASSERT(res,"_mm_malloc() failed"); UTIL_ASSERT(res,"_mm_malloc() failed");
return res; return res;
} }

View file

@ -25,7 +25,7 @@
/*! \file c_utils.h /*! \file c_utils.h
* Convenience functions * Convenience functions
* *
* Copyright (C) 2008, 2009, 2010, 2011 Max-Planck-Society * Copyright (C) 2008-2017 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
* \note This file should only be included from .c files, NOT from .h files. * \note This file should only be included from .c files, NOT from .h files.
*/ */
@ -144,4 +144,10 @@ void util_free_ (void *ptr);
} }
#endif #endif
#ifdef __GNUC__
#define NOINLINE __attribute__((noinline))
#else
#define NOINLINE
#endif
#endif #endif

View file

@ -25,11 +25,12 @@
/*! \file sharp.c /*! \file sharp.c
* Spherical transform library * Spherical transform library
* *
* Copyright (C) 2006-2013 Max-Planck-Society * Copyright (C) 2006-2016 Max-Planck-Society
* \author Martin Reinecke \author Dag Sverre Seljebotn * \author Martin Reinecke \author Dag Sverre Seljebotn
*/ */
#include <math.h> #include <math.h>
#include <string.h>
#include "pocketfft/pocketfft.h" #include "pocketfft/pocketfft.h"
#include "sharp_ylmgen_c.h" #include "sharp_ylmgen_c.h"
#include "sharp_internal.h" #include "sharp_internal.h"
@ -63,7 +64,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
*nchunks = (ndata+(*chunksize)-1)/(*chunksize); *nchunks = (ndata+(*chunksize)-1)/(*chunksize);
} }
int sharp_get_mlim (int lmax, int spin, double sth, double cth) NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
{ {
double ofs=lmax*0.01; double ofs=lmax*0.01;
if (ofs<100.) ofs=100.; if (ofs<100.) ofs=100.;
@ -83,12 +84,13 @@ typedef struct
dcmplx *shiftarr; dcmplx *shiftarr;
int s_shift; int s_shift;
rfft_plan plan; rfft_plan plan;
int length;
int norot; int norot;
} ringhelper; } ringhelper;
static void ringhelper_init (ringhelper *self) static void ringhelper_init (ringhelper *self)
{ {
static ringhelper rh_null = { 0, NULL, 0, NULL, 0 }; static ringhelper rh_null = { 0, NULL, 0, NULL, 0, 0 };
*self = rh_null; *self = rh_null;
} }
@ -99,7 +101,7 @@ static void ringhelper_destroy (ringhelper *self)
ringhelper_init(self); ringhelper_init(self);
} }
static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0) NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
{ {
self->norot = (fabs(phi0)<1e-14); self->norot = (fabs(phi0)<1e-14);
if (!(self->norot)) if (!(self->norot))
@ -110,12 +112,15 @@ static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
self->phi0_ = phi0; self->phi0_ = phi0;
for (int m=0; m<=mmax; ++m) for (int m=0; m<=mmax; ++m)
self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0); self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
// double *tmp=(double *) self->shiftarr;
// sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
} }
if (!self->plan) self->plan=make_rfft_plan(nph); if (!self->plan) self->plan=make_rfft_plan(nph);
if (nph!=(int)rfft_length(self->plan)) if (nph!=(int)self->length)
{ {
destroy_rfft_plan(self->plan); destroy_rfft_plan(self->plan);
self->plan=make_rfft_plan(nph); self->plan=make_rfft_plan(nph);
self->length=nph;
} }
} }
@ -127,6 +132,7 @@ static int ringinfo_compare (const void *xa, const void *xb)
static int ringpair_compare (const void *xa, const void *xb) static int ringpair_compare (const void *xa, const void *xb)
{ {
const sharp_ringpair *a=xa, *b=xb; const sharp_ringpair *a=xa, *b=xb;
// return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
if (a->r1.nph==b->r1.nph) if (a->r1.nph==b->r1.nph)
return (a->r1.phi0 < b->r1.phi0) ? -1 : return (a->r1.phi0 < b->r1.phi0) ? -1 :
((a->r1.phi0 > b->r1.phi0) ? 1 : ((a->r1.phi0 > b->r1.phi0) ? 1 :
@ -261,6 +267,7 @@ void sharp_destroy_geom_info (sharp_geom_info *geom_info)
distribution are permissible. */ distribution are permissible. */
static int sharp_get_mmax (int *mval, int nm) static int sharp_get_mmax (int *mval, int nm)
{ {
//FIXME: if gaps are allowed, we have to search the maximum m in the array
int *mcheck=RALLOC(int,nm); int *mcheck=RALLOC(int,nm);
SET_ARRAY(mcheck,0,nm,0); SET_ARRAY(mcheck,0,nm,0);
for (int i=0; i<nm; ++i) for (int i=0; i<nm; ++i)
@ -274,7 +281,7 @@ static int sharp_get_mmax (int *mval, int nm)
return nm-1; return nm-1;
} }
static void ringhelper_phase2ring (ringhelper *self, NOINLINE static void ringhelper_phase2ring (ringhelper *self,
const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase, const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase,
int pstride, int flags) int pstride, int flags)
{ {
@ -288,13 +295,19 @@ static void ringhelper_phase2ring (ringhelper *self,
if (nph>=2*mmax+1) if (nph>=2*mmax+1)
{ {
for (int m=0; m<=mmax; ++m) if (self->norot)
{ for (int m=0; m<=mmax; ++m)
dcmplx tmp = phase[m*pstride]*wgt; {
if(!self->norot) tmp*=self->shiftarr[m]; data[2*m]=creal(phase[m*pstride])*wgt;
data[2*m]=creal(tmp); data[2*m+1]=cimag(phase[m*pstride])*wgt;
data[2*m+1]=cimag(tmp); }
} else
for (int m=0; m<=mmax; ++m)
{
dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
data[2*m]=creal(tmp)*wgt;
data[2*m+1]=cimag(tmp)*wgt;
}
for (int m=2*(mmax+1); m<nph+2; ++m) for (int m=2*(mmax+1); m<nph+2; ++m)
data[m]=0.; data[m]=0.;
} }
@ -326,7 +339,7 @@ static void ringhelper_phase2ring (ringhelper *self,
rfft_backward (self->plan, &(data[1]), 1.); rfft_backward (self->plan, &(data[1]), 1.);
} }
static void ringhelper_ring2phase (ringhelper *self, NOINLINE static void ringhelper_ring2phase (ringhelper *self,
const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase, const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase,
int pstride, int flags) int pstride, int flags)
{ {
@ -376,7 +389,7 @@ static void ringhelper_ring2phase (ringhelper *self,
phase[m*pstride]=0.; phase[m*pstride]=0.;
} }
static void fill_map (const sharp_geom_info *ginfo, void *map, double value, NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
int flags) int flags)
{ {
if (flags & SHARP_NO_FFT) if (flags & SHARP_NO_FFT)
@ -386,50 +399,55 @@ static void fill_map (const sharp_geom_info *ginfo, void *map, double value,
if (flags&SHARP_DP) if (flags&SHARP_DP)
{ {
for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i) for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
((dcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride] ((dcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
=value;
for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i) for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
((dcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride] ((dcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
=value;
} }
else else
{ {
for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i) for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
((fcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride] ((fcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
=(float)value;
for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i) for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
((fcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride] ((fcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
=(float)value;
} }
} }
} }
else else
{ {
for (int j=0;j<ginfo->npairs;++j) if (flags&SHARP_DP)
{ {
if (flags&SHARP_DP) for (int j=0;j<ginfo->npairs;++j)
{ {
for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i) double *dmap=(double *)map;
((double *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride] if (ginfo->pair[j].r1.stride==1)
=value; memset(&dmap[ginfo->pair[j].r1.ofs],0,
for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i) ginfo->pair[j].r1.nph*sizeof(double));
((double *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride] else
=value; for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
dmap[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
if ((ginfo->pair[j].r2.nph>0)&&(ginfo->pair[j].r2.stride==1))
memset(&dmap[ginfo->pair[j].r2.ofs],0,
ginfo->pair[j].r2.nph*sizeof(double));
else
for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
dmap[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
} }
else }
else
{
for (int j=0;j<ginfo->npairs;++j)
{ {
for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i) for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride] ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
=(float)value;
for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i) for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride] ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
=(float)value;
} }
} }
} }
} }
static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags) NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
int flags)
{ {
#define CLEARLOOP(real_t,body) \ #define CLEARLOOP(real_t,body) \
{ \ { \
@ -465,7 +483,7 @@ static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags)
} }
} }
static void init_output (sharp_job *job) NOINLINE static void init_output (sharp_job *job)
{ {
if (job->flags&SHARP_ADD) return; if (job->flags&SHARP_ADD) return;
if (job->type == SHARP_MAP2ALM) if (job->type == SHARP_MAP2ALM)
@ -473,21 +491,21 @@ static void init_output (sharp_job *job)
clear_alm (job->ainfo,job->alm[i],job->flags); clear_alm (job->ainfo,job->alm[i],job->flags);
else else
for (int i=0; i<job->ntrans*job->nmaps; ++i) for (int i=0; i<job->ntrans*job->nmaps; ++i)
fill_map (job->ginfo,job->map[i],0.,job->flags); clear_map (job->ginfo,job->map[i],job->flags);
} }
static void alloc_phase (sharp_job *job, int nm, int ntheta) NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
{ {
if (job->type==SHARP_MAP2ALM) if (job->type==SHARP_MAP2ALM)
{ {
if ((nm&1023)==0) nm+=3; // hack to avoid critical strides
job->s_m=2*job->ntrans*job->nmaps; job->s_m=2*job->ntrans*job->nmaps;
if (((job->s_m*16*nm)&1023)==0) nm+=3; // hack to avoid critical strides
job->s_th=job->s_m*nm; job->s_th=job->s_m*nm;
} }
else else
{ {
if ((ntheta&1023)==0) ntheta+=3; // hack to avoid critical strides
job->s_th=2*job->ntrans*job->nmaps; job->s_th=2*job->ntrans*job->nmaps;
if (((job->s_th*16*ntheta)&1023)==0) ntheta+=3; // hack to avoid critical strides
job->s_m=job->s_th*ntheta; job->s_m=job->s_th*ntheta;
} }
job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta); job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta);
@ -502,22 +520,28 @@ static void alloc_almtmp (sharp_job *job, int lmax)
static void dealloc_almtmp (sharp_job *job) static void dealloc_almtmp (sharp_job *job)
{ DEALLOC(job->almtmp); } { DEALLOC(job->almtmp); }
static void alm2almtmp (sharp_job *job, int lmax, int mi) NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
{ {
#define COPY_LOOP(real_t, source_t, expr_of_x) \ #define COPY_LOOP(real_t, source_t, expr_of_x) \
for (int l=job->ainfo->mval[mi]; l<=lmax; ++l) \ { \
for (int l=m; l<lmin; ++l) \
for (int i=0; i<job->ntrans*job->nalm; ++i) \
job->almtmp[job->ntrans*job->nalm*l+i] = 0; \
for (int l=lmin; l<=lmax; ++l) \
for (int i=0; i<job->ntrans*job->nalm; ++i) \ for (int i=0; i<job->ntrans*job->nalm; ++i) \
{ \ { \
source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \ source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \
job->almtmp[job->ntrans*job->nalm*l+i] = expr_of_x; \ job->almtmp[job->ntrans*job->nalm*l+i] = expr_of_x; \
} } \
}
if (job->type!=SHARP_MAP2ALM) if (job->type!=SHARP_MAP2ALM)
{ {
ptrdiff_t ofs=job->ainfo->mvstart[mi]; ptrdiff_t ofs=job->ainfo->mvstart[mi];
int stride=job->ainfo->stride; int stride=job->ainfo->stride;
int m=job->ainfo->mval[mi]; int m=job->ainfo->mval[mi];
int lmin=(m<job->spin) ? job->spin : m;
/* in the case of SHARP_REAL_HARMONICS, phase2ring scales all the /* in the case of SHARP_REAL_HARMONICS, phase2ring scales all the
coefficients by sqrt_one_half; here we must compensate to avoid scaling coefficients by sqrt_one_half; here we must compensate to avoid scaling
m=0 */ m=0 */
@ -562,17 +586,17 @@ static void alm2almtmp (sharp_job *job, int lmax, int mi)
} }
} }
else else
SET_ARRAY(job->almtmp,job->ntrans*job->nalm*job->ainfo->mval[mi], memset (job->almtmp+job->ntrans*job->nalm*job->ainfo->mval[mi], 0,
job->ntrans*job->nalm*(lmax+1),0.); job->ntrans*job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx));
#undef COPY_LOOP #undef COPY_LOOP
} }
static void almtmp2alm (sharp_job *job, int lmax, int mi) NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
{ {
#define COPY_LOOP(real_t, target_t, expr_of_x) \ #define COPY_LOOP(real_t, target_t, expr_of_x) \
for (int l=job->ainfo->mval[mi]; l<=lmax; ++l) \ for (int l=lmin; l<=lmax; ++l) \
for (int i=0; i<job->ntrans*job->nalm; ++i) \ for (int i=0; i<job->ntrans*job->nalm; ++i) \
{ \ { \
dcmplx x = job->almtmp[job->ntrans*job->nalm*l+i]; \ dcmplx x = job->almtmp[job->ntrans*job->nalm*l+i]; \
@ -583,6 +607,7 @@ static void almtmp2alm (sharp_job *job, int lmax, int mi)
ptrdiff_t ofs=job->ainfo->mvstart[mi]; ptrdiff_t ofs=job->ainfo->mvstart[mi];
int stride=job->ainfo->stride; int stride=job->ainfo->stride;
int m=job->ainfo->mval[mi]; int m=job->ainfo->mval[mi];
int lmin=(m<job->spin) ? job->spin : m;
/* in the case of SHARP_REAL_HARMONICS, ring2phase scales all the /* in the case of SHARP_REAL_HARMONICS, ring2phase scales all the
coefficients by sqrt_two; here we must compensate to avoid scaling coefficients by sqrt_two; here we must compensate to avoid scaling
m=0 */ m=0 */
@ -629,27 +654,56 @@ static void almtmp2alm (sharp_job *job, int lmax, int mi)
#undef COPY_LOOP #undef COPY_LOOP
} }
static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, double *ringtmp, NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
int rstride) const double *ringtmp, int rstride)
{ {
double **dmap = (double **)job->map; if (job->flags & SHARP_DP)
float **fmap = (float **)job->map; {
for (int i=0; i<job->ntrans*job->nmaps; ++i) double **dmap = (double **)job->map;
for (int m=0; m<ri->nph; ++m) for (int i=0; i<job->ntrans*job->nmaps; ++i)
if (job->flags & SHARP_DP) {
dmap[i][ri->ofs+m*ri->stride] += ringtmp[i*rstride+m+1]; double *restrict p1=&dmap[i][ri->ofs];
const double *restrict p2=&ringtmp[i*rstride+1];
if (ri->stride==1)
{
if (job->flags&SHARP_ADD)
for (int m=0; m<ri->nph; ++m)
p1[m] += p2[m];
else
memcpy(p1,p2,ri->nph*sizeof(double));
}
else else
for (int m=0; m<ri->nph; ++m)
p1[m*ri->stride] += p2[m];
}
}
else
{
float **fmap = (float **)job->map;
for (int i=0; i<job->ntrans*job->nmaps; ++i)
for (int m=0; m<ri->nph; ++m)
fmap[i][ri->ofs+m*ri->stride] += (float)ringtmp[i*rstride+m+1]; fmap[i][ri->ofs+m*ri->stride] += (float)ringtmp[i*rstride+m+1];
}
} }
static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, double *ringtmp, NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
int rstride) double *ringtmp, int rstride)
{ {
for (int i=0; i<job->ntrans*job->nmaps; ++i) if (job->flags & SHARP_DP)
for (int m=0; m<ri->nph; ++m) for (int i=0; i<job->ntrans*job->nmaps; ++i)
ringtmp[i*rstride+m+1] = (job->flags & SHARP_DP) ? {
((double *)(job->map[i]))[ri->ofs+m*ri->stride] : double *restrict p1=&ringtmp[i*rstride+1],
((float *)(job->map[i]))[ri->ofs+m*ri->stride]; *restrict p2=&(((double *)(job->map[i]))[ri->ofs]);
if (ri->stride==1)
memcpy(p1,p2,ri->nph*sizeof(double));
else
for (int m=0; m<ri->nph; ++m)
p1[m] = p2[m*ri->stride];
}
else
for (int i=0; i<job->ntrans*job->nmaps; ++i)
for (int m=0; m<ri->nph; ++m)
ringtmp[i*rstride+m+1] = ((float *)(job->map[i]))[ri->ofs+m*ri->stride];
} }
static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
@ -693,7 +747,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
} }
//FIXME: set phase to zero if not SHARP_MAP2ALM? //FIXME: set phase to zero if not SHARP_MAP2ALM?
static void map2phase (sharp_job *job, int mmax, int llim, int ulim) NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
{ {
if (job->type != SHARP_MAP2ALM) return; if (job->type != SHARP_MAP2ALM) return;
int pstride = job->s_m; int pstride = job->s_m;
@ -738,7 +792,7 @@ static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
} }
} }
static void phase2map (sharp_job *job, int mmax, int llim, int ulim) NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
{ {
if (job->type == SHARP_MAP2ALM) return; if (job->type == SHARP_MAP2ALM) return;
int pstride = job->s_m; int pstride = job->s_m;
@ -783,7 +837,7 @@ static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
} }
} }
static void sharp_execute_job (sharp_job *job) NOINLINE static void sharp_execute_job (sharp_job *job)
{ {
double timer=wallTime(); double timer=wallTime();
job->opcnt=0; job->opcnt=0;
@ -800,6 +854,7 @@ static void sharp_execute_job (sharp_job *job)
int nchunks, chunksize; int nchunks, chunksize;
get_chunk_info(job->ginfo->npairs,(job->flags&SHARP_NVMAX)*VLEN,&nchunks, get_chunk_info(job->ginfo->npairs,(job->flags&SHARP_NVMAX)*VLEN,&nchunks,
&chunksize); &chunksize);
//FIXME: needs to be changed to "nm"
alloc_phase (job,mmax+1,chunksize); alloc_phase (job,mmax+1,chunksize);
/* chunk loop */ /* chunk loop */

View file

@ -25,7 +25,7 @@
/*! \file sharp_almhelpers.c /*! \file sharp_almhelpers.c
* Spherical transform library * Spherical transform library
* *
* Copyright (C) 2008-2013 Max-Planck-Society * Copyright (C) 2008-2016 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
*/ */

View file

@ -25,7 +25,7 @@
/*! \file sharp_almhelpers.h /*! \file sharp_almhelpers.h
* SHARP helper function for the creation of a_lm data structures * SHARP helper function for the creation of a_lm data structures
* *
* Copyright (C) 2008-2011 Max-Planck-Society * Copyright (C) 2008-2016 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
*/ */

View file

@ -25,7 +25,7 @@
/* \file sharp_complex_hacks.h /* \file sharp_complex_hacks.h
* support for converting vector types and complex numbers * support for converting vector types and complex numbers
* *
* Copyright (C) 2012,2013 Max-Planck-Society * Copyright (C) 2012-2016 Max-Planck-Society
* Author: Martin Reinecke * Author: Martin Reinecke
*/ */
@ -51,6 +51,10 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
complex double * restrict c1, complex double * restrict c2) complex double * restrict c1, complex double * restrict c2)
{ *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; } { *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
complex double * restrict cc)
{ cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
#endif #endif
#if (VLEN==2) #if (VLEN==2)
@ -94,6 +98,10 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
#endif #endif
} }
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
complex double * restrict cc)
{ vhsum_cmplx2(a,b,c,d,cc,cc+1); }
#endif #endif
#if (VLEN==4) #if (VLEN==4)
@ -130,6 +138,23 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
#endif #endif
} }
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
complex double * restrict cc)
{
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
tmp1=vadd(tmp3,tmp4);
#ifdef UNSAFE_CODE
_mm256_storeu_pd((double *)cc,
_mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1));
#else
union {Tv v; complex double c[2]; } u;
u.v=tmp1;
cc[0]+=u.c[0]; cc[1]+=u.c[1];
#endif
}
#endif #endif
#if (VLEN==8) #if (VLEN==8)
@ -144,6 +169,10 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
*c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d); *c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
} }
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
complex double * restrict cc)
{ vhsum_cmplx2(a,b,c,d,cc,cc+1); }
#endif #endif
#endif #endif

View file

@ -25,7 +25,7 @@
/*! \file sharp_core_inc.c /*! \file sharp_core_inc.c
* Type-dependent code for the computational core * Type-dependent code for the computational core
* *
* Copyright (C) 2012 Max-Planck-Society * Copyright (C) 2012-2017 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
*/ */
@ -73,8 +73,8 @@ static inline void Y(Tbmuleq)(Tb * restrict a, Tb b)
static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale, static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
double maxval) double maxval)
{ {
const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval); const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
for (int i=0;i<nvec; ++i) for (int i=0;i<nvec; ++i)
{ {
Tm mask = vgt(vabs(val->v[i]),vfmax); Tm mask = vgt(vabs(val->v[i]),vfmax);
@ -94,35 +94,58 @@ static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
} }
} }
static void Y(mypow) (Tb val, int npow, Tb * restrict resd, NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimit,
Tb * restrict ress) Tb * restrict resd, Tb * restrict ress)
{ {
Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.); Tv vminv=vload(powlimit[npow]);
Tm mask = vlt(vabs(val.v[0]),vminv);
Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf); for (int i=1;i<nvec; ++i)
mask=vor_mask(mask,vlt(vabs(val.v[i]),vminv));
do if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
{ {
if (npow&1) Tb res=Y(Tbconst)(1.);
do
{ {
if (npow&1)
for (int i=0; i<nvec; ++i)
{
vmuleq(res.v[i],val.v[i]);
vmuleq(val.v[i],val.v[i]);
}
else
for (int i=0; i<nvec; ++i)
vmuleq(val.v[i],val.v[i]);
}
while(npow>>=1);
*resd=res;
*ress=Y(Tbconst)(0.);
}
else
{
Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
do
{
if (npow&1)
{
for (int i=0; i<nvec; ++i)
{
vmuleq(res.v[i],val.v[i]);
vaddeq(scale.v[i],scaleint.v[i]);
}
Y(Tbnormalize)(&res,&scale,sharp_fbighalf);
}
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
vmuleq(res.v[i],val.v[i]); vmuleq(val.v[i],val.v[i]);
vaddeq(scale.v[i],scaleint.v[i]); vaddeq(scaleint.v[i],scaleint.v[i]);
} }
Y(Tbnormalize)(&res,&scale,sharp_fbighalf); Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
} }
for (int i=0; i<nvec; ++i) while(npow>>=1);
{ *resd=res;
vmuleq(val.v[i],val.v[i]); *ress=scale;
vaddeq(scaleint.v[i],scaleint.v[i]);
}
Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
} }
while(npow>>=1);
*resd=res;
*ress=scale;
} }
static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2, static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
@ -179,13 +202,13 @@ static void Y(getCorfac)(Tb scale, Tb * restrict corfac,
*corfac=corf.b; *corfac=corf.b;
} }
static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_, NOINLINE static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_, Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
const sharp_Ylmgen_C * restrict gen) const sharp_Ylmgen_C * restrict gen)
{ {
int l=gen->m; int l=gen->m;
Tb lam_1=Y(Tbconst)(0.), lam_2, scale; Tb lam_1=Y(Tbconst)(0.), lam_2, scale;
Y(mypow) (sth,l,&lam_2,&scale); Y(mypow) (sth,l,gen->powlimit,&lam_2,&scale);
Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
Y(Tbnormalize)(&lam_2,&scale,sharp_ftol); Y(Tbnormalize)(&lam_2,&scale,sharp_ftol);
@ -193,12 +216,12 @@ static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
while (below_limit) while (below_limit)
{ {
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
Tv r0=vload(gen->rf[l].f[0]),r1=vload(gen->rf[l].f[1]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1)); lam_1.v[i] = vload(gen->rf[l].f[0])*(cth.v[i]*lam_2.v[i])
r0=vload(gen->rf[l+1].f[0]); r1=vload(gen->rf[l+1].f[1]); - vload(gen->rf[l].f[1])*lam_1.v[i];
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1)); lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
- vload(gen->rf[l+1].f[1])*lam_2.v[i];
if (Y(rescale)(&lam_1,&lam_2,&scale)) if (Y(rescale)(&lam_1,&lam_2,&scale))
below_limit = Y(TballLt)(scale,sharp_limscale); below_limit = Y(TballLt)(scale,sharp_limscale);
l+=2; l+=2;
@ -213,10 +236,8 @@ static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]); Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
rxp->v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,ryp->v[i])), rxp->v[i] = (cth.v[i]-fx1)*fx0*ryp->v[i] - fx2*rxp->v[i];
vmul(fx2,rxp->v[i])); rxm->v[i] = (cth.v[i]+fx1)*fx0*rym->v[i] - fx2*rxm->v[i];
rxm->v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rym->v[i])),
vmul(fx2,rxm->v[i]));
} }
} }
@ -240,8 +261,10 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
} }
Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps; Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
Y(mypow)(cth2,gen->cosPow,&ccp,&ccps); Y(mypow)(sth2,gen->sinPow,&ssp,&ssps); Y(mypow)(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
Y(mypow)(cth2,gen->sinPow,&csp,&csps); Y(mypow)(sth2,gen->cosPow,&scp,&scps); Y(mypow)(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
Y(mypow)(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
Y(mypow)(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
Tb rec2p, rec2m, scalep, scalem; Tb rec2p, rec2m, scalep, scalem;
Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.); Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.);

View file

@ -25,7 +25,7 @@
/*! \file sharp_core_inc0.c /*! \file sharp_core_inc0.c
* Computational core * Computational core
* *
* Copyright (C) 2012-2013 Max-Planck-Society * Copyright (C) 2012-2018 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
*/ */
@ -34,7 +34,6 @@
#include <string.h> #include <string.h>
#include "sharp_vecsupport.h" #include "sharp_vecsupport.h"
#include "sharp_complex_hacks.h" #include "sharp_complex_hacks.h"
#include "sharp_ylmgen_c.h"
#include "sharp.h" #include "sharp.h"
#include "sharp_core.h" #include "sharp_core.h"
#include "c_utils.h" #include "c_utils.h"

View file

@ -25,11 +25,11 @@
/*! \file sharp_core_inc2.c /*! \file sharp_core_inc2.c
* Type-dependent code for the computational core * Type-dependent code for the computational core
* *
* Copyright (C) 2012-2013 Max-Planck-Society * Copyright (C) 2012-2017 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
*/ */
static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1, NOINLINE static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
int l, int lmax NJ1) int l, int lmax NJ1)
@ -77,29 +77,32 @@ if (njobs>1)
} }
while (l<lmax) while (l<lmax)
{ {
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1)); lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
- vload(rf[l].f[1])*lam_1.v[i];
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
{ {
Tv ar=vload(creal(alm[njobs*l+j])), Tv ar=vload(creal(alm[njobs*l+j])),
ai=vload(cimag(alm[njobs*l+j])); ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
vfmaeq(p1[j].r.v[i],lam_2.v[i],ar); p1[j].r.v[i] += lam_2.v[i]*ar;
vfmaeq(p1[j].i.v[i],lam_2.v[i],ai); p1[j].i.v[i] += lam_2.v[i]*ai;
}
ar=vload(creal(alm[njobs*(l+1)+j]));
ai=vload(cimag(alm[njobs*(l+1)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p2[j].r.v[i],lam_1.v[i],ar);
vfmaeq(p2[j].i.v[i],lam_1.v[i],ai);
} }
} }
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1)); lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
- vload(rf[l+1].f[1])*lam_2.v[i];
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*(l+1)+j])),
ai=vload(cimag(alm[njobs*(l+1)+j]));
for (int i=0; i<nvec; ++i)
{
p2[j].r.v[i] += lam_1.v[i]*ar;
p2[j].i.v[i] += lam_1.v[i]*ai;
}
}
l+=2; l+=2;
} }
if (l==lmax) if (l==lmax)
@ -109,64 +112,57 @@ if (njobs>1)
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j])); Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
vfmaeq(p1[j].r.v[i],lam_2.v[i],ar); p1[j].r.v[i] += lam_2.v[i]*ar;
vfmaeq(p1[j].i.v[i],lam_2.v[i],ai); p1[j].i.v[i] += lam_2.v[i]*ai;
} }
} }
} }
} }
static void Z(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1, NOINLINE static void Z(map2alm_kernel) (const Tb cth,
const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp
NJ1) NJ1)
{ {
while (l<lmax) while (l<lmax)
{ {
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1)); lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
- vload(rf[l].f[1])*lam_1.v[i];
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
{
Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
vfmaeq(tr1,lam_2.v[i],p1[j].r.v[i]); atmp[2*(l*njobs+j)]+=lam_2.v[i]*p1[j].r.v[i];
vfmaeq(ti1,lam_2.v[i],p1[j].i.v[i]); atmp[2*(l*njobs+j)+1]+=lam_2.v[i]*p1[j].i.v[i];
} }
for (int i=0; i<nvec; ++i)
{
vfmaeq(tr2,lam_1.v[i],p2[j].r.v[i]);
vfmaeq(ti2,lam_1.v[i],p2[j].i.v[i]);
}
vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
}
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1)); lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
- vload(rf[l+1].f[1])*lam_2.v[i];
for (int j=0; j<njobs; ++j)
for (int i=0; i<nvec; ++i)
{
atmp[2*((l+1)*njobs+j)]+=lam_1.v[i]*p2[j].r.v[i];
atmp[2*((l+1)*njobs+j)+1]+=lam_1.v[i]*p2[j].i.v[i];
}
l+=2; l+=2;
} }
if (l==lmax) if (l==lmax)
{ {
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
vfmaeq(tre,lam_2.v[i],p1[j].r.v[i]); atmp[2*(l*njobs+j)] += lam_2.v[i]*p1[j].r.v[i];
vfmaeq(tim,lam_2.v[i],p1[j].i.v[i]); atmp[2*(l*njobs+j)+1] += lam_2.v[i]*p1[j].i.v[i];
} }
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
} }
} }
static void Z(calc_alm2map) (const Tb cth, const Tb sth, NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1, const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
Y(Tbri) * restrict p2 NJ1) Y(Tbri) * restrict p2 NJ1)
{ {
int l,lmax=gen->lmax; int l,lmax=gen->lmax;
Tb lam_1,lam_2,scale; Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec; job->opcnt += (l-gen->m) * 4*VLEN*nvec;
if (l>lmax) return; if (l>lmax) return;
@ -219,12 +215,12 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth,
Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2); Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
} }
static void Z(calc_map2alm) (const Tb cth, const Tb sth, NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1, const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
const Y(Tbri) * restrict p2 NJ1) const Y(Tbri) * restrict p2, Tv *restrict atmp NJ1)
{ {
int lmax=gen->lmax; int lmax=gen->lmax;
Tb lam_1,lam_2,scale; Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
int l=gen->m; int l=gen->m;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec; job->opcnt += (l-gen->m) * 4*VLEN*nvec;
@ -234,40 +230,31 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
const sharp_ylmgen_dbl2 * restrict rf = gen->rf; const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
Tb corfac; Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf); Y(getCorfac)(scale,&corfac,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scale,sharp_minscale); int full_ieee = Y(TballGe)(scale,sharp_minscale);
while (!full_ieee) while (!full_ieee)
{ {
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
Tv tmp=vmul(lam_2.v[i],corfac.v[i]); Tv tmp=lam_2.v[i]*corfac.v[i];
vfmaeq(tre,tmp,p1[j].r.v[i]); atmp[2*(l*njobs+j)]+=tmp*p1[j].r.v[i];
vfmaeq(tim,tmp,p1[j].i.v[i]); atmp[2*(l*njobs+j)+1]+=tmp*p1[j].i.v[i];
} }
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
if (++l>lmax) return; if (++l>lmax) return;
Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1)); lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
- vload(rf[l-1].f[1])*lam_1.v[i];
for (int j=0; j<njobs; ++j) for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
Tv tmp=vmul(lam_1.v[i],corfac.v[i]); Tv tmp=lam_1.v[i]*corfac.v[i];
vfmaeq(tre,tmp,p2[j].r.v[i]); atmp[2*(l*njobs+j)]+=tmp*p2[j].r.v[i];
vfmaeq(tim,tmp,p2[j].i.v[i]); atmp[2*(l*njobs+j)+1]+=tmp*p2[j].i.v[i];
} }
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
if (++l>lmax) return; if (++l>lmax) return;
r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1)); lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i])
- vload(rf[l-1].f[1])*lam_2.v[i];
if (Y(rescale)(&lam_1,&lam_2,&scale)) if (Y(rescale)(&lam_1,&lam_2,&scale))
{ {
Y(getCorfac)(scale,&corfac,gen->cf); Y(getCorfac)(scale,&corfac,gen->cf);
@ -276,7 +263,7 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
} }
Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2); Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp NJ2);
} }
static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
@ -317,8 +304,8 @@ static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1])); acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
Tv lw1=vadd(r2p.v[i],r2m.v[i]); Tv lw1=r2p.v[i]+r2m.v[i];
Tv lx2=vsub(r1m.v[i],r1p.v[i]); Tv lx2=r1m.v[i]-r1p.v[i];
vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2); vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2); vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2); vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
@ -326,8 +313,8 @@ static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
} }
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
Tv lx1=vsub(r2m.v[i],r2p.v[i]); Tv lx1=r2m.v[i]-r2p.v[i];
Tv lw2=vadd(r1p.v[i],r1m.v[i]); Tv lw2=r1p.v[i]+r1m.v[i];
vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1); vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1); vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1); vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
@ -359,11 +346,11 @@ static inline void Z(saddstep2) (const Y(Tbqu) * restrict px,
vfmaeq(acr,py[j].qi.v[i],lx); vfmaeq(acr,py[j].qi.v[i],lx);
vfmseq(aci,py[j].qr.v[i],lx); vfmseq(aci,py[j].qr.v[i],lx);
} }
vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]); vhsum_cmplx_special(agr,agi,acr,aci,&alm[2*j]);
} }
} }
static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1, NOINLINE static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax NJ1) int lmax NJ1)
@ -374,10 +361,8 @@ static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
fx2=vload(fx[l+1].f[2]); fx2=vload(fx[l+1].f[2]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])), rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i];
vmul(fx2,rec1p.v[i])); rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i];
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
} }
Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l], Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
&alm[2*njobs*(l+1)] NJ2); &alm[2*njobs*(l+1)] NJ2);
@ -385,10 +370,8 @@ static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
fx2=vload(fx[l+2].f[2]); fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i) for (int i=0; i<nvec; ++i)
{ {
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])), rec2p.v[i] = (cth.v[i]-fx1)*fx0*rec1p.v[i] - fx2*rec2p.v[i];
vmul(fx2,rec2p.v[i])); rec2m.v[i] = (cth.v[i]+fx1)*fx0*rec1m.v[i] - fx2*rec2m.v[i];
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
vmul(fx2,rec2m.v[i]));
} }
l+=2; l+=2;
} }
@ -396,7 +379,7 @@ static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2); Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
} }
static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1, NOINLINE static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax
NJ1) NJ1)
@ -429,7 +412,7 @@ static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2); Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
} }
static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth, NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1, const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2 NJ1) Y(Tbqu) * restrict p2 NJ1)
{ {
@ -475,7 +458,7 @@ static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
lmax NJ2); lmax NJ2);
} }
static void Z(calc_map2alm_spin) (Tb cth, Tb sth, NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
const sharp_Ylmgen_C * restrict gen, sharp_job *job, const sharp_Ylmgen_C * restrict gen, sharp_job *job,
const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1) const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1)
{ {
@ -539,7 +522,7 @@ static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
} }
} }
static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1, NOINLINE static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax NJ1) int lmax NJ1)
@ -572,7 +555,7 @@ static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2); Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2);
} }
static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth, NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1, const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2 NJ1) Y(Tbqu) * restrict p2 NJ1)
{ {
@ -621,7 +604,7 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
static void Z(inner_loop) (sharp_job *job, const int *ispair, NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim, const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1) sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
{ {
@ -722,10 +705,30 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
} }
break; break;
} }
default:
{
UTIL_FAIL("must not happen");
break;
}
}
}
NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
{
const int nval=nvec*VLEN;
const int m = job->ainfo->mval[mi];
sharp_Ylmgen_prepare (gen, m);
switch (job->type)
{
case SHARP_MAP2ALM: case SHARP_MAP2ALM:
{ {
if (job->spin==0) if (job->spin==0)
{ {
Tv atmp[2*njobs*(gen->lmax+1)];
memset (&atmp[2*njobs*m],0,2*njobs*(gen->lmax+1-m)*sizeof(Tv));
for (int ith=0; ith<ulim-llim; ith+=nval) for (int ith=0; ith<ulim-llim; ith+=nval)
{ {
Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2); Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
@ -751,8 +754,15 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
} }
} }
if (!skip) if (!skip)
Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2); Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b, atmp NJ2);
} }
{
int istart=m*njobs, istop=(gen->lmax+1)*njobs;
for(; istart<istop-2; istart+=2)
vhsum_cmplx_special(atmp[2*istart],atmp[2*istart+1],atmp[2*istart+2],atmp[2*istart+3],&(job->almtmp[istart]));
for(; istart<istop; istart++)
job->almtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]);
}
} }
else else
{ {
@ -800,4 +810,13 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
} }
} }
static void Z(inner_loop) (sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
{
(job->type==SHARP_MAP2ALM) ?
Z(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2) :
Z(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2);
}
#undef VZERO #undef VZERO

View file

@ -25,13 +25,14 @@
/*! \file sharp_cxx.h /*! \file sharp_cxx.h
* Spherical transform library * Spherical transform library
* *
* Copyright (C) 2012-2015 Max-Planck-Society * Copyright (C) 2012-2016 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
*/ */
#ifndef PLANCK_SHARP_CXX_H #ifndef PLANCK_SHARP_CXX_H
#define PLANCK_SHARP_CXX_H #define PLANCK_SHARP_CXX_H
#include <complex>
#include "sharp_lowlevel.h" #include "sharp_lowlevel.h"
#include "sharp_geomhelpers.h" #include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h" #include "sharp_almhelpers.h"
@ -107,19 +108,30 @@ template<typename T> class sharp_cxxjob: public sharp_base
private: private:
static void *conv (T *ptr) static void *conv (T *ptr)
{ return reinterpret_cast<void *>(ptr); } { return reinterpret_cast<void *>(ptr); }
static void *conv (std::complex<T> *ptr)
{ return reinterpret_cast<void *>(ptr); }
static void *conv (const T *ptr) static void *conv (const T *ptr)
{ return const_cast<void *>(reinterpret_cast<const void *>(ptr)); } { return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }
static void *conv (const std::complex<T> *ptr)
{ return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }
public: public:
void alm2map (const T *alm, T *map, bool add) void alm2map (const T *alm, T *map, bool add) const
{ {
void *aptr=conv(alm), *mptr=conv(map); void *aptr=conv(alm), *mptr=conv(map);
int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0); int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1, sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1,
flags,0,0); flags,0,0);
} }
void alm2map_spin (const T *alm1, const T *alm2, T *map1, T *map2, void alm2map (const std::complex<T> *alm, T *map, bool add) const
int spin, bool add) {
void *aptr=conv(alm), *mptr=conv(map);
int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1,
flags,0,0);
}
void alm2map_spin (const T *alm1, const T *alm2,
T *map1, T *map2, int spin, bool add) const
{ {
void *aptr[2], *mptr[2]; void *aptr[2], *mptr[2];
aptr[0]=conv(alm1); aptr[1]=conv(alm2); aptr[0]=conv(alm1); aptr[1]=conv(alm2);
@ -127,21 +139,65 @@ template<typename T> class sharp_cxxjob: public sharp_base
int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0); int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0); sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
} }
void alm2map_der1 (const T *alm, T *map1, T *map2, bool add) void alm2map_spin (const std::complex<T> *alm1, const std::complex<T> *alm2,
T *map1, T *map2, int spin, bool add) const
{
void *aptr[2], *mptr[2];
aptr[0]=conv(alm1); aptr[1]=conv(alm2);
mptr[0]=conv(map1); mptr[1]=conv(map2);
int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
}
void alm2map_der1 (const T *alm, T *map1, T *map2, bool add) const
{ {
void *aptr=conv(alm), *mptr[2]; void *aptr=conv(alm), *mptr[2];
mptr[0]=conv(map1); mptr[1]=conv(map2); mptr[0]=conv(map1); mptr[1]=conv(map2);
int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0); int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0); sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0);
} }
void map2alm (const T *map, T *alm, bool add) void alm2map_der1 (const std::complex<T> *alm, T *map1, T *map2, bool add)
const
{
void *aptr=conv(alm), *mptr[2];
mptr[0]=conv(map1); mptr[1]=conv(map2);
int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0);
}
void alm2map_adjoint (const T *map, T *alm, bool add) const
{
void *aptr=conv(alm), *mptr=conv(map);
int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
}
void alm2map_adjoint (const T *map, std::complex<T> *alm, bool add) const
{
void *aptr=conv(alm), *mptr=conv(map);
int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
}
void map2alm (const T *map, T *alm, bool add) const
{
void *aptr=conv(alm), *mptr=conv(map);
int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
}
void map2alm (const T *map, std::complex<T> *alm, bool add) const
{ {
void *aptr=conv(alm), *mptr=conv(map); void *aptr=conv(alm), *mptr=conv(map);
int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0); int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0); sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
} }
void map2alm_spin (const T *map1, const T *map2, T *alm1, T *alm2, void map2alm_spin (const T *map1, const T *map2, T *alm1, T *alm2,
int spin, bool add) int spin, bool add) const
{
void *aptr[2], *mptr[2];
aptr[0]=conv(alm1); aptr[1]=conv(alm2);
mptr[0]=conv(map1); mptr[1]=conv(map2);
int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
sharp_execute (SHARP_MAP2ALM,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
}
void map2alm_spin (const T *map1, const T *map2, std::complex<T> *alm1,
std::complex<T> *alm2, int spin, bool add) const
{ {
void *aptr[2], *mptr[2]; void *aptr[2], *mptr[2];
aptr[0]=conv(alm1); aptr[1]=conv(alm2); aptr[0]=conv(alm1); aptr[1]=conv(alm2);

View file

@ -25,9 +25,8 @@
/*! \file sharp_geomhelpers.c /*! \file sharp_geomhelpers.c
* Spherical transform library * Spherical transform library
* *
* Copyright (C) 2006-2012 Max-Planck-Society<br> * Copyright (C) 2006-2018 Max-Planck-Society<br>
* Copyright (C) 2007-2008 Pavel Holoborodko (for gauss_legendre_tbl) * \author Martin Reinecke
* \author Martin Reinecke \author Pavel Holoborodko
*/ */
#include <math.h> #include <math.h>
@ -35,7 +34,6 @@
#include "sharp_legendre_roots.h" #include "sharp_legendre_roots.h"
#include "c_utils.h" #include "c_utils.h"
#include "pocketfft/pocketfft.h" #include "pocketfft/pocketfft.h"
#include <stdio.h>
void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings, void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
const int *rings, const double *weight, sharp_geom_info **geom_info) const int *rings, const double *weight, sharp_geom_info **geom_info)

View file

@ -23,7 +23,7 @@
*/ */
/* \file sharp_testsuite.c /* \file sharp_testsuite.c
* *
* Copyright (C) 2012-2013 Max-Planck-Society * Copyright (C) 2012-2013 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
*/ */
@ -50,9 +50,9 @@ typedef complex double dcmplx;
int ntasks, mytask; int ntasks, mytask;
static double drand (double min, double max, int *state) static double drand (double min, double max, unsigned *state)
{ {
*state = (((*state) * 1103515245) + 12345) & 0x7fffffff; *state = (((*state) * 1103515245u) + 12345u) & 0x7fffffffu;
return min + (max-min)*(*state)/(0x7fffffff+1.0); return min + (max-min)*(*state)/(0x7fffffff+1.0);
} }
@ -65,7 +65,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
for (mi=0;mi<helper->nm; ++mi) for (mi=0;mi<helper->nm; ++mi)
{ {
int m=helper->mval[mi]; int m=helper->mval[mi];
int state=1234567*cnt+8912*m; // random seed unsigned state=1234567u*(unsigned)cnt+8912u*(unsigned)m; // random seed
for (int l=m;l<=helper->lmax; ++l) for (int l=m;l<=helper->lmax; ++l)
{ {
if ((l<spin)&&(m<spin)) if ((l<spin)&&(m<spin))

View file

@ -25,7 +25,7 @@
/* \file sharp_vecsupport.h /* \file sharp_vecsupport.h
* Convenience functions for vector arithmetics * Convenience functions for vector arithmetics
* *
* Copyright (C) 2012,2013 Max-Planck-Society * Copyright (C) 2012-2016 Max-Planck-Society
* Author: Martin Reinecke * Author: Martin Reinecke
*/ */
@ -72,6 +72,7 @@ typedef int Tm;
#define vge(a,b) ((a)>=(b)) #define vge(a,b) ((a)>=(b))
#define vne(a,b) ((a)!=(b)) #define vne(a,b) ((a)!=(b))
#define vand_mask(a,b) ((a)&&(b)) #define vand_mask(a,b) ((a)&&(b))
#define vor_mask(a,b) ((a)||(b))
#define vstoreu(p, a) (*(p)=a) #define vstoreu(p, a) (*(p)=a)
#define vstoreu_s(p, a) (*(p)=a) #define vstoreu_s(p, a) (*(p)=a)
@ -138,6 +139,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
#define vge(a,b) _mm_cmpge_pd(a,b) #define vge(a,b) _mm_cmpge_pd(a,b)
#define vne(a,b) _mm_cmpneq_pd(a,b) #define vne(a,b) _mm_cmpneq_pd(a,b)
#define vand_mask(a,b) _mm_and_pd(a,b) #define vand_mask(a,b) _mm_and_pd(a,b)
#define vor_mask(a,b) _mm_or_pd(a,b)
#define vmin(a,b) _mm_min_pd(a,b) #define vmin(a,b) _mm_min_pd(a,b)
#define vmax(a,b) _mm_max_pd(a,b); #define vmax(a,b) _mm_max_pd(a,b);
#define vanyTrue(a) (_mm_movemask_pd(a)!=0) #define vanyTrue(a) (_mm_movemask_pd(a)!=0)
@ -183,6 +185,13 @@ typedef __m256d Tm;
#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a)) #define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a)) #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
#else #else
#if (USE_FMA)
#define vfmaeq(a,b,c) a=_mm256_fmadd_pd(b,c,a)
#define vfmaeq_s(a,b,c) a=_mm256_fmadd_ps(b,c,a)
#define vfmseq(a,b,c) a=_mm256_fnmadd_pd(b,c,a)
#define vfmaaeq(a,b,c,d,e) a=_mm256_fmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
#define vfmaseq(a,b,c,d,e) a=_mm256_fnmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
#else
#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c)) #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c)) #define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c)) #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
@ -191,6 +200,7 @@ typedef __m256d Tm;
#define vfmaseq(a,b,c,d,e) \ #define vfmaseq(a,b,c,d,e) \
a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e))) a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
#endif #endif
#endif
#define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a) #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
#define vload(a) _mm256_set1_pd(a) #define vload(a) _mm256_set1_pd(a)
#define vload_s(a) _mm256_set1_ps(a) #define vload_s(a) _mm256_set1_ps(a)
@ -201,6 +211,7 @@ typedef __m256d Tm;
#define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ) #define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ)
#define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ) #define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
#define vand_mask(a,b) _mm256_and_pd(a,b) #define vand_mask(a,b) _mm256_and_pd(a,b)
#define vor_mask(a,b) _mm256_or_pd(a,b)
#define vmin(a,b) _mm256_min_pd(a,b) #define vmin(a,b) _mm256_min_pd(a,b)
#define vmax(a,b) _mm256_max_pd(a,b) #define vmax(a,b) _mm256_max_pd(a,b)
#define vanyTrue(a) (_mm256_movemask_pd(a)!=0) #define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
@ -242,6 +253,7 @@ typedef __mmask8 Tm;
#define vge(a,b) _mm512_cmpnlt_pd_mask(a,b) #define vge(a,b) _mm512_cmpnlt_pd_mask(a,b)
#define vne(a,b) _mm512_cmpneq_pd_mask(a,b) #define vne(a,b) _mm512_cmpneq_pd_mask(a,b)
#define vand_mask(a,b) ((a)&(b)) #define vand_mask(a,b) ((a)&(b))
#define vor_mask(a,b) ((a)|(b))
#define vmin(a,b) _mm512_min_pd(a,b) #define vmin(a,b) _mm512_min_pd(a,b)
#define vmax(a,b) _mm512_max_pd(a,b) #define vmax(a,b) _mm512_max_pd(a,b)
#define vanyTrue(a) (a!=0) #define vanyTrue(a) (a!=0)

View file

@ -25,7 +25,7 @@
/* /*
* Helper code for efficient calculation of Y_lm(theta,phi=0) * Helper code for efficient calculation of Y_lm(theta,phi=0)
* *
* Copyright (C) 2005-2014 Max-Planck-Society * Copyright (C) 2005-2016 Max-Planck-Society
* Author: Martin Reinecke * Author: Martin Reinecke
*/ */
@ -59,6 +59,12 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
gen->cf[m]=gen->cf[m+1]*sharp_fsmall; gen->cf[m]=gen->cf[m+1]*sharp_fsmall;
for (int m=-sharp_minscale+1; m<(sharp_maxscale-sharp_minscale+1); ++m) for (int m=-sharp_minscale+1; m<(sharp_maxscale-sharp_minscale+1); ++m)
gen->cf[m]=gen->cf[m-1]*sharp_fbig; gen->cf[m]=gen->cf[m-1]*sharp_fbig;
gen->powlimit=RALLOC(double,m_max+spin+1);
gen->powlimit[0]=0.;
const double ln2 = 0.6931471805599453094172321214581766;
const double expo=-400*ln2;
for (int m=1; m<=m_max+spin; ++m)
gen->powlimit[m]=exp(expo/m);
gen->m = -1; gen->m = -1;
if (spin==0) if (spin==0)
@ -124,6 +130,7 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen) void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
{ {
DEALLOC(gen->cf); DEALLOC(gen->cf);
DEALLOC(gen->powlimit);
if (gen->s==0) if (gen->s==0)
{ {
DEALLOC(gen->rf); DEALLOC(gen->rf);

View file

@ -25,7 +25,7 @@
/*! \file sharp_ylmgen_c.h /*! \file sharp_ylmgen_c.h
* Code for efficient calculation of Y_lm(phi=0,theta) * Code for efficient calculation of Y_lm(phi=0,theta)
* *
* Copyright (C) 2005-2012 Max-Planck-Society * Copyright (C) 2005-2016 Max-Planck-Society
* \author Martin Reinecke * \author Martin Reinecke
*/ */
@ -49,6 +49,7 @@ typedef struct
/* for public use; immutable during lifetime */ /* for public use; immutable during lifetime */
int lmax, mmax, s; int lmax, mmax, s;
double *cf; double *cf;
double *powlimit;
/* for public use; will typically change after call to Ylmgen_prepare() */ /* for public use; will typically change after call to Ylmgen_prepare() */
int m; int m;