Fixed error message. Imported libsharp

This commit is contained in:
Guilhem Lavaux 2012-11-10 08:59:10 -05:00
parent 3aa898e636
commit bddd26a5ca
65 changed files with 18489 additions and 1 deletions

94
external/sharp/libsharp/libsharp.dox vendored Normal file
View file

@ -0,0 +1,94 @@
/*! \mainpage libsharp documentation
<ul>
<li>\ref introduction "Introduction"
<li><a href="modules.html">Programming interface</a>
</ul>
*/
/*! \page introduction Introduction to libsharp
"SHARP" is an acronym for <i>Performant Spherical Harmonic Transforms</i>.
All user-visible data types and functions in this library start with
the prefix "sharp_", or with "sharps_" and "sharpd_" for single- and
double precision variants, respectively.
<i>libsharp</i>'s main functionality is the conversion between <i>maps</i>
on the sphere and <i>spherical harmonic coefficients</i> (or <i>a_lm</i>).
A map is defined as a set of <i>rings</i>, which in turn consist of
individual pixels that
<ul>
<li>all have the same colatitude and</li>
<li>are uniformly spaced in azimuthal direction.</li>
</ul>
Consequently, a ring is completely defined by
<ul>
<li>its colatitute (in radians)</li>
<li>the number of pixels it contains</li>
<li>the azimuth (in radians) of the first pixel in the ring</li>
<li>the weight that must be multiplied to every pixel during a map
analysis (typically the solid angle of a pixel in the ring) </li>
<li>the offset of the first ring pixel in the <i>map array</i></li>
<li>the stride between consecutive pixels in the ring.</li>
</ul>
The map array is a one-dimensional array of type <i>float</i> or
<i>double</i>, which contains the values of all map pixels. It is assumed
that the pixels of every ring are stored inside this array in order of
increasing azimuth and with the specified stride. Note however that the rings
themselves can be stored in any order inside the array.
The a_lm array is a one-dimensional array of type <i>complex float</i> or
<i>complex double</i>, which contains all spherical harmonic coefficients
for a full or partial set of m quantum numbers with 0<=m<=mmax and m<=l<=lmax.
There is only one constraint on the internal structure of the array, which is:
<code>Index[a_l+1,m] = Index[a_l,m] + stride</code>
That means that coefficients with identical <i>m</i> but different <i>l</i>
can be interpreted as a one-dimensional array in <i>l</i> with a unique
stride.
Several functions are provided for efficient index computation in this array;
they are documented \ref almgroup "here".
Information about a pixelisation of the sphere is stored in objects of
type sharp_geom_info. It is possible to create such an object for any
supported pixelisation by using the function sharp_make_geometry_info();
however, several easier-to-use functions are \ref geominfogroup "supplied"
for generating often-used pixelisations like ECP grids, Gaussian grids,
and Healpix grids.
Currently, SHARP supports the following kinds of transforms:
<ul>
<li>scalar a_lm to map</li>
<li>scalar map to a_lm</li>
<!-- <li>polarised a_lm to map</li>
<li>polarised map to a_lm</li> !-->
<li>spin a_lm to map</li>
<li>spin map to a_lm</li>
<li>scalar a_lm to maps of first derivatives</li>
</ul>
SHARP supports shared-memory parallelisation via OpenMP; this feature will
be automatically enabled if the compiler supports it.
SHARP will also make use of SSE2 and AVX instructions when compiled for a
platform known to support them.
Support for MPI-parallel transforms is also available; in this mode,
every MPI task must provide a unique subset of the map and a_lm coefficients.
The spherical harmonic transforms can be executed on double-precision and
single-precision maps and a_lm, but for accuracy reasons the computations
will always be performed in double precision. As a consequence,
single-precision transforms will most likely not be faster than their
double-precision counterparts, but they will require significantly less
memory.
Two example and benchmark programs are distributed with SHARP:
<ul>
<li>sharp_test.c checks the accuracy of the (iterative) map analysis
algorithm</li>
<li>sharp_bench.c determines the quickest transform strategy for a given
SHT</li>
</ul>
*/

29
external/sharp/libsharp/planck.make vendored Normal file
View file

@ -0,0 +1,29 @@
PKG:=libsharp
SD:=$(SRCROOT)/$(PKG)
OD:=$(BLDROOT)/$(PKG)
FULL_INCLUDE+= -I$(SD)
HDR_$(PKG):=$(SD)/*.h
LIB_$(PKG):=$(LIBDIR)/libsharp.a
BIN:=sharp_test sharp_acctest sharp_test_mpi sharp_bench sharp_bench2
LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o
ALLOBJ:=$(LIBOBJ) sharp_test.o sharp_acctest.o sharp_test_mpi.o sharp_bench.o sharp_bench2.o
LIBOBJ:=$(LIBOBJ:%=$(OD)/%)
ALLOBJ:=$(ALLOBJ:%=$(OD)/%)
ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils)
$(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c
$(OD)/sharp.o: $(SD)/sharp_mpi.c
BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils)
$(LIB_$(PKG)): $(LIBOBJ)
$(ALLOBJ): $(ODEP) | $(OD)_mkdir
BIN:=$(BIN:%=$(BINDIR)/%)
$(BIN): $(BINDIR)/% : $(OD)/%.o $(BDEP)
all_hdr+=$(HDR_$(PKG))
all_lib+=$(LIB_$(PKG))
all_cbin+=$(BIN)

669
external/sharp/libsharp/sharp.c vendored Normal file
View file

@ -0,0 +1,669 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp.c
* Spherical transform library
*
* Copyright (C) 2006-2012 Max-Planck-Society
* \author Martin Reinecke
*/
#include <math.h>
#include "ls_fft.h"
#include "sharp_ylmgen_c.h"
#include "sharp_internal.h"
#include "c_utils.h"
#include "sharp_core.h"
#include "sharp_vecutil.h"
#include "walltime_c.h"
#include "sharp_almhelpers.h"
#include "sharp_geomhelpers.h"
typedef complex double dcmplx;
typedef complex float fcmplx;
static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
{
static const int chunksize_min=500, nchunks_max=10;
*chunksize = IMAX(chunksize_min,(ndata+nchunks_max-1)/nchunks_max);
*chunksize = ((*chunksize+nmult-1)/nmult)*nmult;
*nchunks = (ndata+*chunksize-1) / *chunksize;
}
typedef struct
{
double s;
int i;
} idxhelper;
static int idx_compare (const void *xa, const void *xb)
{
const idxhelper *a=xa, *b=xb;
return (a->s > b->s) ? -1 : (a->s < b->s) ? 1 : 0;
}
typedef struct
{
double phi0_;
dcmplx *shiftarr, *work;
int s_shift, s_work;
real_plan plan;
int norot;
} ringhelper;
static void ringhelper_init (ringhelper *self)
{
static ringhelper rh_null = { 0, NULL, NULL, 0, 0, NULL, 0 };
*self = rh_null;
}
static void ringhelper_destroy (ringhelper *self)
{
if (self->plan) kill_real_plan(self->plan);
DEALLOC(self->shiftarr);
DEALLOC(self->work);
ringhelper_init(self);
}
static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
{
self->norot = (fabs(phi0)<1e-14);
if (!(self->norot))
if ((mmax!=self->s_shift-1) || (!FAPPROX(phi0,self->phi0_,1e-12)))
{
RESIZE (self->shiftarr,dcmplx,mmax+1);
self->s_shift = mmax+1;
self->phi0_ = phi0;
for (int m=0; m<=mmax; ++m)
self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
}
if (!self->plan) self->plan=make_real_plan(nph);
if (nph!=(int)self->plan->length)
{
kill_real_plan(self->plan);
self->plan=make_real_plan(nph);
}
GROW(self->work,dcmplx,self->s_work,nph);
}
static int ringinfo_compare (const void *xa, const void *xb)
{
const sharp_ringinfo *a=xa, *b=xb;
return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
}
static int ringpair_compare (const void *xa, const void *xb)
{
const sharp_ringpair *a=xa, *b=xb;
if (a->r1.nph==b->r1.nph)
return (a->r1.phi0 < b->r1.phi0) ? -1 :
((a->r1.phi0 > b->r1.phi0) ? 1 :
(a->r1.cth>b->r1.cth ? -1 : 1));
return (a->r1.nph<b->r1.nph) ? -1 : 1;
}
void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
const ptrdiff_t *mstart, sharp_alm_info **alm_info)
{
sharp_alm_info *info = RALLOC(sharp_alm_info,1);
info->lmax = lmax;
info->nm = nm;
info->mval = RALLOC(int,nm);
info->mvstart = RALLOC(ptrdiff_t,nm);
info->stride = stride;
for (int mi=0; mi<nm; ++mi)
{
info->mval[mi] = mval[mi];
info->mvstart[mi] = mstart[mi];
}
*alm_info = info;
}
void sharp_make_alm_info (int lmax, int mmax, int stride,
const ptrdiff_t *mstart, sharp_alm_info **alm_info)
{
int *mval=RALLOC(int,mmax+1);
for (int i=0; i<=mmax; ++i)
mval[i]=i;
sharp_make_general_alm_info (lmax, mmax+1, stride, mval, mstart, alm_info);
DEALLOC(mval);
}
ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi)
{ return self->mvstart[mi]+self->stride*l; }
void sharp_destroy_alm_info (sharp_alm_info *info)
{
DEALLOC (info->mval);
DEALLOC (info->mvstart);
DEALLOC (info);
}
void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
const int *stride, const double *phi0, const double *theta,
const double *weight, sharp_geom_info **geom_info)
{
sharp_geom_info *info = RALLOC(sharp_geom_info,1);
sharp_ringinfo *infos = RALLOC(sharp_ringinfo,nrings);
int pos=0;
info->pair=RALLOC(sharp_ringpair,nrings);
info->npairs=0;
*geom_info = info;
for (int m=0; m<nrings; ++m)
{
infos[m].theta = theta[m];
infos[m].cth = cos(theta[m]);
infos[m].sth = sin(theta[m]);
infos[m].weight = weight[m];
infos[m].phi0 = phi0[m];
infos[m].ofs = ofs[m];
infos[m].stride = stride[m];
infos[m].nph = nph[m];
}
qsort(infos,nrings,sizeof(sharp_ringinfo),ringinfo_compare);
while (pos<nrings)
{
info->pair[info->npairs].r1=infos[pos];
if ((pos<nrings-1) && FAPPROX(infos[pos].cth,-infos[pos+1].cth,1e-12))
{
if (infos[pos].cth>0) // make sure northern ring is in r1
info->pair[info->npairs].r2=infos[pos+1];
else
{
info->pair[info->npairs].r1=infos[pos+1];
info->pair[info->npairs].r2=infos[pos];
}
++pos;
}
else
info->pair[info->npairs].r2.nph=-1;
++pos;
++info->npairs;
}
DEALLOC(infos);
qsort(info->pair,info->npairs,sizeof(sharp_ringpair),ringpair_compare);
}
void sharp_destroy_geom_info (sharp_geom_info *geom_info)
{
DEALLOC (geom_info->pair);
DEALLOC (geom_info);
}
static int sharp_get_mmax (int *mval, int nm)
{
int *mcheck=RALLOC(int,nm);
SET_ARRAY(mcheck,0,nm,0);
for (int i=0; i<nm; ++i)
{
int m_cur=mval[i];
UTIL_ASSERT((m_cur>=0) && (m_cur<nm), "m out of range");
UTIL_ASSERT(mcheck[m_cur]==0, "duplicate m value");
mcheck[m_cur]=1;
}
DEALLOC(mcheck);
return nm-1; // FIXME: this looks wrong
}
static void ringhelper_phase2ring (ringhelper *self,
const sharp_ringinfo *info, void *data, int mmax, const dcmplx *phase,
int pstride, sharp_fde fde)
{
int nph = info->nph;
int stride = info->stride;
ringhelper_update (self, nph, mmax, info->phi0);
self->work[0]=phase[0];
SET_ARRAY(self->work,1,nph,0.);
#if 0
if (self->norot)
for (int m=1; m<=mmax; ++m)
{
int idx1 = m%nph;
int idx2 = nph-1-((m-1)%nph);
self->work[idx1]+=phase[m*pstride];
self->work[idx2]+=conj(phase[m*pstride]);
}
else
for (int m=1; m<=mmax; ++m)
{
int idx1 = m%nph;
int idx2 = nph-1-((m-1)%nph);
dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
self->work[idx1]+=tmp;
self->work[idx2]+=conj(tmp);
}
#else
int idx1=1, idx2=nph-1;
for (int m=1; m<=mmax; ++m)
{
dcmplx tmp = phase[m*pstride];
if(!self->norot) tmp*=self->shiftarr[m];
self->work[idx1]+=tmp;
self->work[idx2]+=conj(tmp);
if (++idx1>=nph) idx1=0;
if (--idx2<0) idx2=nph-1;
}
#endif
real_plan_backward_c (self->plan, (double *)(self->work));
if (fde==DOUBLE)
for (int m=0; m<nph; ++m)
((double *)data)[m*stride+info->ofs] += creal(self->work[m]);
else
for (int m=0; m<nph; ++m)
((float *)data)[m*stride+info->ofs] += (float)creal(self->work[m]);
}
static void ringhelper_ring2phase (ringhelper *self,
const sharp_ringinfo *info, const void *data, int mmax, dcmplx *phase,
int pstride, sharp_fde fde)
{
int nph = info->nph;
#if 1
int maxidx = mmax; /* Enable this for traditional Healpix compatibility */
#else
int maxidx = IMIN(nph-1,mmax);
#endif
ringhelper_update (self, nph, mmax, -info->phi0);
if (fde==DOUBLE)
for (int m=0; m<nph; ++m)
self->work[m] = ((double *)data)[info->ofs+m*info->stride]*info->weight;
else
for (int m=0; m<nph; ++m)
self->work[m] = ((float *)data)[info->ofs+m*info->stride]*info->weight;
real_plan_forward_c (self->plan, (double *)self->work);
if (self->norot)
for (int m=0; m<=maxidx; ++m)
phase[m*pstride] = self->work[m%nph];
else
for (int m=0; m<=maxidx; ++m)
phase[m*pstride]=self->work[m%nph]*self->shiftarr[m];
for (int m=maxidx+1;m<=mmax; ++m)
phase[m*pstride]=0.;
}
static void ringhelper_pair2phase (ringhelper *self, int mmax,
const sharp_ringpair *pair, const void *data, dcmplx *phase1, dcmplx *phase2,
int pstride, sharp_fde fde)
{
ringhelper_ring2phase (self, &(pair->r1), data, mmax, phase1, pstride, fde);
if (pair->r2.nph>0)
ringhelper_ring2phase (self, &(pair->r2), data, mmax, phase2, pstride, fde);
}
static void ringhelper_phase2pair (ringhelper *self, int mmax,
const dcmplx *phase1, const dcmplx *phase2, int pstride,
const sharp_ringpair *pair, void *data, sharp_fde fde)
{
ringhelper_phase2ring (self, &(pair->r1), data, mmax, phase1, pstride, fde);
if (pair->r2.nph>0)
ringhelper_phase2ring (self, &(pair->r2), data, mmax, phase2, pstride, fde);
}
static void fill_map (const sharp_geom_info *ginfo, void *map, double value,
sharp_fde fde)
{
for (int j=0;j<ginfo->npairs;++j)
{
if (fde==DOUBLE)
{
for (int i=0;i<ginfo->pair[j].r1.nph;++i)
((double *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=value;
for (int i=0;i<ginfo->pair[j].r2.nph;++i)
((double *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=value;
}
else
{
for (int i=0;i<ginfo->pair[j].r1.nph;++i)
((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
=(float)value;
for (int i=0;i<ginfo->pair[j].r2.nph;++i)
((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
=(float)value;
}
}
}
static void fill_alm (const sharp_alm_info *ainfo, void *alm, dcmplx value,
sharp_fde fde)
{
if (fde==DOUBLE)
for (int mi=0;mi<ainfo->nm;++mi)
for (int l=ainfo->mval[mi];l<=ainfo->lmax;++l)
((dcmplx *)alm)[sharp_alm_index(ainfo,l,mi)] = value;
else
for (int mi=0;mi<ainfo->nm;++mi)
for (int l=ainfo->mval[mi];l<=ainfo->lmax;++l)
((fcmplx *)alm)[sharp_alm_index(ainfo,l,mi)] = (fcmplx)value;
}
static void init_output (sharp_job *job)
{
if (job->add_output) return;
if (job->type == SHARP_MAP2ALM)
for (int i=0; i<job->ntrans*job->nalm; ++i)
fill_alm (job->ainfo,job->alm[i],0.,job->fde);
else
for (int i=0; i<job->ntrans*job->nmaps; ++i)
fill_map (job->ginfo,job->map[i],0.,job->fde);
}
static void alloc_phase (sharp_job *job, int nm, int ntheta)
{ job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta); }
static void dealloc_phase (sharp_job *job)
{ DEALLOC(job->phase); }
//FIXME: set phase to zero if not SHARP_MAP2ALM?
static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
{
if (job->type != SHARP_MAP2ALM) return;
int pstride = 2*job->ntrans*job->nmaps;
#pragma omp parallel
{
ringhelper helper;
ringhelper_init(&helper);
#pragma omp for schedule(dynamic,1)
for (int ith=llim; ith<ulim; ++ith)
{
int dim2 = pstride*(ith-llim)*(mmax+1);
for (int i=0; i<job->ntrans*job->nmaps; ++i)
ringhelper_pair2phase(&helper,mmax,&job->ginfo->pair[ith], job->map[i],
&job->phase[dim2+2*i], &job->phase[dim2+2*i+1], pstride, job->fde);
}
ringhelper_destroy(&helper);
} /* end of parallel region */
}
static void alloc_almtmp (sharp_job *job, int lmax)
{ job->almtmp=RALLOC(dcmplx,job->ntrans*job->nalm*(lmax+1)); }
static void dealloc_almtmp (sharp_job *job)
{ DEALLOC(job->almtmp); }
static void alm2almtmp (sharp_job *job, int lmax, int mi)
{
if (job->type!=SHARP_MAP2ALM)
for (int l=job->ainfo->mval[mi]; l<=lmax; ++l)
{
ptrdiff_t aidx = sharp_alm_index(job->ainfo,l,mi);
double fct = job->norm_l[l];
for (int i=0; i<job->ntrans*job->nalm; ++i)
if (job->fde==DOUBLE)
job->almtmp[job->ntrans*job->nalm*l+i]
= ((dcmplx *)job->alm[i])[aidx]*fct;
else
job->almtmp[job->ntrans*job->nalm*l+i]
= ((fcmplx *)job->alm[i])[aidx]*fct;
}
else
SET_ARRAY(job->almtmp,job->ntrans*job->nalm*job->ainfo->mval[mi],
job->ntrans*job->nalm*(lmax+1),0.);
}
static void almtmp2alm (sharp_job *job, int lmax, int mi)
{
if (job->type != SHARP_MAP2ALM) return;
for (int l=job->ainfo->mval[mi]; l<=lmax; ++l)
{
ptrdiff_t aidx = sharp_alm_index(job->ainfo,l,mi);
for (int i=0;i<job->ntrans*job->nalm;++i)
if (job->fde==DOUBLE)
((dcmplx *)job->alm[i])[aidx] +=
job->almtmp[job->ntrans*job->nalm*l+i]*job->norm_l[l];
else
((fcmplx *)job->alm[i])[aidx] +=
(fcmplx)(job->almtmp[job->ntrans*job->nalm*l+i]*job->norm_l[l]);
}
}
static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
{
if (job->type == SHARP_MAP2ALM) return;
int pstride = 2*job->ntrans*job->nmaps;
#pragma omp parallel
{
ringhelper helper;
ringhelper_init(&helper);
#pragma omp for schedule(dynamic,1)
for (int ith=llim; ith<ulim; ++ith)
{
int dim2 = pstride*(ith-llim)*(mmax+1);
for (int i=0; i<job->ntrans*job->nmaps; ++i)
ringhelper_phase2pair(&helper,mmax,&job->phase[dim2+2*i],
&job->phase[dim2+2*i+1],pstride,&job->ginfo->pair[ith],job->map[i],
job->fde);
}
ringhelper_destroy(&helper);
} /* end of parallel region */
}
static void sharp_execute_job (sharp_job *job)
{
double timer=wallTime();
job->opcnt=0;
int lmax = job->ainfo->lmax,
mmax=sharp_get_mmax(job->ainfo->mval, job->ainfo->nm);
job->norm_l = (job->type==SHARP_ALM2MAP_DERIV1) ?
sharp_Ylmgen_get_d1norm (lmax) :
sharp_Ylmgen_get_norm (lmax, job->spin);
/* clear output arrays if requested */
init_output (job);
int nchunks, chunksize;
get_chunk_info(job->ginfo->npairs,job->nv*VLEN,&nchunks,&chunksize);
alloc_phase (job,mmax+1,chunksize);
/* chunk loop */
for (int chunk=0; chunk<nchunks; ++chunk)
{
int llim=chunk*chunksize, ulim=IMIN(llim+chunksize,job->ginfo->npairs);
int *ispair = RALLOC(int,ulim-llim);
double *cth = RALLOC(double,ulim-llim), *sth = RALLOC(double,ulim-llim);
idxhelper *stmp = RALLOC(idxhelper,ulim-llim);
for (int i=0; i<ulim-llim; ++i)
{
ispair[i] = job->ginfo->pair[i+llim].r2.nph>0;
cth[i] = job->ginfo->pair[i+llim].r1.cth;
sth[i] = job->ginfo->pair[i+llim].r1.sth;
stmp[i].s=sth[i];
stmp[i].i=i;
}
qsort (stmp,ulim-llim,sizeof(idxhelper),idx_compare);
int *idx = RALLOC(int,ulim-llim);
for (int i=0; i<ulim-llim; ++i)
idx[i]=stmp[i].i;
DEALLOC(stmp);
/* map->phase where necessary */
map2phase (job, mmax, llim, ulim);
#pragma omp parallel
{
sharp_job ljob = *job;
ljob.opcnt=0;
sharp_Ylmgen_C generator;
sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
alloc_almtmp(&ljob,lmax);
#pragma omp for schedule(dynamic,1)
for (int mi=0; mi<job->ainfo->nm; ++mi)
{
/* alm->alm_tmp where necessary */
alm2almtmp (&ljob, lmax, mi);
inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, idx);
/* alm_tmp->alm where necessary */
almtmp2alm (&ljob, lmax, mi);
}
sharp_Ylmgen_destroy(&generator);
dealloc_almtmp(&ljob);
#pragma omp critical
job->opcnt+=ljob.opcnt;
} /* end of parallel region */
/* phase->map where necessary */
phase2map (job, mmax, llim, ulim);
DEALLOC(ispair);
DEALLOC(cth);
DEALLOC(sth);
DEALLOC(idx);
} /* end of chunk loop */
DEALLOC(job->norm_l);
dealloc_phase (job);
job->time=wallTime()-timer;
}
static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
int spin, int add_output, void *alm, void *map,
const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans,
int dp, int nv)
{
UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
if (type==SHARP_ALM2MAP_DERIV1) spin=1;
UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin");
job->type = type;
job->spin = spin;
job->norm_l = NULL;
job->add_output = add_output;
job->nmaps = (type==SHARP_ALM2MAP_DERIV1) ? 2 : ((spin>0) ? 2 : 1);
job->nalm = (type==SHARP_ALM2MAP_DERIV1) ? 1 : ((spin>0) ? 2 : 1);
job->ginfo = geom_info;
job->ainfo = alm_info;
job->nv = (nv==0) ? sharp_nv_oracle (type, spin, ntrans) : nv;
job->time = 0.;
job->opcnt = 0;
job->ntrans = ntrans;
job->alm=alm;
job->map=map;
job->fde=dp ? DOUBLE : FLOAT;
}
void sharp_execute (sharp_jobtype type, int spin, int add_output, void *alm,
void *map, const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
int ntrans, int dp, int nv, double *time, unsigned long long *opcnt)
{
sharp_job job;
sharp_build_job_common (&job, type, spin, add_output, alm, map, geom_info,
alm_info, ntrans, dp, nv);
sharp_execute_job (&job);
if (time!=NULL) *time = job.time;
if (opcnt!=NULL) *opcnt = job.opcnt;
}
int sharp_get_nv_max (void)
{ return 6; }
static int sharp_oracle (sharp_jobtype type, int spin, int ntrans)
{
int lmax=127;
int mmax=(lmax+1)/2;
int nrings=(lmax+1)/4;
int ppring=1;
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
sharp_geom_info *tinfo;
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
SET_ARRAY(map[0],0,npix*ncomp,0.);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
SET_ARRAY(alm[0],0,nalms*ncomp,0.);
double time=1e30;
int nvbest=-1;
for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
{
double time_acc=0.;
double jtime;
int ntries=0;
do
{
sharp_execute(type,spin,0,&alm[0],&map[0],tinfo,alms,ntrans,1,nv,&jtime,
NULL);
if (jtime<time) { time=jtime; nvbest=nv; }
time_acc+=jtime;
++ntries;
}
while ((time_acc<0.02)&&(ntries<2));
}
DEALLOC2D(map);
DEALLOC2D(alm);
sharp_destroy_alm_info(alms);
sharp_destroy_geom_info(tinfo);
return nvbest;
}
int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
{
static const int maxtr = 6;
static int nv_opt[6][2][3] = {
{{0,0,0},{0,0,0}},
{{0,0,0},{0,0,0}},
{{0,0,0},{0,0,0}},
{{0,0,0},{0,0,0}},
{{0,0,0},{0,0,0}},
{{0,0,0},{0,0,0}} };
if (type==SHARP_ALM2MAP_DERIV1) spin=1;
UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin");
ntrans=IMIN(ntrans,maxtr);
if (nv_opt[ntrans-1][spin!=0][type]==0)
nv_opt[ntrans-1][spin!=0][type]=sharp_oracle(type,spin,ntrans);
return nv_opt[ntrans-1][spin!=0][type];
}
#ifdef USE_MPI
#include "sharp_mpi.c"
#endif

43
external/sharp/libsharp/sharp.h vendored Normal file
View file

@ -0,0 +1,43 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp.h
* Interface for the spherical transform library.
*
* Copyright (C) 2006-2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef PLANCK_SHARP_H
#define PLANCK_SHARP_H
#ifdef __cplusplus
#error This header file cannot be included from C++, only from C
#endif
#include <complex.h>
#include "sharp_lowlevel.h"
#endif

267
external/sharp/libsharp/sharp_acctest.c vendored Normal file
View file

@ -0,0 +1,267 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_acctest.c
Systematic accuracy test for libsharp.
Copyright (C) 2006-2012 Max-Planck-Society
\author Martin Reinecke
*/
#include <stdio.h>
#include <string.h>
#ifdef USE_MPI
#include "mpi.h"
#endif
#include "sharp.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "c_utils.h"
#include "sharp_announce.h"
#include "sharp_core.h"
typedef complex double dcmplx;
static double drand (double min, double max)
{ return min + (max-min)*rand()/(RAND_MAX+1.0); }
static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin)
{
for (int mi=0;mi<helper->nm; ++mi)
{
int m=helper->mval[mi];
for (int l=m;l<=helper->lmax; ++l)
{
if ((l<spin)&&(m<spin))
alm[sharp_alm_index(helper,l,mi)] = 0.;
else
{
double rv = drand(-1,1);
double iv = (m==0) ? 0 : drand(-1,1);
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
}
}
}
}
static void measure_errors (dcmplx **alm, dcmplx **alm2,
ptrdiff_t nalms, int ncomp)
{
for (int i=0; i<ncomp; ++i)
{
double sum=0, sum2=0, maxdiff=0;
for (ptrdiff_t m=0; m<nalms; ++m)
{
double x=creal(alm[i][m])-creal(alm2[i][m]),
y=cimag(alm[i][m])-cimag(alm2[i][m]);
sum+=x*x+y*y;
sum2+=creal(alm[i][m])*creal(alm[i][m])+cimag(alm[i][m])*cimag(alm[i][m]);
if (fabs(x)>maxdiff) maxdiff=fabs(x);
if (fabs(y)>maxdiff) maxdiff=fabs(y);
}
sum=sqrt(sum/nalms);
sum2=sqrt(sum2/nalms);
UTIL_ASSERT((maxdiff<1e-10)&&(sum/sum2<1e-10),"error");
}
}
static void check_sign_scale(void)
{
int lmax=50;
int mmax=lmax;
sharp_geom_info *tinfo;
int nrings=lmax+1;
int ppring=2*lmax+2;
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
/* flip theta to emulate the "old" Gaussian grid geometry */
for (int i=0; i<tinfo->npairs; ++i)
{
const double pi=3.141592653589793238462643383279502884197;
tinfo->pair[i].r1.cth=-tinfo->pair[i].r1.cth;
tinfo->pair[i].r2.cth=-tinfo->pair[i].r2.cth;
tinfo->pair[i].r1.theta=pi-tinfo->pair[i].r1.theta;
tinfo->pair[i].r2.theta=pi-tinfo->pair[i].r2.theta;
}
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
for (int ntrans=1; ntrans<10; ++ntrans)
{
double **map;
ALLOC2D(map,double,2*ntrans,npix);
dcmplx **alm;
ALLOC2D(alm,dcmplx,2*ntrans,nalms);
for (int i=0; i<2*ntrans; ++i)
for (int j=0; j<nalms; ++j)
alm[i][j]=1.+_Complex_I;
sharp_execute(SHARP_ALM2MAP,0,0,&alm[0],&map[0],tinfo,alms,ntrans,1,0,NULL,
NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[it][0 ], 3.588246976618616912e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[it][npix/2], 4.042209792157496651e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[it][npix-1],-1.234675107554816442e+01,1e-12),
"error");
}
sharp_execute(SHARP_ALM2MAP,1,0,&alm[0],&map[0],tinfo,alms,ntrans,1,0,NULL,
NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[2*it ][0 ], 2.750897760535633285e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix/2], 3.137704477368562905e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix-1],-8.405730859837063917e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][0 ],-2.398026536095463346e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-4.961140548331700728e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.412765834230440021e+01,1e-12),
"error");
}
sharp_execute(SHARP_ALM2MAP,2,0,&alm[0],&map[0],tinfo,alms,ntrans,1,0,NULL,
NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[2*it ][0 ],-1.398186224727334448e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix/2],-2.456676000884031197e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix-1],-1.516249174408820863e+02,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][0 ],-3.173406200299964119e+00,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-5.831327404513146462e+01,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.863257892248353897e+01,1e-12),
"error");
}
sharp_execute(SHARP_ALM2MAP_DERIV1,1,0,&alm[0],&map[0],tinfo,alms,ntrans,1,
0,NULL,NULL);
for (int it=0; it<ntrans; ++it)
{
UTIL_ASSERT(FAPPROX(map[2*it ][0 ],-6.859393905369091105e-01,1e-11),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix/2],-2.103947835973212364e+02,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it ][npix-1],-1.092463246472086439e+03,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][0 ],-1.411433220713928165e+02,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-1.146122859381925082e+03,1e-12),
"error");
UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1], 7.821618677689795049e+02,1e-12),
"error");
}
DEALLOC2D(map);
DEALLOC2D(alm);
}
sharp_destroy_alm_info(alms);
sharp_destroy_geom_info(tinfo);
}
static void check_accuracy (sharp_geom_info *tinfo, ptrdiff_t lmax,
ptrdiff_t mmax, ptrdiff_t npix, int spin, int ntrans, int nv)
{
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
srand(4);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
for (int i=0; i<ncomp; ++i)
random_alm(alm[i],alms,spin);
dcmplx **alm2;
ALLOC2D(alm2,dcmplx,ncomp,nalms);
sharp_execute(SHARP_ALM2MAP,spin,0,&alm[0],&map[0],tinfo,alms,ntrans,1,nv,
NULL,NULL);
sharp_execute(SHARP_MAP2ALM,spin,0,&alm2[0],&map[0],tinfo,alms,ntrans,1,nv,
NULL,NULL);
measure_errors(alm,alm2,nalms,ncomp);
DEALLOC2D(map);
DEALLOC2D(alm);
DEALLOC2D(alm2);
sharp_destroy_alm_info(alms);
}
int main(void)
{
#ifdef USE_MPI
MPI_Init(NULL,NULL);
#endif
sharp_module_startup("sharp_acctest",1,1,"",1);
int lmax=127;
printf("Checking signs and scales.\n");
check_sign_scale();
printf("Passed.\n\n");
printf("Testing map analysis accuracy.\n");
sharp_geom_info *tinfo;
int nrings=lmax+1;
int ppring=2*lmax+2;
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
for (int nv=1; nv<=6; ++nv)
for (int ntrans=1; ntrans<=6; ++ntrans)
{
check_accuracy(tinfo,lmax,lmax,npix,0,ntrans,nv);
check_accuracy(tinfo,lmax,lmax,npix,1,ntrans,nv);
check_accuracy(tinfo,lmax,lmax,npix,2,ntrans,nv);
check_accuracy(tinfo,lmax,lmax,npix,3,ntrans,nv);
check_accuracy(tinfo,lmax,lmax,npix,30,ntrans,nv);
}
sharp_destroy_geom_info(tinfo);
printf("Passed.\n\n");
#ifdef USE_MPI
MPI_Finalize();
#endif
return 0;
}

View file

@ -0,0 +1,68 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_almhelpers.c
* Spherical transform library
*
* Copyright (C) 2008-2011 Max-Planck-Society
* \author Martin Reinecke
*/
#include "sharp_almhelpers.h"
#include "c_utils.h"
void sharp_make_triangular_alm_info (int lmax, int mmax, int stride,
sharp_alm_info **alm_info)
{
sharp_alm_info *info = RALLOC(sharp_alm_info,1);
info->lmax = lmax;
info->nm = mmax+1;
info->mval = RALLOC(int,mmax+1);
info->mvstart = RALLOC(ptrdiff_t,mmax+1);
info->stride = stride;
int tval = 2*lmax+1;
for (ptrdiff_t m=0; m<=mmax; ++m)
{
info->mval[m] = m;
info->mvstart[m] = stride*((m*(tval-m))>>1);
}
*alm_info = info;
}
void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride,
sharp_alm_info **alm_info)
{
sharp_alm_info *info = RALLOC(sharp_alm_info,1);
info->lmax = lmax;
info->nm = mmax+1;
info->mval = RALLOC(int,mmax+1);
info->mvstart = RALLOC(ptrdiff_t,mmax+1);
info->stride = stride;
for (ptrdiff_t m=0; m<=mmax; ++m)
{
info->mval[m] = m;
info->mvstart[m] = stride*m*(lmax+1);
}
*alm_info = info;
}

View file

@ -0,0 +1,57 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_almhelpers.h
* SHARP helper function for the creation of a_lm data structures
*
* Copyright (C) 2008-2011 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef PLANCK_SHARP_ALMHELPERS_H
#define PLANCK_SHARP_ALMHELPERS_H
#include "sharp_lowlevel.h"
#ifdef __cplusplus
extern "C" {
#endif
/*! Initialises an a_lm data structure according to the scheme used by
Healpix_cxx.
\ingroup almgroup */
void sharp_make_triangular_alm_info (int lmax, int mmax, int stride,
sharp_alm_info **alm_info);
/*! Initialises an a_lm data structure according to the scheme used by
Fortran Healpix
\ingroup almgroup */
void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride,
sharp_alm_info **alm_info);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -0,0 +1,98 @@
/*
* This file is part of libc_utils.
*
* libc_utils is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libc_utils is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libc_utils; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_announce.c
* Banner for module startup
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#ifdef _OPENMP
#include <omp.h>
#endif
#ifdef USE_MPI
#include <mpi.h>
#endif
#include "sharp_announce.h"
#include "sharp_vecutil.h"
static void OpenMP_status(void)
{
#ifndef _OPENMP
printf("OpenMP: not supported by this binary\n");
#else
int threads = omp_get_max_threads();
if (threads>1)
printf("OpenMP active: max. %d threads.\n",threads);
else
printf("OpenMP active, but running with 1 thread only.\n");
#endif
}
static void MPI_status(void)
{
#ifndef USE_MPI
printf("MPI: not supported by this binary\n");
#else
int tasks;
MPI_Comm_size(MPI_COMM_WORLD,&tasks);
if (tasks>1)
printf("MPI active with %d tasks.\n",tasks);
else
printf("MPI active, but running with 1 task only.\n");
#endif
}
static void vecmath_status(void)
{ printf("Supported vector length: %d\n",VLEN); }
void sharp_announce (const char *name)
{
size_t m, nlen=strlen(name);
printf("\n+-");
for (m=0; m<nlen; ++m) printf("-");
printf("-+\n");
printf("| %s |\n", name);
printf("+-");
for (m=0; m<nlen; ++m) printf("-");
printf("-+\n\n");
vecmath_status();
OpenMP_status();
MPI_status();
printf("\n");
}
void sharp_module_startup (const char *name, int argc, int argc_expected,
const char *argv_expected, int verbose)
{
if (verbose) sharp_announce (name);
if (argc==argc_expected) return;
if (verbose) fprintf(stderr, "Usage: %s %s\n", name, argv_expected);
exit(1);
}

View file

@ -0,0 +1,39 @@
/*
* This file is part of libc_utils.
*
* libc_utils is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libc_utils is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libc_utils; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_announce.h
* Banner for module startup
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef SHARP_ANNOUNCE_H
#define SHARP_ANNOUNCE_H
void sharp_announce (const char *name);
void sharp_module_startup (const char *name, int argc, int argc_expected,
const char *argv_expected, int verbose);
#endif

149
external/sharp/libsharp/sharp_bench.c vendored Normal file
View file

@ -0,0 +1,149 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_bench.c
Copyright (C) 2012 Max-Planck-Society
\author Martin Reinecke
*/
#include <stdio.h>
#include <string.h>
#ifdef USE_MPI
#include "mpi.h"
#endif
#include "sharp.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "c_utils.h"
#include "sharp_announce.h"
#include "sharp_core.h"
typedef complex double dcmplx;
static void bench_sht (int spin, int nv, sharp_jobtype type,
int ntrans, double *time, unsigned long long *opcnt)
{
int lmax=2047;
int mmax=128;
int nrings=512;
int ppring=1024;
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
sharp_geom_info *tinfo;
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
SET_ARRAY(map[0],0,npix*ncomp,0.);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
SET_ARRAY(alm[0],0,nalms*ncomp,0.);
int nruns=0;
*time=1e30;
*opcnt=1000000000000000;
do
{
double jtime;
unsigned long long jopcnt;
sharp_execute(type,spin,0,&alm[0],&map[0],tinfo,alms,ntrans,1,nv,&jtime,
&jopcnt);
if (jopcnt<*opcnt) *opcnt=jopcnt;
if (jtime<*time) *time=jtime;
}
while (++nruns < 4);
DEALLOC2D(map);
DEALLOC2D(alm);
sharp_destroy_alm_info(alms);
sharp_destroy_geom_info(tinfo);
}
int main(void)
{
#ifdef USE_MPI
MPI_Init(NULL,NULL);
#endif
sharp_module_startup("sharp_bench",1,1,"",1);
printf("Benchmarking SHTs.\n\n");
FILE *fp=fopen("sharp_oracle.inc","w");
UTIL_ASSERT(fp, "failed to open oracle file for writing");
fprintf(fp,"static const int maxtr = 6;\n");
fprintf(fp,"static const int nv_opt[6][2][3] = {\n");
const char *shtname[]={"map2alm","alm2map","a2mder1"};
for (int ntr=1; ntr<=6; ++ntr)
{
fprintf(fp,"{");
for (int spin=0; spin<=2; spin+=2)
{
fprintf(fp,"{");
for (sharp_jobtype type=SHARP_MAP2ALM; type<=SHARP_ALM2MAP_DERIV1; ++type)
{
if ((type==SHARP_ALM2MAP_DERIV1) && (spin==0))
fprintf(fp,"-1");
else
{
int nvbest=-1, nvoracle=sharp_nv_oracle(type,spin,ntr);
unsigned long long opmin=1000000000000000, op;
double tmin=1e30;
double *time=RALLOC(double,sharp_get_nv_max()+1);
for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
{
bench_sht (spin,nv,type,ntr,&time[nv],&op);
if (op<opmin) opmin=op;
if (time[nv]<tmin)
{ tmin=time[nv]; nvbest=nv; }
}
printf("nt: %d %s spin: %d nv: %d time: %6.3f perf: %6.3f"
" dev[%d]: %6.2f%%\n",ntr,shtname[type],
spin,nvbest,tmin,opmin/tmin*1e-9,nvoracle,
(time[nvoracle]-tmin)/tmin*100.);
DEALLOC(time);
fprintf(fp,"%d",nvbest);
}
if (type!=SHARP_ALM2MAP_DERIV1) fprintf(fp,",");
}
fprintf(fp,(spin==0)?"},":"}");
printf("\n");
}
fprintf(fp,(ntr<6)?"},\n":"}\n");
}
fprintf(fp,"};\n");
fclose(fp);
#ifdef USE_MPI
MPI_Finalize();
#endif
return 0;
}

223
external/sharp/libsharp/sharp_bench2.c vendored Normal file
View file

@ -0,0 +1,223 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_bench2.c
Copyright (C) 2012 Max-Planck-Society
\author Martin Reinecke
*/
#include <stdio.h>
#if (defined(_OPENMP) && defined(USE_MPI))
#include <stdlib.h>
#include <string.h>
#include <omp.h>
#include <mpi.h>
#include "sharp_mpi.h"
#include "sharp.h"
#include "sharp_vecutil.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "c_utils.h"
#include "sharp_announce.h"
#include "sharp_core.h"
#include "memusage.h"
typedef complex double dcmplx;
int ntasks, mytask;
static unsigned long long totalops (unsigned long long val)
{
unsigned long long tmp;
MPI_Allreduce (&val, &tmp,1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
return tmp;
}
static double maxTime (double val)
{
double tmp;
MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
return tmp;
}
static double totalMem (double val)
{
double tmp;
MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
return tmp;
}
static void reduce_alm_info(sharp_alm_info *ainfo)
{
int nmnew=0;
ptrdiff_t ofs = 0;
for (int i=mytask; i<ainfo->nm; i+=ntasks,++nmnew)
{
ainfo->mval[nmnew]=ainfo->mval[i];
ainfo->mvstart[nmnew]=ofs-ainfo->mval[nmnew];
ofs+=ainfo->lmax-ainfo->mval[nmnew]+1;
}
ainfo->nm=nmnew;
}
static void reduce_geom_info(sharp_geom_info *ginfo)
{
int npairsnew=0;
ptrdiff_t ofs = 0;
for (int i=mytask; i<ginfo->npairs; i+=ntasks,++npairsnew)
{
ginfo->pair[npairsnew]=ginfo->pair[i];
ginfo->pair[npairsnew].r1.ofs=ofs;
ofs+=ginfo->pair[npairsnew].r1.nph;
ginfo->pair[npairsnew].r2.ofs=ofs;
if (ginfo->pair[npairsnew].r2.nph>0) ofs+=ginfo->pair[npairsnew].r2.nph;
}
ginfo->npairs=npairsnew;
}
static ptrdiff_t get_nalms(const sharp_alm_info *ainfo)
{
ptrdiff_t res=0;
for (int i=0; i<ainfo->nm; ++i)
res += ainfo->lmax-ainfo->mval[i]+1;
return res;
}
static ptrdiff_t get_npix(const sharp_geom_info *ginfo)
{
ptrdiff_t res=0;
for (int i=0; i<ginfo->npairs; ++i)
{
res += ginfo->pair[i].r1.nph;
if (ginfo->pair[i].r2.nph>0) res += ginfo->pair[i].r2.nph;
}
return res;
}
int main(int argc, char **argv)
{
MPI_Init(NULL,NULL);
MPI_Comm_size(MPI_COMM_WORLD,&ntasks);
MPI_Comm_rank(MPI_COMM_WORLD,&mytask);
int master=(mytask==0);
sharp_module_startup("sharp_bench2",argc,7,
"<healpix|ecp|gauss> <lmax> <nside|nphi> <a2m/m2a> <spin> <ntrans>",0);
int lmax=atoi(argv[2]);
sharp_jobtype jtype = (strcmp(argv[4],"a2m")==0) ?
SHARP_ALM2MAP : SHARP_MAP2ALM;
int spin=atoi(argv[5]);
int ntrans=atoi(argv[6]);
sharp_geom_info *tinfo;
ptrdiff_t npix=0;
int geom2=0;
if (strcmp(argv[1],"gauss")==0)
{
int nrings=geom2=lmax+1;
int ppring=atoi(argv[3]);
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
}
else if (strcmp(argv[1],"ecp")==0)
{
int nrings=geom2=2*lmax+2;
int ppring=atoi(argv[3]);
sharp_make_ecp_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
}
else if (strcmp(argv[1],"healpix")==0)
{
int nside=atoi(argv[3]);
if (nside<1) nside=1;
geom2=4*nside-1;
sharp_make_healpix_geom_info (nside, 1, &tinfo);
}
else
UTIL_FAIL("unknown grid geometry");
reduce_geom_info(tinfo);
npix=get_npix(tinfo);
int mmax=lmax;
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
reduce_alm_info(alms);
ptrdiff_t nalms=get_nalms(alms);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
for (int n=0; n<ncomp; ++n)
{
for (int i=0; i<npix; ++i) map[n][i]=1;
for (int i=0; i<nalms; ++i) alm[n][i]=1;
}
double time=1e20;
unsigned long long opcnt=0;
for (int ntries=0; (ntries<2)||(ntries*time<5); ++ntries)
{
double ltime;
unsigned long long lopcnt;
sharp_execute_mpi(MPI_COMM_WORLD,jtype,spin,0,&alm[0],&map[0],
tinfo,alms,ntrans,1,0,&ltime,&lopcnt);
ltime=maxTime(ltime);
if (ltime<time) { time=ltime; opcnt=totalops(lopcnt); }
}
DEALLOC2D(map);
DEALLOC2D(alm);
sharp_destroy_alm_info(alms);
sharp_destroy_geom_info(tinfo);
double mHWM=totalMem(VmHWM());
int nomp=omp_get_max_threads();
if (master)
printf("%-12s %-7s %-3s %2d %d %2d %3d %5d %5d %1d %.2e %7.2f %9.2f\n",
getenv("HOST"),argv[1],argv[4],spin,VLEN,nomp,ntasks,lmax,geom2,ntrans,
time,opcnt/(time*1e9),mHWM/(1<<20));
MPI_Finalize();
return 0;
}
#else
#include "c_utils.h"
int main(void)
{ UTIL_FAIL("Need OpenMP and MPI"); return 1; }
#endif

View file

@ -0,0 +1,135 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/* \file sharp_complex_hacks.h
* support for converting vector types and complex numbers
*
* Copyright (C) 2012 Max-Planck-Society
* Author: Martin Reinecke
*/
#ifndef SHARP_COMPLEX_HACKS_H
#define SHARP_COMPLEX_HACKS_H
#ifdef __cplusplus
#error This header file cannot be included from C++, only from C
#endif
#include <math.h>
#include <complex.h>
#include "sharp_vecsupport.h"
#define UNSAFE_CODE
#if (VLEN==1)
static inline complex double vhsum_cmplx(Tv a, Tv b)
{ return a+_Complex_I*b; }
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
complex double * restrict c1, complex double * restrict c2)
{ *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
#endif
#if (VLEN==2)
static inline complex double vhsum_cmplx (Tv a, Tv b)
{
#if defined(__SSE3__)
Tv tmp = _mm_hadd_pd(a,b);
#else
Tv tmp = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
#endif
union {Tv v; complex double c; } u;
u.v=tmp; return u.c;
}
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
Tv d, complex double * restrict c1, complex double * restrict c2)
{
#ifdef UNSAFE_CODE
#if defined(__SSE3__)
vaddeq(*((__m128d *)c1),_mm_hadd_pd(a,b));
vaddeq(*((__m128d *)c2),_mm_hadd_pd(c,d));
#else
vaddeq(*((__m128d *)c1),vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0))));
vaddeq(*((__m128d *)c2),vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0))));
#endif
#else
union {Tv v; complex double c; } u1, u2;
#if defined(__SSE3__)
u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
#else
u1.v = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
u2.v = vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)));
#endif
*c1+=u1.c; *c2+=u2.c;
#endif
}
#endif
#if (VLEN==4)
static inline complex double vhsum_cmplx (Tv a, Tv b)
{
Tv tmp=_mm256_hadd_pd(a,b);
Tv tmp2=_mm256_permute2f128_pd(tmp,tmp,1);
tmp=_mm256_add_pd(tmp,tmp2);
#ifdef UNSAFE_CODE
complex double ret;
*((__m128d *)&ret)=_mm256_extractf128_pd(tmp, 0);
return ret;
#else
union {Tv v; complex double c[2]; } u;
u.v=tmp; return u.c[0];
#endif
}
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
complex double * restrict c1, complex double * restrict c2)
{
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
tmp1=vadd(tmp3,tmp4);
#ifdef UNSAFE_CODE
*((__m128d *)c1)=_mm_add_pd(*((__m128d *)c1),_mm256_extractf128_pd(tmp1, 0));
*((__m128d *)c2)=_mm_add_pd(*((__m128d *)c2),_mm256_extractf128_pd(tmp1, 1));
#else
union {Tv v; complex double c[2]; } u;
u.v=tmp1;
*c1+=u.c[0]; *c2+=u.c[1];
#endif
}
#endif
#endif

239
external/sharp/libsharp/sharp_core.c vendored Normal file
View file

@ -0,0 +1,239 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_core.c
* Computational core
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
#include <complex.h>
#include <math.h>
#include <string.h>
#include "sharp_vecsupport.h"
#include "sharp_complex_hacks.h"
#include "sharp_ylmgen_c.h"
#include "sharp.h"
#include "sharp_core.h"
#include "c_utils.h"
typedef complex double dcmplx;
#define MAXJOB_SPECIAL 2
#define XCONCAT2(a,b) a##_##b
#define CONCAT2(a,b) XCONCAT2(a,b)
#define XCONCAT3(a,b,c) a##_##b##_##c
#define CONCAT3(a,b,c) XCONCAT3(a,b,c)
#define nvec 1
#include "sharp_inchelper1.inc.c"
#undef nvec
#define nvec 2
#include "sharp_inchelper1.inc.c"
#undef nvec
#define nvec 3
#include "sharp_inchelper1.inc.c"
#undef nvec
#define nvec 4
#include "sharp_inchelper1.inc.c"
#undef nvec
#define nvec 5
#include "sharp_inchelper1.inc.c"
#undef nvec
#define nvec 6
#include "sharp_inchelper1.inc.c"
#undef nvec
void inner_loop (sharp_job *job, const int *ispair,const double *cth,
const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
const int *idx)
{
int njobs=job->ntrans;
if (njobs<=MAXJOB_SPECIAL)
{
switch (njobs*16+job->nv)
{
#if (MAXJOB_SPECIAL>=1)
case 0x11:
CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x12:
CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x13:
CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x14:
CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x15:
CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x16:
CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
#endif
#if (MAXJOB_SPECIAL>=2)
case 0x21:
CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x22:
CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x23:
CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x24:
CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x25:
CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x26:
CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
#endif
#if (MAXJOB_SPECIAL>=3)
case 0x31:
CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x32:
CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x33:
CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x34:
CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x35:
CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x36:
CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
#endif
#if (MAXJOB_SPECIAL>=4)
case 0x41:
CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x42:
CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x43:
CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x44:
CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x45:
CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x46:
CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
#endif
#if (MAXJOB_SPECIAL>=5)
case 0x51:
CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x52:
CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x53:
CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x54:
CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x55:
CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x56:
CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
#endif
#if (MAXJOB_SPECIAL>=6)
case 0x61:
CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x62:
CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x63:
CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x64:
CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x65:
CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x66:
CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
#endif
}
}
#if (MAXJOB_SPECIAL<6)
else
{
switch (job->nv)
{
case 1:
CONCAT2(inner_loop,1)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
return;
case 2:
CONCAT2(inner_loop,2)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
return;
case 3:
CONCAT2(inner_loop,3)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
return;
case 4:
CONCAT2(inner_loop,4)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
return;
case 5:
CONCAT2(inner_loop,5)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
return;
case 6:
CONCAT2(inner_loop,6)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
return;
}
}
#endif
UTIL_FAIL("Incorrect vector parameters");
}

50
external/sharp/libsharp/sharp_core.h vendored Normal file
View file

@ -0,0 +1,50 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_core.h
* Interface for the computational core
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef PLANCK_SHARP_CORE_H
#define PLANCK_SHARP_CORE_H
#include "sharp_internal.h"
#include "sharp_ylmgen_c.h"
#ifdef __cplusplus
extern "C" {
#endif
void inner_loop (sharp_job *job, const int *ispair,const double *cth,
const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
const int *idx);
#ifdef __cplusplus
}
#endif
#endif

288
external/sharp/libsharp/sharp_core_inc.c vendored Normal file
View file

@ -0,0 +1,288 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_core_inc.c
* Type-dependent code for the computational core
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
typedef struct
{ Tv v[nvec]; } Tb;
typedef union
{ Tb b; double s[VLEN*nvec]; } Y(Tbu);
typedef struct
{ Tb r, i; } Y(Tbri);
typedef struct
{ Tb qr, qi, ur, ui; } Y(Tbqu);
typedef struct
{ double r[VLEN*nvec], i[VLEN*nvec]; } Y(Tsri);
typedef struct
{ double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Y(Tsqu);
typedef union
{ Y(Tbri) b; Y(Tsri)s; } Y(Tburi);
typedef union
{ Y(Tbqu) b; Y(Tsqu)s; } Y(Tbuqu);
static inline Tb Y(Tbconst)(double val)
{
Tv v=vload(val);
Tb res;
for (int i=0; i<nvec; ++i) res.v[i]=v;
return res;
}
static inline void Y(Tbmuleq1)(Tb * restrict a, double b)
{ Tv v=vload(b); for (int i=0; i<nvec; ++i) vmuleq(a->v[i],v); }
static inline Tb Y(Tbprod)(Tb a, Tb b)
{ Tb r; for (int i=0; i<nvec; ++i) r.v[i]=vmul(a.v[i],b.v[i]); return r; }
static inline void Y(Tbmuleq)(Tb * restrict a, Tb b)
{ for (int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); }
static inline void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
double maxval)
{
const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
for (int i=0;i<nvec; ++i)
{
Tv mask = vgt(vabs(val->v[i]),vfmax);
while (vanyTrue(mask))
{
vmuleq(val->v[i],vblend(mask,vfsmall,vone));
vaddeq(scale->v[i],vblend(mask,vone,vzero));
mask = vgt(vabs(val->v[i]),vfmax);
}
mask = vand(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
while (vanyTrue(mask))
{
vmuleq(val->v[i],vblend(mask,vfbig,vone));
vsubeq(scale->v[i],vblend(mask,vone,vzero));
mask = vand(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
}
}
}
static inline void Y(mypow) (Tb val, int npow, Tb * restrict resd,
Tb * restrict ress)
{
Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
do
{
if (npow&1)
{
for (int i=0; i<nvec; ++i)
{
vmuleq(res.v[i],val.v[i]);
vaddeq(scale.v[i],scaleint.v[i]);
}
Y(Tbnormalize)(&res,&scale,sharp_fbighalf);
}
for (int i=0; i<nvec; ++i)
{
vmuleq(val.v[i],val.v[i]);
vaddeq(scaleint.v[i],scaleint.v[i]);
}
Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
}
while(npow>>=1);
*resd=res;
*ress=scale;
}
static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
Tb * restrict scale)
{
int did_scale=0;
for (int i=0;i<nvec; ++i)
{
Tv mask = vgt(vabs(lam2->v[i]),vload(sharp_ftol));
if (vanyTrue(mask))
{
did_scale=1;
Tv fact = vblend(mask,vload(sharp_fsmall),vone);
vmuleq(lam1->v[i],fact); vmuleq(lam2->v[i],fact);
vaddeq(scale->v[i],vblend(mask,vone,vzero));
}
}
return did_scale;
}
static inline int Y(TballLt)(Tb a,double b)
{
Tv vb=vload(b);
Tv res=vlt(a.v[0],vb);
for (int i=1; i<nvec; ++i)
res=vand(res,vlt(a.v[i],vb));
return vallTrue(res);
}
static inline int Y(TballGt)(Tb a,double b)
{
Tv vb=vload(b);
Tv res=vgt(a.v[0],vb);
for (int i=1; i<nvec; ++i)
res=vand(res,vgt(a.v[i],vb));
return vallTrue(res);
}
static inline int Y(TballGe)(Tb a,double b)
{
Tv vb=vload(b);
Tv res=vge(a.v[0],vb);
for (int i=1; i<nvec; ++i)
res=vand(res,vge(a.v[i],vb));
return vallTrue(res);
}
static inline void Y(getCorfac)(Tb scale, Tb * restrict corfac,
const double * restrict cf)
{
Y(Tbu) sc, corf;
sc.b=scale;
for (int i=0; i<VLEN*nvec; ++i)
corf.s[i] = (sc.s[i]<sharp_minscale) ?
0. : cf[(int)(sc.s[i])-sharp_minscale];
*corfac=corf.b;
}
static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
const sharp_Ylmgen_C * restrict gen)
{
int l=gen->m;
Tb lam_1=Y(Tbconst)(0.), lam_2, scale;
Y(mypow) (sth,l,&lam_2,&scale);
Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
Y(Tbnormalize)(&lam_2,&scale,sharp_ftol);
int below_limit = Y(TballLt)(scale,sharp_limscale);
while (below_limit)
{
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
Tv r0=vload(gen->rf[l].f[0]),r1=vload(gen->rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
r0=vload(gen->rf[l+1].f[0]); r1=vload(gen->rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
if (Y(rescale)(&lam_1,&lam_2,&scale))
below_limit = Y(TballLt)(scale,sharp_limscale);
l+=2;
}
*l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale;
}
static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
Tb * restrict ryp, Tb * restrict rym, const Tb cth,
const sharp_ylmgen_dbl3 fx)
{
Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
for (int i=0; i<nvec; ++i)
{
rxp->v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,ryp->v[i])),
vmul(fx2,rxp->v[i]));
rxm->v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rym->v[i])),
vmul(fx2,rxm->v[i]));
}
}
static void Y(iter_to_ieee_spin) (const Tb cth, int *l_,
Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_,
Tb * scalep_, Tb * scalem_, const sharp_Ylmgen_C * restrict gen)
{
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb cth2, sth2;
for (int i=0; i<nvec; ++i)
{
cth2.v[i]=vsqrt(vmul(vadd(vone,cth.v[i]),vload(0.5)));
cth2.v[i]=vmax(cth2.v[i],vload(1e-15));
sth2.v[i]=vsqrt(vmul(vsub(vone,cth.v[i]),vload(0.5)));
sth2.v[i]=vmax(sth2.v[i],vload(1e-15));
}
Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
Y(mypow)(cth2,gen->cosPow,&ccp,&ccps); Y(mypow)(sth2,gen->sinPow,&ssp,&ssps);
Y(mypow)(cth2,gen->sinPow,&csp,&csps); Y(mypow)(sth2,gen->cosPow,&scp,&scps);
Tb rec2p, rec2m, scalep, scalem;
Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.);
Tv prefac=vload(gen->prefac[gen->m]),
prescale=vload(gen->fscale[gen->m]);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i]=vmul(prefac,ccp.v[i]);
scalep.v[i]=vadd(prescale,ccps.v[i]);
rec2m.v[i]=vmul(prefac,csp.v[i]);
scalem.v[i]=vadd(prescale,csps.v[i]);
}
Y(Tbnormalize)(&rec2m,&scalem,sharp_fbighalf);
Y(Tbnormalize)(&rec2p,&scalep,sharp_fbighalf);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i]=vmul(rec2p.v[i],ssp.v[i]);
scalep.v[i]=vadd(scalep.v[i],ssps.v[i]);
rec2m.v[i]=vmul(rec2m.v[i],scp.v[i]);
scalem.v[i]=vadd(scalem.v[i],scps.v[i]);
if (gen->preMinus_p)
rec2p.v[i]=vneg(rec2p.v[i]);
if (gen->preMinus_m)
rec2m.v[i]=vneg(rec2m.v[i]);
if (gen->s&1)
rec2p.v[i]=vneg(rec2p.v[i]);
}
Y(Tbnormalize)(&rec2m,&scalem,sharp_ftol);
Y(Tbnormalize)(&rec2p,&scalep,sharp_ftol);
int l=gen->mhi;
int below_limit = Y(TballLt)(scalep,sharp_limscale)
&& Y(TballLt)(scalem,sharp_limscale);
while (below_limit)
{
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]);
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
below_limit = Y(TballLt)(scalep,sharp_limscale)
&& Y(TballLt)(scalem,sharp_limscale);
l+=2;
}
*l_=l;
*rec1p_=rec1p; *rec2p_=rec2p; *scalep_=scalep;
*rec1m_=rec1m; *rec2m_=rec2m; *scalem_=scalem;
}

View file

@ -0,0 +1,810 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_core_inc2.c
* Type-dependent code for the computational core
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
typedef struct
{ Y(Tbri) j[njobs]; } Z(Tbrij);
typedef union
{ Z(Tbrij) b; Y(Tsri) j[njobs]; } Z(Tburij);
typedef struct
{ Y(Tbqu) j[njobs]; } Z(Tbquj);
typedef union
{ Z(Tbquj) b; Y(Tsqu) j[njobs]; } Z(Tbuquj);
static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2,
const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
int l, int lmax)
{
#if (njobs>1)
while (l<lmax-2)
{
Tb lam_3, lam_4;
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar2=vload(creal(alm[njobs*l+j])),
ai2=vload(cimag(alm[njobs*l+j])),
ar4=vload(creal(alm[njobs*(l+2)+j])),
ai4=vload(cimag(alm[njobs*(l+2)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaaeq(p1->j[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
vfmaaeq(p1->j[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
}
Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
ai3=vload(cimag(alm[njobs*(l+1)+j])),
ar1=vload(creal(alm[njobs*(l+3)+j])),
ai1=vload(cimag(alm[njobs*(l+3)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaaeq(p2->j[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
vfmaaeq(p2->j[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
}
}
r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
l+=4;
}
#endif
while (l<lmax)
{
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),
ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p1->j[j].r.v[i],lam_2.v[i],ar);
vfmaeq(p1->j[j].i.v[i],lam_2.v[i],ai);
}
ar=vload(creal(alm[njobs*(l+1)+j]));
ai=vload(cimag(alm[njobs*(l+1)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p2->j[j].r.v[i],lam_1.v[i],ar);
vfmaeq(p2->j[j].i.v[i],lam_1.v[i],ai);
}
}
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
l+=2;
}
if (l==lmax)
{
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p1->j[j].r.v[i],lam_2.v[i],ar);
vfmaeq(p1->j[j].i.v[i],lam_2.v[i],ai);
}
}
}
}
static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
const Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2,
const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax)
{
while (l<lmax)
{
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
for (int i=0; i<nvec; ++i)
{
vfmaeq(tr1,lam_2.v[i],p1->j[j].r.v[i]);
vfmaeq(ti1,lam_2.v[i],p1->j[j].i.v[i]);
}
for (int i=0; i<nvec; ++i)
{
vfmaeq(tr2,lam_1.v[i],p2->j[j].r.v[i]);
vfmaeq(ti2,lam_1.v[i],p2->j[j].i.v[i]);
}
vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
}
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
l+=2;
}
if (l==lmax)
{
for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i)
{
vfmaeq(tre,lam_2.v[i],p1->j[j].r.v[i]);
vfmaeq(tim,lam_2.v[i],p1->j[j].i.v[i]);
}
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
}
}
static void Z(calc_alm2map) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, Z(Tbrij) * restrict p1,
Z(Tbrij) * restrict p2, int *done)
{
int l,lmax=gen->lmax;
Tb lam_1,lam_2,scale;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec;
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scale,sharp_minscale);
while (!full_ieee)
{
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
vfmaeq(p1->j[j].r.v[i],tmp,ar);
vfmaeq(p1->j[j].i.v[i],tmp,ai);
}
}
if (++l>lmax) break;
Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
vfmaeq(p2->j[j].r.v[i],tmp,ar);
vfmaeq(p2->j[j].i.v[i],tmp,ai);
}
}
if (++l>lmax) break;
r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
if (Y(rescale)(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGe)(scale,sharp_minscale);
}
}
if (l>lmax) { *done=1; return; }
Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
}
static void Z(calc_map2alm) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, const Z(Tbrij) * restrict p1,
const Z(Tbrij) * restrict p2, int *done)
{
int lmax=gen->lmax;
Tb lam_1,lam_2,scale;
int l=gen->m;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec;
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scale,sharp_minscale);
while (!full_ieee)
{
for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
vfmaeq(tre,tmp,p1->j[j].r.v[i]);
vfmaeq(tim,tmp,p1->j[j].i.v[i]);
}
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
if (++l>lmax) { *done=1; return; }
Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
vfmaeq(tre,tmp,p2->j[j].r.v[i]);
vfmaeq(tim,tmp,p2->j[j].i.v[i]);
}
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
if (++l>lmax) { *done=1; return; }
r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
if (Y(rescale)(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGe)(scale,sharp_minscale);
}
}
Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
}
static inline void Z(saddstep) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py,
const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
{
for (int j=0; j<njobs; ++j)
{
Tv agr=vload(creal(alm[2*j])), agi=vload(cimag(alm[2*j])),
acr=vload(creal(alm[2*j+1])), aci=vload(cimag(alm[2*j+1]));
for (int i=0; i<nvec; ++i)
{
Tv lw=vadd(rxp.v[i],rxm.v[i]);
vfmaeq(px->j[j].qr.v[i],agr,lw);
vfmaeq(px->j[j].qi.v[i],agi,lw);
vfmaeq(px->j[j].ur.v[i],acr,lw);
vfmaeq(px->j[j].ui.v[i],aci,lw);
}
for (int i=0; i<nvec; ++i)
{
Tv lx=vsub(rxm.v[i],rxp.v[i]);
vfmseq(py->j[j].qr.v[i],aci,lx);
vfmaeq(py->j[j].qi.v[i],acr,lx);
vfmaeq(py->j[j].ur.v[i],agi,lx);
vfmseq(py->j[j].ui.v[i],agr,lx);
}
}
}
static inline void Z(saddstepb) (Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2,
const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
const dcmplx * restrict alm1, const dcmplx * restrict alm2)
{
for (int j=0; j<njobs; ++j)
{
Tv agr1=vload(creal(alm1[2*j])), agi1=vload(cimag(alm1[2*j])),
acr1=vload(creal(alm1[2*j+1])), aci1=vload(cimag(alm1[2*j+1]));
Tv agr2=vload(creal(alm2[2*j])), agi2=vload(cimag(alm2[2*j])),
acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
for (int i=0; i<nvec; ++i)
{
Tv lw1=vadd(r2p.v[i],r2m.v[i]);
Tv lx2=vsub(r1m.v[i],r1p.v[i]);
vfmaseq(p1->j[j].qr.v[i],agr1,lw1,aci2,lx2);
vfmaaeq(p1->j[j].qi.v[i],agi1,lw1,acr2,lx2);
vfmaaeq(p1->j[j].ur.v[i],acr1,lw1,agi2,lx2);
vfmaseq(p1->j[j].ui.v[i],aci1,lw1,agr2,lx2);
}
for (int i=0; i<nvec; ++i)
{
Tv lx1=vsub(r2m.v[i],r2p.v[i]);
Tv lw2=vadd(r1p.v[i],r1m.v[i]);
vfmaseq(p2->j[j].qr.v[i],agr2,lw2,aci1,lx1);
vfmaaeq(p2->j[j].qi.v[i],agi2,lw2,acr1,lx1);
vfmaaeq(p2->j[j].ur.v[i],acr2,lw2,agi1,lx1);
vfmaseq(p2->j[j].ui.v[i],aci2,lw2,agr1,lx1);
}
}
}
static inline void Z(saddstep2) (const Z(Tbquj) * restrict px,
const Z(Tbquj) * restrict py, const Tb * restrict rxp,
const Tb * restrict rxm, dcmplx * restrict alm)
{
for (int j=0; j<njobs; ++j)
{
Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
for (int i=0; i<nvec; ++i)
{
Tv lw=vadd(rxp->v[i],rxm->v[i]);
vfmaeq(agr,px->j[j].qr.v[i],lw);
vfmaeq(agi,px->j[j].qi.v[i],lw);
vfmaeq(acr,px->j[j].ur.v[i],lw);
vfmaeq(aci,px->j[j].ui.v[i],lw);
}
for (int i=0; i<nvec; ++i)
{
Tv lx=vsub(rxm->v[i],rxp->v[i]);
vfmseq(agr,py->j[j].ui.v[i],lx);
vfmaeq(agi,py->j[j].ur.v[i],lx);
vfmaeq(acr,py->j[j].qi.v[i],lx);
vfmseq(aci,py->j[j].qr.v[i],lx);
}
vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]);
}
}
static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1,
Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax)
{
while (l<lmax)
{
Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
fx2=vload(fx[l+1].f[2]);
for (int i=0; i<nvec; ++i)
{
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
vmul(fx2,rec1p.v[i]));
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
}
#if (njobs>1)
Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
&alm[2*njobs*(l+1)]);
#else
Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l]);
Z(saddstep)(p2, p1, rec1p, rec1m, &alm[2*njobs*(l+1)]);
#endif
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
vmul(fx2,rec2p.v[i]));
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
vmul(fx2,rec2m.v[i]));
}
l+=2;
}
if (l==lmax)
Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l]);
}
static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1,
const Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
{
while (l<lmax)
{
Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
fx2=vload(fx[l+1].f[2]);
for (int i=0; i<nvec; ++i)
{
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
vmul(fx2,rec1p.v[i]));
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
}
Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l]);
Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)]);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
vmul(fx2,rec2p.v[i]));
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
vmul(fx2,rec2m.v[i]));
}
l+=2;
}
if (l==lmax)
Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l]);
}
static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax)
{ *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
while (!full_ieee)
{
Z(saddstep)(p1, p2,
Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), &alm[2*njobs*l]);
if (++l>lmax) break;
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
Z(saddstep)(p2, p1,
Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), &alm[2*njobs*l]);
if (++l>lmax) break;
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
}
}
if (l>lmax)
{ *done=1; return; }
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Z(alm2map_spin_kernel) (cth,p1,p2,
rec1p, rec1m, rec2p, rec2m, fx, alm, l, lmax);
}
static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen,
sharp_job *job, const Z(Tbquj) * restrict p1, const Z(Tbquj) * restrict p2,
int *done)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
while (!full_ieee)
{
Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l]);
if (++l>lmax) { *done=1; return; }
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l]);
if (++l>lmax) { *done=1; return; }
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
}
}
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Z(map2alm_spin_kernel) (cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
}
static inline void Z(saddstep_d) (Z(Tbquj) * restrict px,
Z(Tbquj) * restrict py, const Tb rxp, const Tb rxm,
const dcmplx * restrict alm)
{
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[j])), ai=vload(cimag(alm[j]));
for (int i=0; i<nvec; ++i)
{
Tv lw=vadd(rxp.v[i],rxm.v[i]);
vfmaeq(px->j[j].qr.v[i],ar,lw);
vfmaeq(px->j[j].qi.v[i],ai,lw);
}
for (int i=0; i<nvec; ++i)
{
Tv lx=vsub(rxm.v[i],rxp.v[i]);
vfmaeq(py->j[j].ur.v[i],ai,lx);
vfmseq(py->j[j].ui.v[i],ar,lx);
}
}
}
static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1,
Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax)
{
while (l<lmax)
{
Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
fx2=vload(fx[l+1].f[2]);
for (int i=0; i<nvec; ++i)
{
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
vmul(fx2,rec1p.v[i]));
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
}
Z(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l]);
Z(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)]);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
vmul(fx2,rec2p.v[i]));
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
vmul(fx2,rec2m.v[i]));
}
l+=2;
}
if (l==lmax)
Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l]);
}
static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen,
sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax)
{ *done=1; return; }
job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
while (!full_ieee)
{
Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
&alm[njobs*l]);
if (++l>lmax) break;
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
&alm[njobs*l]);
if (++l>lmax) break;
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
}
}
if (l>lmax)
{ *done=1; return; }
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
lmax);
}
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
static void Z(inner_loop) (sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *idx)
{
const int nval=nvec*VLEN;
const int m = job->ainfo->mval[mi];
sharp_Ylmgen_prepare (gen, m);
switch (job->type)
{
case SHARP_ALM2MAP:
case SHARP_ALM2MAP_DERIV1:
{
if (job->spin==0)
{
int done=0;
for (int ith=0; ith<ulim-llim; ith+=nval)
{
Z(Tburij) p1,p2; VZERO(p1); VZERO(p2);
if (!done)
{
Y(Tbu) cth, sth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
}
Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1.b,&p2.b,&done);
}
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot<ulim-llim)
{
itot=idx[itot];
for (int j=0; j<njobs; ++j)
{
int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
complex double r1 = p1.j[j].r[i] + p1.j[j].i[i]*_Complex_I,
r2 = p2.j[j].r[i] + p2.j[j].i[i]*_Complex_I;
job->phase[phas_idx] = r1+r2;
if (ispair[itot])
job->phase[phas_idx+1] = r1-r2;
}
}
}
}
}
else
{
int done=0;
for (int ith=0; ith<ulim-llim; ith+=nval)
{
Z(Tbuquj) p1,p2; VZERO(p1); VZERO(p2);
if (!done)
{
Y(Tbu) cth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot];
}
(job->type==SHARP_ALM2MAP) ?
Z(calc_alm2map_spin ) (cth.b,gen,job,&p1.b,&p2.b,&done) :
Z(calc_alm2map_deriv1) (cth.b,gen,job,&p1.b,&p2.b,&done);
}
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot<ulim-llim)
{
itot=idx[itot];
for (int j=0; j<njobs; ++j)
{
int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
complex double q1 = p1.j[j].qr[i] + p1.j[j].qi[i]*_Complex_I,
q2 = p2.j[j].qr[i] + p2.j[j].qi[i]*_Complex_I,
u1 = p1.j[j].ur[i] + p1.j[j].ui[i]*_Complex_I,
u2 = p2.j[j].ur[i] + p2.j[j].ui[i]*_Complex_I;
job->phase[phas_idx] = q1+q2;
job->phase[phas_idx+2] = u1+u2;
if (ispair[itot])
{
dcmplx *phQ = &(job->phase[phas_idx+1]),
*phU = &(job->phase[phas_idx+3]);
*phQ = q1-q2;
*phU = u1-u2;
if ((gen->mhi-gen->m+gen->s)&1)
{ *phQ=-(*phQ); *phU=-(*phU); }
}
}
}
}
}
}
break;
}
case SHARP_MAP2ALM:
{
if (job->spin==0)
{
int done=0;
for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
{
Z(Tburij) p1, p2; VZERO(p1); VZERO(p2);
Y(Tbu) cth, sth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
if (i+ith<ulim-llim)
{
for (int j=0; j<njobs; ++j)
{
int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
dcmplx ph1=job->phase[phas_idx];
dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
p1.j[j].r[i]=creal(ph1+ph2); p1.j[j].i[i]=cimag(ph1+ph2);
p2.j[j].r[i]=creal(ph1-ph2); p2.j[j].i[i]=cimag(ph1-ph2);
}
}
}
Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b,&done);
}
}
else
{
int done=0;
for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
{
Z(Tbuquj) p1, p2; VZERO(p1); VZERO(p2);
Y(Tbu) cth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot];
if (i+ith<ulim-llim)
{
for (int j=0; j<njobs; ++j)
{
int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
dcmplx p1Q=job->phase[phas_idx],
p1U=job->phase[phas_idx+2],
p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
if ((gen->mhi-gen->m+gen->s)&1)
{ p2Q=-p2Q; p2U=-p2U; }
p1.j[j].qr[i]=creal(p1Q+p2Q); p1.j[j].qi[i]=cimag(p1Q+p2Q);
p1.j[j].ur[i]=creal(p1U+p2U); p1.j[j].ui[i]=cimag(p1U+p2U);
p2.j[j].qr[i]=creal(p1Q-p2Q); p2.j[j].qi[i]=cimag(p1Q-p2Q);
p2.j[j].ur[i]=creal(p1U-p2U); p2.j[j].ui[i]=cimag(p1U-p2U);
}
}
}
Z(calc_map2alm_spin) (cth.b,gen,job,&p1.b,&p2.b,&done);
}
}
break;
}
}
}
#undef VZERO

View file

@ -0,0 +1,800 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_core_inc3.c
* Type-dependent code for the computational core
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
int l, int lmax, int njobs)
{
while (l<lmax-2)
{
Tb lam_3, lam_4;
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar2=vload(creal(alm[njobs*l+j])),
ai2=vload(cimag(alm[njobs*l+j])),
ar4=vload(creal(alm[njobs*(l+2)+j])),
ai4=vload(cimag(alm[njobs*(l+2)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaaeq(p1[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
vfmaaeq(p1[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
}
Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
ai3=vload(cimag(alm[njobs*(l+1)+j])),
ar1=vload(creal(alm[njobs*(l+3)+j])),
ai1=vload(cimag(alm[njobs*(l+3)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaaeq(p2[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
vfmaaeq(p2[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
}
}
r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
l+=4;
}
while (l<lmax)
{
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),
ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
}
ar=vload(creal(alm[njobs*(l+1)+j]));
ai=vload(cimag(alm[njobs*(l+1)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p2[j].r.v[i],lam_1.v[i],ar);
vfmaeq(p2[j].i.v[i],lam_1.v[i],ai);
}
}
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
l+=2;
}
if (l==lmax)
{
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
}
}
}
}
static void Y(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax,
int njobs)
{
while (l<lmax)
{
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
for (int i=0; i<nvec; ++i)
{
vfmaeq(tr1,lam_2.v[i],p1[j].r.v[i]);
vfmaeq(ti1,lam_2.v[i],p1[j].i.v[i]);
}
for (int i=0; i<nvec; ++i)
{
vfmaeq(tr2,lam_1.v[i],p2[j].r.v[i]);
vfmaeq(ti2,lam_1.v[i],p2[j].i.v[i]);
}
vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
}
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
l+=2;
}
if (l==lmax)
{
for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i)
{
vfmaeq(tre,lam_2.v[i],p1[j].r.v[i]);
vfmaeq(tim,lam_2.v[i],p1[j].i.v[i]);
}
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
}
}
static void Y(calc_alm2map) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
Y(Tbri) * restrict p2, int njobs, int *done)
{
int l,lmax=gen->lmax;
Tb lam_1,lam_2,scale;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec;
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scale,sharp_minscale);
while (!full_ieee)
{
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
vfmaeq(p1[j].r.v[i],tmp,ar);
vfmaeq(p1[j].i.v[i],tmp,ai);
}
}
if (++l>lmax) break;
Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
vfmaeq(p2[j].r.v[i],tmp,ar);
vfmaeq(p2[j].i.v[i],tmp,ai);
}
}
if (++l>lmax) break;
r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
if (Y(rescale)(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGe)(scale,sharp_minscale);
}
}
if (l>lmax) { *done=1; return; }
Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
Y(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, njobs);
}
static void Y(calc_map2alm) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
const Y(Tbri) * restrict p2, int njobs, int *done)
{
int lmax=gen->lmax;
Tb lam_1,lam_2,scale;
int l=gen->m;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec;
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scale,sharp_minscale);
while (!full_ieee)
{
for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
vfmaeq(tre,tmp,p1[j].r.v[i]);
vfmaeq(tim,tmp,p1[j].i.v[i]);
}
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
if (++l>lmax) { *done=1; return; }
Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
vfmaeq(tre,tmp,p2[j].r.v[i]);
vfmaeq(tim,tmp,p2[j].i.v[i]);
}
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
if (++l>lmax) { *done=1; return; }
r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
if (Y(rescale)(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGe)(scale,sharp_minscale);
}
}
Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
Y(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, njobs);
}
static inline void Y(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
const Tb rxp, const Tb rxm, const dcmplx * restrict alm, int njobs)
{
for (int j=0; j<njobs; ++j)
{
Tv agr=vload(creal(alm[2*j])), agi=vload(cimag(alm[2*j])),
acr=vload(creal(alm[2*j+1])), aci=vload(cimag(alm[2*j+1]));
for (int i=0; i<nvec; ++i)
{
Tv lw=vadd(rxp.v[i],rxm.v[i]);
vfmaeq(px[j].qr.v[i],agr,lw);
vfmaeq(px[j].qi.v[i],agi,lw);
vfmaeq(px[j].ur.v[i],acr,lw);
vfmaeq(px[j].ui.v[i],aci,lw);
}
for (int i=0; i<nvec; ++i)
{
Tv lx=vsub(rxm.v[i],rxp.v[i]);
vfmseq(py[j].qr.v[i],aci,lx);
vfmaeq(py[j].qi.v[i],acr,lx);
vfmaeq(py[j].ur.v[i],agi,lx);
vfmseq(py[j].ui.v[i],agr,lx);
}
}
}
static inline void Y(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
const dcmplx * restrict alm1, const dcmplx * restrict alm2, int njobs)
{
for (int j=0; j<njobs; ++j)
{
Tv agr1=vload(creal(alm1[2*j])), agi1=vload(cimag(alm1[2*j])),
acr1=vload(creal(alm1[2*j+1])), aci1=vload(cimag(alm1[2*j+1]));
Tv agr2=vload(creal(alm2[2*j])), agi2=vload(cimag(alm2[2*j])),
acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
for (int i=0; i<nvec; ++i)
{
Tv lw1=vadd(r2p.v[i],r2m.v[i]);
Tv lx2=vsub(r1m.v[i],r1p.v[i]);
vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
vfmaseq(p1[j].ui.v[i],aci1,lw1,agr2,lx2);
}
for (int i=0; i<nvec; ++i)
{
Tv lx1=vsub(r2m.v[i],r2p.v[i]);
Tv lw2=vadd(r1p.v[i],r1m.v[i]);
vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
vfmaseq(p2[j].ui.v[i],aci2,lw2,agr1,lx1);
}
}
}
static inline void Y(saddstep2) (const Y(Tbqu) * restrict px,
const Y(Tbqu) * restrict py, const Tb * restrict rxp,
const Tb * restrict rxm, dcmplx * restrict alm, int njobs)
{
for (int j=0; j<njobs; ++j)
{
Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
for (int i=0; i<nvec; ++i)
{
Tv lw=vadd(rxp->v[i],rxm->v[i]);
vfmaeq(agr,px[j].qr.v[i],lw);
vfmaeq(agi,px[j].qi.v[i],lw);
vfmaeq(acr,px[j].ur.v[i],lw);
vfmaeq(aci,px[j].ui.v[i],lw);
}
for (int i=0; i<nvec; ++i)
{
Tv lx=vsub(rxm->v[i],rxp->v[i]);
vfmseq(agr,py[j].ui.v[i],lx);
vfmaeq(agi,py[j].ur.v[i],lx);
vfmaeq(acr,py[j].qi.v[i],lx);
vfmseq(aci,py[j].qr.v[i],lx);
}
vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]);
}
}
static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax, int njobs)
{
while (l<lmax)
{
Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
fx2=vload(fx[l+1].f[2]);
for (int i=0; i<nvec; ++i)
{
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
vmul(fx2,rec1p.v[i]));
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
}
Y(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
&alm[2*njobs*(l+1)], njobs);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
vmul(fx2,rec2p.v[i]));
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
vmul(fx2,rec2m.v[i]));
}
l+=2;
}
if (l==lmax)
Y(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l], njobs);
}
static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax,
int njobs)
{
while (l<lmax)
{
Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
fx2=vload(fx[l+1].f[2]);
for (int i=0; i<nvec; ++i)
{
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
vmul(fx2,rec1p.v[i]));
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
}
Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l],njobs);
Y(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)],njobs);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
vmul(fx2,rec2p.v[i]));
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
vmul(fx2,rec2m.v[i]));
}
l+=2;
}
if (l==lmax)
Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l], njobs);
}
static void Y(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs,
int *done)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax)
{ *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
while (!full_ieee)
{
Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
&alm[2*njobs*l],njobs);
if (++l>lmax) break;
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
Y(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
&alm[2*njobs*l], njobs);
if (++l>lmax) break;
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
}
}
if (l>lmax)
{ *done=1; return; }
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Y(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
lmax, njobs);
}
static void Y(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen,
sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2,
int njobs, int *done)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
while (!full_ieee)
{
Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
Y(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l], njobs);
if (++l>lmax) { *done=1; return; }
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
Y(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l], njobs);
if (++l>lmax) { *done=1; return; }
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
}
}
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Y(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax,njobs);
}
static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
const Tb rxp, const Tb rxm, const dcmplx * restrict alm, int njobs)
{
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[j])), ai=vload(cimag(alm[j]));
for (int i=0; i<nvec; ++i)
{
Tv lw=vadd(rxp.v[i],rxm.v[i]);
vfmaeq(px[j].qr.v[i],ar,lw);
vfmaeq(px[j].qi.v[i],ai,lw);
}
for (int i=0; i<nvec; ++i)
{
Tv lx=vsub(rxm.v[i],rxp.v[i]);
vfmaeq(py[j].ur.v[i],ai,lx);
vfmseq(py[j].ui.v[i],ar,lx);
}
}
}
static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax, int njobs)
{
while (l<lmax)
{
Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
fx2=vload(fx[l+1].f[2]);
for (int i=0; i<nvec; ++i)
{
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
vmul(fx2,rec1p.v[i]));
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
}
Y(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l],njobs);
Y(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)],njobs);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
vmul(fx2,rec2p.v[i]));
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
vmul(fx2,rec2m.v[i]));
}
l+=2;
}
if (l==lmax)
Y(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l], njobs);
}
static void Y(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen,
sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs,
int *done)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax)
{ *done=1; return; }
job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
while (!full_ieee)
{
Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
&alm[njobs*l],njobs);
if (++l>lmax) break;
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
Y(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
&alm[njobs*l], njobs);
if (++l>lmax) break;
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
}
}
if (l>lmax)
{ *done=1; return; }
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Y(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
lmax, njobs);
}
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
static void Y(inner_loop) (sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *idx, int njobs)
{
const int nval=nvec*VLEN;
const int m = job->ainfo->mval[mi];
sharp_Ylmgen_prepare (gen, m);
switch (job->type)
{
case SHARP_ALM2MAP:
case SHARP_ALM2MAP_DERIV1:
{
if (job->spin==0)
{
int done=0;
for (int ith=0; ith<ulim-llim; ith+=nval)
{
Y(Tburi) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
if (!done)
{
Y(Tbu) cth, sth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
}
Y(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
}
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot<ulim-llim)
{
itot=idx[itot];
for (int j=0; j<njobs; ++j)
{
int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
complex double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I,
r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I;
job->phase[phas_idx] = r1+r2;
if (ispair[itot])
job->phase[phas_idx+1] = r1-r2;
}
}
}
}
}
else
{
int done=0;
for (int ith=0; ith<ulim-llim; ith+=nval)
{
Y(Tbuqu) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
if (!done)
{
Y(Tbu) cth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot];
}
(job->type==SHARP_ALM2MAP) ?
Y(calc_alm2map_spin )
(cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done) :
Y(calc_alm2map_deriv1)
(cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
}
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot<ulim-llim)
{
itot=idx[itot];
for (int j=0; j<njobs; ++j)
{
int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
complex double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I,
q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I,
u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I,
u2 = p2[j].s.ur[i] + p2[j].s.ui[i]*_Complex_I;
job->phase[phas_idx] = q1+q2;
job->phase[phas_idx+2] = u1+u2;
if (ispair[itot])
{
dcmplx *phQ = &(job->phase[phas_idx+1]),
*phU = &(job->phase[phas_idx+3]);
*phQ = q1-q2;
*phU = u1-u2;
if ((gen->mhi-gen->m+gen->s)&1)
{ *phQ=-(*phQ); *phU=-(*phU); }
}
}
}
}
}
}
break;
}
case SHARP_MAP2ALM:
{
if (job->spin==0)
{
int done=0;
for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
{
Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
Y(Tbu) cth, sth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
if (i+ith<ulim-llim)
{
for (int j=0; j<njobs; ++j)
{
int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
dcmplx ph1=job->phase[phas_idx];
dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2);
p2[j].s.r[i]=creal(ph1-ph2); p2[j].s.i[i]=cimag(ph1-ph2);
}
}
}
Y(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
}
}
else
{
int done=0;
for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
{
Y(Tbuqu) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
Y(Tbu) cth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot];
if (i+ith<ulim-llim)
{
for (int j=0; j<njobs; ++j)
{
int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
dcmplx p1Q=job->phase[phas_idx],
p1U=job->phase[phas_idx+2],
p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
if ((gen->mhi-gen->m+gen->s)&1)
{ p2Q=-p2Q; p2U=-p2U; }
p1[j].s.qr[i]=creal(p1Q+p2Q); p1[j].s.qi[i]=cimag(p1Q+p2Q);
p1[j].s.ur[i]=creal(p1U+p2U); p1[j].s.ui[i]=cimag(p1U+p2U);
p2[j].s.qr[i]=creal(p1Q-p2Q); p2[j].s.qi[i]=cimag(p1Q-p2Q);
p2[j].s.ur[i]=creal(p1U-p2U); p2[j].s.ui[i]=cimag(p1U-p2U);
}
}
}
Y(calc_map2alm_spin) (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
}
}
break;
}
}
}
#undef VZERO

140
external/sharp/libsharp/sharp_cxx.h vendored Normal file
View file

@ -0,0 +1,140 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_cxx.h
* Spherical transform library
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef PLANCK_SHARP_CXX_H
#define PLANCK_SHARP_CXX_H
#include "sharp_lowlevel.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "xcomplex.h"
class sharp_base
{
protected:
sharp_alm_info *ainfo;
sharp_geom_info *ginfo;
public:
sharp_base()
: ainfo(0), ginfo(0) {}
~sharp_base()
{
sharp_destroy_geom_info(ginfo);
sharp_destroy_alm_info(ainfo);
}
void set_general_geometry (int nrings, const int *nph, const ptrdiff_t *ofs,
const int *stride, const double *phi0, const double *theta,
const double *weight)
{
sharp_make_geom_info (nrings, nph, ofs, stride, phi0, theta, weight,
&ginfo);
}
void set_ECP_geometry (int nrings, int nphi)
{ sharp_make_ecp_geom_info (nrings, nphi, 0., 1, nphi, &ginfo); }
void set_Gauss_geometry (int nrings, int nphi)
{ sharp_make_gauss_geom_info (nrings, nphi, 1, nphi, &ginfo); }
void set_Healpix_geometry (int nside)
{ sharp_make_healpix_geom_info (nside, 1, &ginfo); }
void set_weighted_Healpix_geometry (int nside, const double *weight)
{ sharp_make_weighted_healpix_geom_info (nside, 1, weight, &ginfo); }
void set_triangular_alm_info (int lmax, int mmax)
{ sharp_make_triangular_alm_info (lmax, mmax, 1, &ainfo); }
};
template<typename T> struct cxxjobhelper__ {};
template<> struct cxxjobhelper__<double>
{ enum {val=1}; };
template<> struct cxxjobhelper__<float>
{ enum {val=0}; };
template<typename T> class sharp_cxxjob: public sharp_base
{
private:
static void *conv (xcomplex<T> *ptr)
{ return reinterpret_cast<void *>(ptr); }
static void *conv (const xcomplex<T> *ptr)
{ return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }
static void *conv (T *ptr)
{ return reinterpret_cast<void *>(ptr); }
static void *conv (const T *ptr)
{ return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }
public:
void alm2map (const xcomplex<T> *alm, T *map, bool add)
{
void *aptr=conv(alm), *mptr=conv(map);
sharp_execute (SHARP_ALM2MAP, 0, add, &aptr, &mptr, ginfo, ainfo, 1,
cxxjobhelper__<T>::val,0,0,0);
}
void alm2map_spin (const xcomplex<T> *alm1, const xcomplex<T> *alm2,
T *map1, T *map2, int spin, bool add)
{
void *aptr[2], *mptr[2];
aptr[0]=conv(alm1); aptr[1]=conv(alm2);
mptr[0]=conv(map1); mptr[1]=conv(map2);
sharp_execute (SHARP_ALM2MAP, spin, add, aptr, mptr, ginfo, ainfo, 1,
cxxjobhelper__<T>::val,0,0,0);
}
void alm2map_der1 (const xcomplex<T> *alm, T *map1, T *map2, bool add)
{
void *aptr=conv(alm), *mptr[2];
mptr[0]=conv(map1); mptr[1]=conv(map2);
sharp_execute (SHARP_ALM2MAP_DERIV1, 1, add,&aptr, mptr, ginfo, ainfo,
1, cxxjobhelper__<T>::val,0,0,0);
}
void map2alm (const T *map, xcomplex<T> *alm, bool add)
{
void *aptr=conv(alm), *mptr=conv(map);
sharp_execute (SHARP_MAP2ALM, 0, add, &aptr, &mptr, ginfo, ainfo, 1,
cxxjobhelper__<T>::val,0,0,0);
}
void map2alm_spin (const T *map1, const T *map2, xcomplex<T> *alm1,
xcomplex<T> *alm2, int spin, bool add)
{
void *aptr[2], *mptr[2];
aptr[0]=conv(alm1); aptr[1]=conv(alm2);
mptr[0]=conv(map1); mptr[1]=conv(map2);
sharp_execute (SHARP_MAP2ALM, spin, add, aptr, mptr, ginfo, ainfo, 1,
cxxjobhelper__<T>::val,0,0,0);
}
};
#endif

View file

@ -0,0 +1,222 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_geomhelpers.c
* Spherical transform library
*
* Copyright (C) 2006-2011 Max-Planck-Society
* \author Martin Reinecke
*/
#include <math.h>
#include "sharp_geomhelpers.h"
#include "c_utils.h"
void sharp_make_healpix_geom_info (int nside, int stride,
sharp_geom_info **geom_info)
{
double *weight=RALLOC(double,2*nside);
SET_ARRAY(weight,0,2*nside,1);
sharp_make_weighted_healpix_geom_info (nside, stride, weight, geom_info);
DEALLOC(weight);
}
void sharp_make_weighted_healpix_geom_info (int nside, int stride,
const double *weight, sharp_geom_info **geom_info)
{
const double pi=3.141592653589793238462643383279502884197;
ptrdiff_t npix=(ptrdiff_t)nside*nside*12;
ptrdiff_t ncap=2*(ptrdiff_t)nside*(nside-1);
int nrings=4*nside-1;
double *theta=RALLOC(double,nrings);
double *weight_=RALLOC(double,nrings);
int *nph=RALLOC(int,nrings);
double *phi0=RALLOC(double,nrings);
ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
int *stride_=RALLOC(int,nrings);
for (int m=0; m<nrings; ++m)
{
int ring=m+1;
ptrdiff_t northring = (ring>2*nside) ? 4*nside-ring : ring;
stride_[m] = stride;
if (northring < nside)
{
theta[m] = 2*asin(northring/(sqrt(6.)*nside));
nph[m] = 4*northring;
phi0[m] = pi/nph[m];
ofs[m] = 2*northring*(northring-1)*stride;
}
else
{
double fact1 = (8.*nside)/npix;
double costheta = (2*nside-northring)*fact1;
theta[m] = acos(costheta);
nph[m] = 4*nside;
if ((northring-nside) & 1)
phi0[m] = 0;
else
phi0[m] = pi/nph[m];
ofs[m] = (ncap + (northring-nside)*nph[m])*stride;
}
if (northring != ring) /* southern hemisphere */
{
theta[m] = pi-theta[m];
ofs[m] = (npix - nph[m])*stride - ofs[m];
}
weight_[m]=4.*pi/npix*weight[northring-1];
}
sharp_make_geom_info (nrings, nph, ofs, stride_, phi0, theta, weight_,
geom_info);
DEALLOC(theta);
DEALLOC(weight_);
DEALLOC(nph);
DEALLOC(phi0);
DEALLOC(ofs);
DEALLOC(stride_);
}
static void gauleg (double x1, double x2, double *x, double *w, int n)
{
const double pi = 3.141592653589793238462643383279502884197;
const double eps = 3.0E-14;
int m = (n+1)/2;
double xm = 0.5*(x2+x1);
double xl = 0.5*(x2-x1);
for(int i=1; i<=m; ++i)
{
double z = cos(pi*(i-0.25)/(n+0.5));
double pp;
int dobreak=0;
while(1)
{
double p1 = 1.0, p2 = 0.0;
double z1 = z;
int j;
for(j=1; j<=n; ++j)
{
double p3 = p2;
p2 = p1;
p1 = ((2*j-1)*z*p2-(j-1)*p3)/j;
}
pp = n*(z*p1-p2)/(z*z-1);
z = z1 - p1/pp;
if (dobreak) break;
if (fabs(z-z1) <= eps) dobreak=1;
}
x[i-1] = xm - xl*z;
x[n-i] = xm + xl*z;
w[i-1] = w[n-i] = 2*xl/((1-z*z)*pp*pp);
}
}
static void makeweights (int bw, double *weights)
{
const double pi = 3.141592653589793238462643383279502884197;
const double fudge = pi/(4*bw);
for (int j=0; j<2*bw; ++j)
{
double tmpsum = 0;
for (int k=0; k<bw; ++k)
tmpsum += 1./(2*k+1) * sin((2*j+1)*(2*k+1)*fudge);
tmpsum *= sin((2*j+1)*fudge);
tmpsum *= 2./bw;
weights[j] = tmpsum;
/* weights[j + 2*bw] = tmpsum * sin((2*j+1)*fudge); */
}
}
void sharp_make_gauss_geom_info (int nrings, int nphi, int stride_lon,
int stride_lat, sharp_geom_info **geom_info)
{
const double pi=3.141592653589793238462643383279502884197;
double *theta=RALLOC(double,nrings);
double *weight=RALLOC(double,nrings);
int *nph=RALLOC(int,nrings);
double *phi0=RALLOC(double,nrings);
ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
int *stride_=RALLOC(int,nrings);
gauleg(-1,1,theta,weight,nrings);
for (int m=0; m<nrings; ++m)
{
theta[m] = acos(-theta[m]);
nph[m]=nphi;
phi0[m]=0;
ofs[m]=(ptrdiff_t)m*stride_lat;
stride_[m]=stride_lon;
weight[m]*=2*pi/nphi;
}
sharp_make_geom_info (nrings, nph, ofs, stride_, phi0, theta, weight,
geom_info);
DEALLOC(theta);
DEALLOC(weight);
DEALLOC(nph);
DEALLOC(phi0);
DEALLOC(ofs);
DEALLOC(stride_);
}
void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info)
{
const double pi=3.141592653589793238462643383279502884197;
double *theta=RALLOC(double,nrings);
double *weight=RALLOC(double,nrings);
int *nph=RALLOC(int,nrings);
double *phi0_=RALLOC(double,nrings);
ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
int *stride_=RALLOC(int,nrings);
UTIL_ASSERT((nrings&1)==0,
"Even number of rings needed for equidistant grid!");
makeweights(nrings/2,weight);
for (int m=0; m<nrings; ++m)
{
theta[m] = (m+0.5)*pi/nrings;
nph[m]=nphi;
phi0_[m]=phi0;
ofs[m]=(ptrdiff_t)m*stride_lat;
stride_[m]=stride_lon;
weight[m]*=2*pi/nphi;
}
sharp_make_geom_info (nrings, nph, ofs, stride_, phi0_, theta, weight,
geom_info);
DEALLOC(theta);
DEALLOC(weight);
DEALLOC(nph);
DEALLOC(phi0_);
DEALLOC(ofs);
DEALLOC(stride_);
}

View file

@ -0,0 +1,82 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_geomhelpers.h
* SHARP helper function for the creation of grid geometries
*
* Copyright (C) 2006-2011 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef PLANCK_SHARP_GEOMHELPERS_H
#define PLANCK_SHARP_GEOMHELPERS_H
#include "sharp_lowlevel.h"
#ifdef __cplusplus
extern "C" {
#endif
/*! Creates a geometry information describing a HEALPix map with an
Nside parameter \a nside.
\ingroup geominfogroup */
void sharp_make_healpix_geom_info (int nside, int stride,
sharp_geom_info **geom_info);
/*! Creates a geometry information describing a HEALPix map with an
Nside parameter \a nside. \a weight contains the relative ring
weights and must have \a 2*nside entries.
\ingroup geominfogroup */
void sharp_make_weighted_healpix_geom_info (int nside, int stride,
const double *weight, sharp_geom_info **geom_info);
/*! Creates a geometry information describing a Gaussian map with \a nrings
iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
pixel in each ring is 0. The index difference between two adjacent pixels
in an iso-latitude ring is \a stride_lon, the index difference between the
two start pixels in consecutive iso-latitude rings is \a stride_lat.
\ingroup geominfogroup */
void sharp_make_gauss_geom_info (int nrings, int nphi, int stride_lon,
int stride_lat, sharp_geom_info **geom_info);
/*! Creates a geometry information describing an ECP map with \a nrings
iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
pixel in each ring is \a phi0 (in radians). The index difference between
two adjacent pixels in an iso-latitude ring is \a stride_lon, the index
difference between the two start pixels in consecutive iso-latitude rings
is \a stride_lat.
\note The spacing of pixel centers is equidistant in colatitude and
longitude.
\note \a nrings must be an even number.
\note The sphere is pixelized in a way that the colatitude of the first ring
is \a 0.5*(pi/nrings). There are no pixel centers at the poles.
\ingroup geominfogroup */
void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -0,0 +1,57 @@
#define Tb CONCAT2(Tb,nvec)
#define Y(arg) CONCAT2(arg,nvec)
#include "sharp_core_inc.c"
#if (MAXJOB_SPECIAL<6)
#include "sharp_core_inc3.c"
#endif
#if (MAXJOB_SPECIAL>=1)
#define njobs 1
#define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c"
#undef Z
#undef njobs
#endif
#if (MAXJOB_SPECIAL>=2)
#define njobs 2
#define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c"
#undef Z
#undef njobs
#endif
#if (MAXJOB_SPECIAL>=3)
#define njobs 3
#define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c"
#undef Z
#undef njobs
#endif
#if (MAXJOB_SPECIAL>=4)
#define njobs 4
#define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c"
#undef Z
#undef njobs
#endif
#if (MAXJOB_SPECIAL>=5)
#define njobs 5
#define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c"
#undef Z
#undef njobs
#endif
#if (MAXJOB_SPECIAL>=6)
#define njobs 6
#define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c"
#undef Z
#undef njobs
#endif
#undef Y
#undef Tb

View file

@ -0,0 +1,66 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_internal.h
* Internally used functionality for the spherical transform library.
*
* Copyright (C) 2006-2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef PLANCK_SHARP_INTERNAL_H
#define PLANCK_SHARP_INTERNAL_H
#ifdef __cplusplus
#error This header file cannot be included from C++, only from C
#endif
#include "sharp.h"
typedef enum { FLOAT, DOUBLE } sharp_fde;
typedef struct
{
sharp_jobtype type;
int spin;
int add_output;
int nmaps, nalm;
sharp_fde fde;
void **map;
void **alm;
complex double *phase;
double *norm_l;
complex double *almtmp;
const sharp_geom_info *ginfo;
const sharp_alm_info *ainfo;
int nv;
double time;
int ntrans;
unsigned long long opcnt;
} sharp_job;
int sharp_get_nv_max (void);
int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans);
#endif

188
external/sharp/libsharp/sharp_lowlevel.h vendored Normal file
View file

@ -0,0 +1,188 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_lowlevel.h
* Low-level, portable interface for the spherical transform library.
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef PLANCK_SHARP_LOWLEVEL_H
#define PLANCK_SHARP_LOWLEVEL_H
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
/*! \internal
Helper type containing information about a single ring. */
typedef struct
{
double theta, phi0, weight, cth, sth;
ptrdiff_t ofs;
int nph, stride;
} sharp_ringinfo;
/*! \internal
Helper type containing information about a pair of rings with colatitudes
symmetric around the equator. */
typedef struct
{
sharp_ringinfo r1,r2;
} sharp_ringpair;
/*! \internal
Type holding all required information about a map geometry. */
typedef struct
{
sharp_ringpair *pair;
int npairs;
} sharp_geom_info;
/*! \defgroup almgroup Helpers for dealing with a_lm */
/*! \{ */
/*! \internal
Helper type for index calculation in a_lm arrays. */
typedef struct
{
/*! Maximum \a l index of the array */
int lmax;
/*! Number of different \a m values in this object */
int nm;
/*! Array with \a nm entries containing the individual m values */
int *mval;
/*! Array with \a nm entries containing the (hypothetical) indices of
the coefficients with quantum numbers 0,\a mval[i] */
ptrdiff_t *mvstart;
/*! Stride between a_lm and a_(l+1),m */
ptrdiff_t stride;
} sharp_alm_info;
/*! Creates an Alm data structure information from the following parameters:
\param lmax maximum \a l quantum number (>=0)
\param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
\param stride the stride between consecutive a_lm entries
\param mstart the index of the (hypothetical) coefficient with the
quantum numbers 0,\a m. Must have \a mmax+1 entries.
\param alm_info will hold a pointer to the newly created data structure
*/
void sharp_make_alm_info (int lmax, int mmax, int stride,
const ptrdiff_t *mstart, sharp_alm_info **alm_info);
/*! Creates an Alm data structure information from the following parameters:
\param lmax maximum \a l quantum number (>=0)
\param nm number of different \a m (<=\a lmax+1)
\param stride the stride between consecutive a_lm entries
\param mval array with \a nm entries containing the individual m values
\param mvstart array with \a nm entries containing the (hypothetical)
indices of the coefficients with the quantum numbers 0,\a mval[i]
\param alm_info will hold a pointer to the newly created data structure
*/
void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
const ptrdiff_t *mvstart, sharp_alm_info **alm_info);
/*! Returns the index of the coefficient with quantum numbers \a l,
\a mval[mi]. */
ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
/*! Deallocates the a_lm info object. */
void sharp_destroy_alm_info (sharp_alm_info *info);
/*! \} */
/*! \defgroup geominfogroup Functions for dealing with geometry information */
/*! \{ */
/*! Creates a geometry information from a set of ring descriptions.
All arrays passed to this function must have \a nrings elements.
\param nrings the number of rings in the map
\param nph the number of pixels in each ring
\param ofs the index of the first pixel in each ring in the map array
\param stride the stride between consecutive pixels
\param phi0 the azimuth (in radians) of the first pixel in each ring
\param theta the colatitude (in radians) of each ring
\param weight the pixel weight to be used for the ring
\param geom_info will hold a pointer to the newly created data structure
*/
void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
const int *stride, const double *phi0, const double *theta,
const double *weight, sharp_geom_info **geom_info);
/*! Deallocates the geometry information in \a info. */
void sharp_destroy_geom_info (sharp_geom_info *info);
/*! \} */
/*! \defgroup lowlevelgroup Low-level libsharp SHT interface */
/*! \{ */
/*! Enumeration of SHARP job types. */
typedef enum { SHARP_MAP2ALM, /*!< analysis */
SHARP_ALM2MAP, /*!< synthesis */
SHARP_ALM2MAP_DERIV1 /*!< synthesis of first derivatives */
} sharp_jobtype;
/*! Performs a libsharp SHT job. The interface deliberately does not use
the C99 "complex" data type, in order to be callable from C.
\param type the type of SHT
\param spin the spin of the quantities to be transformed
\param add_output if 0, the output arrays will be overwritten,
else the result will be added to the output arrays.
\param alm contains pointers to the a_lm coefficients. If \a spin==0,
alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
depends on the \a dp parameter.
\param map contains pointers to the maps. If \a spin==0,
map[0] points to the map of the first SHT, map[1] to that of the second
etc. If \a spin>0, map[0] and map[1] point to the maps of the first SHT,
map[2] and map[3] to those of the second, etc. The exact data type of \a map
depends on the \a dp parameter.
\param geom_info A \c sharp_geom_info object compatible with the provided
\a map arrays.
\param alm_info A \c sharp_alm_info object compatible with the provided
\a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
exactly once.
\param ntrans the number of simultaneous SHTs
\param dp if 0, the \a alm is expected to have the type "complex float **"
and \a map is expected to have the type "float **"; otherwise the expected
types are "complex double **" and "double **", respectively.
\param nv Internally used SHT parameter. Set to 0 unless you know what you are
doing.
\param time If not NULL, the wall clock time required for this SHT
(in seconds)will be written here.
\param opcnt If not NULL, a conservative estimate of the total floating point
operation count for this SHT will be written here. */
void sharp_execute (sharp_jobtype type, int spin, int add_output, void *alm,
void *map, const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
int ntrans, int dp, int nv, double *time, unsigned long long *opcnt);
/*! \} */
#ifdef __cplusplus
}
#endif
#endif

300
external/sharp/libsharp/sharp_mpi.c vendored Normal file
View file

@ -0,0 +1,300 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_mpi.c
* Functionality only needed for MPI-parallel transforms
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifdef USE_MPI
#include "sharp_mpi.h"
typedef struct
{
int ntasks; /* number of tasks */
int mytask; /* own task number */
MPI_Comm comm; /* communicator to use */
int *nm; /* number of m values on every task */
int *ofs_m; /* accumulated nm */
int nmtotal; /* total number of m values (must be mmax+1) */
int *mval; /* array containing all m values of task 0, task 1 etc. */
int mmax;
int nph;
int *npair; /* number of ring pairs on every task */
int *ofs_pair; /* accumulated npair */
int npairtotal; /* total number of ring pairs */
double *theta; /* theta of first ring of every pair on task 0, task 1 etc. */
int *ispair; /* is this really a pair? */
int *almcount, *almdisp, *mapcount, *mapdisp; /* for all2all communication */
} sharp_mpi_info;
static void sharp_make_mpi_info (MPI_Comm comm, const sharp_job *job,
sharp_mpi_info *minfo)
{
minfo->comm = comm;
MPI_Comm_size (comm, &minfo->ntasks);
MPI_Comm_rank (comm, &minfo->mytask);
minfo->nm=RALLOC(int,minfo->ntasks);
MPI_Allgather ((int *)(&job->ainfo->nm),1,MPI_INT,minfo->nm,1,MPI_INT,comm);
minfo->ofs_m=RALLOC(int,minfo->ntasks+1);
minfo->ofs_m[0]=0;
for (int i=1; i<=minfo->ntasks; ++i)
minfo->ofs_m[i] = minfo->ofs_m[i-1]+minfo->nm[i-1];
minfo->nmtotal=minfo->ofs_m[minfo->ntasks];
minfo->mval=RALLOC(int,minfo->nmtotal);
MPI_Allgatherv(job->ainfo->mval, job->ainfo->nm, MPI_INT, minfo->mval,
minfo->nm, minfo->ofs_m, MPI_INT, comm);
minfo->mmax=sharp_get_mmax(minfo->mval,minfo->nmtotal);
minfo->npair=RALLOC(int,minfo->ntasks);
MPI_Allgather ((int *)(&job->ginfo->npairs), 1, MPI_INT, minfo->npair, 1,
MPI_INT, comm);
minfo->ofs_pair=RALLOC(int,minfo->ntasks+1);
minfo->ofs_pair[0]=0;
for (int i=1; i<=minfo->ntasks; ++i)
minfo->ofs_pair[i] = minfo->ofs_pair[i-1]+minfo->npair[i-1];
minfo->npairtotal=minfo->ofs_pair[minfo->ntasks];
double *theta_tmp=RALLOC(double,job->ginfo->npairs);
int *ispair_tmp=RALLOC(int,job->ginfo->npairs);
for (int i=0; i<job->ginfo->npairs; ++i)
{
theta_tmp[i]=job->ginfo->pair[i].r1.theta;
ispair_tmp[i]=job->ginfo->pair[i].r2.nph>0;
}
minfo->theta=RALLOC(double,minfo->npairtotal);
minfo->ispair=RALLOC(int,minfo->npairtotal);
MPI_Allgatherv(theta_tmp, job->ginfo->npairs, MPI_DOUBLE, minfo->theta,
minfo->npair, minfo->ofs_pair, MPI_DOUBLE, comm);
MPI_Allgatherv(ispair_tmp, job->ginfo->npairs, MPI_INT, minfo->ispair,
minfo->npair, minfo->ofs_pair, MPI_INT, comm);
DEALLOC(theta_tmp);
DEALLOC(ispair_tmp);
minfo->nph=2*job->nmaps*job->ntrans;
minfo->almcount=RALLOC(int,minfo->ntasks);
minfo->almdisp=RALLOC(int,minfo->ntasks+1);
minfo->mapcount=RALLOC(int,minfo->ntasks);
minfo->mapdisp=RALLOC(int,minfo->ntasks+1);
minfo->almdisp[0]=minfo->mapdisp[0]=0;
for (int i=0; i<minfo->ntasks; ++i)
{
minfo->almcount[i] = 2*minfo->nph*minfo->nm[minfo->mytask]*minfo->npair[i];
minfo->almdisp[i+1] = minfo->almdisp[i]+minfo->almcount[i];
minfo->mapcount[i] = 2*minfo->nph*minfo->nm[i]*minfo->npair[minfo->mytask];
minfo->mapdisp[i+1] = minfo->mapdisp[i]+minfo->mapcount[i];
}
}
static void sharp_destroy_mpi_info (sharp_mpi_info *minfo)
{
DEALLOC(minfo->nm);
DEALLOC(minfo->ofs_m);
DEALLOC(minfo->mval);
DEALLOC(minfo->npair);
DEALLOC(minfo->ofs_pair);
DEALLOC(minfo->theta);
DEALLOC(minfo->ispair);
DEALLOC(minfo->almcount);
DEALLOC(minfo->almdisp);
DEALLOC(minfo->mapcount);
DEALLOC(minfo->mapdisp);
}
static void sharp_communicate_alm2map (const sharp_mpi_info *minfo, dcmplx **ph)
{
dcmplx *phas_tmp = RALLOC(dcmplx,minfo->mapdisp[minfo->ntasks]/2);
MPI_Alltoallv (*ph,minfo->almcount,minfo->almdisp,MPI_DOUBLE,phas_tmp,
minfo->mapcount,minfo->mapdisp,MPI_DOUBLE,minfo->comm);
DEALLOC(*ph);
ALLOC(*ph,dcmplx,minfo->nph*minfo->npair[minfo->mytask]*minfo->nmtotal);
for (int task=0; task<minfo->ntasks; ++task)
for (int th=0; th<minfo->npair[minfo->mytask]; ++th)
for (int mi=0; mi<minfo->nm[task]; ++mi)
{
int m = minfo->mval[mi+minfo->ofs_m[task]];
int o1 = minfo->nph*(th*(minfo->mmax+1) + m);
int o2 = minfo->mapdisp[task]/2+minfo->nph*(mi+th*minfo->nm[task]);
for (int i=0; i<minfo->nph; ++i)
(*ph)[o1+i] = phas_tmp[o2+i];
}
DEALLOC(phas_tmp);
}
static void sharp_communicate_map2alm (const sharp_mpi_info *minfo, dcmplx **ph)
{
dcmplx *phas_tmp = RALLOC(dcmplx,minfo->mapdisp[minfo->ntasks]/2);
for (int task=0; task<minfo->ntasks; ++task)
for (int th=0; th<minfo->npair[minfo->mytask]; ++th)
for (int mi=0; mi<minfo->nm[task]; ++mi)
{
int m = minfo->mval[mi+minfo->ofs_m[task]];
int o1 = minfo->mapdisp[task]/2+minfo->nph*(mi+th*minfo->nm[task]);
int o2 = minfo->nph*(th*(minfo->mmax+1) + m);
for (int i=0; i<minfo->nph; ++i)
phas_tmp[o1+i] = (*ph)[o2+i];
}
DEALLOC(*ph);
ALLOC(*ph,dcmplx,minfo->nph*minfo->nm[minfo->mytask]*minfo->npairtotal);
MPI_Alltoallv (phas_tmp,minfo->mapcount,minfo->mapdisp,MPI_DOUBLE,
*ph,minfo->almcount,minfo->almdisp,MPI_DOUBLE,minfo->comm);
DEALLOC(phas_tmp);
}
static void alloc_phase_mpi (sharp_job *job, int nm, int ntheta,
int nmfull, int nthetafull)
{
ptrdiff_t phase_size = (job->type==SHARP_MAP2ALM) ?
(ptrdiff_t)(nmfull)*ntheta : (ptrdiff_t)(nm)*nthetafull;
job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*phase_size);
}
static void alm2map_comm (sharp_job *job, const sharp_mpi_info *minfo)
{
if (job->type != SHARP_MAP2ALM)
sharp_communicate_alm2map (minfo,&job->phase);
}
static void map2alm_comm (sharp_job *job, const sharp_mpi_info *minfo)
{
if (job->type == SHARP_MAP2ALM)
sharp_communicate_map2alm (minfo,&job->phase);
}
static void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm)
{
double timer=wallTime();
int ntasks;
MPI_Comm_size(comm, &ntasks);
if (ntasks==1) /* fall back to scalar implementation */
{ sharp_execute_job (job); return; }
int lmax = job->ainfo->lmax;
job->norm_l = sharp_Ylmgen_get_norm (lmax, job->spin);
sharp_mpi_info minfo;
sharp_make_mpi_info(comm, job, &minfo);
/* clear output arrays if requested */
init_output (job);
alloc_phase_mpi (job,job->ainfo->nm,job->ginfo->npairs,minfo.mmax+1,
minfo.npairtotal);
double *cth = RALLOC(double,minfo.npairtotal),
*sth = RALLOC(double,minfo.npairtotal);
idxhelper *stmp = RALLOC(idxhelper,minfo.npairtotal);
for (int i=0; i<minfo.npairtotal; ++i)
{
cth[i] = cos(minfo.theta[i]);
sth[i] = sin(minfo.theta[i]);
stmp[i].s=sth[i];
stmp[i].i=i;
}
qsort (stmp,minfo.npairtotal,sizeof(idxhelper),idx_compare);
int *idx = RALLOC(int,minfo.npairtotal);
for (int i=0; i<minfo.npairtotal; ++i)
idx[i]=stmp[i].i;
DEALLOC(stmp);
/* map->phase where necessary */
map2phase (job, minfo.mmax, 0, job->ginfo->npairs);
map2alm_comm (job, &minfo);
#pragma omp parallel
{
sharp_job ljob = *job;
sharp_Ylmgen_C generator;
sharp_Ylmgen_init (&generator,lmax,minfo.mmax,ljob.spin);
alloc_almtmp(&ljob,lmax);
#pragma omp for schedule(dynamic,1)
for (int mi=0; mi<job->ainfo->nm; ++mi)
{
/* alm->alm_tmp where necessary */
alm2almtmp (&ljob, lmax, mi);
/* inner conversion loop */
inner_loop (&ljob, minfo.ispair, cth, sth, 0, minfo.npairtotal,
&generator, mi, idx);
/* alm_tmp->alm where necessary */
almtmp2alm (&ljob, lmax, mi);
}
sharp_Ylmgen_destroy(&generator);
dealloc_almtmp(&ljob);
#pragma omp critical
job->opcnt+=ljob.opcnt;
} /* end of parallel region */
alm2map_comm (job, &minfo);
/* phase->map where necessary */
phase2map (job, minfo.mmax, 0, job->ginfo->npairs);
DEALLOC(cth);
DEALLOC(sth);
DEALLOC(idx);
DEALLOC(job->norm_l);
dealloc_phase (job);
sharp_destroy_mpi_info(&minfo);
job->time=wallTime()-timer;
}
void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
int add_output, void *alm, void *map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans, int dp, int nv, double *time,
unsigned long long *opcnt)
{
sharp_job job;
sharp_build_job_common (&job, type, spin, add_output, alm, map, geom_info,
alm_info, ntrans, dp, nv);
sharp_execute_job_mpi (&job, comm);
if (time!=NULL) *time = job.time;
if (opcnt!=NULL) *opcnt = job.opcnt;
}
#endif

85
external/sharp/libsharp/sharp_mpi.h vendored Normal file
View file

@ -0,0 +1,85 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_mpi.h
* Interface for the spherical transform library with MPI support.
*
* Copyright (C) 2011,2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef PLANCK_SHARP_MPI_H
#define PLANCK_SHARP_MPI_H
#include <mpi.h>
#include "sharp.h"
#ifdef __cplusplus
extern "C" {
#endif
/*! Performs an MPI parallel libsharp SHT job. The interface deliberately does
not use the C99 "complex" data type, in order to be callable from C.
\param comm the MPI communicator to be used for this SHT
\param type the type of SHT
\param spin the spin of the quantities to be transformed
\param add_output if 0, the output arrays will be overwritten,
else the result will be added to the output arrays.
\param alm contains pointers to the a_lm coefficients. If \a spin==0,
alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
depends on the \a dp parameter.
\param map contains pointers to the maps. If \a spin==0,
map[0] points to the map of the first SHT, map[1] to that of the second
etc. If \a spin>0, map[0] and map[1] point to the maps of the first SHT,
map[2] and map[3] to those of the second, etc. The exact data type of \a map
depends on the \a dp parameter.
\param geom_info A \c sharp_geom_info object compatible with the provided
\a map arrays. The total map geometry is the union of all \a geom_info
objects over the participating MPI tasks.
\param alm_info A \c sharp_alm_info object compatible with the provided
\a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
exactly once in the union of all \a alm_info objects over the participating
MPI tasks.
\param ntrans the number of simultaneous SHTs
\param dp if 0, the \a alm is expected to have the type "complex float **"
and \a map is expected to have the type "float **"; otherwise the expected
types are "complex double **" and "double **", respectively.
\param nv Internally used SHT parameter. Set to 0 unless you know what you are
doing.
\param time If not NULL, the wall clock time required for this SHT
(in seconds)will be written here.
\param opcnt If not NULL, a conservative estimate of the total floating point
operation count for this SHT will be written here. */
void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
int add_output, void *alm, void *map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans, int dp, int nv, double *time,
unsigned long long *opcnt);
#ifdef __cplusplus
}
#endif
#endif

249
external/sharp/libsharp/sharp_test.c vendored Normal file
View file

@ -0,0 +1,249 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_test.c
Accuracy test for libsharp's map analysis.
This program first generates a_lm coefficients up to
a user-specified lmax (with mmax=lmax); where applicable, the
real and imaginary parts of the coefficients are uniform
random numbers of the interval [-1;1[.
Afterwards, the random a_lm are converted to a map.
This map is analyzed (optionally using an iterative scheme
with a user-supplied number of steps).
After every iteration, the code then outputs the RMS of the residual a_lm
(i.e. the difference between the current and original a_lm), divided by
the RMS of the original a_lm, as well as the maximum absolute change of any
real or imaginary part between the current and original a_lm.
This operation can be performed for several different pixelisations:
- a Gaussian with the minimal number of rings for exact analysis
and a user-defined ring resolution
- an ECP grid with the minimal number of rings for exact analysis
and a user-defined ring resolution
- a Healpix grid with a user-defined Nside parameter.
The user can specify the spin of the desired transform.
Copyright (C) 2006-2012 Max-Planck-Society
\author Martin Reinecke
*/
#include <stdio.h>
#include <string.h>
#ifdef USE_MPI
#include "mpi.h"
#endif
#include "sharp.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "c_utils.h"
#include "sharp_announce.h"
#include "sharp_core.h"
#include "memusage.h"
typedef complex double dcmplx;
static double drand (double min, double max)
{ return min + (max-min)*rand()/(RAND_MAX+1.0); }
static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin)
{
for (int mi=0;mi<helper->nm; ++mi)
{
int m=helper->mval[mi];
for (int l=m;l<=helper->lmax; ++l)
{
if ((l<spin)&&(m<spin))
alm[sharp_alm_index(helper,l,mi)] = 0.;
else
{
double rv = drand(-1,1);
double iv = (m==0) ? 0 : drand(-1,1);
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
}
}
}
}
static void measure_errors (dcmplx **alm, dcmplx **alm2,
ptrdiff_t nalms, int ncomp)
{
for (int i=0; i<ncomp; ++i)
{
double sum=0, sum2=0, maxdiff=0;
for (ptrdiff_t m=0; m<nalms; ++m)
{
double x=creal(alm[i][m])-creal(alm2[i][m]),
y=cimag(alm[i][m])-cimag(alm2[i][m]);
sum+=x*x+y*y;
sum2+=creal(alm[i][m])*creal(alm[i][m])+cimag(alm[i][m])*cimag(alm[i][m]);
if (fabs(x)>maxdiff) maxdiff=fabs(x);
if (fabs(y)>maxdiff) maxdiff=fabs(y);
}
sum=sqrt(sum/nalms);
sum2=sqrt(sum2/nalms);
printf("component %i: rms %e, maxerr %e\n",i, sum/sum2, maxdiff);
}
}
static void map2alm_iter (sharp_geom_info *tinfo, double **map,
dcmplx **alm_orig, dcmplx **alm, int lmax, int mmax,
ptrdiff_t npix, ptrdiff_t nalms, int spin, int ntrans, int niter)
{
int ncomp = ntrans*((spin==0) ? 1 : 2);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
double time;
unsigned long long opcnt;
sharp_execute(SHARP_MAP2ALM,spin,0,&alm[0],&map[0],tinfo,alms,ntrans,1,0,
&time,&opcnt);
printf("wall time for map2alm: %fs\n",time);
printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/time);
measure_errors(alm_orig,alm,nalms,ncomp);
for (int iter=0; iter<niter; ++iter)
{
double **map2;
ALLOC2D(map2,double,ncomp,npix);
printf ("\niteration %i:\n", iter+1);
sharp_execute(SHARP_ALM2MAP,spin,0,&alm[0],&map2[0],tinfo,alms,ntrans,1,0,
&time,&opcnt);
printf("wall time for alm2map: %fs\n",time);
printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/time);
for (int i=0; i<ncomp; ++i)
for (ptrdiff_t m=0; m<npix; ++m)
map2[i][m] = map[i][m]-map2[i][m];
sharp_execute(SHARP_MAP2ALM,spin,1,&alm[0],&map2[0],tinfo,alms,ntrans,1,0,
&time,&opcnt);
printf("wall time for map2alm: %fs\n",time);
printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/time);
DEALLOC2D(map2);
measure_errors(alm_orig,alm,nalms,ncomp);
}
sharp_destroy_alm_info(alms);
}
static void check_accuracy (sharp_geom_info *tinfo, ptrdiff_t lmax,
ptrdiff_t mmax, ptrdiff_t npix, int spin, int ntrans, int niter)
{
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
srand(4);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
for (int i=0; i<ncomp; ++i)
random_alm(alm[i],alms,spin);
dcmplx **alm2;
ALLOC2D(alm2,dcmplx,ncomp,nalms);
double time;
unsigned long long opcnt;
printf ("\niteration 0:\n");
sharp_execute(SHARP_ALM2MAP,spin,0,&alm[0],&map[0],tinfo,alms,ntrans,1,0,
&time,&opcnt);
printf("wall time for alm2map: %fs\n",time);
printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/time);
map2alm_iter(tinfo,map,alm,alm2,lmax,mmax,npix,nalms,spin,ntrans,niter);
DEALLOC2D(map);
DEALLOC2D(alm);
DEALLOC2D(alm2);
sharp_destroy_alm_info(alms);
}
int main(int argc, char **argv)
{
#ifdef USE_MPI
MPI_Init(NULL,NULL);
#endif
sharp_module_startup("sharp_test",argc,7,
"<healpix|ecp|gauss> <lmax> <nside|nphi> <niter> <spin> <ntrans>",1);
int lmax=atoi(argv[2]);
int niter=atoi(argv[4]);
int spin=atoi(argv[5]);
int ntrans=atoi(argv[6]);
printf("Testing map analysis accuracy.\n");
printf("lmax=%d, %d iterations, spin=%d\n", lmax, niter, spin);
sharp_geom_info *tinfo;
if (strcmp(argv[1],"gauss")==0)
{
int nrings=lmax+1;
int ppring=atoi(argv[3]);
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
printf("\nTesting Gaussian grid (%d rings, %d pixels/ring, %ld pixels)\n",
nrings,ppring,(long)npix);
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else if (strcmp(argv[1],"ecp")==0)
{
int nrings=2*lmax+2;
int ppring=atoi(argv[3]);
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
printf("\nTesting ECP grid (%d rings, %d pixels/ring, %ld pixels)\n",
nrings,ppring,(long)npix);
sharp_make_ecp_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else if (strcmp(argv[1],"healpix")==0)
{
int nside=atoi(argv[3]);
if (nside<1) nside=1;
ptrdiff_t npix=12*(ptrdiff_t)nside*nside;
printf("\nTesting Healpix grid (nside=%d, %ld pixels)\n",
nside,(long)npix);
sharp_make_healpix_geom_info (nside, 1, &tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else
UTIL_FAIL("unknown grid geometry");
printf("\nMemory high water mark: %.2f MB\n",VmHWM()/(1<<20));
#ifdef USE_MPI
MPI_Finalize();
#endif
return 0;
}

359
external/sharp/libsharp/sharp_test_mpi.c vendored Normal file
View file

@ -0,0 +1,359 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_test_mpi.c
Accuracy test for libsharp's map analysis with MPI support.
This program first generates a_lm coefficients up to
a user-specified lmax (with mmax=lmax); where applicable, the
real and imaginary parts of the coefficients are uniform
random numbers of the interval [-1;1[.
Afterwards, the random a_lm are converted to a map.
This map is analyzed (optionally using an iterative scheme
with a user-supplied number of steps).
After every iteration, the code then outputs the RMS of the residual a_lm
(i.e. the difference between the current and original a_lm), divided by
the RMS of the original a_lm, as well as the maximum absolute change of any
real or imaginary part between the current and original a_lm.
This operation can be performed for several different pixelisations:
- a Gaussian with the minimal number of rings for exact analysis
and a user-defined ring resolution
- an ECP grid with the minimal number of rings for exact analysis
and a user-defined ring resolution
- a Healpix grid with a user-defined Nside parameter.
The user can specify the spin of the desired transform.
Copyright (C) 2006-2012 Max-Planck-Society
\author Martin Reinecke
*/
#ifdef USE_MPI
#include <stdio.h>
#include <string.h>
#include "sharp_mpi.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "c_utils.h"
#include "walltime_c.h"
#include "sharp_announce.h"
#include "sharp_core.h"
typedef complex double dcmplx;
int ntasks, mytask;
static unsigned long long totalops (unsigned long long val)
{
unsigned long long tmp;
MPI_Allreduce (&val, &tmp,1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
return tmp;
}
static double maxTime (double val)
{
double tmp;
MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
return tmp;
}
static double drand (double min, double max)
{ return min + (max-min)*rand()/(RAND_MAX+1.0); }
static ptrdiff_t get_nalms(const sharp_alm_info *ainfo)
{
ptrdiff_t res=0;
for (int i=0; i<ainfo->nm; ++i)
res += ainfo->lmax-ainfo->mval[i]+1;
return res;
}
static ptrdiff_t get_npix(const sharp_geom_info *ginfo)
{
ptrdiff_t res=0;
for (int i=0; i<ginfo->npairs; ++i)
{
res += ginfo->pair[i].r1.nph;
if (ginfo->pair[i].r2.nph>0) res += ginfo->pair[i].r2.nph;
}
return res;
}
static void reduce_alm_info(sharp_alm_info *ainfo)
{
int nmnew=0;
ptrdiff_t ofs = 0;
for (int i=mytask; i<ainfo->nm; i+=ntasks,++nmnew)
{
ainfo->mval[nmnew]=ainfo->mval[i];
ainfo->mvstart[nmnew]=ofs-ainfo->mval[nmnew];
ofs+=ainfo->lmax-ainfo->mval[nmnew]+1;
}
ainfo->nm=nmnew;
}
static void reduce_geom_info(sharp_geom_info *ginfo)
{
int npairsnew=0;
ptrdiff_t ofs = 0;
for (int i=mytask; i<ginfo->npairs; i+=ntasks,++npairsnew)
{
ginfo->pair[npairsnew]=ginfo->pair[i];
ginfo->pair[npairsnew].r1.ofs=ofs;
ofs+=ginfo->pair[npairsnew].r1.nph;
ginfo->pair[npairsnew].r2.ofs=ofs;
if (ginfo->pair[npairsnew].r2.nph>0) ofs+=ginfo->pair[npairsnew].r2.nph;
}
ginfo->npairs=npairsnew;
}
static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin)
{
static int cnt=0;
++cnt;
for (int mi=0;mi<helper->nm; ++mi)
{
int m=helper->mval[mi];
srand(1234567*cnt+8912*m);
for (int l=m;l<=helper->lmax; ++l)
{
if ((l<spin)&&(m<spin))
alm[sharp_alm_index(helper,l,mi)] = 0.;
else
{
double rv = drand(-1,1);
double iv = (m==0) ? 0 : drand(-1,1);
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
}
}
}
}
static void measure_errors (dcmplx **alm, dcmplx **alm2,
const sharp_alm_info *ainfo, int ncomp)
{
long nalms=get_nalms(ainfo), nalms_tot;
MPI_Allreduce(&nalms,&nalms_tot,1,MPI_LONG,MPI_SUM,MPI_COMM_WORLD);
for (int i=0; i<ncomp; ++i)
{
double sum=0, sum2=0, maxdiff=0, sumtot, sum2tot, maxdifftot;
for (int mi=0; mi<ainfo->nm; ++mi)
{
int m=ainfo->mval[mi];
for (int l=m; l<=ainfo->lmax; ++l)
{
ptrdiff_t idx=sharp_alm_index(ainfo,l,mi);
double x=creal(alm[i][idx])-creal(alm2[i][idx]),
y=cimag(alm[i][idx])-cimag(alm2[i][idx]);
sum+=x*x+y*y;
sum2+=creal(alm[i][idx])*creal(alm[i][idx])
+cimag(alm[i][idx])*cimag(alm[i][idx]);
if (fabs(x)>maxdiff) maxdiff=fabs(x);
if (fabs(y)>maxdiff) maxdiff=fabs(y);
}
}
MPI_Allreduce(&sum,&sumtot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
MPI_Allreduce(&sum2,&sum2tot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
MPI_Allreduce(&maxdiff,&maxdifftot,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD);
sumtot=sqrt(sumtot/nalms_tot);
sum2tot=sqrt(sum2tot/nalms_tot);
if (mytask==0)
printf("component %i: rms %e, maxerr %e\n",i, sumtot/sum2tot, maxdifftot);
}
}
static void map2alm_iter (sharp_geom_info *tinfo, double **map,
dcmplx **alm_orig, dcmplx **alm, int lmax, int mmax,
ptrdiff_t npix, int spin, int ntrans, int niter)
{
int ncomp = ntrans*((spin==0) ? 1 : 2);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
reduce_alm_info(alms);
double jtime;
unsigned long long jopcnt;
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,0,&alm[0],&map[0],
tinfo,alms,ntrans,1,0,&jtime,&jopcnt);
unsigned long long opcnt=totalops(jopcnt);
double timer=maxTime(jtime);
if (mytask==0) printf("wall time for map2alm: %fs\n",timer);
if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
measure_errors(alm_orig,alm,alms,ncomp);
for (int iter=0; iter<niter; ++iter)
{
double **map2;
ALLOC2D(map2,double,ncomp,npix);
if (mytask==0) printf ("\niteration %i:\n", iter+1);
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,0,&alm[0],&map2[0],
tinfo,alms,ntrans,1,0,&jtime,&jopcnt);
opcnt=totalops(jopcnt);
timer=maxTime(jtime);
if (mytask==0) printf("wall time for alm2map: %fs\n",timer);
if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
for (int i=0; i<ncomp; ++i)
for (ptrdiff_t m=0; m<npix; ++m)
map2[i][m] = map[i][m]-map2[i][m];
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,1,&alm[0],&map2[0],
tinfo,alms,ntrans,1,0,&jtime,&jopcnt);
opcnt=totalops(jopcnt);
timer=maxTime(jtime);
if (mytask==0) printf("wall time for map2alm: %fs\n",wallTime()-timer);
if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
DEALLOC2D(map2);
measure_errors(alm_orig,alm,alms,ncomp);
}
sharp_destroy_alm_info(alms);
}
static void check_accuracy (sharp_geom_info *tinfo, ptrdiff_t lmax,
ptrdiff_t mmax, ptrdiff_t npix, int spin, int ntrans, int niter)
{
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
double jtime;
unsigned long long jopcnt;
sharp_alm_info *alms;
ptrdiff_t nalms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
reduce_alm_info(alms);
nalms=get_nalms(alms);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
srand(4);
for (int i=0; i<ncomp; ++i)
random_alm(alm[i],alms,spin);
dcmplx **alm2;
ALLOC2D(alm2,dcmplx,ncomp,nalms);
if (mytask==0) printf ("\niteration 0:\n");
sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,0,&alm[0],&map[0],
tinfo,alms,ntrans,1,0,&jtime,&jopcnt);
unsigned long long opcnt=totalops(jopcnt);
double timer=maxTime(jtime);
if (mytask==0) printf("wall time for alm2map: %fs\n",timer);
if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
map2alm_iter(tinfo, map, alm, alm2, lmax, mmax, npix, spin, ntrans, niter);
DEALLOC2D(map);
DEALLOC2D(alm);
DEALLOC2D(alm2);
sharp_destroy_alm_info(alms);
}
int main(int argc, char **argv)
{
MPI_Init(NULL,NULL);
MPI_Comm_size(MPI_COMM_WORLD,&ntasks);
MPI_Comm_rank(MPI_COMM_WORLD,&mytask);
sharp_module_startup("sharp_test_mpi",argc,7,
"<healpix|ecp|gauss> <lmax> <nside|nphi> <niter> <spin> <ntrans>",
mytask==0);
int lmax=atoi(argv[2]);
int niter=atoi(argv[4]);
int spin=atoi(argv[5]);
int ntrans=atoi(argv[6]);
if (mytask==0)
{
printf("Testing map analysis accuracy.\n");
printf("lmax=%d, %d iterations, spin=%d\n", lmax, niter, spin);
}
sharp_geom_info *tinfo;
if (strcmp(argv[1],"gauss")==0)
{
int nrings=lmax+1;
int ppring=atoi(argv[3]);
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
if (mytask==0)
printf("\nTesting Gaussian grid (%d rings, %d pixels/ring, %ld pixels)\n",
nrings,ppring,(long)npix);
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
reduce_geom_info(tinfo);
npix=get_npix(tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else if (strcmp(argv[1],"ecp")==0)
{
int nrings=2*lmax+2;
int ppring=atoi(argv[3]);
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
if (mytask==0)
printf("\nTesting ECP grid (%d rings, %d pixels/ring, %ld pixels)\n",
nrings,ppring,(long)npix);
sharp_make_ecp_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
reduce_geom_info(tinfo);
npix=get_npix(tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else if (strcmp(argv[1],"healpix")==0)
{
int nside=atoi(argv[3]);
if (nside<1) nside=1;
ptrdiff_t npix=12*(ptrdiff_t)nside*nside;
if (mytask==0)
printf("\nTesting Healpix grid (nside=%d, %ld pixels)\n",
nside,(long)npix);
sharp_make_healpix_geom_info (nside, 1, &tinfo);
reduce_geom_info(tinfo);
npix=get_npix(tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else
UTIL_FAIL("unknown grid geometry");
MPI_Finalize();
return 0;
}
#else
#include "c_utils.h"
int main(void)
{ UTIL_FAIL("MPI support not enabled."); return 1; }
#endif

View file

@ -0,0 +1,174 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/* \file sharp_vecsupport.h
* Convenience functions for vector arithmetics
*
* Copyright (C) 2012 Max-Planck-Society
* Author: Martin Reinecke
*/
#ifndef SHARP_VECSUPPORT_H
#define SHARP_VECSUPPORT_H
#include <math.h>
#include "sharp_vecutil.h"
typedef double Ts;
#if (VLEN==1)
typedef double Tv;
#define vadd(a,b) ((a)+(b))
#define vaddeq(a,b) ((a)+=(b))
#define vsub(a,b) ((a)-(b))
#define vsubeq(a,b) ((a)-=(b))
#define vmul(a,b) ((a)*(b))
#define vmuleq(a,b) ((a)*=(b))
#define vfmaeq(a,b,c) ((a)+=(b)*(c))
#define vfmseq(a,b,c) ((a)-=(b)*(c))
#define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
#define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
#define vneg(a) (-(a))
#define vload(a) (a)
#define vabs(a) fabs(a)
#define vsqrt(a) sqrt(a)
#define vlt(a,b) (((a)<(b))?1.:0.)
#define vgt(a,b) (((a)>(b))?1.:0.)
#define vge(a,b) (((a)>=(b))?1.:0.)
#define vne(a,b) (((a)!=(b))?1.:0.)
#define vand(a,b) ((((a)*(b))!=0.)?1.:0.)
#define vor(a,b) ((((a)+(b))!=0.)?1.:0.)
static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
#define vanyTrue(a) ((a)!=0.)
#define vallTrue(a) ((a)!=0.)
#define vblend(m,a,b) (((m)!=0.) ? (a) : (b))
#define vzero 0.
#define vone 1.
#endif
#if (VLEN==2)
#include <emmintrin.h>
#if defined (__SSE3__)
#include <pmmintrin.h>
#endif
#if defined (__SSE4_1__)
#include <smmintrin.h>
#endif
typedef __m128d Tv;
#define vadd(a,b) _mm_add_pd(a,b)
#define vaddeq(a,b) a=_mm_add_pd(a,b)
#define vsub(a,b) _mm_sub_pd(a,b)
#define vsubeq(a,b) a=_mm_sub_pd(a,b)
#define vmul(a,b) _mm_mul_pd(a,b)
#define vmuleq(a,b) a=_mm_mul_pd(a,b)
#define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
#define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
#define vfmaaeq(a,b,c,d,e) \
a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
#define vfmaseq(a,b,c,d,e) \
a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
#define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
#define vload(a) _mm_set1_pd(a)
#define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
#define vsqrt(a) _mm_sqrt_pd(a)
#define vlt(a,b) _mm_cmplt_pd(a,b)
#define vgt(a,b) _mm_cmpgt_pd(a,b)
#define vge(a,b) _mm_cmpge_pd(a,b)
#define vne(a,b) _mm_cmpneq_pd(a,b)
#define vand(a,b) _mm_and_pd(a,b)
#define vor(a,b) _mm_or_pd(a,b)
#define vmin(a,b) _mm_min_pd(a,b)
#define vmax(a,b) _mm_max_pd(a,b);
#define vanyTrue(a) (_mm_movemask_pd(a)!=0)
#define vallTrue(a) (_mm_movemask_pd(a)==3)
#if defined(__SSE4_1__)
#define vblend(m,a,b) _mm_blendv_pd(b,a,m)
#else
static inline Tv vblend(Tv m, Tv a, Tv b)
{ return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
#endif
#define vzero _mm_setzero_pd()
#define vone _mm_set1_pd(1.)
#endif
#if (VLEN==4)
#include <immintrin.h>
#ifdef __FMA4__
#include <x86intrin.h>
#endif
typedef __m256d Tv;
#define vadd(a,b) _mm256_add_pd(a,b)
#define vaddeq(a,b) a=_mm256_add_pd(a,b)
#define vsub(a,b) _mm256_sub_pd(a,b)
#define vsubeq(a,b) a=_mm256_sub_pd(a,b)
#define vmul(a,b) _mm256_mul_pd(a,b)
#define vmuleq(a,b) a=_mm256_mul_pd(a,b)
#ifdef __FMA4__
#define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
#define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
#else
#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
#define vfmaaeq(a,b,c,d,e) \
a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
#define vfmaseq(a,b,c,d,e) \
a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
#endif
#define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
#define vload(a) _mm256_set1_pd(a)
#define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
#define vsqrt(a) _mm256_sqrt_pd(a)
#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
#define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ)
#define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ)
#define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
#define vand(a,b) _mm256_and_pd(a,b)
#define vor(a,b) _mm256_or_pd(a,b)
#define vmin(a,b) _mm256_min_pd(a,b)
#define vmax(a,b) _mm256_max_pd(a,b)
#define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
#define vallTrue(a) (_mm256_movemask_pd(a)==15)
#define vblend(m,a,b) _mm256_blendv_pd(b,a,m)
#define vzero _mm256_setzero_pd()
#define vone _mm256_set1_pd(1.)
#endif
#endif

43
external/sharp/libsharp/sharp_vecutil.h vendored Normal file
View file

@ -0,0 +1,43 @@
/*
* This file is part of libc_utils.
*
* libc_utils is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libc_utils is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libc_utils; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_vecutil.h
* Functionality related to vector instruction support
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef SHARP_VECUTIL_H
#define SHARP_VECUTIL_H
#if (defined (__AVX__))
#define VLEN 4
#elif (defined (__SSE2__))
#define VLEN 2
#else
#define VLEN 1
#endif
#endif

230
external/sharp/libsharp/sharp_ylmgen_c.c vendored Normal file
View file

@ -0,0 +1,230 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*
* Helper code for efficient calculation of Y_lm(theta,phi=0)
*
* Copyright (C) 2005-2012 Max-Planck-Society
* Author: Martin Reinecke
*/
#include <math.h>
#include <stdlib.h>
#include "sharp_ylmgen_c.h"
#include "c_utils.h"
static inline void normalize (double *val, int *scale, double xfmax)
{
while (fabs(*val)>xfmax) { *val*=sharp_fsmall; ++*scale; }
if (*val!=0.)
while (fabs(*val)<xfmax*sharp_fsmall) { *val*=sharp_fbig; --*scale; }
}
void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
{
const double inv_sqrt4pi = 0.2820947917738781434740397257803862929220;
gen->lmax = l_max;
gen->mmax = m_max;
UTIL_ASSERT(spin>=0,"incorrect spin");
gen->s = spin;
UTIL_ASSERT((sharp_minscale<=0)&&(sharp_maxscale>0),
"bad value for min/maxscale");
gen->cf=RALLOC(double,sharp_maxscale-sharp_minscale+1);
gen->cf[-sharp_minscale]=1.;
for (int m=-sharp_minscale-1; m>=0; --m)
gen->cf[m]=gen->cf[m+1]*sharp_fsmall;
for (int m=-sharp_minscale+1; m<(sharp_maxscale-sharp_minscale+1); ++m)
gen->cf[m]=gen->cf[m-1]*sharp_fbig;
gen->m = -1;
if (spin==0)
{
gen->rf = RALLOC(sharp_ylmgen_dbl2,gen->lmax+1);
gen->mfac = RALLOC(double,gen->mmax+1);
gen->mfac[0] = inv_sqrt4pi;
for (int m=1; m<=gen->mmax; ++m)
gen->mfac[m] = gen->mfac[m-1]*sqrt((2*m+1.)/(2*m));
gen->root = RALLOC(double,2*gen->lmax+5);
gen->iroot = RALLOC(double,2*gen->lmax+5);
for (int m=0; m<2*gen->lmax+5; ++m)
{
gen->root[m] = sqrt(m);
gen->iroot[m] = (m==0) ? 0. : 1./gen->root[m];
}
}
else
{
gen->m=gen->mlo=gen->mhi=-1234567890;
ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+2);
for (int m=0; m<gen->lmax+2; ++m)
gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
ALLOC(gen->inv,double,gen->lmax+1);
gen->inv[0]=0;
for (int m=1; m<gen->lmax+1; ++m) gen->inv[m]=1./m;
ALLOC(gen->flm1,double,2*gen->lmax+1);
ALLOC(gen->flm2,double,2*gen->lmax+1);
for (int m=0; m<2*gen->lmax+1; ++m)
{
gen->flm1[m] = sqrt(1./(m+1.));
gen->flm2[m] = sqrt(m/(m+1.));
}
ALLOC(gen->prefac,double,gen->mmax+1);
ALLOC(gen->fscale,int,gen->mmax+1);
double *fac = RALLOC(double,2*gen->lmax+1);
int *facscale = RALLOC(int,2*gen->lmax+1);
fac[0]=1; facscale[0]=0;
for (int m=1; m<2*gen->lmax+1; ++m)
{
fac[m]=fac[m-1]*sqrt(m);
facscale[m]=facscale[m-1];
normalize(&fac[m],&facscale[m],sharp_fbighalf);
}
for (int m=0; m<=gen->mmax; ++m)
{
int mlo=gen->s, mhi=m;
if (mhi<mlo) SWAP(mhi,mlo,int);
double tfac=fac[2*mhi]/fac[mhi+mlo];
int tscale=facscale[2*mhi]-facscale[mhi+mlo];
normalize(&tfac,&tscale,sharp_fbighalf);
tfac/=fac[mhi-mlo];
tscale-=facscale[mhi-mlo];
normalize(&tfac,&tscale,sharp_fbighalf);
gen->prefac[m]=tfac;
gen->fscale[m]=tscale;
}
DEALLOC(fac);
DEALLOC(facscale);
}
}
void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
{
DEALLOC(gen->cf);
if (gen->s==0)
{
DEALLOC(gen->rf);
DEALLOC(gen->mfac);
DEALLOC(gen->root);
DEALLOC(gen->iroot);
}
else
{
DEALLOC(gen->fx);
DEALLOC(gen->prefac);
DEALLOC(gen->fscale);
DEALLOC(gen->flm1);
DEALLOC(gen->flm2);
DEALLOC(gen->inv);
}
}
void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
{
if (m==gen->m) return;
UTIL_ASSERT(m>=0,"incorrect m");
gen->m = m;
if (gen->s==0)
{
gen->rf[m].f[0] = gen->root[2*m+3];
gen->rf[m].f[1] = 0.;
for (int l=m+1; l<=gen->lmax; ++l)
{
double tmp=gen->root[2*l+3]*gen->iroot[l+1+m]*gen->iroot[l+1-m];
gen->rf[l].f[0] = tmp*gen->root[2*l+1];
gen->rf[l].f[1] = tmp*gen->root[l+m]*gen->root[l-m]*gen->iroot[2*l-1];
}
}
else
{
int mlo_=m, mhi_=gen->s;
if (mhi_<mlo_) SWAP(mhi_,mlo_,int);
int ms_similar = ((gen->mhi==mhi_) && (gen->mlo==mlo_));
gen->mlo = mlo_; gen->mhi = mhi_;
if (!ms_similar)
{
for (int l=gen->mhi; l<gen->lmax; ++l)
{
double t = gen->flm1[l+gen->m]*gen->flm1[l-gen->m]
*gen->flm1[l+gen->s]*gen->flm1[l-gen->s];
double lt = 2*l+1;
double l1 = l+1;
gen->fx[l+1].f[0]=l1*lt*t;
gen->fx[l+1].f[1]=gen->m*gen->s*gen->inv[l]*gen->inv[l+1];
t = gen->flm2[l+gen->m]*gen->flm2[l-gen->m]
*gen->flm2[l+gen->s]*gen->flm2[l-gen->s];
gen->fx[l+1].f[2]=t*l1*gen->inv[l];
}
}
gen->preMinus_p = gen->preMinus_m = 0;
if (gen->mhi==gen->m)
{
gen->cosPow = gen->mhi+gen->s; gen->sinPow = gen->mhi-gen->s;
gen->preMinus_p = gen->preMinus_m = ((gen->mhi-gen->s)&1);
}
else
{
gen->cosPow = gen->mhi+gen->m; gen->sinPow = gen->mhi-gen->m;
gen->preMinus_m = ((gen->mhi+gen->m)&1);
}
}
}
double *sharp_Ylmgen_get_norm (int lmax, int spin)
{
const double pi = 3.141592653589793238462643383279502884197;
double *res=RALLOC(double,lmax+1);
/* sign convention for H=1 (LensPix paper) */
#if 1
double spinsign = (spin>0) ? -1.0 : 1.0;
#else
double spinsign = 1.0;
#endif
if (spin==0)
{
for (int l=0; l<=lmax; ++l)
res[l]=1.;
return res;
}
spinsign = (spin&1) ? -spinsign : spinsign;
for (int l=0; l<=lmax; ++l)
res[l] = (l<spin) ? 0. : spinsign*0.5*sqrt((2*l+1)/(4*pi));
return res;
}
double *sharp_Ylmgen_get_d1norm (int lmax)
{
const double pi = 3.141592653589793238462643383279502884197;
double *res=RALLOC(double,lmax+1);
for (int l=0; l<=lmax; ++l)
res[l] = (l<1) ? 0. : 0.5*sqrt(l*(l+1.)*(2*l+1.)/(4*pi));
return res;
}

100
external/sharp/libsharp/sharp_ylmgen_c.h vendored Normal file
View file

@ -0,0 +1,100 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_ylmgen_c.h
* Code for efficient calculation of Y_lm(phi=0,theta)
*
* Copyright (C) 2005-2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef SHARP_YLMGEN_C_H
#define SHARP_YLMGEN_C_H
#ifdef __cplusplus
extern "C" {
#endif
enum { sharp_minscale=0, sharp_limscale=1, sharp_maxscale=1 };
static const double sharp_fbig=0x1p+800,sharp_fsmall=0x1p-800;
static const double sharp_ftol=0x1p-60;
static const double sharp_fbighalf=0x1p+400;
typedef struct { double f[2]; } sharp_ylmgen_dbl2;
typedef struct { double f[3]; } sharp_ylmgen_dbl3;
typedef struct
{
/* for public use; immutable during lifetime */
int lmax, mmax, s;
double *cf;
/* for public use; will typically change after call to Ylmgen_prepare() */
int m;
/* used if s==0 */
double *mfac;
sharp_ylmgen_dbl2 *rf;
/* used if s!=0 */
int sinPow, cosPow, preMinus_p, preMinus_m;
double *prefac;
int *fscale;
sharp_ylmgen_dbl3 *fx;
/* internal usage only */
/* used if s==0 */
double *root, *iroot;
/* used if s!=0 */
double *flm1, *flm2, *inv;
int mlo, mhi;
} sharp_Ylmgen_C;
/*! Creates a generator which will calculate helper data for Y_lm calculation
up to \a l=l_max and \a m=m_max. */
void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin);
/*! Deallocates a generator previously initialised by Ylmgen_init(). */
void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen);
/*! Prepares the object for the calculation at \a m. */
void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m);
/*! Returns a pointer to an array with \a lmax+1 entries containing
normalisation factors that must be applied to Y_lm values computed for
\a spin. The array must be deallocated (using free()) by the user. */
double *sharp_Ylmgen_get_norm (int lmax, int spin);
/*! Returns a pointer to an array with \a lmax+1 entries containing
normalisation factors that must be applied to Y_lm values computed for
first derivatives. The array must be deallocated (using free()) by the
user. */
double *sharp_Ylmgen_get_d1norm (int lmax);
#ifdef __cplusplus
}
#endif
#endif