initial import

This commit is contained in:
Martin Reinecke 2012-06-27 16:44:31 +02:00
commit 2942428e92
53 changed files with 9133 additions and 0 deletions

131
libsharp/complex_hacks.h Normal file
View file

@ -0,0 +1,131 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/* \file complex_hacks.h
* support for converting vector types and complex numbers
*
* Copyright (C) 2012 Max-Planck-Society
* Author: Martin Reinecke
*/
#ifndef COMPLEX_HACKS_H
#define COMPLEX_HACKS_H
#include <math.h>
#include <complex.h>
#include "vecsupport.h"
#define UNSAFE_CODE
#if (VLEN==1)
static inline complex double vhsum_cmplx(Tv a, Tv b)
{ return a+_Complex_I*b; }
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
complex double * restrict c1, complex double * restrict c2)
{ *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
#endif
#if (VLEN==2)
static inline complex double vhsum_cmplx (Tv a, Tv b)
{
#if defined(__SSE3__)
Tv tmp = _mm_hadd_pd(a,b);
#else
Tv tmp = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
#endif
union {Tv v; complex double c; } u;
u.v=tmp; return u.c;
}
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
Tv d, complex double * restrict c1, complex double * restrict c2)
{
#ifdef UNSAFE_CODE
#if defined(__SSE3__)
vaddeq(*((__m128d *)c1),_mm_hadd_pd(a,b));
vaddeq(*((__m128d *)c2),_mm_hadd_pd(c,d));
#else
vaddeq(*((__m128d *)c1),vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0))));
vaddeq(*((__m128d *)c2),vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0))));
#endif
#else
union {Tv v; complex double c; } u1, u2;
#if defined(__SSE3__)
u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
#else
u1.v = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
_mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
u2.v = vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
_mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)));
#endif
*c1+=u1.c; *c2+=u2.c;
#endif
}
#endif
#if (VLEN==4)
static inline complex double vhsum_cmplx (Tv a, Tv b)
{
Tv tmp=_mm256_hadd_pd(a,b);
Tv tmp2=_mm256_permute2f128_pd(tmp,tmp,1);
tmp=_mm256_add_pd(tmp,tmp2);
#ifdef UNSAFE_CODE
complex double ret;
*((__m128d *)&ret)=_mm256_extractf128_pd(tmp, 0);
return ret;
#else
union {Tv v; complex double c[2]; } u;
u.v=tmp; return u.c[0];
#endif
}
static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
complex double * restrict c1, complex double * restrict c2)
{
Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
tmp1=vadd(tmp3,tmp4);
#ifdef UNSAFE_CODE
*((__m128d *)c1)=_mm_add_pd(*((__m128d *)c1),_mm256_extractf128_pd(tmp1, 0));
*((__m128d *)c2)=_mm_add_pd(*((__m128d *)c2),_mm256_extractf128_pd(tmp1, 1));
#else
union {Tv v; complex double c[2]; } u;
u.v=tmp1;
*c1+=u.c[0]; *c2+=u.c[1];
#endif
}
#endif
#endif

94
libsharp/libsharp.dox Normal file
View file

@ -0,0 +1,94 @@
/*! \mainpage libsharp documentation
<ul>
<li>\ref introduction "Introduction"
<li><a href="modules.html">Programming interface</a>
</ul>
*/
/*! \page introduction Introduction to libsharp
"SHARP" is an acronym for <i>Performant Spherical Harmonic Transforms</i>.
All user-visible data types and functions in this library start with
the prefix "sharp_", or with "sharps_" and "sharpd_" for single- and
double precision variants, respectively.
<i>libsharp</i>'s main functionality is the conversion between <i>maps</i>
on the sphere and <i>spherical harmonic coefficients</i> (or <i>a_lm</i>).
A map is defined as a set of <i>rings</i>, which in turn consist of
individual pixels that
<ul>
<li>all have the same colatitude and</li>
<li>are uniformly spaced in azimuthal direction.</li>
</ul>
Consequently, a ring is completely defined by
<ul>
<li>its colatitute (in radians)</li>
<li>the number of pixels it contains</li>
<li>the azimuth (in radians) of the first pixel in the ring</li>
<li>the weight that must be multiplied to every pixel during a map
analysis (typically the solid angle of a pixel in the ring) </li>
<li>the offset of the first ring pixel in the <i>map array</i></li>
<li>the stride between consecutive pixels in the ring.</li>
</ul>
The map array is a one-dimensional array of type <i>float</i> or
<i>double</i>, which contains the values of all map pixels. It is assumed
that the pixels of every ring are stored inside this array in order of
increasing azimuth and with the specified stride. Note however that the rings
themselves can be stored in any order inside the array.
The a_lm array is a one-dimensional array of type <i>complex float</i> or
<i>complex double</i>, which contains all spherical harmonic coefficients
for a full or partial set of m quantum numbers with 0<=m<=mmax and m<=l<=lmax.
There is only one constraint on the internal structure of the array, which is:
<code>Index[a_l+1,m] = Index[a_l,m] + stride</code>
That means that coefficients with identical <i>m</i> but different <i>l</i>
can be interpreted as a one-dimensional array in <i>l</i> with a unique
stride.
Several functions are provided for efficient index computation in this array;
they are documented \ref almgroup "here".
Information about a pixelisation of the sphere is stored in objects of
type sharp_geom_info. It is possible to create such an object for any
supported pixelisation by using the function sharp_make_geometry_info();
however, several easier-to-use functions are \ref geominfogroup "supplied"
for generating often-used pixelisations like ECP grids, Gaussian grids,
and Healpix grids.
Currently, SHARP supports the following kinds of transforms:
<ul>
<li>scalar a_lm to map</li>
<li>scalar map to a_lm</li>
<!-- <li>polarised a_lm to map</li>
<li>polarised map to a_lm</li> -->
<li>spin a_lm to map</li>
<li>spin map to a_lm</li>
<!-- <li>scalar a_lm to maps of first derivatives</li> -->
</ul>
SHARP supports shared-memory parallelisation via OpenMP; this feature will
be automatically enabled if the compiler supports it.
SHARP will also make use of SSE2 and AVX instructions when compiled for a
platform known to support them.
Support for MPI-parallel transforms is also available; in this mode,
every MPI task must provide a unique subset of the map and a_lm coefficients.
The spherical harmonic transforms can be executed on double-precision and
single-precision maps and a_lm, but for accuracy reasons the computations
will always be performed in double precision. As a consequence,
single-precision transforms will most likely not be faster than their
double-precision counterparts, but they will require significantly less
memory.
Two example and benchmark programs are distributed with SHARP:
<ul>
<li>sharp_test.c checks the accuracy of the (iterative) map analysis
algorithm</li>
<li>sharp_bench.c determines the quickest transform strategy for a given
SHT</li>
</ul>
*/

9
libsharp/oracle.inc Normal file
View file

@ -0,0 +1,9 @@
static const int maxtr = 6;
static const int nv_opt[6][2][3] = {
{{4,2,-1},{2,1,-1}},
{{4,2,-1},{2,1,-1}},
{{5,2,-1},{5,2,-1}},
{{5,2,-1},{5,2,-1}},
{{5,2,-1},{5,2,-1}},
{{5,2,-1},{5,2,-1}}
};

29
libsharp/planck.make Normal file
View file

@ -0,0 +1,29 @@
PKG:=libsharp
SD:=$(SRCROOT)/$(PKG)
OD:=$(BLDROOT)/$(PKG)
FULL_INCLUDE+= -I$(SD)
HDR_$(PKG):=$(SD)/*.h
LIB_$(PKG):=$(LIBDIR)/libsharp.a
BIN:=sharp_test sharp_acctest sharp_test_mpi sharp_bench
LIBOBJ:=ylmgen_c.o sharp.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o
ALLOBJ:=$(LIBOBJ) sharp_test.o sharp_acctest.o sharp_test_mpi.o sharp_bench.o
LIBOBJ:=$(LIBOBJ:%=$(OD)/%)
ALLOBJ:=$(ALLOBJ:%=$(OD)/%)
ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils)
$(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c
$(OD)/sharp.o: $(SD)/sharp_mpi.c $(SD)/oracle.inc
BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils)
$(LIB_$(PKG)): $(LIBOBJ)
$(ALLOBJ): $(ODEP) | $(OD)_mkdir
BIN:=$(BIN:%=$(BINDIR)/%)
$(BIN): $(BINDIR)/% : $(OD)/%.o $(BDEP)
all_hdr+=$(HDR_$(PKG))
all_lib+=$(LIB_$(PKG))
all_cbin+=$(BIN)

596
libsharp/sharp.c Normal file
View file

@ -0,0 +1,596 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp.c
* Spherical transform library
*
* Copyright (C) 2006-2012 Max-Planck-Society
* \author Martin Reinecke
*/
#include <math.h>
#include "ls_fft.h"
#include "ylmgen_c.h"
#include "sharp.h"
#include "c_utils.h"
#include "sharp_core.h"
#include "vec_utils.h"
#include "walltime_c.h"
typedef complex double dcmplx;
typedef complex float fcmplx;
static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
{
static const int chunksize_min=500, nchunks_max=10;
*chunksize = IMAX(chunksize_min,(ndata+nchunks_max-1)/nchunks_max);
*chunksize = ((*chunksize+nmult-1)/nmult)*nmult;
*nchunks = (ndata+*chunksize-1) / *chunksize;
}
typedef struct
{
double s;
int i;
} idxhelper;
static int idx_compare (const void *xa, const void *xb)
{
const idxhelper *a=xa, *b=xb;
return (a->s > b->s) ? -1 : (a->s < b->s) ? 1 : 0;
}
typedef struct
{
double phi0_;
dcmplx *shiftarr, *work;
int s_shift, s_work;
real_plan plan;
int norot;
} ringhelper;
static void ringhelper_init (ringhelper *self)
{
static ringhelper rh_null = { 0, NULL, NULL, 0, 0, NULL, 0 };
*self = rh_null;
}
static void ringhelper_destroy (ringhelper *self)
{
if (self->plan) kill_real_plan(self->plan);
DEALLOC(self->shiftarr);
DEALLOC(self->work);
ringhelper_init(self);
}
static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
{
self->norot = (fabs(phi0)<1e-14);
if (!(self->norot))
if ((mmax!=self->s_shift-1) || (!FAPPROX(phi0,self->phi0_,1e-12)))
{
RESIZE (self->shiftarr,dcmplx,mmax+1);
self->s_shift = mmax+1;
self->phi0_ = phi0;
for (int m=0; m<=mmax; ++m)
self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
}
if (!self->plan) self->plan=make_real_plan(nph);
if (nph!=(int)self->plan->length)
{
kill_real_plan(self->plan);
self->plan=make_real_plan(nph);
}
GROW(self->work,dcmplx,self->s_work,nph);
}
static int ringinfo_compare (const void *xa, const void *xb)
{
const sharp_ringinfo *a=xa, *b=xb;
return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
}
static int ringpair_compare (const void *xa, const void *xb)
{
const sharp_ringpair *a=xa, *b=xb;
if (a->r1.nph==b->r1.nph)
return (a->r1.phi0 < b->r1.phi0) ? -1 : (a->r1.phi0 > b->r1.phi0) ? 1 : 0;
return (a->r1.nph<b->r1.nph) ? -1 : 1;
}
void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
const ptrdiff_t *mstart, sharp_alm_info **alm_info)
{
sharp_alm_info *info = RALLOC(sharp_alm_info,1);
info->lmax = lmax;
info->nm = nm;
info->mval = RALLOC(int,nm);
info->mvstart = RALLOC(ptrdiff_t,nm);
info->stride = stride;
for (int mi=0; mi<nm; ++mi)
{
info->mval[mi] = mval[mi];
info->mvstart[mi] = mstart[mi];
}
*alm_info = info;
}
void sharp_make_alm_info (int lmax, int mmax, int stride,
const ptrdiff_t *mstart, sharp_alm_info **alm_info)
{
int *mval=RALLOC(int,mmax+1);
for (int i=0; i<=mmax; ++i)
mval[i]=i;
sharp_make_general_alm_info (lmax, mmax+1, stride, mval, mstart, alm_info);
DEALLOC(mval);
}
ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi)
{ return self->mvstart[mi]+self->stride*l; }
void sharp_destroy_alm_info (sharp_alm_info *info)
{
DEALLOC (info->mval);
DEALLOC (info->mvstart);
DEALLOC (info);
}
void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
const int *stride, const double *phi0, const double *theta,
const double *weight, sharp_geom_info **geom_info)
{
sharp_geom_info *info = RALLOC(sharp_geom_info,1);
sharp_ringinfo *infos = RALLOC(sharp_ringinfo,nrings);
int pos=0;
info->pair=RALLOC(sharp_ringpair,nrings);
info->npairs=0;
*geom_info = info;
for (int m=0; m<nrings; ++m)
{
infos[m].theta = theta[m];
infos[m].cth = cos(theta[m]);
infos[m].sth = sin(theta[m]);
infos[m].weight = weight[m];
infos[m].phi0 = phi0[m];
infos[m].ofs = ofs[m];
infos[m].stride = stride[m];
infos[m].nph = nph[m];
}
qsort(infos,nrings,sizeof(sharp_ringinfo),ringinfo_compare);
while (pos<nrings)
{
info->pair[info->npairs].r1=infos[pos];
if ((pos<nrings-1) && FAPPROX(infos[pos].cth,-infos[pos+1].cth,1e-12))
{
info->pair[info->npairs].r2=infos[pos+1];
++pos;
}
else
info->pair[info->npairs].r2.nph=-1;
++pos;
++info->npairs;
}
DEALLOC(infos);
qsort(info->pair,info->npairs,sizeof(sharp_ringpair),ringpair_compare);
}
void sharp_destroy_geom_info (sharp_geom_info *geom_info)
{
DEALLOC (geom_info->pair);
DEALLOC (geom_info);
}
static int sharp_get_mmax (int *mval, int nm)
{
int *mcheck=RALLOC(int,nm);
SET_ARRAY(mcheck,0,nm,0);
for (int i=0; i<nm; ++i)
{
int m_cur=mval[i];
UTIL_ASSERT((m_cur>=0) && (m_cur<nm), "m out of range");
UTIL_ASSERT(mcheck[m_cur]==0, "duplicate m value");
mcheck[m_cur]=1;
}
DEALLOC(mcheck);
return nm-1; // FIXME: this looks wrong
}
static void ringhelper_phase2ring (ringhelper *self,
const sharp_ringinfo *info, void *data, int mmax, const dcmplx *phase,
int pstride, sharp_fde fde)
{
int nph = info->nph;
int stride = info->stride;
ringhelper_update (self, nph, mmax, info->phi0);
self->work[0]=phase[0];
SET_ARRAY(self->work,1,nph,0.);
#if 0
if (self->norot)
for (int m=1; m<=mmax; ++m)
{
int idx1 = m%nph;
int idx2 = nph-1-((m-1)%nph);
self->work[idx1]+=phase[m*pstride];
self->work[idx2]+=conj(phase[m*pstride]);
}
else
for (int m=1; m<=mmax; ++m)
{
int idx1 = m%nph;
int idx2 = nph-1-((m-1)%nph);
dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
self->work[idx1]+=tmp;
self->work[idx2]+=conj(tmp);
}
#else
int idx1=1, idx2=nph-1;
for (int m=1; m<=mmax; ++m)
{
dcmplx tmp = phase[m*pstride];
if(!self->norot) tmp*=self->shiftarr[m];
self->work[idx1]+=tmp;
self->work[idx2]+=conj(tmp);
if (++idx1>=nph) idx1=0;
if (--idx2<0) idx2=nph-1;
}
#endif
real_plan_backward_c (self->plan, (double *)(self->work));
if (fde==DOUBLE)
for (int m=0; m<nph; ++m)
((double *)data)[m*stride+info->ofs] += creal(self->work[m]);
else
for (int m=0; m<nph; ++m)
((float *)data)[m*stride+info->ofs] += (float)creal(self->work[m]);
}
static void ringhelper_ring2phase (ringhelper *self,
const sharp_ringinfo *info, const void *data, int mmax, dcmplx *phase,
int pstride, sharp_fde fde)
{
int nph = info->nph;
#if 1
int maxidx = mmax; /* Enable this for traditional Healpix compatibility */
#else
int maxidx = IMIN(nph-1,mmax);
#endif
ringhelper_update (self, nph, mmax, -info->phi0);
if (fde==DOUBLE)
for (int m=0; m<nph; ++m)
self->work[m] = ((double *)data)[info->ofs+m*info->stride]*info->weight;
else
for (int m=0; m<nph; ++m)
self->work[m] = ((float *)data)[info->ofs+m*info->stride]*info->weight;
real_plan_forward_c (self->plan, (double *)self->work);
if (self->norot)
for (int m=0; m<=maxidx; ++m)
phase[m*pstride] = self->work[m%nph];
else
for (int m=0; m<=maxidx; ++m)
phase[m*pstride]=self->work[m%nph]*self->shiftarr[m];
for (int m=maxidx+1;m<=mmax; ++m)
phase[m*pstride]=0.;
}
static void ringhelper_pair2phase (ringhelper *self, int mmax,
const sharp_ringpair *pair, const void *data, dcmplx *phase1, dcmplx *phase2,
int pstride, sharp_fde fde)
{
ringhelper_ring2phase (self, &(pair->r1), data, mmax, phase1, pstride, fde);
if (pair->r2.nph>0)
ringhelper_ring2phase (self, &(pair->r2), data, mmax, phase2, pstride, fde);
}
static void ringhelper_phase2pair (ringhelper *self, int mmax,
const dcmplx *phase1, const dcmplx *phase2, int pstride,
const sharp_ringpair *pair, void *data, sharp_fde fde)
{
ringhelper_phase2ring (self, &(pair->r1), data, mmax, phase1, pstride, fde);
if (pair->r2.nph>0)
ringhelper_phase2ring (self, &(pair->r2), data, mmax, phase2, pstride, fde);
}
static void fill_map (const sharp_geom_info *ginfo, void *map, double value,
sharp_fde fde)
{
for (int j=0;j<ginfo->npairs;++j)
{
if (fde==DOUBLE)
{
for (int i=0;i<ginfo->pair[j].r1.nph;++i)
((double *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=value;
for (int i=0;i<ginfo->pair[j].r2.nph;++i)
((double *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=value;
}
else
{
for (int i=0;i<ginfo->pair[j].r1.nph;++i)
((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
=(float)value;
for (int i=0;i<ginfo->pair[j].r2.nph;++i)
((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
=(float)value;
}
}
}
static void fill_alm (const sharp_alm_info *ainfo, void *alm, dcmplx value,
sharp_fde fde)
{
if (fde==DOUBLE)
for (int mi=0;mi<ainfo->nm;++mi)
for (int l=ainfo->mval[mi];l<=ainfo->lmax;++l)
((dcmplx *)alm)[sharp_alm_index(ainfo,l,mi)] = value;
else
for (int mi=0;mi<ainfo->nm;++mi)
for (int l=ainfo->mval[mi];l<=ainfo->lmax;++l)
((fcmplx *)alm)[sharp_alm_index(ainfo,l,mi)] = (fcmplx)value;
}
static void init_output (sharp_job *job)
{
if (job->add_output) return;
if (job->type == MAP2ALM)
for (int i=0; i<job->ntrans*job->nalm; ++i)
fill_alm (job->ainfo,job->alm[i],0.,job->fde);
else
for (int i=0; i<job->ntrans*job->nmaps; ++i)
fill_map (job->ginfo,job->map[i],0.,job->fde);
}
static void alloc_phase (sharp_job *job, int nm, int ntheta)
{ job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta); }
static void dealloc_phase (sharp_job *job)
{ DEALLOC(job->phase); }
//FIXME: set phase to zero if not MAP2ALM?
static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
{
if (job->type != MAP2ALM) return;
int pstride = 2*job->ntrans*job->nmaps;
#pragma omp parallel
{
ringhelper helper;
ringhelper_init(&helper);
#pragma omp for schedule(dynamic,1)
for (int ith=llim; ith<ulim; ++ith)
{
int dim2 = pstride*(ith-llim)*(mmax+1);
for (int i=0; i<job->ntrans*job->nmaps; ++i)
ringhelper_pair2phase(&helper,mmax,&job->ginfo->pair[ith], job->map[i],
&job->phase[dim2+2*i], &job->phase[dim2+2*i+1], pstride, job->fde);
}
ringhelper_destroy(&helper);
} /* end of parallel region */
}
static void alloc_almtmp (sharp_job *job, int lmax)
{ job->almtmp=RALLOC(dcmplx,job->ntrans*job->nalm*(lmax+1)); }
static void dealloc_almtmp (sharp_job *job)
{ DEALLOC(job->almtmp); }
static void alm2almtmp (sharp_job *job, int lmax, int mi)
{
if (job->type!=MAP2ALM)
for (int l=job->ainfo->mval[mi]; l<=lmax; ++l)
{
ptrdiff_t aidx = sharp_alm_index(job->ainfo,l,mi);
double fct = (job->type==ALM2MAP) ? job->norm_l[l] :
-fabs(job->norm_l[l])*sqrt(l*(l+1.));
for (int i=0; i<job->ntrans*job->nalm; ++i)
if (job->fde==DOUBLE)
job->almtmp[job->ntrans*job->nalm*l+i]
= ((dcmplx *)job->alm[i])[aidx]*fct;
else
job->almtmp[job->ntrans*job->nalm*l+i]
= ((fcmplx *)job->alm[i])[aidx]*fct;
}
else
SET_ARRAY(job->almtmp,job->ntrans*job->nalm*job->ainfo->mval[mi],
job->ntrans*job->nalm*(lmax+1),0.);
}
static void almtmp2alm (sharp_job *job, int lmax, int mi)
{
if (job->type != MAP2ALM) return;
for (int l=job->ainfo->mval[mi]; l<=lmax; ++l)
{
ptrdiff_t aidx = sharp_alm_index(job->ainfo,l,mi);
for (int i=0;i<job->ntrans*job->nalm;++i)
if (job->fde==DOUBLE)
((dcmplx *)job->alm[i])[aidx] +=
job->almtmp[job->ntrans*job->nalm*l+i]*job->norm_l[l];
else
((fcmplx *)job->alm[i])[aidx] +=
(fcmplx)(job->almtmp[job->ntrans*job->nalm*l+i]*job->norm_l[l]);
}
}
static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
{
if (job->type == MAP2ALM) return;
int pstride = 2*job->ntrans*job->nmaps;
#pragma omp parallel
{
ringhelper helper;
ringhelper_init(&helper);
#pragma omp for schedule(dynamic,1)
for (int ith=llim; ith<ulim; ++ith)
{
int dim2 = pstride*(ith-llim)*(mmax+1);
for (int i=0; i<job->ntrans*job->nmaps; ++i)
ringhelper_phase2pair(&helper,mmax,&job->phase[dim2+2*i],
&job->phase[dim2+2*i+1],pstride,&job->ginfo->pair[ith],job->map[i],
job->fde);
}
ringhelper_destroy(&helper);
} /* end of parallel region */
}
void sharp_execute_job (sharp_job *job)
{
double timer=wallTime();
job->opcnt=0;
int lmax = job->ainfo->lmax,
mmax=sharp_get_mmax(job->ainfo->mval, job->ainfo->nm);
job->norm_l = Ylmgen_get_norm (lmax, job->spin);
/* clear output arrays if requested */
init_output (job);
int nchunks, chunksize;
get_chunk_info(job->ginfo->npairs,job->nv*VLEN,&nchunks,&chunksize);
alloc_phase (job,mmax+1,chunksize);
/* chunk loop */
for (int chunk=0; chunk<nchunks; ++chunk)
{
int llim=chunk*chunksize, ulim=IMIN(llim+chunksize,job->ginfo->npairs);
int *ispair = RALLOC(int,ulim-llim);
double *cth = RALLOC(double,ulim-llim), *sth = RALLOC(double,ulim-llim);
idxhelper *stmp = RALLOC(idxhelper,ulim-llim);
for (int i=0; i<ulim-llim; ++i)
{
ispair[i] = job->ginfo->pair[i+llim].r2.nph>0;
cth[i] = job->ginfo->pair[i+llim].r1.cth;
sth[i] = job->ginfo->pair[i+llim].r1.sth;
stmp[i].s=sth[i];
stmp[i].i=i;
}
qsort (stmp,ulim-llim,sizeof(idxhelper),idx_compare);
int *idx = RALLOC(int,ulim-llim);
for (int i=0; i<ulim-llim; ++i)
idx[i]=stmp[i].i;
DEALLOC(stmp);
/* map->phase where necessary */
map2phase (job, mmax, llim, ulim);
#pragma omp parallel
{
sharp_job ljob = *job;
ljob.opcnt=0;
Ylmgen_C generator;
Ylmgen_init (&generator,lmax,mmax,ljob.spin);
alloc_almtmp(&ljob,lmax);
#pragma omp for schedule(dynamic,1)
for (int mi=0; mi<job->ainfo->nm; ++mi)
{
/* alm->alm_tmp where necessary */
alm2almtmp (&ljob, lmax, mi);
inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, idx);
/* alm_tmp->alm where necessary */
almtmp2alm (&ljob, lmax, mi);
}
Ylmgen_destroy(&generator);
dealloc_almtmp(&ljob);
#pragma omp critical
job->opcnt+=ljob.opcnt;
} /* end of parallel region */
/* phase->map where necessary */
phase2map (job, mmax, llim, ulim);
DEALLOC(ispair);
DEALLOC(cth);
DEALLOC(sth);
DEALLOC(idx);
} /* end of chunk loop */
DEALLOC(job->norm_l);
dealloc_phase (job);
job->time=wallTime()-timer;
}
static void sharp_build_job_common (sharp_job *job, sharp_jobtype type, int spin,
int add_output, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans)
{
UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin");
UTIL_ASSERT((type==MAP2ALM)||(type==ALM2MAP), "unsupported SHT type");
job->type = type;
job->spin = spin;
job->norm_l = NULL;
job->add_output = add_output;
job->nmaps = (type==ALM2MAP_DERIV1) ? 2 : ((spin>0) ? 2 : 1);
job->nalm = (type==ALM2MAP_DERIV1) ? 1 : ((spin>0) ? 2 : 1);
job->ginfo = geom_info;
job->ainfo = alm_info;
job->nv = sharp_nv_oracle (type, spin, ntrans);
job->time = 0.;
job->opcnt = 0;
job->ntrans = ntrans;
}
void sharpd_build_job (sharp_job *job, sharp_jobtype type, int spin,
int add_output, dcmplx **alm, double **map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans)
{
sharp_build_job_common (job, type, spin, add_output, geom_info, alm_info,
ntrans);
job->alm=(void **)alm;
job->map=(void **)map;
job->fde=DOUBLE;
}
void sharps_build_job (sharp_job *job, sharp_jobtype type, int spin,
int add_output, fcmplx **alm, float **map, const sharp_geom_info *geom_info,
const sharp_alm_info *alm_info, int ntrans)
{
sharp_build_job_common (job, type, spin, add_output, geom_info, alm_info,
ntrans);
job->alm=(void **)alm;
job->map=(void **)map;
job->fde=FLOAT;
}
int sharp_get_nv_max (void)
{ return 6; }
int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
{
UTIL_ASSERT(type!=ALM2MAP_DERIV1,"transform type not yet supported");
#include "oracle.inc"
return nv_opt[IMIN(ntrans,maxtr)-1][spin!=0][type];
}
#include "sharp_mpi.c"

213
libsharp/sharp.h Normal file
View file

@ -0,0 +1,213 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp.h
* Interface for the spherical transform library.
*
* Copyright (C) 2006-2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef PLANCK_SHARP_H
#define PLANCK_SHARP_H
#include <stddef.h>
#include <complex.h>
#ifdef __cplusplus
extern "C" {
#endif
/*! \internal
Helper type containing information about a single ring. */
typedef struct
{
double theta, phi0, weight, cth, sth;
ptrdiff_t ofs;
int nph, stride;
} sharp_ringinfo;
/*! \internal
Helper type containing information about a pair of rings with colatitudes
symmetric around the equator. */
typedef struct
{
sharp_ringinfo r1,r2;
} sharp_ringpair;
/*! \internal
Type holding all required information about a map geometry. */
typedef struct
{
sharp_ringpair *pair;
int npairs;
} sharp_geom_info;
/*! \defgroup almgroup Helpers for dealing with a_lm */
/*! \{ */
/*! \internal
Helper type for index calculation in a_lm arrays. */
typedef struct
{
/*! Maximum \a l index of the array */
int lmax;
/*! Number of different \a m values in this object */
int nm;
/*! Array with \a nm entries containing the individual m values */
int *mval;
/*! Array with \a nm entries containing the (hypothetical) indices of
the coefficients with quantum numbers 0,\a mval[i] */
ptrdiff_t *mvstart;
/*! Stride between a_lm and a_(l+1),m */
ptrdiff_t stride;
} sharp_alm_info;
/*! Creates an Alm data structure information from the following parameters:
\param lmax maximum \a l quantum number (>=0)
\param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
\param stride the stride between consecutive a_lm entries
\param mstart the index of the (hypothetical) coefficient with the
quantum numbers 0,\a m. Must have \a mmax+1 entries.
\param alm_info will hold a pointer to the newly created data structure
*/
void sharp_make_alm_info (int lmax, int mmax, int stride,
const ptrdiff_t *mstart, sharp_alm_info **alm_info);
/*! Creates an Alm data structure information from the following parameters:
\param lmax maximum \a l quantum number (>=0)
\param nm number of different \a m (<=\a lmax+1)
\param stride the stride between consecutive a_lm entries
\param mval array with \a nm entries containing the individual m values
\param mvstart array with \a nm entries containing the (hypothetical)
indices of the coefficients with the quantum numbers 0,\a mval[i]
\param alm_info will hold a pointer to the newly created data structure
*/
void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
const ptrdiff_t *mvstart, sharp_alm_info **alm_info);
/*! Returns the index of the coefficient with quantum numbers \a l,
\a mval[mi]. */
ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
/*! Deallocates the a_lm info object. */
void sharp_destroy_alm_info (sharp_alm_info *info);
/*! \} */
/*! \defgroup geominfogroup Functions for dealing with geometry information */
/*! \{ */
/*! Creates a geometry information from a set of ring descriptions.
All arrays passed to this function must have \a nrings elements.
\param nrings the number of rings in the map
\param nph the number of pixels in each ring
\param ofs the index of the first pixel in each ring in the map array
\param stride the stride between consecutive pixels
\param phi0 the azimuth (in radians) of the first pixel in each ring
\param theta the colatitude (in radians) of each ring
\param weight the pixel weight to be used for the ring
\param geom_info will hold a pointer to the newly created data structure
*/
void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
const int *stride, const double *phi0, const double *theta,
const double *weight, sharp_geom_info **geom_info);
/*! Deallocates the geometry information in \a info. */
void sharp_destroy_geom_info (sharp_geom_info *info);
/*! \} */
/*! \defgroup jobgroup Functionality for defining and executing SHTs */
/*! \{ */
/*! Enumeration of SHARP job types. */
typedef enum { MAP2ALM, /*!< analysis */
ALM2MAP, /*!< synthesis */
ALM2MAP_DERIV1 /*!< currently unused */
} sharp_jobtype;
typedef enum { FLOAT, DOUBLE } sharp_fde;
/*! \internal
Type holding all required information about an SHT job. */
typedef struct
{
sharp_jobtype type;
int spin;
int add_output;
int nmaps, nalm;
sharp_fde fde;
void **map;
void **alm;
complex double *phase;
double *norm_l;
complex double *almtmp;
const sharp_geom_info *ginfo;
const sharp_alm_info *ainfo;
int nv;
double time;
int ntrans;
unsigned long long opcnt;
} sharp_job;
/*! Initializes \a job with the appropriate parameters to perform the required
SHT.
\param type the type of SHT (currently ALM2MAP and MAP2ALM)
\param spin the spin of the quantities to be transformed
\param add_output if 0, the output arrays will be overwritten,
else the result will be added to the output arrays.
\param ntrans the number of simultaneous SHTs
\param alm contains pointers to the a_lm coefficients. If \a spin==0,
alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
alm[2] and alm[3] to those of the second, etc.
\param map contains pointers to the maps. If \a spin==0,
map[0] points to the map of the first SHT, map[1] to that of the second
etc. If \a spin>0, map[0] and map[1] point to the maps of the first SHT,
map[2] and map[3] to those of the second, etc.
\note \a map and \a a_lm must not be de-allocated until after the last call of
sharp_execute_job()! This is because the library does not copy the input
data, but only stores the pointers to the supplied maps and a_lm. */
void sharpd_build_job (sharp_job *job, sharp_jobtype type, int spin,
int add_output, complex double **alm, double **map,
const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans);
void sharps_build_job (sharp_job *job, sharp_jobtype type, int spin,
int add_output, complex float **alm, float **map,
const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans);
/*! Execute the SHT job previously constructed by sharpd_build_job() or
sharps_build_job(). */
void sharp_execute_job (sharp_job *job);
/*! \} */
/*! Internal */
int sharp_get_nv_max (void);
/*! Internal */
int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans);
#ifdef __cplusplus
}
#endif
#endif

217
libsharp/sharp_acctest.c Normal file
View file

@ -0,0 +1,217 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_acctest.c
Systematic accuracy test for libsharp.
Copyright (C) 2006-2012 Max-Planck-Society
\author Martin Reinecke
*/
#include <stdio.h>
#include <string.h>
#ifdef USE_MPI
#include "mpi.h"
#endif
#include "sharp.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "c_utils.h"
#include "sharp_core.h"
typedef complex double dcmplx;
static double drand (double min, double max)
{ return min + (max-min)*rand()/(RAND_MAX+1.0); }
static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin)
{
for (int mi=0;mi<helper->nm; ++mi)
{
int m=helper->mval[mi];
for (int l=m;l<=helper->lmax; ++l)
{
if ((l<spin)&&(m<spin))
alm[sharp_alm_index(helper,l,mi)] = 0.;
else
{
double rv = drand(-1,1);
double iv = (m==0) ? 0 : drand(-1,1);
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
}
}
}
}
static void measure_errors (dcmplx **alm, dcmplx **alm2,
ptrdiff_t nalms, int ncomp)
{
for (int i=0; i<ncomp; ++i)
{
double sum=0, sum2=0, maxdiff=0;
for (ptrdiff_t m=0; m<nalms; ++m)
{
double x=creal(alm[i][m])-creal(alm2[i][m]),
y=cimag(alm[i][m])-cimag(alm2[i][m]);
sum+=x*x+y*y;
sum2+=creal(alm[i][m])*creal(alm[i][m])+cimag(alm[i][m])*cimag(alm[i][m]);
if (fabs(x)>maxdiff) maxdiff=fabs(x);
if (fabs(y)>maxdiff) maxdiff=fabs(y);
}
sum=sqrt(sum/nalms);
sum2=sqrt(sum2/nalms);
UTIL_ASSERT((maxdiff<1e-10)&&(sum/sum2<1e-10),"error");
}
}
static void check_sign_scale(void)
{
int lmax=50;
int mmax=lmax;
sharp_geom_info *tinfo;
int nrings=lmax+1;
int ppring=2*lmax+2;
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
double **map;
ALLOC2D(map,double,2,npix);
dcmplx **alm;
ALLOC2D(alm,dcmplx,2,nalms);
for (int i=0; i<2; ++i)
for (int j=0; j<nalms; ++j)
alm[i][j]=1.+_Complex_I;
sharp_job job;
sharpd_build_job(&job,ALM2MAP,0,0,&alm[0],&map[0],tinfo,alms,1);
sharp_execute_job(&job);
UTIL_ASSERT(FAPPROX(map[0][0 ], 3.588246976618616912e+00,1e-12),"error");
UTIL_ASSERT(FAPPROX(map[0][npix/2], 4.042209792157496651e+01,1e-12),"error");
UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.234675107554816442e+01,1e-12),"error");
sharpd_build_job(&job,ALM2MAP,1,0,&alm[0],&map[0],tinfo,alms,1);
sharp_execute_job(&job);
UTIL_ASSERT(FAPPROX(map[0][0 ], 2.750897760535633285e+00,1e-12),"error");
UTIL_ASSERT(FAPPROX(map[0][npix/2], 3.137704477368562905e+01,1e-12),"error");
UTIL_ASSERT(FAPPROX(map[0][npix-1],-8.405730859837063917e+01,1e-12),"error");
UTIL_ASSERT(FAPPROX(map[1][0 ],-2.398026536095463346e+00,1e-12),"error");
UTIL_ASSERT(FAPPROX(map[1][npix/2],-4.961140548331700728e+01,1e-12),"error");
UTIL_ASSERT(FAPPROX(map[1][npix-1],-1.412765834230440021e+01,1e-12),"error");
sharpd_build_job(&job,ALM2MAP,2,0,&alm[0],&map[0],tinfo,alms,1);
sharp_execute_job(&job);
UTIL_ASSERT(FAPPROX(map[0][0 ],-1.398186224727334448e+00,1e-12),"error");
UTIL_ASSERT(FAPPROX(map[0][npix/2],-2.456676000884031197e+01,1e-12),"error");
UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.516249174408820863e+02,1e-12),"error");
UTIL_ASSERT(FAPPROX(map[1][0 ],-3.173406200299964119e+00,1e-12),"error");
UTIL_ASSERT(FAPPROX(map[1][npix/2],-5.831327404513146462e+01,1e-12),"error");
UTIL_ASSERT(FAPPROX(map[1][npix-1],-1.863257892248353897e+01,1e-12),"error");
DEALLOC2D(map);
DEALLOC2D(alm);
sharp_destroy_alm_info(alms);
sharp_destroy_geom_info(tinfo);
}
static void check_accuracy (sharp_geom_info *tinfo, ptrdiff_t lmax,
ptrdiff_t mmax, ptrdiff_t npix, int spin, int ntrans, int nv)
{
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
srand(4);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
for (int i=0; i<ncomp; ++i)
random_alm(alm[i],alms,spin);
dcmplx **alm2;
ALLOC2D(alm2,dcmplx,ncomp,nalms);
sharp_job job;
sharpd_build_job(&job,ALM2MAP,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
job.nv=nv;
sharp_execute_job(&job);
sharpd_build_job(&job,MAP2ALM,spin,0,&alm2[0],&map[0],tinfo,alms,ntrans);
job.nv=nv;
sharp_execute_job(&job);
measure_errors(alm,alm2,nalms,ncomp);
DEALLOC2D(map);
DEALLOC2D(alm);
DEALLOC2D(alm2);
sharp_destroy_alm_info(alms);
}
int main(void)
{
#ifdef USE_MPI
MPI_Init(NULL,NULL);
#endif
module_startup_c("sharp_acctest",1,1,"",1);
int lmax=127;
printf("Checking signs and scales.\n");
check_sign_scale();
printf("Passed.\n\n");
printf("Testing map analysis accuracy.\n");
sharp_geom_info *tinfo;
int nrings=lmax+1;
int ppring=2*lmax+2;
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
for (int nv=1; nv<=6; ++nv)
for (int ntrans=1; ntrans<=6; ++ntrans)
{
check_accuracy(tinfo,lmax,lmax,npix,0,ntrans,nv);
check_accuracy(tinfo,lmax,lmax,npix,1,ntrans,nv);
check_accuracy(tinfo,lmax,lmax,npix,2,ntrans,nv);
check_accuracy(tinfo,lmax,lmax,npix,3,ntrans,nv);
check_accuracy(tinfo,lmax,lmax,npix,30,ntrans,nv);
}
sharp_destroy_geom_info(tinfo);
printf("Passed.\n\n");
#ifdef USE_MPI
MPI_Finalize();
#endif
return 0;
}

View file

@ -0,0 +1,68 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_almhelpers.c
* Spherical transform library
*
* Copyright (C) 2008-2011 Max-Planck-Society
* \author Martin Reinecke
*/
#include "sharp_almhelpers.h"
#include "c_utils.h"
void sharp_make_triangular_alm_info (int lmax, int mmax, int stride,
sharp_alm_info **alm_info)
{
sharp_alm_info *info = RALLOC(sharp_alm_info,1);
info->lmax = lmax;
info->nm = mmax+1;
info->mval = RALLOC(int,mmax+1);
info->mvstart = RALLOC(ptrdiff_t,mmax+1);
info->stride = stride;
int tval = 2*lmax+1;
for (ptrdiff_t m=0; m<=mmax; ++m)
{
info->mval[m] = m;
info->mvstart[m] = stride*((m*(tval-m))>>1);
}
*alm_info = info;
}
void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride,
sharp_alm_info **alm_info)
{
sharp_alm_info *info = RALLOC(sharp_alm_info,1);
info->lmax = lmax;
info->nm = mmax+1;
info->mval = RALLOC(int,mmax+1);
info->mvstart = RALLOC(ptrdiff_t,mmax+1);
info->stride = stride;
for (ptrdiff_t m=0; m<=mmax; ++m)
{
info->mval[m] = m;
info->mvstart[m] = stride*m*(lmax+1);
}
*alm_info = info;
}

View file

@ -0,0 +1,57 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_almhelpers.h
* SHARP helper function for the creation of a_lm data structures
*
* Copyright (C) 2008-2011 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef PLANCK_SHARP_ALMHELPERS_H
#define PLANCK_SHARP_ALMHELPERS_H
#include "sharp.h"
#ifdef __cplusplus
extern "C" {
#endif
/*! Initialises an a_lm data structure according to the scheme used by
Healpix_cxx.
\ingroup almgroup */
void sharp_make_triangular_alm_info (int lmax, int mmax, int stride,
sharp_alm_info **alm_info);
/*! Initialises an a_lm data structure according to the scheme used by
Fortran Healpix
\ingroup almgroup */
void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride,
sharp_alm_info **alm_info);
#ifdef __cplusplus
}
#endif
#endif

143
libsharp/sharp_bench.c Normal file
View file

@ -0,0 +1,143 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_bench.c
Copyright (C) 2012 Max-Planck-Society
\author Martin Reinecke
*/
#include <stdio.h>
#include <string.h>
#ifdef USE_MPI
#include "mpi.h"
#endif
#include "sharp.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "c_utils.h"
#include "sharp_core.h"
typedef complex double dcmplx;
static void bench_sht (int spin, int nv, sharp_jobtype type,
int ntrans, double *time, unsigned long long *opcnt)
{
int lmax=2047;
int mmax=128;
int nrings=512;
int ppring=1024;
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
sharp_geom_info *tinfo;
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
SET_ARRAY(map[0],0,npix*ncomp,0.);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
SET_ARRAY(alm[0],0,nalms*ncomp,0.);
int nruns=0;
sharp_job job;
sharpd_build_job(&job,type,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
job.nv=nv;
*time=1e30;
*opcnt=1000000000000000;
do
{
sharpd_build_job(&job,type,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
job.nv=nv;
sharp_execute_job(&job);
if (job.opcnt<*opcnt) *opcnt=job.opcnt;
if (job.time<*time) *time=job.time;
}
while (++nruns < 4);
DEALLOC2D(map);
DEALLOC2D(alm);
sharp_destroy_alm_info(alms);
sharp_destroy_geom_info(tinfo);
}
int main(void)
{
#ifdef USE_MPI
MPI_Init(NULL,NULL);
#endif
module_startup_c("sharp_bench",1,1,"",1);
printf("Benchmarking SHTs.\n\n");
FILE *fp=fopen("oracle.inc","w");
UTIL_ASSERT(fp, "failed to open oracle file for writing");
fprintf(fp,"static const int maxtr = 6;\n");
fprintf(fp,"static const int nv_opt[6][2][3] = {\n");
for (int ntr=1; ntr<=6; ++ntr)
{
fprintf(fp,"{");
for (int spin=0; spin<=2; spin+=2)
{
fprintf(fp,"{");
for (sharp_jobtype type=MAP2ALM; type<=ALM2MAP; ++type)
{
int nvbest=-1, nvoracle=sharp_nv_oracle(type,spin,ntr);
unsigned long long opmin=1000000000000000, op;
double tmin=1e30;
double *time=RALLOC(double,sharp_get_nv_max()+1);
for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
{
bench_sht (spin,nv,type,ntr,&time[nv],&op);
if (op<opmin) opmin=op;
if (time[nv]<tmin)
{ tmin=time[nv]; nvbest=nv; }
}
printf("nt: %d %s spin: %d nv: %d time: %6.3f perf: %6.3f"
" dev[%d]: %6.2f%%\n",ntr,(type==ALM2MAP)?"alm2map":"map2alm",
spin,nvbest,tmin,opmin/tmin*1e-9,nvoracle,
(time[nvoracle]-tmin)/tmin*100.);
DEALLOC(time);
fprintf(fp,"%d",nvbest);
fprintf(fp,(type==MAP2ALM)?",":",-1");
}
fprintf(fp,(spin==0)?"},":"}");
printf("\n");
}
fprintf(fp,(ntr<6)?"},\n":"}\n");
}
fprintf(fp,"};\n");
fclose(fp);
#ifdef USE_MPI
MPI_Finalize();
#endif
return 0;
}

238
libsharp/sharp_core.c Normal file
View file

@ -0,0 +1,238 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_core.c
* Computational core
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
#include <complex.h>
#include <math.h>
#include <string.h>
#include "vecsupport.h"
#include "complex_hacks.h"
#include "ylmgen_c.h"
#include "sharp.h"
#include "sharp_core.h"
#include "c_utils.h"
typedef complex double dcmplx;
#define MAXJOB_SPECIAL 2
#define XCONCAT2(a,b) a##_##b
#define CONCAT2(a,b) XCONCAT2(a,b)
#define XCONCAT3(a,b,c) a##_##b##_##c
#define CONCAT3(a,b,c) XCONCAT3(a,b,c)
#define nvec 1
#include "sharp_inchelper1.inc.c"
#undef nvec
#define nvec 2
#include "sharp_inchelper1.inc.c"
#undef nvec
#define nvec 3
#include "sharp_inchelper1.inc.c"
#undef nvec
#define nvec 4
#include "sharp_inchelper1.inc.c"
#undef nvec
#define nvec 5
#include "sharp_inchelper1.inc.c"
#undef nvec
#define nvec 6
#include "sharp_inchelper1.inc.c"
#undef nvec
void inner_loop (sharp_job *job, const int *ispair,const double *cth,
const double *sth, int llim, int ulim, Ylmgen_C *gen, int mi, const int *idx)
{
int njobs=job->ntrans;
if (njobs<=MAXJOB_SPECIAL)
{
switch (njobs*16+job->nv)
{
#if (MAXJOB_SPECIAL>=1)
case 0x11:
CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x12:
CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x13:
CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x14:
CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x15:
CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x16:
CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
#endif
#if (MAXJOB_SPECIAL>=2)
case 0x21:
CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x22:
CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x23:
CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x24:
CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x25:
CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x26:
CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
#endif
#if (MAXJOB_SPECIAL>=3)
case 0x31:
CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x32:
CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x33:
CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x34:
CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x35:
CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x36:
CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
#endif
#if (MAXJOB_SPECIAL>=4)
case 0x41:
CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x42:
CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x43:
CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x44:
CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x45:
CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x46:
CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
#endif
#if (MAXJOB_SPECIAL>=5)
case 0x51:
CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x52:
CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x53:
CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x54:
CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x55:
CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x56:
CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
#endif
#if (MAXJOB_SPECIAL>=6)
case 0x61:
CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x62:
CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x63:
CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x64:
CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x65:
CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
case 0x66:
CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
return;
#endif
}
}
#if (MAXJOB_SPECIAL<6)
else
{
switch (job->nv)
{
case 1:
CONCAT2(inner_loop,1)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
return;
case 2:
CONCAT2(inner_loop,2)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
return;
case 3:
CONCAT2(inner_loop,3)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
return;
case 4:
CONCAT2(inner_loop,4)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
return;
case 5:
CONCAT2(inner_loop,5)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
return;
case 6:
CONCAT2(inner_loop,6)
(job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
return;
}
}
#endif
UTIL_FAIL("Incorrect vector parameters");
}

49
libsharp/sharp_core.h Normal file
View file

@ -0,0 +1,49 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_core.h
* Interface for the computational core
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef PLANCK_SHARP_CORE_H
#define PLANCK_SHARP_CORE_H
#include "sharp.h"
#include "ylmgen_c.h"
#ifdef __cplusplus
extern "C" {
#endif
void inner_loop (sharp_job *job, const int *ispair,const double *cth,
const double *sth, int llim, int ulim, Ylmgen_C *gen, int mi, const int *idx);
#ifdef __cplusplus
}
#endif
#endif

268
libsharp/sharp_core_inc.c Normal file
View file

@ -0,0 +1,268 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_core_inc.c
* Type-dependent code for the computational core
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
typedef struct
{ Tv v[nvec]; } Tb;
typedef union
{ Tb b; double s[VLEN*nvec]; } Y(Tbu);
typedef struct
{ Tb r, i; } Y(Tbri);
typedef struct
{ Tb qr, qi, ur, ui; } Y(Tbqu);
typedef struct
{ double r[VLEN*nvec], i[VLEN*nvec]; } Y(Tsri);
typedef struct
{ double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Y(Tsqu);
typedef union
{ Y(Tbri) b; Y(Tsri)s; } Y(Tburi);
typedef union
{ Y(Tbqu) b; Y(Tsqu)s; } Y(Tbuqu);
static inline Tb Y(Tbconst)(double val)
{
Tv v=vload(val);
Tb res;
for (int i=0; i<nvec; ++i) res.v[i]=v;
return res;
}
static inline void Y(Tbmuleq1)(Tb * restrict a, double b)
{ Tv v=vload(b); for (int i=0; i<nvec; ++i) vmuleq(a->v[i],v); }
static inline Tb Y(Tbprod)(Tb a, Tb b)
{ Tb r; for (int i=0; i<nvec; ++i) r.v[i]=vmul(a.v[i],b.v[i]); return r; }
static inline void Y(Tbmuleq)(Tb * restrict a, Tb b)
{ for (int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); }
static inline void Y(mypow) (Tb val, int npow, Tb * restrict resd,
Tb * restrict ress)
{
Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
do
{
if (npow&1)
{
for (int i=0; i<nvec; ++i)
{
vmuleq(res.v[i],val.v[i]);
vaddeq(scale.v[i],scaleint.v[i]);
Tv mask=vlt(vabs(res.v[i]),vload(fsmall));
vmuleq(res.v[i],vblend(mask,vload(fbig),vone));
vsubeq(scale.v[i],vblend(mask,vone,vzero));
}
}
for (int i=0; i<nvec; ++i)
{
vmuleq(val.v[i],val.v[i]);
vaddeq(scaleint.v[i],scaleint.v[i]);
Tv mask = vlt(vabs(val.v[i]),vload(fsmall));
vmuleq(val.v[i],vblend(mask,vload(fbig),vone));
vsubeq(scaleint.v[i],vblend(mask,vone,vzero));
}
}
while(npow>>=1);
*resd=res;
*ress=scale;
}
static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
Tb * restrict scale)
{
int did_scale=0;
for (int i=0;i<nvec; ++i)
{
Tv mask = vgt(vabs(lam2->v[i]),vone);
if (vanyTrue(mask))
{
did_scale=1;
Tv fact = vblend(mask,vload(fsmall),vone);
vmuleq(lam1->v[i],fact); vmuleq(lam2->v[i],fact);
vaddeq(scale->v[i],vblend(mask,vone,vzero));
}
}
return did_scale;
}
static inline void Y(normalize) (Tb * restrict val, Tb * restrict scale)
{
const Tv vfsmall=vload(fsmall), vfbig=vload(fbig);
for (int i=0;i<nvec; ++i)
{
Tv mask = vgt(vabs(val->v[i]),vone);
while (vanyTrue(mask))
{
vmuleq(val->v[i],vblend(mask,vfsmall,vone));
vaddeq(scale->v[i],vblend(mask,vone,vzero));
mask = vgt(vabs(val->v[i]),vone);
}
mask = vlt(vabs(val->v[i]),vfsmall);
mask = vand(mask,vne(val->v[i],vzero));
while (vanyTrue(mask))
{
vmuleq(val->v[i],vblend(mask,vfbig,vone));
vsubeq(scale->v[i],vblend(mask,vone,vzero));
mask = vlt(vabs(val->v[i]),vfsmall);
mask = vand(mask,vne(val->v[i],vzero));
}
}
}
static inline int Y(TballLt)(Tb a,double b)
{
Tv vb=vload(b);
Tv res=vlt(a.v[0],vb);
for (int i=1; i<nvec; ++i)
res=vand(res,vlt(a.v[i],vb));
return vallTrue(res);
}
static inline int Y(TballGt)(Tb a,double b)
{
Tv vb=vload(b);
Tv res=vgt(a.v[0],vb);
for (int i=1; i<nvec; ++i)
res=vand(res,vgt(a.v[i],vb));
return vallTrue(res);
}
static inline void Y(getCorfac)(Tb scale, Tb * restrict corfac,
const double * restrict cf)
{
Y(Tbu) sc, corf;
sc.b=scale;
for (int i=0; i<VLEN*nvec; ++i)
corf.s[i] = (sc.s[i]<minscale) ? 0. : cf[(int)(sc.s[i])-minscale];
*corfac=corf.b;
}
static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
const Ylmgen_C * restrict gen)
{
int l=gen->m;
Tb lam_1=Y(Tbconst)(0.), lam_2, scale;
Y(mypow) (sth,l,&lam_2,&scale);
Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
Y(normalize)(&lam_2,&scale);
int below_limit = Y(TballLt)(scale,limscale);
while (below_limit)
{
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
Tv r0=vload(gen->rf[l].f[0]),r1=vload(gen->rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
r0=vload(gen->rf[l+1].f[0]); r1=vload(gen->rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
if (Y(rescale)(&lam_1,&lam_2,&scale))
below_limit = Y(TballLt)(scale,limscale);
l+=2;
}
*l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale;
}
static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
Tb * restrict ryp, Tb * restrict rym, const Tb cth, const ylmgen_dbl3 fx)
{
Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
for (int i=0; i<nvec; ++i)
{
rxp->v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,ryp->v[i])),
vmul(fx2,rxp->v[i]));
rxm->v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rym->v[i])),
vmul(fx2,rxm->v[i]));
}
}
static void Y(iter_to_ieee_spin) (const Tb cth, int *l_,
Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_,
Tb * scalep_, Tb * scalem_, const Ylmgen_C * restrict gen)
{
const ylmgen_dbl3 * restrict fx = gen->fx;
Tb cth2, sth2;
for (int i=0; i<nvec; ++i)
{
cth2.v[i]=vsqrt(vmul(vadd(vone,cth.v[i]),vload(0.5)));
cth2.v[i]=vmax(cth2.v[i],vload(1e-15));
sth2.v[i]=vsqrt(vmul(vsub(vone,cth.v[i]),vload(0.5)));
sth2.v[i]=vmax(sth2.v[i],vload(1e-15));
}
Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
Y(mypow)(cth2,gen->cosPow,&ccp,&ccps); Y(mypow)(sth2,gen->sinPow,&ssp,&ssps);
Y(mypow)(cth2,gen->sinPow,&csp,&csps); Y(mypow)(sth2,gen->cosPow,&scp,&scps);
Tb rec2p, rec2m, scalep, scalem;
Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.);
Tv prefac=vload(gen->prefac[gen->m]),
prescale=vload(gen->fscale[gen->m]);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i]=vmul(vmul(prefac,ccp.v[i]),ssp.v[i]);
scalep.v[i]=vadd(vadd(prescale,ccps.v[i]),ssps.v[i]);
rec2m.v[i]=vmul(vmul(prefac,csp.v[i]),scp.v[i]);
scalem.v[i]=vadd(vadd(prescale,csps.v[i]),scps.v[i]);
if (gen->preMinus_p)
rec2p.v[i]=vneg(rec2p.v[i]);
if (gen->preMinus_m)
rec2m.v[i]=vneg(rec2m.v[i]);
if (gen->s&1)
rec2p.v[i]=vneg(rec2p.v[i]);
}
Y(normalize)(&rec2m,&scalem); Y(normalize)(&rec2p,&scalep);
int l=gen->mhi;
int below_limit = Y(TballLt)(scalep,limscale) && Y(TballLt)(scalem,limscale);
while (below_limit)
{
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]);
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
below_limit = Y(TballLt)(scalep,limscale) && Y(TballLt)(scalem,limscale);
l+=2;
}
*l_=l;
*rec1p_=rec1p; *rec2p_=rec2p; *scalep_=scalep;
*rec1m_=rec1m; *rec2m_=rec2m; *scalem_=scalem;
}

702
libsharp/sharp_core_inc2.c Normal file
View file

@ -0,0 +1,702 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_core_inc2.c
* Type-dependent code for the computational core
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
typedef struct
{ Y(Tbri) j[njobs]; } Z(Tbrij);
typedef union
{ Z(Tbrij) b; Y(Tsri) j[njobs]; } Z(Tburij);
typedef struct
{ Y(Tbqu) j[njobs]; } Z(Tbquj);
typedef union
{ Z(Tbquj) b; Y(Tsqu) j[njobs]; } Z(Tbuquj);
static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2,
const ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
int l, int lmax)
{
#if (njobs>1)
while (l<lmax-2)
{
Tb lam_3, lam_4;
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar2=vload(creal(alm[njobs*l+j])),
ai2=vload(cimag(alm[njobs*l+j])),
ar4=vload(creal(alm[njobs*(l+2)+j])),
ai4=vload(cimag(alm[njobs*(l+2)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaaeq(p1->j[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
vfmaaeq(p1->j[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
}
Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
ai3=vload(cimag(alm[njobs*(l+1)+j])),
ar1=vload(creal(alm[njobs*(l+3)+j])),
ai1=vload(cimag(alm[njobs*(l+3)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaaeq(p2->j[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
vfmaaeq(p2->j[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
}
}
r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
l+=4;
}
#endif
while (l<lmax)
{
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),
ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p1->j[j].r.v[i],lam_2.v[i],ar);
vfmaeq(p1->j[j].i.v[i],lam_2.v[i],ai);
}
ar=vload(creal(alm[njobs*(l+1)+j]));
ai=vload(cimag(alm[njobs*(l+1)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p2->j[j].r.v[i],lam_1.v[i],ar);
vfmaeq(p2->j[j].i.v[i],lam_1.v[i],ai);
}
}
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
l+=2;
}
if (l==lmax)
{
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p1->j[j].r.v[i],lam_2.v[i],ar);
vfmaeq(p1->j[j].i.v[i],lam_2.v[i],ai);
}
}
}
}
static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
const Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2,
const ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax)
{
while (l<lmax)
{
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
for (int i=0; i<nvec; ++i)
{
vfmaeq(tr1,lam_2.v[i],p1->j[j].r.v[i]);
vfmaeq(ti1,lam_2.v[i],p1->j[j].i.v[i]);
}
for (int i=0; i<nvec; ++i)
{
vfmaeq(tr2,lam_1.v[i],p2->j[j].r.v[i]);
vfmaeq(ti2,lam_1.v[i],p2->j[j].i.v[i]);
}
vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
}
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
l+=2;
}
if (l==lmax)
{
for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i)
{
vfmaeq(tre,lam_2.v[i],p1->j[j].r.v[i]);
vfmaeq(tim,lam_2.v[i],p1->j[j].i.v[i]);
}
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
}
}
static void Z(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
sharp_job *job, Z(Tbrij) * restrict p1, Z(Tbrij) * restrict p2, int *done)
{
int l,lmax=gen->lmax;
Tb lam_1,lam_2,scale;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec;
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
const ylmgen_dbl2 * restrict rf = gen->rf;
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scale,minscale);
while (!full_ieee)
{
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
vfmaeq(p1->j[j].r.v[i],tmp,ar);
vfmaeq(p1->j[j].i.v[i],tmp,ai);
}
}
if (++l>lmax) break;
Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
vfmaeq(p2->j[j].r.v[i],tmp,ar);
vfmaeq(p2->j[j].i.v[i],tmp,ai);
}
}
if (++l>lmax) break;
r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
if (Y(rescale)(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGt)(scale,minscale);
}
}
if (l>lmax) { *done=1; return; }
Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
}
static void Z(calc_map2alm) (const Tb cth, const Tb sth,
const Ylmgen_C *gen, sharp_job *job, const Z(Tbrij) * restrict p1,
const Z(Tbrij) * restrict p2, int *done)
{
int lmax=gen->lmax;
Tb lam_1,lam_2,scale;
int l=gen->m;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec;
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
const ylmgen_dbl2 * restrict rf = gen->rf;
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scale,minscale);
while (!full_ieee)
{
for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
vfmaeq(tre,tmp,p1->j[j].r.v[i]);
vfmaeq(tim,tmp,p1->j[j].i.v[i]);
}
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
if (++l>lmax) { *done=1; return; }
Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
vfmaeq(tre,tmp,p2->j[j].r.v[i]);
vfmaeq(tim,tmp,p2->j[j].i.v[i]);
}
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
if (++l>lmax) { *done=1; return; }
r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
if (Y(rescale)(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGt)(scale,minscale);
}
}
Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
}
static inline void Z(saddstep) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py,
const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
{
for (int j=0; j<njobs; ++j)
{
Tv agr=vload(creal(alm[2*j])), agi=vload(cimag(alm[2*j])),
acr=vload(creal(alm[2*j+1])), aci=vload(cimag(alm[2*j+1]));
for (int i=0; i<nvec; ++i)
{
Tv lw=vadd(rxp.v[i],rxm.v[i]);
vfmaeq(px->j[j].qr.v[i],agr,lw);
vfmaeq(px->j[j].qi.v[i],agi,lw);
vfmaeq(px->j[j].ur.v[i],acr,lw);
vfmaeq(px->j[j].ui.v[i],aci,lw);
}
for (int i=0; i<nvec; ++i)
{
Tv lx=vsub(rxm.v[i],rxp.v[i]);
vfmseq(py->j[j].qr.v[i],aci,lx);
vfmaeq(py->j[j].qi.v[i],acr,lx);
vfmaeq(py->j[j].ur.v[i],agi,lx);
vfmseq(py->j[j].ui.v[i],agr,lx);
}
}
}
static inline void Z(saddstepb) (Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2,
const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
const dcmplx * restrict alm1, const dcmplx * restrict alm2)
{
for (int j=0; j<njobs; ++j)
{
Tv agr1=vload(creal(alm1[2*j])), agi1=vload(cimag(alm1[2*j])),
acr1=vload(creal(alm1[2*j+1])), aci1=vload(cimag(alm1[2*j+1]));
Tv agr2=vload(creal(alm2[2*j])), agi2=vload(cimag(alm2[2*j])),
acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
for (int i=0; i<nvec; ++i)
{
Tv lw1=vadd(r2p.v[i],r2m.v[i]);
Tv lx2=vsub(r1m.v[i],r1p.v[i]);
vfmaseq(p1->j[j].qr.v[i],agr1,lw1,aci2,lx2);
vfmaaeq(p1->j[j].qi.v[i],agi1,lw1,acr2,lx2);
vfmaaeq(p1->j[j].ur.v[i],acr1,lw1,agi2,lx2);
vfmaseq(p1->j[j].ui.v[i],aci1,lw1,agr2,lx2);
}
for (int i=0; i<nvec; ++i)
{
Tv lx1=vsub(r2m.v[i],r2p.v[i]);
Tv lw2=vadd(r1p.v[i],r1m.v[i]);
vfmaseq(p2->j[j].qr.v[i],agr2,lw2,aci1,lx1);
vfmaaeq(p2->j[j].qi.v[i],agi2,lw2,acr1,lx1);
vfmaaeq(p2->j[j].ur.v[i],acr2,lw2,agi1,lx1);
vfmaseq(p2->j[j].ui.v[i],aci2,lw2,agr1,lx1);
}
}
}
static inline void Z(saddstep2) (const Z(Tbquj) * restrict px,
const Z(Tbquj) * restrict py, const Tb * restrict rxp,
const Tb * restrict rxm, dcmplx * restrict alm)
{
for (int j=0; j<njobs; ++j)
{
Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
for (int i=0; i<nvec; ++i)
{
Tv lw=vadd(rxp->v[i],rxm->v[i]);
vfmaeq(agr,px->j[j].qr.v[i],lw);
vfmaeq(agi,px->j[j].qi.v[i],lw);
vfmaeq(acr,px->j[j].ur.v[i],lw);
vfmaeq(aci,px->j[j].ui.v[i],lw);
}
for (int i=0; i<nvec; ++i)
{
Tv lx=vsub(rxm->v[i],rxp->v[i]);
vfmseq(agr,py->j[j].ui.v[i],lx);
vfmaeq(agi,py->j[j].ur.v[i],lx);
vfmaeq(acr,py->j[j].qi.v[i],lx);
vfmseq(aci,py->j[j].qr.v[i],lx);
}
vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]);
}
}
static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1,
Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, int lmax)
{
while (l<lmax)
{
Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
fx2=vload(fx[l+1].f[2]);
for (int i=0; i<nvec; ++i)
{
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
vmul(fx2,rec1p.v[i]));
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
}
#if (njobs>1)
Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
&alm[2*njobs*(l+1)]);
#else
Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l]);
Z(saddstep)(p2, p1, rec1p, rec1m, &alm[2*njobs*(l+1)]);
#endif
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
vmul(fx2,rec2p.v[i]));
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
vmul(fx2,rec2m.v[i]));
}
l+=2;
}
if (l==lmax)
Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l]);
}
static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1,
const Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
{
while (l<lmax)
{
Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
fx2=vload(fx[l+1].f[2]);
for (int i=0; i<nvec; ++i)
{
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
vmul(fx2,rec1p.v[i]));
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
}
Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l]);
Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)]);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
vmul(fx2,rec2p.v[i]));
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
vmul(fx2,rec2m.v[i]));
}
l+=2;
}
if (l==lmax)
Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l]);
}
static void Z(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax)
{ *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
while (!full_ieee)
{
Z(saddstep)(p1, p2,
Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), &alm[2*njobs*l]);
if (++l>lmax) break;
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
Z(saddstep)(p2, p1,
Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), &alm[2*njobs*l]);
if (++l>lmax) break;
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
}
}
if (l>lmax)
{ *done=1; return; }
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Z(alm2map_spin_kernel) (cth,p1,p2,
rec1p, rec1m, rec2p, rec2m, fx, alm, l, lmax);
}
static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
sharp_job *job, const Z(Tbquj) * restrict p1, const Z(Tbquj) * restrict p2,
int *done)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
while (!full_ieee)
{
Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l]);
if (++l>lmax) { *done=1; return; }
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l]);
if (++l>lmax) { *done=1; return; }
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
}
}
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Z(map2alm_spin_kernel) (cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
}
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
static void Z(inner_loop) (sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim, Ylmgen_C *gen,
int mi, const int *idx)
{
const int nval=nvec*VLEN;
const int m = job->ainfo->mval[mi];
Ylmgen_prepare (gen, m);
switch (job->type)
{
case ALM2MAP:
{
if (job->spin==0)
{
int done=0;
for (int ith=0; ith<ulim-llim; ith+=nval)
{
Z(Tburij) p1,p2; VZERO(p1); VZERO(p2);
if (!done)
{
Y(Tbu) cth, sth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
}
Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1.b,&p2.b,&done);
}
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot<ulim-llim)
{
itot=idx[itot];
for (int j=0; j<njobs; ++j)
{
int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
complex double r1 = p1.j[j].r[i] + p1.j[j].i[i]*_Complex_I,
r2 = p2.j[j].r[i] + p2.j[j].i[i]*_Complex_I;
job->phase[phas_idx] = r1+r2;
if (ispair[itot])
job->phase[phas_idx+1] = r1-r2;
}
}
}
}
}
else
{
int done=0;
for (int ith=0; ith<ulim-llim; ith+=nval)
{
Z(Tbuquj) p1,p2; VZERO(p1); VZERO(p2);
if (!done)
{
Y(Tbu) cth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot];
}
Z(calc_alm2map_spin) (cth.b,gen,job,&p1.b,&p2.b,&done);
}
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot<ulim-llim)
{
itot=idx[itot];
for (int j=0; j<njobs; ++j)
{
int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
complex double q1 = p1.j[j].qr[i] + p1.j[j].qi[i]*_Complex_I,
q2 = p2.j[j].qr[i] + p2.j[j].qi[i]*_Complex_I,
u1 = p1.j[j].ur[i] + p1.j[j].ui[i]*_Complex_I,
u2 = p2.j[j].ur[i] + p2.j[j].ui[i]*_Complex_I;
job->phase[phas_idx] = q1+q2;
job->phase[phas_idx+2] = u1+u2;
if (ispair[itot])
{
dcmplx *phQ = &(job->phase[phas_idx+1]),
*phU = &(job->phase[phas_idx+3]);
*phQ = q1-q2;
*phU = u1-u2;
if ((gen->mhi-gen->m+gen->s)&1)
{ *phQ=-(*phQ); *phU=-(*phU); }
}
}
}
}
}
}
break;
}
case ALM2MAP_DERIV1:
break;
case MAP2ALM:
{
if (job->spin==0)
{
int done=0;
for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
{
Z(Tburij) p1, p2; VZERO(p1); VZERO(p2);
Y(Tbu) cth, sth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
if (i+ith<ulim-llim)
{
for (int j=0; j<njobs; ++j)
{
int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
dcmplx ph1=job->phase[phas_idx];
dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
p1.j[j].r[i]=creal(ph1+ph2); p1.j[j].i[i]=cimag(ph1+ph2);
p2.j[j].r[i]=creal(ph1-ph2); p2.j[j].i[i]=cimag(ph1-ph2);
}
}
}
Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b,&done);
}
}
else
{
int done=0;
for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
{
Z(Tbuquj) p1, p2; VZERO(p1); VZERO(p2);
Y(Tbu) cth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot];
if (i+ith<ulim-llim)
{
for (int j=0; j<njobs; ++j)
{
int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
dcmplx p1Q=job->phase[phas_idx],
p1U=job->phase[phas_idx+2],
p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
if ((gen->mhi-gen->m+gen->s)&1)
{ p2Q=-p2Q; p2U=-p2U; }
p1.j[j].qr[i]=creal(p1Q+p2Q); p1.j[j].qi[i]=cimag(p1Q+p2Q);
p1.j[j].ur[i]=creal(p1U+p2U); p1.j[j].ui[i]=cimag(p1U+p2U);
p2.j[j].qr[i]=creal(p1Q-p2Q); p2.j[j].qi[i]=cimag(p1Q-p2Q);
p2.j[j].ur[i]=creal(p1U-p2U); p2.j[j].ui[i]=cimag(p1U-p2U);
}
}
}
Z(calc_map2alm_spin) (cth.b,gen,job,&p1.b,&p2.b,&done);
}
}
break;
}
}
}
#undef VZERO

691
libsharp/sharp_core_inc3.c Normal file
View file

@ -0,0 +1,691 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_core_inc3.c
* Type-dependent code for the computational core
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
const ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
int l, int lmax, int njobs)
{
while (l<lmax-2)
{
Tb lam_3, lam_4;
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar2=vload(creal(alm[njobs*l+j])),
ai2=vload(cimag(alm[njobs*l+j])),
ar4=vload(creal(alm[njobs*(l+2)+j])),
ai4=vload(cimag(alm[njobs*(l+2)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaaeq(p1[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
vfmaaeq(p1[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
}
Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
ai3=vload(cimag(alm[njobs*(l+1)+j])),
ar1=vload(creal(alm[njobs*(l+3)+j])),
ai1=vload(cimag(alm[njobs*(l+3)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaaeq(p2[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
vfmaaeq(p2[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
}
}
r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
l+=4;
}
while (l<lmax)
{
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),
ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
}
ar=vload(creal(alm[njobs*(l+1)+j]));
ai=vload(cimag(alm[njobs*(l+1)+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p2[j].r.v[i],lam_1.v[i],ar);
vfmaeq(p2[j].i.v[i],lam_1.v[i],ai);
}
}
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
l+=2;
}
if (l==lmax)
{
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
}
}
}
}
static void Y(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
const ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax,
int njobs)
{
while (l<lmax)
{
Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
for (int i=0; i<nvec; ++i)
{
vfmaeq(tr1,lam_2.v[i],p1[j].r.v[i]);
vfmaeq(ti1,lam_2.v[i],p1[j].i.v[i]);
}
for (int i=0; i<nvec; ++i)
{
vfmaeq(tr2,lam_1.v[i],p2[j].r.v[i]);
vfmaeq(ti2,lam_1.v[i],p2[j].i.v[i]);
}
vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
}
r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
l+=2;
}
if (l==lmax)
{
for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i)
{
vfmaeq(tre,lam_2.v[i],p1[j].r.v[i]);
vfmaeq(tim,lam_2.v[i],p1[j].i.v[i]);
}
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
}
}
static void Y(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
sharp_job *job, Y(Tbri) * restrict p1, Y(Tbri) * restrict p2, int njobs,
int *done)
{
int l,lmax=gen->lmax;
Tb lam_1,lam_2,scale;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec;
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
const ylmgen_dbl2 * restrict rf = gen->rf;
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scale,minscale);
while (!full_ieee)
{
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
vfmaeq(p1[j].r.v[i],tmp,ar);
vfmaeq(p1[j].i.v[i],tmp,ai);
}
}
if (++l>lmax) break;
Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
vfmaeq(p2[j].r.v[i],tmp,ar);
vfmaeq(p2[j].i.v[i],tmp,ai);
}
}
if (++l>lmax) break;
r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
if (Y(rescale)(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGt)(scale,minscale);
}
}
if (l>lmax) { *done=1; return; }
Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
Y(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, njobs);
}
static void Y(calc_map2alm) (const Tb cth, const Tb sth,
const Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
const Y(Tbri) * restrict p2, int njobs, int *done)
{
int lmax=gen->lmax;
Tb lam_1,lam_2,scale;
int l=gen->m;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec;
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
const ylmgen_dbl2 * restrict rf = gen->rf;
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scale,minscale);
while (!full_ieee)
{
for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
vfmaeq(tre,tmp,p1[j].r.v[i]);
vfmaeq(tim,tmp,p1[j].i.v[i]);
}
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
if (++l>lmax) { *done=1; return; }
Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
for (int j=0; j<njobs; ++j)
{
Tv tre=vzero, tim=vzero;
for (int i=0; i<nvec; ++i)
{
Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
vfmaeq(tre,tmp,p2[j].r.v[i]);
vfmaeq(tim,tmp,p2[j].i.v[i]);
}
alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
}
if (++l>lmax) { *done=1; return; }
r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
if (Y(rescale)(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGt)(scale,minscale);
}
}
Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
Y(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, njobs);
}
static inline void Y(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
const Tb rxp, const Tb rxm, const dcmplx * restrict alm, int njobs)
{
for (int j=0; j<njobs; ++j)
{
Tv agr=vload(creal(alm[2*j])), agi=vload(cimag(alm[2*j])),
acr=vload(creal(alm[2*j+1])), aci=vload(cimag(alm[2*j+1]));
for (int i=0; i<nvec; ++i)
{
Tv lw=vadd(rxp.v[i],rxm.v[i]);
vfmaeq(px[j].qr.v[i],agr,lw);
vfmaeq(px[j].qi.v[i],agi,lw);
vfmaeq(px[j].ur.v[i],acr,lw);
vfmaeq(px[j].ui.v[i],aci,lw);
}
for (int i=0; i<nvec; ++i)
{
Tv lx=vsub(rxm.v[i],rxp.v[i]);
vfmseq(py[j].qr.v[i],aci,lx);
vfmaeq(py[j].qi.v[i],acr,lx);
vfmaeq(py[j].ur.v[i],agi,lx);
vfmseq(py[j].ui.v[i],agr,lx);
}
}
}
static inline void Y(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
const dcmplx * restrict alm1, const dcmplx * restrict alm2, int njobs)
{
for (int j=0; j<njobs; ++j)
{
Tv agr1=vload(creal(alm1[2*j])), agi1=vload(cimag(alm1[2*j])),
acr1=vload(creal(alm1[2*j+1])), aci1=vload(cimag(alm1[2*j+1]));
Tv agr2=vload(creal(alm2[2*j])), agi2=vload(cimag(alm2[2*j])),
acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
for (int i=0; i<nvec; ++i)
{
Tv lw1=vadd(r2p.v[i],r2m.v[i]);
Tv lx2=vsub(r1m.v[i],r1p.v[i]);
vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
vfmaseq(p1[j].ui.v[i],aci1,lw1,agr2,lx2);
}
for (int i=0; i<nvec; ++i)
{
Tv lx1=vsub(r2m.v[i],r2p.v[i]);
Tv lw2=vadd(r1p.v[i],r1m.v[i]);
vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
vfmaseq(p2[j].ui.v[i],aci2,lw2,agr1,lx1);
}
}
}
static inline void Y(saddstep2) (const Y(Tbqu) * restrict px,
const Y(Tbqu) * restrict py, const Tb * restrict rxp,
const Tb * restrict rxm, dcmplx * restrict alm, int njobs)
{
for (int j=0; j<njobs; ++j)
{
Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
for (int i=0; i<nvec; ++i)
{
Tv lw=vadd(rxp->v[i],rxm->v[i]);
vfmaeq(agr,px[j].qr.v[i],lw);
vfmaeq(agi,px[j].qi.v[i],lw);
vfmaeq(acr,px[j].ur.v[i],lw);
vfmaeq(aci,px[j].ui.v[i],lw);
}
for (int i=0; i<nvec; ++i)
{
Tv lx=vsub(rxm->v[i],rxp->v[i]);
vfmseq(agr,py[j].ui.v[i],lx);
vfmaeq(agi,py[j].ur.v[i],lx);
vfmaeq(acr,py[j].qi.v[i],lx);
vfmseq(aci,py[j].qr.v[i],lx);
}
vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]);
}
}
static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax, int njobs)
{
while (l<lmax)
{
Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
fx2=vload(fx[l+1].f[2]);
for (int i=0; i<nvec; ++i)
{
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
vmul(fx2,rec1p.v[i]));
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
}
Y(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
&alm[2*njobs*(l+1)], njobs);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
vmul(fx2,rec2p.v[i]));
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
vmul(fx2,rec2m.v[i]));
}
l+=2;
}
if (l==lmax)
Y(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l], njobs);
}
static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax,
int njobs)
{
while (l<lmax)
{
Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
fx2=vload(fx[l+1].f[2]);
for (int i=0; i<nvec; ++i)
{
rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
vmul(fx2,rec1p.v[i]));
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
}
Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l],njobs);
Y(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)],njobs);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
vmul(fx2,rec2p.v[i]));
rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
vmul(fx2,rec2m.v[i]));
}
l+=2;
}
if (l==lmax)
Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l], njobs);
}
static void Y(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs,
int *done)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax)
{ *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
while (!full_ieee)
{
Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
&alm[2*njobs*l],njobs);
if (++l>lmax) break;
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
Y(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
&alm[2*njobs*l], njobs);
if (++l>lmax) break;
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
}
}
if (l>lmax)
{ *done=1; return; }
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Y(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
lmax, njobs);
}
static void Y(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2,
int njobs, int *done)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax) { *done=1; return; }
job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
const ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
while (!full_ieee)
{
Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
Y(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l], njobs);
if (++l>lmax) { *done=1; return; }
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
Y(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l], njobs);
if (++l>lmax) { *done=1; return; }
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
}
}
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Y(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax,njobs);
}
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
static void Y(inner_loop) (sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim, Ylmgen_C *gen,
int mi, const int *idx, int njobs)
{
const int nval=nvec*VLEN;
const int m = job->ainfo->mval[mi];
Ylmgen_prepare (gen, m);
switch (job->type)
{
case ALM2MAP:
{
if (job->spin==0)
{
int done=0;
for (int ith=0; ith<ulim-llim; ith+=nval)
{
Y(Tburi) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
if (!done)
{
Y(Tbu) cth, sth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
}
Y(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
}
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot<ulim-llim)
{
itot=idx[itot];
for (int j=0; j<njobs; ++j)
{
int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
complex double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I,
r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I;
job->phase[phas_idx] = r1+r2;
if (ispair[itot])
job->phase[phas_idx+1] = r1-r2;
}
}
}
}
}
else
{
int done=0;
for (int ith=0; ith<ulim-llim; ith+=nval)
{
Y(Tbuqu) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
if (!done)
{
Y(Tbu) cth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot];
}
Y(calc_alm2map_spin) (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
}
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot<ulim-llim)
{
itot=idx[itot];
for (int j=0; j<njobs; ++j)
{
int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
complex double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I,
q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I,
u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I,
u2 = p2[j].s.ur[i] + p2[j].s.ui[i]*_Complex_I;
job->phase[phas_idx] = q1+q2;
job->phase[phas_idx+2] = u1+u2;
if (ispair[itot])
{
dcmplx *phQ = &(job->phase[phas_idx+1]),
*phU = &(job->phase[phas_idx+3]);
*phQ = q1-q2;
*phU = u1-u2;
if ((gen->mhi-gen->m+gen->s)&1)
{ *phQ=-(*phQ); *phU=-(*phU); }
}
}
}
}
}
}
break;
}
case ALM2MAP_DERIV1:
break;
case MAP2ALM:
{
if (job->spin==0)
{
int done=0;
for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
{
Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
Y(Tbu) cth, sth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
if (i+ith<ulim-llim)
{
for (int j=0; j<njobs; ++j)
{
int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
dcmplx ph1=job->phase[phas_idx];
dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2);
p2[j].s.r[i]=creal(ph1-ph2); p2[j].s.i[i]=cimag(ph1-ph2);
}
}
}
Y(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
}
}
else
{
int done=0;
for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
{
Y(Tbuqu) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
Y(Tbu) cth;
for (int i=0; i<nval; ++i)
{
int itot=i+ith;
if (itot>=ulim-llim) itot=ulim-llim-1;
itot=idx[itot];
cth.s[i]=cth_[itot];
if (i+ith<ulim-llim)
{
for (int j=0; j<njobs; ++j)
{
int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
dcmplx p1Q=job->phase[phas_idx],
p1U=job->phase[phas_idx+2],
p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
if ((gen->mhi-gen->m+gen->s)&1)
{ p2Q=-p2Q; p2U=-p2U; }
p1[j].s.qr[i]=creal(p1Q+p2Q); p1[j].s.qi[i]=cimag(p1Q+p2Q);
p1[j].s.ur[i]=creal(p1U+p2U); p1[j].s.ui[i]=cimag(p1U+p2U);
p2[j].s.qr[i]=creal(p1Q-p2Q); p2[j].s.qi[i]=cimag(p1Q-p2Q);
p2[j].s.ur[i]=creal(p1U-p2U); p2[j].s.ui[i]=cimag(p1U-p2U);
}
}
}
Y(calc_map2alm_spin) (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
}
}
break;
}
}
}
#undef VZERO

View file

@ -0,0 +1,222 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_geomhelpers.c
* Spherical transform library
*
* Copyright (C) 2006-2011 Max-Planck-Society
* \author Martin Reinecke
*/
#include <math.h>
#include "sharp_geomhelpers.h"
#include "c_utils.h"
void sharp_make_healpix_geom_info (int nside, int stride,
sharp_geom_info **geom_info)
{
double *weight=RALLOC(double,2*nside);
SET_ARRAY(weight,0,2*nside,1);
sharp_make_weighted_healpix_geom_info (nside, stride, weight, geom_info);
DEALLOC(weight);
}
void sharp_make_weighted_healpix_geom_info (int nside, int stride,
const double *weight, sharp_geom_info **geom_info)
{
const double pi=3.141592653589793238462643383279502884197;
ptrdiff_t npix=(ptrdiff_t)nside*nside*12;
ptrdiff_t ncap=2*(ptrdiff_t)nside*(nside-1);
int nrings=4*nside-1;
double *theta=RALLOC(double,nrings);
double *weight_=RALLOC(double,nrings);
int *nph=RALLOC(int,nrings);
double *phi0=RALLOC(double,nrings);
ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
int *stride_=RALLOC(int,nrings);
for (int m=0; m<nrings; ++m)
{
int ring=m+1;
ptrdiff_t northring = (ring>2*nside) ? 4*nside-ring : ring;
stride_[m] = stride;
if (northring < nside)
{
theta[m] = 2*asin(northring/(sqrt(6.)*nside));
nph[m] = 4*northring;
phi0[m] = pi/nph[m];
ofs[m] = 2*northring*(northring-1)*stride;
}
else
{
double fact1 = (8.*nside)/npix;
double costheta = (2*nside-northring)*fact1;
theta[m] = acos(costheta);
nph[m] = 4*nside;
if ((northring-nside) & 1)
phi0[m] = 0;
else
phi0[m] = pi/nph[m];
ofs[m] = (ncap + (northring-nside)*nph[m])*stride;
}
if (northring != ring) /* southern hemisphere */
{
theta[m] = pi-theta[m];
ofs[m] = (npix - nph[m])*stride - ofs[m];
}
weight_[m]=4.*pi/npix*weight[northring-1];
}
sharp_make_geom_info (nrings, nph, ofs, stride_, phi0, theta, weight_,
geom_info);
DEALLOC(theta);
DEALLOC(weight_);
DEALLOC(nph);
DEALLOC(phi0);
DEALLOC(ofs);
DEALLOC(stride_);
}
static void gauleg (double x1, double x2, double *x, double *w, int n)
{
const double pi = 3.141592653589793238462643383279502884197;
const double eps = 3.0E-14;
int m = (n+1)/2;
double xm = 0.5*(x2+x1);
double xl = 0.5*(x2-x1);
for(int i=1; i<=m; ++i)
{
double z = cos(pi*(i-0.25)/(n+0.5));
double pp;
int dobreak=0;
while(1)
{
double p1 = 1.0, p2 = 0.0;
double z1 = z;
int j;
for(j=1; j<=n; ++j)
{
double p3 = p2;
p2 = p1;
p1 = ((2*j-1)*z*p2-(j-1)*p3)/j;
}
pp = n*(z*p1-p2)/(z*z-1);
z = z1 - p1/pp;
if (dobreak) break;
if (fabs(z-z1) <= eps) dobreak=1;
}
x[i-1] = xm - xl*z;
x[n-i] = xm + xl*z;
w[i-1] = w[n-i] = 2*xl/((1-z*z)*pp*pp);
}
}
static void makeweights (int bw, double *weights)
{
const double pi = 3.141592653589793238462643383279502884197;
const double fudge = pi/(4*bw);
for (int j=0; j<2*bw; ++j)
{
double tmpsum = 0;
for (int k=0; k<bw; ++k)
tmpsum += 1./(2*k+1) * sin((2*j+1)*(2*k+1)*fudge);
tmpsum *= sin((2*j+1)*fudge);
tmpsum *= 2./bw;
weights[j] = tmpsum;
/* weights[j + 2*bw] = tmpsum * sin((2*j+1)*fudge); */
}
}
void sharp_make_gauss_geom_info (int nrings, int nphi, int stride_lon,
int stride_lat, sharp_geom_info **geom_info)
{
const double pi=3.141592653589793238462643383279502884197;
double *theta=RALLOC(double,nrings);
double *weight=RALLOC(double,nrings);
int *nph=RALLOC(int,nrings);
double *phi0=RALLOC(double,nrings);
ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
int *stride_=RALLOC(int,nrings);
gauleg(-1,1,theta,weight,nrings);
for (int m=0; m<nrings; ++m)
{
theta[m] = acos(theta[m]);
nph[m]=nphi;
phi0[m]=0;
ofs[m]=(ptrdiff_t)m*stride_lat;
stride_[m]=stride_lon;
weight[m]*=2*pi/nphi;
}
sharp_make_geom_info (nrings, nph, ofs, stride_, phi0, theta, weight,
geom_info);
DEALLOC(theta);
DEALLOC(weight);
DEALLOC(nph);
DEALLOC(phi0);
DEALLOC(ofs);
DEALLOC(stride_);
}
void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info)
{
const double pi=3.141592653589793238462643383279502884197;
double *theta=RALLOC(double,nrings);
double *weight=RALLOC(double,nrings);
int *nph=RALLOC(int,nrings);
double *phi0_=RALLOC(double,nrings);
ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
int *stride_=RALLOC(int,nrings);
UTIL_ASSERT((nrings&1)==0,
"Even number of rings needed for equidistant grid!");
makeweights(nrings/2,weight);
for (int m=0; m<nrings; ++m)
{
theta[m] = (m+0.5)*pi/nrings;
nph[m]=nphi;
phi0_[m]=phi0;
ofs[m]=(ptrdiff_t)m*stride_lat;
stride_[m]=stride_lon;
weight[m]*=2*pi/nphi;
}
sharp_make_geom_info (nrings, nph, ofs, stride_, phi0_, theta, weight,
geom_info);
DEALLOC(theta);
DEALLOC(weight);
DEALLOC(nph);
DEALLOC(phi0_);
DEALLOC(ofs);
DEALLOC(stride_);
}

View file

@ -0,0 +1,82 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_geomhelpers.h
* SHARP helper function for the creation of grid geometries
*
* Copyright (C) 2006-2011 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef PLANCK_SHARP_GEOMHELPERS_H
#define PLANCK_SHARP_GEOMHELPERS_H
#include "sharp.h"
#ifdef __cplusplus
extern "C" {
#endif
/*! Creates a geometry information describing a HEALPix map with an
Nside parameter \a nside.
\ingroup geominfogroup */
void sharp_make_healpix_geom_info (int nside, int stride,
sharp_geom_info **geom_info);
/*! Creates a geometry information describing a HEALPix map with an
Nside parameter \a nside. \a weight contains the relative ring
weights and must have \a 2*nside entries.
\ingroup geominfogroup */
void sharp_make_weighted_healpix_geom_info (int nside, int stride,
const double *weight, sharp_geom_info **geom_info);
/*! Creates a geometry information describing a Gaussian map with \a nrings
iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
pixel in each ring is 0. The index difference between two adjacent pixels
in an iso-latitude ring is \a stride_lon, the index difference between the
two start pixels in consecutive iso-latitude rings is \a stride_lat.
\ingroup geominfogroup */
void sharp_make_gauss_geom_info (int nrings, int nphi, int stride_lon,
int stride_lat, sharp_geom_info **geom_info);
/*! Creates a geometry information describing an ECP map with \a nrings
iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
pixel in each ring is \a phi0 (in radians). The index difference between
two adjacent pixels in an iso-latitude ring is \a stride_lon, the index
difference between the two start pixels in consecutive iso-latitude rings
is \a stride_lat.
\note The spacing of pixel centers is equidistant in colatitude and
longitude.
\note \a nrings must be an even number.
\note The sphere is pixelized in a way that the colatitude of the first ring
is \a 0.5*(pi/nrings). There are no pixel centers at the poles.
\ingroup geominfogroup */
void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -0,0 +1,57 @@
#define Tb CONCAT2(Tb,nvec)
#define Y(arg) CONCAT2(arg,nvec)
#include "sharp_core_inc.c"
#if (MAXJOB_SPECIAL<6)
#include "sharp_core_inc3.c"
#endif
#if (MAXJOB_SPECIAL>=1)
#define njobs 1
#define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c"
#undef Z
#undef njobs
#endif
#if (MAXJOB_SPECIAL>=2)
#define njobs 2
#define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c"
#undef Z
#undef njobs
#endif
#if (MAXJOB_SPECIAL>=3)
#define njobs 3
#define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c"
#undef Z
#undef njobs
#endif
#if (MAXJOB_SPECIAL>=4)
#define njobs 4
#define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c"
#undef Z
#undef njobs
#endif
#if (MAXJOB_SPECIAL>=5)
#define njobs 5
#define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c"
#undef Z
#undef njobs
#endif
#if (MAXJOB_SPECIAL>=6)
#define njobs 6
#define Z(arg) CONCAT3(arg,nvec,njobs)
#include "sharp_core_inc2.c"
#undef Z
#undef njobs
#endif
#undef Y
#undef Tb

286
libsharp/sharp_mpi.c Normal file
View file

@ -0,0 +1,286 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_mpi.c
* Functionality only needed for MPI-parallel transforms
*
* Copyright (C) 2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifdef USE_MPI
#include "sharp_mpi.h"
typedef struct
{
int ntasks; /* number of tasks */
int mytask; /* own task number */
MPI_Comm comm; /* communicator to use */
int *nm; /* number of m values on every task */
int *ofs_m; /* accumulated nm */
int nmtotal; /* total number of m values (must be mmax+1) */
int *mval; /* array containing all m values of task 0, task 1 etc. */
int mmax;
int nph;
int *npair; /* number of ring pairs on every task */
int *ofs_pair; /* accumulated npair */
int npairtotal; /* total number of ring pairs */
double *theta; /* theta of first ring of every pair on task 0, task 1 etc. */
int *ispair; /* is this really a pair? */
int *almcount, *almdisp, *mapcount, *mapdisp; /* for all2all communication */
} sharp_mpi_info;
static void sharp_make_mpi_info (MPI_Comm comm, const sharp_job *job,
sharp_mpi_info *minfo)
{
minfo->comm = comm;
MPI_Comm_size (comm, &minfo->ntasks);
MPI_Comm_rank (comm, &minfo->mytask);
minfo->nm=RALLOC(int,minfo->ntasks);
MPI_Allgather ((int *)(&job->ainfo->nm),1,MPI_INT,minfo->nm,1,MPI_INT,comm);
minfo->ofs_m=RALLOC(int,minfo->ntasks+1);
minfo->ofs_m[0]=0;
for (int i=1; i<=minfo->ntasks; ++i)
minfo->ofs_m[i] = minfo->ofs_m[i-1]+minfo->nm[i-1];
minfo->nmtotal=minfo->ofs_m[minfo->ntasks];
minfo->mval=RALLOC(int,minfo->nmtotal);
MPI_Allgatherv(job->ainfo->mval, job->ainfo->nm, MPI_INT, minfo->mval,
minfo->nm, minfo->ofs_m, MPI_INT, comm);
minfo->mmax=sharp_get_mmax(minfo->mval,minfo->nmtotal);
minfo->npair=RALLOC(int,minfo->ntasks);
MPI_Allgather ((int *)(&job->ginfo->npairs), 1, MPI_INT, minfo->npair, 1,
MPI_INT, comm);
minfo->ofs_pair=RALLOC(int,minfo->ntasks+1);
minfo->ofs_pair[0]=0;
for (int i=1; i<=minfo->ntasks; ++i)
minfo->ofs_pair[i] = minfo->ofs_pair[i-1]+minfo->npair[i-1];
minfo->npairtotal=minfo->ofs_pair[minfo->ntasks];
double *theta_tmp=RALLOC(double,job->ginfo->npairs);
int *ispair_tmp=RALLOC(int,job->ginfo->npairs);
for (int i=0; i<job->ginfo->npairs; ++i)
{
theta_tmp[i]=job->ginfo->pair[i].r1.theta;
ispair_tmp[i]=job->ginfo->pair[i].r2.nph>0;
}
minfo->theta=RALLOC(double,minfo->npairtotal);
minfo->ispair=RALLOC(int,minfo->npairtotal);
MPI_Allgatherv(theta_tmp, job->ginfo->npairs, MPI_DOUBLE, minfo->theta,
minfo->npair, minfo->ofs_pair, MPI_DOUBLE, comm);
MPI_Allgatherv(ispair_tmp, job->ginfo->npairs, MPI_INT, minfo->ispair,
minfo->npair, minfo->ofs_pair, MPI_INT, comm);
DEALLOC(theta_tmp);
DEALLOC(ispair_tmp);
minfo->nph=2*job->nmaps*job->ntrans;
minfo->almcount=RALLOC(int,minfo->ntasks);
minfo->almdisp=RALLOC(int,minfo->ntasks+1);
minfo->mapcount=RALLOC(int,minfo->ntasks);
minfo->mapdisp=RALLOC(int,minfo->ntasks+1);
minfo->almdisp[0]=minfo->mapdisp[0]=0;
for (int i=0; i<minfo->ntasks; ++i)
{
minfo->almcount[i] = 2*minfo->nph*minfo->nm[minfo->mytask]*minfo->npair[i];
minfo->almdisp[i+1] = minfo->almdisp[i]+minfo->almcount[i];
minfo->mapcount[i] = 2*minfo->nph*minfo->nm[i]*minfo->npair[minfo->mytask];
minfo->mapdisp[i+1] = minfo->mapdisp[i]+minfo->mapcount[i];
}
}
static void sharp_destroy_mpi_info (sharp_mpi_info *minfo)
{
DEALLOC(minfo->nm);
DEALLOC(minfo->ofs_m);
DEALLOC(minfo->mval);
DEALLOC(minfo->npair);
DEALLOC(minfo->ofs_pair);
DEALLOC(minfo->theta);
DEALLOC(minfo->ispair);
DEALLOC(minfo->almcount);
DEALLOC(minfo->almdisp);
DEALLOC(minfo->mapcount);
DEALLOC(minfo->mapdisp);
}
static void sharp_communicate_alm2map (const sharp_mpi_info *minfo, dcmplx **ph)
{
dcmplx *phas_tmp = RALLOC(dcmplx,minfo->mapdisp[minfo->ntasks]/2);
MPI_Alltoallv (*ph,minfo->almcount,minfo->almdisp,MPI_DOUBLE,phas_tmp,
minfo->mapcount,minfo->mapdisp,MPI_DOUBLE,minfo->comm);
DEALLOC(*ph);
ALLOC(*ph,dcmplx,minfo->nph*minfo->npair[minfo->mytask]*minfo->nmtotal);
for (int task=0; task<minfo->ntasks; ++task)
for (int th=0; th<minfo->npair[minfo->mytask]; ++th)
for (int mi=0; mi<minfo->nm[task]; ++mi)
{
int m = minfo->mval[mi+minfo->ofs_m[task]];
int o1 = minfo->nph*(th*(minfo->mmax+1) + m);
int o2 = minfo->mapdisp[task]/2+minfo->nph*(mi+th*minfo->nm[task]);
for (int i=0; i<minfo->nph; ++i)
(*ph)[o1+i] = phas_tmp[o2+i];
}
DEALLOC(phas_tmp);
}
static void sharp_communicate_map2alm (const sharp_mpi_info *minfo, dcmplx **ph)
{
dcmplx *phas_tmp = RALLOC(dcmplx,minfo->mapdisp[minfo->ntasks]/2);
for (int task=0; task<minfo->ntasks; ++task)
for (int th=0; th<minfo->npair[minfo->mytask]; ++th)
for (int mi=0; mi<minfo->nm[task]; ++mi)
{
int m = minfo->mval[mi+minfo->ofs_m[task]];
int o1 = minfo->mapdisp[task]/2+minfo->nph*(mi+th*minfo->nm[task]);
int o2 = minfo->nph*(th*(minfo->mmax+1) + m);
for (int i=0; i<minfo->nph; ++i)
phas_tmp[o1+i] = (*ph)[o2+i];
}
DEALLOC(*ph);
ALLOC(*ph,dcmplx,minfo->nph*minfo->nm[minfo->mytask]*minfo->npairtotal);
MPI_Alltoallv (phas_tmp,minfo->mapcount,minfo->mapdisp,MPI_DOUBLE,
*ph,minfo->almcount,minfo->almdisp,MPI_DOUBLE,minfo->comm);
DEALLOC(phas_tmp);
}
static void alloc_phase_mpi (sharp_job *job, int nm, int ntheta,
int nmfull, int nthetafull)
{
ptrdiff_t phase_size = (job->type==MAP2ALM) ?
(ptrdiff_t)(nmfull)*ntheta : (ptrdiff_t)(nm)*nthetafull;
job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*phase_size);
}
static void alm2map_comm (sharp_job *job, const sharp_mpi_info *minfo)
{
if (job->type != MAP2ALM)
sharp_communicate_alm2map (minfo,&job->phase);
}
static void map2alm_comm (sharp_job *job, const sharp_mpi_info *minfo)
{
if (job->type == MAP2ALM)
sharp_communicate_map2alm (minfo,&job->phase);
}
void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm)
{
double timer=wallTime();
int ntasks;
MPI_Comm_size(comm, &ntasks);
if (ntasks==1) /* fall back to scalar implementation */
{ sharp_execute_job (job); return; }
int lmax = job->ainfo->lmax;
job->norm_l = Ylmgen_get_norm (lmax, job->spin);
sharp_mpi_info minfo;
sharp_make_mpi_info(comm, job, &minfo);
/* clear output arrays if requested */
init_output (job);
alloc_phase_mpi (job,job->ainfo->nm,job->ginfo->npairs,minfo.mmax+1,
minfo.npairtotal);
double *cth = RALLOC(double,minfo.npairtotal),
*sth = RALLOC(double,minfo.npairtotal);
idxhelper *stmp = RALLOC(idxhelper,minfo.npairtotal);
for (int i=0; i<minfo.npairtotal; ++i)
{
cth[i] = cos(minfo.theta[i]);
sth[i] = sin(minfo.theta[i]);
stmp[i].s=sth[i];
stmp[i].i=i;
}
qsort (stmp,minfo.npairtotal,sizeof(idxhelper),idx_compare);
int *idx = RALLOC(int,minfo.npairtotal);
for (int i=0; i<minfo.npairtotal; ++i)
idx[i]=stmp[i].i;
DEALLOC(stmp);
/* map->phase where necessary */
map2phase (job, minfo.mmax, 0, job->ginfo->npairs);
map2alm_comm (job, &minfo);
#pragma omp parallel
{
sharp_job ljob = *job;
Ylmgen_C generator;
Ylmgen_init (&generator,lmax,minfo.mmax,ljob.spin);
alloc_almtmp(&ljob,lmax);
#pragma omp for schedule(dynamic,1)
for (int mi=0; mi<job->ainfo->nm; ++mi)
{
/* alm->alm_tmp where necessary */
alm2almtmp (&ljob, lmax, mi);
/* inner conversion loop */
inner_loop (&ljob, minfo.ispair, cth, sth, 0, minfo.npairtotal,
&generator, mi, idx);
/* alm_tmp->alm where necessary */
almtmp2alm (&ljob, lmax, mi);
}
Ylmgen_destroy(&generator);
dealloc_almtmp(&ljob);
#pragma omp critical
job->opcnt+=ljob.opcnt;
} /* end of parallel region */
alm2map_comm (job, &minfo);
/* phase->map where necessary */
phase2map (job, minfo.mmax, 0, job->ginfo->npairs);
DEALLOC(cth);
DEALLOC(sth);
DEALLOC(idx);
DEALLOC(job->norm_l);
dealloc_phase (job);
sharp_destroy_mpi_info(&minfo);
job->time=wallTime()-timer;
}
#endif

48
libsharp/sharp_mpi.h Normal file
View file

@ -0,0 +1,48 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_mpi.h
* Interface for the spherical transform library with MPI support.
*
* Copyright (C) 2011,2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef PLANCK_SHARP_MPI_H
#define PLANCK_SHARP_MPI_H
#include <mpi.h>
#include "sharp.h"
#ifdef __cplusplus
extern "C" {
#endif
void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm);
#ifdef __cplusplus
}
#endif
#endif

243
libsharp/sharp_test.c Normal file
View file

@ -0,0 +1,243 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_test.c
Accuracy test for libsharp's map analysis.
This program first generates a_lm coefficients up to
a user-specified lmax (with mmax=lmax); where applicable, the
real and imaginary parts of the coefficients are uniform
random numbers of the interval [-1;1[.
Afterwards, the random a_lm are converted to a map.
This map is analyzed (optionally using an iterative scheme
with a user-supplied number of steps).
After every iteration, the code then outputs the RMS of the residual a_lm
(i.e. the difference between the current and original a_lm), divided by
the RMS of the original a_lm, as well as the maximum absolute change of any
real or imaginary part between the current and original a_lm.
This operation can be performed for several different pixelisations:
- a Gaussian with the minimal number of rings for exact analysis
and a user-defined ring resolution
- an ECP grid with the minimal number of rings for exact analysis
and a user-defined ring resolution
- a Healpix grid with a user-defined Nside parameter.
The user can specify the spin of the desired transform.
Copyright (C) 2006-2012 Max-Planck-Society
\author Martin Reinecke
*/
#include <stdio.h>
#include <string.h>
#ifdef USE_MPI
#include "mpi.h"
#endif
#include "sharp.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "c_utils.h"
#include "sharp_core.h"
typedef complex double dcmplx;
static double drand (double min, double max)
{ return min + (max-min)*rand()/(RAND_MAX+1.0); }
static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin)
{
for (int mi=0;mi<helper->nm; ++mi)
{
int m=helper->mval[mi];
for (int l=m;l<=helper->lmax; ++l)
{
if ((l<spin)&&(m<spin))
alm[sharp_alm_index(helper,l,mi)] = 0.;
else
{
double rv = drand(-1,1);
double iv = (m==0) ? 0 : drand(-1,1);
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
}
}
}
}
static void measure_errors (dcmplx **alm, dcmplx **alm2,
ptrdiff_t nalms, int ncomp)
{
for (int i=0; i<ncomp; ++i)
{
double sum=0, sum2=0, maxdiff=0;
for (ptrdiff_t m=0; m<nalms; ++m)
{
double x=creal(alm[i][m])-creal(alm2[i][m]),
y=cimag(alm[i][m])-cimag(alm2[i][m]);
sum+=x*x+y*y;
sum2+=creal(alm[i][m])*creal(alm[i][m])+cimag(alm[i][m])*cimag(alm[i][m]);
if (fabs(x)>maxdiff) maxdiff=fabs(x);
if (fabs(y)>maxdiff) maxdiff=fabs(y);
}
sum=sqrt(sum/nalms);
sum2=sqrt(sum2/nalms);
printf("component %i: rms %e, maxerr %e\n",i, sum/sum2, maxdiff);
}
}
static void map2alm_iter (sharp_geom_info *tinfo, double **map,
dcmplx **alm_orig, dcmplx **alm, int lmax, int mmax,
ptrdiff_t npix, ptrdiff_t nalms, int spin, int ntrans, int niter)
{
int ncomp = ntrans*((spin==0) ? 1 : 2);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
sharp_job job;
sharpd_build_job(&job,MAP2ALM,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
sharp_execute_job(&job);
printf("wall time for map2alm: %fs\n",job.time);
printf("Performance: %fGFLOPs/s\n",1e-9*job.opcnt/job.time);
measure_errors(alm_orig,alm,nalms,ncomp);
for (int iter=0; iter<niter; ++iter)
{
double **map2;
ALLOC2D(map2,double,ncomp,npix);
printf ("\niteration %i:\n", iter+1);
sharpd_build_job(&job,ALM2MAP,spin,0,&alm[0],&map2[0],tinfo,alms,ntrans);
sharp_execute_job(&job);
printf("wall time for alm2map: %fs\n",job.time);
printf("Performance: %fGFLOPs/s\n",1e-9*job.opcnt/job.time);
for (int i=0; i<ncomp; ++i)
for (ptrdiff_t m=0; m<npix; ++m)
map2[i][m] = map[i][m]-map2[i][m];
sharpd_build_job(&job,MAP2ALM,spin,1,&alm[0],&map2[0],tinfo,alms,ntrans);
sharp_execute_job(&job);
printf("wall time for map2alm: %fs\n",job.time);
printf("Performance: %fGFLOPs/s\n",1e-9*job.opcnt/job.time);
DEALLOC2D(map2);
measure_errors(alm_orig,alm,nalms,ncomp);
}
sharp_destroy_alm_info(alms);
}
static void check_accuracy (sharp_geom_info *tinfo, ptrdiff_t lmax,
ptrdiff_t mmax, ptrdiff_t npix, int spin, int ntrans, int niter)
{
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
srand(4);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
for (int i=0; i<ncomp; ++i)
random_alm(alm[i],alms,spin);
dcmplx **alm2;
ALLOC2D(alm2,dcmplx,ncomp,nalms);
sharp_job job;
printf ("\niteration 0:\n");
sharpd_build_job(&job,ALM2MAP,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
sharp_execute_job(&job);
printf("wall time for alm2map: %fs\n",job.time);
printf("Performance: %fGFLOPs/s\n",1e-9*job.opcnt/job.time);
map2alm_iter(tinfo,map,alm,alm2,lmax,mmax,npix,nalms,spin,ntrans,niter);
DEALLOC2D(map);
DEALLOC2D(alm);
DEALLOC2D(alm2);
sharp_destroy_alm_info(alms);
}
int main(int argc, char **argv)
{
#ifdef USE_MPI
MPI_Init(NULL,NULL);
#endif
module_startup_c("sharp_test",argc,7,
"<healpix|ecp|gauss> <lmax> <nside|nphi> <niter> <spin> <ntrans>",1);
int lmax=atoi(argv[2]);
int niter=atoi(argv[4]);
int spin=atoi(argv[5]);
int ntrans=atoi(argv[6]);
printf("Testing map analysis accuracy.\n");
printf("lmax=%d, %d iterations, spin=%d\n", lmax, niter, spin);
sharp_geom_info *tinfo;
if (strcmp(argv[1],"gauss")==0)
{
int nrings=lmax+1;
int ppring=atoi(argv[3]);
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
printf("\nTesting Gaussian grid (%d rings, %d pixels/ring, %ld pixels)\n",
nrings,ppring,(long)npix);
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else if (strcmp(argv[1],"ecp")==0)
{
int nrings=2*lmax+2;
int ppring=atoi(argv[3]);
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
printf("\nTesting ECP grid (%d rings, %d pixels/ring, %ld pixels)\n",
nrings,ppring,(long)npix);
sharp_make_ecp_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else if (strcmp(argv[1],"healpix")==0)
{
int nside=atoi(argv[3]);
if (nside<1) nside=1;
ptrdiff_t npix=12*(ptrdiff_t)nside*nside;
printf("\nTesting Healpix grid (nside=%d, %ld pixels)\n",
nside,(long)npix);
sharp_make_healpix_geom_info (nside, 1, &tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else
UTIL_FAIL("unknown grid geometry");
#ifdef USE_MPI
MPI_Finalize();
#endif
return 0;
}

354
libsharp/sharp_test_mpi.c Normal file
View file

@ -0,0 +1,354 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_test_mpi.c
Accuracy test for libsharp's map analysis with MPI support.
This program first generates a_lm coefficients up to
a user-specified lmax (with mmax=lmax); where applicable, the
real and imaginary parts of the coefficients are uniform
random numbers of the interval [-1;1[.
Afterwards, the random a_lm are converted to a map.
This map is analyzed (optionally using an iterative scheme
with a user-supplied number of steps).
After every iteration, the code then outputs the RMS of the residual a_lm
(i.e. the difference between the current and original a_lm), divided by
the RMS of the original a_lm, as well as the maximum absolute change of any
real or imaginary part between the current and original a_lm.
This operation can be performed for several different pixelisations:
- a Gaussian with the minimal number of rings for exact analysis
and a user-defined ring resolution
- an ECP grid with the minimal number of rings for exact analysis
and a user-defined ring resolution
- a Healpix grid with a user-defined Nside parameter.
The user can specify the spin of the desired transform.
Copyright (C) 2006-2012 Max-Planck-Society
\author Martin Reinecke
*/
#ifdef USE_MPI
#include <stdio.h>
#include <string.h>
#include "sharp_mpi.h"
#include "sharp_geomhelpers.h"
#include "sharp_almhelpers.h"
#include "c_utils.h"
#include "walltime_c.h"
#include "sharp_core.h"
typedef complex double dcmplx;
int ntasks, mytask;
static unsigned long long totalops (unsigned long long val)
{
unsigned long long tmp;
MPI_Allreduce (&val, &tmp,1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
return tmp;
}
static double maxTime (double val)
{
double tmp;
MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
return tmp;
}
static double drand (double min, double max)
{ return min + (max-min)*rand()/(RAND_MAX+1.0); }
static ptrdiff_t get_nalms(const sharp_alm_info *ainfo)
{
ptrdiff_t res=0;
for (int i=0; i<ainfo->nm; ++i)
res += ainfo->lmax-ainfo->mval[i]+1;
return res;
}
static ptrdiff_t get_npix(const sharp_geom_info *ginfo)
{
ptrdiff_t res=0;
for (int i=0; i<ginfo->npairs; ++i)
{
res += ginfo->pair[i].r1.nph;
if (ginfo->pair[i].r2.nph>0) res += ginfo->pair[i].r2.nph;
}
return res;
}
static void reduce_alm_info(sharp_alm_info *ainfo)
{
int nmnew=0;
ptrdiff_t ofs = 0;
for (int i=mytask; i<ainfo->nm; i+=ntasks,++nmnew)
{
ainfo->mval[nmnew]=ainfo->mval[i];
ainfo->mvstart[nmnew]=ofs-ainfo->mval[nmnew];
ofs+=ainfo->lmax-ainfo->mval[nmnew]+1;
}
ainfo->nm=nmnew;
}
static void reduce_geom_info(sharp_geom_info *ginfo)
{
int npairsnew=0;
ptrdiff_t ofs = 0;
for (int i=mytask; i<ginfo->npairs; i+=ntasks,++npairsnew)
{
ginfo->pair[npairsnew]=ginfo->pair[i];
ginfo->pair[npairsnew].r1.ofs=ofs;
ofs+=ginfo->pair[npairsnew].r1.nph;
ginfo->pair[npairsnew].r2.ofs=ofs;
if (ginfo->pair[npairsnew].r2.nph>0) ofs+=ginfo->pair[npairsnew].r2.nph;
}
ginfo->npairs=npairsnew;
}
static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin)
{
static int cnt=0;
++cnt;
for (int mi=0;mi<helper->nm; ++mi)
{
int m=helper->mval[mi];
srand(1234567*cnt+8912*m);
for (int l=m;l<=helper->lmax; ++l)
{
if ((l<spin)&&(m<spin))
alm[sharp_alm_index(helper,l,mi)] = 0.;
else
{
double rv = drand(-1,1);
double iv = (m==0) ? 0 : drand(-1,1);
alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
}
}
}
}
static void measure_errors (dcmplx **alm, dcmplx **alm2,
const sharp_alm_info *ainfo, int ncomp)
{
long nalms=get_nalms(ainfo), nalms_tot;
MPI_Allreduce(&nalms,&nalms_tot,1,MPI_LONG,MPI_SUM,MPI_COMM_WORLD);
for (int i=0; i<ncomp; ++i)
{
double sum=0, sum2=0, maxdiff=0, sumtot, sum2tot, maxdifftot;
for (int mi=0; mi<ainfo->nm; ++mi)
{
int m=ainfo->mval[mi];
for (int l=m; l<=ainfo->lmax; ++l)
{
ptrdiff_t idx=sharp_alm_index(ainfo,l,mi);
double x=creal(alm[i][idx])-creal(alm2[i][idx]),
y=cimag(alm[i][idx])-cimag(alm2[i][idx]);
sum+=x*x+y*y;
sum2+=creal(alm[i][idx])*creal(alm[i][idx])
+cimag(alm[i][idx])*cimag(alm[i][idx]);
if (fabs(x)>maxdiff) maxdiff=fabs(x);
if (fabs(y)>maxdiff) maxdiff=fabs(y);
}
}
MPI_Allreduce(&sum,&sumtot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
MPI_Allreduce(&sum2,&sum2tot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
MPI_Allreduce(&maxdiff,&maxdifftot,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD);
sumtot=sqrt(sumtot/nalms_tot);
sum2tot=sqrt(sum2tot/nalms_tot);
if (mytask==0)
printf("component %i: rms %e, maxerr %e\n",i, sumtot/sum2tot, maxdifftot);
}
}
static void map2alm_iter (sharp_geom_info *tinfo, double **map,
dcmplx **alm_orig, dcmplx **alm, int lmax, int mmax,
ptrdiff_t npix, int spin, int ntrans, int niter)
{
int ncomp = ntrans*((spin==0) ? 1 : 2);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
reduce_alm_info(alms);
sharp_job job;
sharpd_build_job(&job,MAP2ALM,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
sharp_execute_job_mpi(&job,MPI_COMM_WORLD);
unsigned long long opcnt=totalops(job.opcnt);
double timer=maxTime(job.time);
if (mytask==0) printf("wall time for map2alm: %fs\n",timer);
if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
measure_errors(alm_orig,alm,alms,ncomp);
for (int iter=0; iter<niter; ++iter)
{
double **map2;
ALLOC2D(map2,double,ncomp,npix);
if (mytask==0) printf ("\niteration %i:\n", iter+1);
sharpd_build_job(&job,ALM2MAP,spin,0,&alm[0],&map2[0],tinfo,alms,ntrans);
sharp_execute_job_mpi(&job,MPI_COMM_WORLD);
opcnt=totalops(job.opcnt);
timer=maxTime(job.time);
if (mytask==0) printf("wall time for alm2map: %fs\n",timer);
if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
for (int i=0; i<ncomp; ++i)
for (ptrdiff_t m=0; m<npix; ++m)
map2[i][m] = map[i][m]-map2[i][m];
sharpd_build_job(&job,MAP2ALM,spin,1,&alm[0],&map2[0],tinfo,alms,ntrans);
sharp_execute_job_mpi(&job,MPI_COMM_WORLD);
opcnt=totalops(job.opcnt);
timer=maxTime(job.time);
if (mytask==0) printf("wall time for map2alm: %fs\n",wallTime()-timer);
if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
DEALLOC2D(map2);
measure_errors(alm_orig,alm,alms,ncomp);
}
sharp_destroy_alm_info(alms);
}
static void check_accuracy (sharp_geom_info *tinfo, ptrdiff_t lmax,
ptrdiff_t mmax, ptrdiff_t npix, int spin, int ntrans, int niter)
{
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
sharp_alm_info *alms;
ptrdiff_t nalms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
reduce_alm_info(alms);
nalms=get_nalms(alms);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
srand(4);
for (int i=0; i<ncomp; ++i)
random_alm(alm[i],alms,spin);
dcmplx **alm2;
ALLOC2D(alm2,dcmplx,ncomp,nalms);
if (mytask==0) printf ("\niteration 0:\n");
sharp_job job;
sharpd_build_job(&job,ALM2MAP,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
sharp_execute_job_mpi(&job,MPI_COMM_WORLD);
unsigned long long opcnt=totalops(job.opcnt);
double timer=maxTime(job.time);
if (mytask==0) printf("wall time for alm2map: %fs\n",timer);
if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
map2alm_iter(tinfo, map, alm, alm2, lmax, mmax, npix, spin, ntrans, niter);
DEALLOC2D(map);
DEALLOC2D(alm);
DEALLOC2D(alm2);
sharp_destroy_alm_info(alms);
}
int main(int argc, char **argv)
{
MPI_Init(NULL,NULL);
MPI_Comm_size(MPI_COMM_WORLD,&ntasks);
MPI_Comm_rank(MPI_COMM_WORLD,&mytask);
module_startup_c("sharp_test_mpi",argc,7,
"<healpix|ecp|gauss> <lmax> <nside|nphi> <niter> <spin> <ntrans>",
mytask==0);
int lmax=atoi(argv[2]);
int niter=atoi(argv[4]);
int spin=atoi(argv[5]);
int ntrans=atoi(argv[6]);
if (mytask==0)
{
printf("Testing map analysis accuracy.\n");
printf("lmax=%d, %d iterations, spin=%d\n", lmax, niter, spin);
}
sharp_geom_info *tinfo;
if (strcmp(argv[1],"gauss")==0)
{
int nrings=lmax+1;
int ppring=atoi(argv[3]);
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
if (mytask==0)
printf("\nTesting Gaussian grid (%d rings, %d pixels/ring, %ld pixels)\n",
nrings,ppring,(long)npix);
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
reduce_geom_info(tinfo);
npix=get_npix(tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else if (strcmp(argv[1],"ecp")==0)
{
int nrings=2*lmax+2;
int ppring=atoi(argv[3]);
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
if (mytask==0)
printf("\nTesting ECP grid (%d rings, %d pixels/ring, %ld pixels)\n",
nrings,ppring,(long)npix);
sharp_make_ecp_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
reduce_geom_info(tinfo);
npix=get_npix(tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else if (strcmp(argv[1],"healpix")==0)
{
int nside=atoi(argv[3]);
if (nside<1) nside=1;
ptrdiff_t npix=12*(ptrdiff_t)nside*nside;
if (mytask==0)
printf("\nTesting Healpix grid (nside=%d, %ld pixels)\n",
nside,(long)npix);
sharp_make_healpix_geom_info (nside, 1, &tinfo);
reduce_geom_info(tinfo);
npix=get_npix(tinfo);
check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
sharp_destroy_geom_info(tinfo);
}
else
UTIL_FAIL("unknown grid geometry");
MPI_Finalize();
return 0;
}
#else
#include "c_utils.h"
int main(void)
{ UTIL_FAIL("MPI support not enabled."); return 1; }
#endif

158
libsharp/vecsupport.h Normal file
View file

@ -0,0 +1,158 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/* \file vecsupport.h
* Convenience functions for vector arithmetics
*
* Copyright (C) 2012 Max-Planck-Society
* Author: Martin Reinecke
*/
#ifndef VECSUPPORT_H
#define VECSUPPORT_H
#include <math.h>
#include "vec_utils.h"
typedef double Ts;
#if (VLEN==1)
typedef double Tv;
#define vadd(a,b) ((a)+(b))
#define vaddeq(a,b) ((a)+=(b))
#define vsub(a,b) ((a)-(b))
#define vsubeq(a,b) ((a)-=(b))
#define vmul(a,b) ((a)*(b))
#define vmuleq(a,b) ((a)*=(b))
#define vfmaeq(a,b,c) ((a)+=(b)*(c))
#define vfmseq(a,b,c) ((a)-=(b)*(c))
#define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
#define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
#define vneg(a) (-(a))
#define vload(a) (a)
#define vabs(a) fabs(a)
#define vsqrt(a) sqrt(a)
#define vlt(a,b) (((a)<(b))?1.:0.)
#define vgt(a,b) (((a)>(b))?1.:0.)
#define vne(a,b) (((a)!=(b))?1.:0.)
#define vand(a,b) ((((a)*(b))!=0.)?1.:0.)
static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
#define vanyTrue(a) ((a)!=0.)
#define vallTrue(a) ((a)!=0.)
#define vblend(m,a,b) (((m)!=0.) ? (a) : (b))
#define vzero 0.
#define vone 1.
#endif
#if (VLEN==2)
#include <emmintrin.h>
#if defined (__SSE3__)
#include <pmmintrin.h>
#endif
#if defined (__SSE4_1__)
#include <smmintrin.h>
#endif
typedef __m128d Tv;
#define vadd(a,b) _mm_add_pd(a,b)
#define vaddeq(a,b) a=_mm_add_pd(a,b)
#define vsub(a,b) _mm_sub_pd(a,b)
#define vsubeq(a,b) a=_mm_sub_pd(a,b)
#define vmul(a,b) _mm_mul_pd(a,b)
#define vmuleq(a,b) a=_mm_mul_pd(a,b)
#define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
#define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
#define vfmaaeq(a,b,c,d,e) \
a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
#define vfmaseq(a,b,c,d,e) \
a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
#define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
#define vload(a) _mm_set1_pd(a)
#define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
#define vsqrt(a) _mm_sqrt_pd(a)
#define vlt(a,b) _mm_cmplt_pd(a,b)
#define vgt(a,b) _mm_cmpgt_pd(a,b)
#define vne(a,b) _mm_cmpneq_pd(a,b)
#define vand(a,b) _mm_and_pd(a,b)
#define vmin(a,b) _mm_min_pd(a,b)
#define vmax(a,b) _mm_max_pd(a,b);
#define vanyTrue(a) (_mm_movemask_pd(a)!=0)
#define vallTrue(a) (_mm_movemask_pd(a)==3)
#if defined(__SSE4_1__)
#define vblend(m,a,b) _mm_blendv_pd(b,a,m)
#else
static inline Tv vblend(Tv m, Tv a, Tv b)
{ return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
#endif
#define vzero _mm_setzero_pd()
#define vone _mm_set1_pd(1.)
#endif
#if (VLEN==4)
#include <immintrin.h>
typedef __m256d Tv;
#define vadd(a,b) _mm256_add_pd(a,b)
#define vaddeq(a,b) a=_mm256_add_pd(a,b)
#define vsub(a,b) _mm256_sub_pd(a,b)
#define vsubeq(a,b) a=_mm256_sub_pd(a,b)
#define vmul(a,b) _mm256_mul_pd(a,b)
#define vmuleq(a,b) a=_mm256_mul_pd(a,b)
#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
#define vfmaaeq(a,b,c,d,e) \
a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
#define vfmaseq(a,b,c,d,e) \
a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
#define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
#define vload(a) _mm256_set1_pd(a)
#define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
#define vsqrt(a) _mm256_sqrt_pd(a)
#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
#define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ)
#define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
#define vand(a,b) _mm256_and_pd(a,b)
#define vmin(a,b) _mm256_min_pd(a,b)
#define vmax(a,b) _mm256_max_pd(a,b)
#define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
#define vallTrue(a) (_mm256_movemask_pd(a)==15)
#define vblend(m,a,b) _mm256_blendv_pd(b,a,m)
#define vzero _mm256_setzero_pd()
#define vone _mm256_set1_pd(1.)
#endif
#endif

206
libsharp/ylmgen_c.c Normal file
View file

@ -0,0 +1,206 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*
* Helper code for efficient calculation of Y_lm(theta,phi=0)
*
* Copyright (C) 2005-2012 Max-Planck-Society
* Author: Martin Reinecke
*/
#include <math.h>
#include <stdlib.h>
#include "ylmgen_c.h"
#include "c_utils.h"
void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
{
const double inv_sqrt4pi = 0.2820947917738781434740397257803862929220;
gen->lmax = l_max;
gen->mmax = m_max;
UTIL_ASSERT(spin>=0,"incorrect spin");
gen->s = spin;
UTIL_ASSERT((minscale<=0)&&(maxscale>0),"bad value for min/maxscale");
gen->cf=RALLOC(double,maxscale-minscale+1);
gen->cf[-minscale]=1.;
for (int m=-minscale-1; m>=0; --m)
gen->cf[m]=gen->cf[m+1]*fsmall;
for (int m=-minscale+1; m<(maxscale-minscale+1); ++m)
gen->cf[m]=gen->cf[m-1]*fbig;
gen->m = -1;
if (spin==0)
{
gen->rf = RALLOC(ylmgen_dbl2,gen->lmax+1);
gen->mfac = RALLOC(double,gen->mmax+1);
gen->mfac[0] = inv_sqrt4pi;
for (int m=1; m<=gen->mmax; ++m)
gen->mfac[m] = gen->mfac[m-1]*sqrt((2*m+1.)/(2*m));
gen->root = RALLOC(double,2*gen->lmax+5);
gen->iroot = RALLOC(double,2*gen->lmax+5);
for (int m=0; m<2*gen->lmax+5; ++m)
{
gen->root[m] = sqrt(m);
gen->iroot[m] = (m==0) ? 0. : 1./gen->root[m];
}
}
else
{
gen->m=gen->mlo=gen->mhi=-1234567890;
ALLOC(gen->fx,ylmgen_dbl3,gen->lmax+2);
for (int m=0; m<gen->lmax+2; ++m)
gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
ALLOC(gen->inv,double,gen->lmax+1);
gen->inv[0]=0;
for (int m=1; m<gen->lmax+1; ++m) gen->inv[m]=1./m;
ALLOC(gen->flm1,double,2*gen->lmax+1);
ALLOC(gen->flm2,double,2*gen->lmax+1);
for (int m=0; m<2*gen->lmax+1; ++m)
{
gen->flm1[m] = sqrt(1./(m+1.));
gen->flm2[m] = sqrt(m/(m+1.));
}
ALLOC(gen->prefac,double,gen->mmax+1);
ALLOC(gen->fscale,int,gen->mmax+1);
double *fac = RALLOC(double,2*gen->lmax+1);
int *facscale = RALLOC(int,2*gen->lmax+1);
fac[0]=1; facscale[0]=0;
for (int m=1; m<2*gen->lmax+1; ++m)
{
fac[m]=fac[m-1]*sqrt(m);
facscale[m]=facscale[m-1];
if (fac[m]>1.) { fac[m]*=fsmall; ++facscale[m]; }
}
for (int m=0; m<=gen->mmax; ++m)
{
int mlo=gen->s, mhi=m;
if (mhi<mlo) SWAP(mhi,mlo,int);
gen->prefac[m]=fac[2*mhi]/(fac[mhi+mlo]*fac[mhi-mlo]);
gen->fscale[m]=facscale[2*mhi]-facscale[mhi+mlo]-facscale[mhi-mlo];
}
DEALLOC(fac);
DEALLOC(facscale);
}
}
void Ylmgen_destroy (Ylmgen_C *gen)
{
DEALLOC(gen->cf);
if (gen->s==0)
{
DEALLOC(gen->rf);
DEALLOC(gen->mfac);
DEALLOC(gen->root);
DEALLOC(gen->iroot);
}
else
{
DEALLOC(gen->fx);
DEALLOC(gen->prefac);
DEALLOC(gen->fscale);
DEALLOC(gen->flm1);
DEALLOC(gen->flm2);
DEALLOC(gen->inv);
}
}
void Ylmgen_prepare (Ylmgen_C *gen, int m)
{
if (m==gen->m) return;
UTIL_ASSERT(m>=0,"incorrect m");
gen->m = m;
if (gen->s==0)
{
gen->rf[m].f[0] = gen->root[2*m+3];
gen->rf[m].f[1] = 0.;
for (int l=m+1; l<=gen->lmax; ++l)
{
double tmp=gen->root[2*l+3]*gen->iroot[l+1+m]*gen->iroot[l+1-m];
gen->rf[l].f[0] = tmp*gen->root[2*l+1];
gen->rf[l].f[1] = tmp*gen->root[l+m]*gen->root[l-m]*gen->iroot[2*l-1];
}
}
else
{
int mlo_=m, mhi_=gen->s;
if (mhi_<mlo_) SWAP(mhi_,mlo_,int);
int ms_similar = ((gen->mhi==mhi_) && (gen->mlo==mlo_));
gen->mlo = mlo_; gen->mhi = mhi_;
if (!ms_similar)
{
for (int l=gen->mhi; l<gen->lmax; ++l)
{
double t = gen->flm1[l+gen->m]*gen->flm1[l-gen->m]
*gen->flm1[l+gen->s]*gen->flm1[l-gen->s];
double lt = 2*l+1;
double l1 = l+1;
gen->fx[l+1].f[0]=l1*lt*t;
gen->fx[l+1].f[1]=gen->m*gen->s*gen->inv[l]*gen->inv[l+1];
t = gen->flm2[l+gen->m]*gen->flm2[l-gen->m]
*gen->flm2[l+gen->s]*gen->flm2[l-gen->s];
gen->fx[l+1].f[2]=t*l1*gen->inv[l];
}
}
gen->preMinus_p = gen->preMinus_m = 0;
if (gen->mhi==gen->m)
{
gen->cosPow = gen->mhi+gen->s; gen->sinPow = gen->mhi-gen->s;
gen->preMinus_p = gen->preMinus_m = ((gen->mhi-gen->s)&1);
}
else
{
gen->cosPow = gen->mhi+gen->m; gen->sinPow = gen->mhi-gen->m;
gen->preMinus_m = ((gen->mhi+gen->m)&1);
}
}
}
double *Ylmgen_get_norm (int lmax, int spin)
{
const double pi = 3.141592653589793238462643383279502884197;
double *res=RALLOC(double,lmax+1);
/* sign convention for H=1 (LensPix paper) */
#if 1
double spinsign = (spin>0) ? -1.0 : 1.0;
#else
double spinsign = 1.0;
#endif
if (spin==0)
{
for (int l=0; l<=lmax; ++l)
res[l]=1.;
return res;
}
spinsign = (spin&1) ? -spinsign : spinsign;
for (int l=0; l<=lmax; ++l)
res[l] = (l<spin) ? 0. : spinsign*0.5*sqrt((2*l+1)/(4*pi));
return res;
}

92
libsharp/ylmgen_c.h Normal file
View file

@ -0,0 +1,92 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file ylmgen_c.h
* Code for efficient calculation of Y_lm(phi=0,theta)
*
* Copyright (C) 2005-2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef PLANCK_YLMGEN_C_H
#define PLANCK_YLMGEN_C_H
#ifdef __cplusplus
extern "C" {
#endif
enum { minscale=-8, limscale=-3, maxscale=5 };
static const double fbig=0x1p+90,fsmall=0x1p-90;
typedef struct { double f[2]; } ylmgen_dbl2;
typedef struct { double f[3]; } ylmgen_dbl3;
typedef struct
{
/* for public use; immutable during lifetime */
int lmax, mmax, s;
double *cf;
/* for public use; will typically change after call to Ylmgen_prepare() */
int m;
/* used if s==0 */
double *mfac;
ylmgen_dbl2 *rf;
/* used if s!=0 */
int sinPow, cosPow, preMinus_p, preMinus_m;
double *prefac;
int *fscale;
ylmgen_dbl3 *fx;
/* internal usage only */
/* used if s==0 */
double *root, *iroot;
/* used if s!=0 */
double *flm1, *flm2, *inv;
int mlo, mhi;
} Ylmgen_C;
/*! Creates a generator which will calculate helper data for Y_lm calculation
up to \a l=l_max and \a m=m_max. */
void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin);
/*! Deallocates a generator previously initialised by Ylmgen_init(). */
void Ylmgen_destroy (Ylmgen_C *gen);
/*! Prepares the object for the calculation at \a m. */
void Ylmgen_prepare (Ylmgen_C *gen, int m);
/*! Returns a pointer to an array with \a lmax+1 entries containing
normalisation factors that must be applied to Y_lm values computed for
\a spin. The array must be deallocated (using free()) by the user. */
double *Ylmgen_get_norm (int lmax, int spin);
#ifdef __cplusplus
}
#endif
#endif