From 782d13d843e17a59a83f973f4bb659fc4bce15c2 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Wed, 26 Dec 2012 00:31:51 +0100 Subject: [PATCH] better code re-use --- libsharp/planck.make | 2 +- libsharp/sharp_core_inc2.c | 815 ------------------ .../{sharp_core_inc3.c => sharp_core_inc4.c} | 147 ++-- libsharp/sharp_inchelper1.inc.c | 26 +- 4 files changed, 100 insertions(+), 890 deletions(-) delete mode 100644 libsharp/sharp_core_inc2.c rename libsharp/{sharp_core_inc3.c => sharp_core_inc4.c} (86%) diff --git a/libsharp/planck.make b/libsharp/planck.make index ad5aaac..bb1ee3c 100644 --- a/libsharp/planck.make +++ b/libsharp/planck.make @@ -14,7 +14,7 @@ LIBOBJ:=$(LIBOBJ:%=$(OD)/%) ALLOBJ:=$(ALLOBJ:%=$(OD)/%) ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils) -$(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c +$(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc4.c $(OD)/sharp.o: $(SD)/sharp_mpi.c BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils) diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c deleted file mode 100644 index d980232..0000000 --- a/libsharp/sharp_core_inc2.c +++ /dev/null @@ -1,815 +0,0 @@ -/* - * This file is part of libsharp. - * - * libsharp is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * libsharp is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with libsharp; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* - * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik - * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt - * (DLR). - */ - -/*! \file sharp_core_inc2.c - * Type-dependent code for the computational core - * - * Copyright (C) 2012 Max-Planck-Society - * \author Martin Reinecke - */ - -typedef struct - { Y(Tbri) j[njobs]; } Z(Tbrij); -typedef union - { Z(Tbrij) b; Y(Tsri) j[njobs]; } Z(Tburij); -typedef struct - { Y(Tbqu) j[njobs]; } Z(Tbquj); -typedef union - { Z(Tbquj) b; Y(Tsqu) j[njobs]; } Z(Tbuquj); - -static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1, - Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2, - const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, - int l, int lmax) - { -#if (njobs>1) - while (lj[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4); - vfmaaeq(p1->j[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4); - } - Tv ar3=vload(creal(alm[njobs*(l+1)+j])), - ai3=vload(cimag(alm[njobs*(l+1)+j])), - ar1=vload(creal(alm[njobs*(l+3)+j])), - ai1=vload(cimag(alm[njobs*(l+3)+j])); - for (int i=0; ij[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1); - vfmaaeq(p2->j[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1); - } - } - r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]); - for (int i=0; ij[j].r.v[i],lam_2.v[i],ar); - vfmaeq(p1->j[j].i.v[i],lam_2.v[i],ai); - } - ar=vload(creal(alm[njobs*(l+1)+j])); - ai=vload(cimag(alm[njobs*(l+1)+j])); - for (int i=0; ij[j].r.v[i],lam_1.v[i],ar); - vfmaeq(p2->j[j].i.v[i],lam_1.v[i],ai); - } - } - r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]); - for (int i=0; ij[j].r.v[i],lam_2.v[i],ar); - vfmaeq(p1->j[j].i.v[i],lam_2.v[i],ai); - } - } - } - } - -static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1, - const Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2, - const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax) - { - while (lj[j].r.v[i]); - vfmaeq(ti1,lam_2.v[i],p1->j[j].i.v[i]); - } - for (int i=0; ij[j].r.v[i]); - vfmaeq(ti2,lam_1.v[i],p2->j[j].i.v[i]); - } - vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]); - } - r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]); - for (int i=0; ij[j].r.v[i]); - vfmaeq(tim,lam_2.v[i],p1->j[j].i.v[i]); - } - alm[l*njobs+j]+=vhsum_cmplx(tre,tim); - } - } - } - -static void Z(calc_alm2map) (const Tb cth, const Tb sth, - const sharp_Ylmgen_C *gen, sharp_job *job, Z(Tbrij) * restrict p1, - Z(Tbrij) * restrict p2, int *done) - { - int l,lmax=gen->lmax; - Tb lam_1,lam_2,scale; - Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); - job->opcnt += (l-gen->m) * 4*VLEN*nvec; - if (l>lmax) { *done=1; return; } - job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec; - - Tb corfac; - Y(getCorfac)(scale,&corfac,gen->cf); - const sharp_ylmgen_dbl2 * restrict rf = gen->rf; - const dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGe)(scale,sharp_minscale); - while (!full_ieee) - { - for (int j=0; jj[j].r.v[i],tmp,ar); - vfmaeq(p1->j[j].i.v[i],tmp,ai); - } - } - if (++l>lmax) break; - Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]); - for (int i=0; ij[j].r.v[i],tmp,ar); - vfmaeq(p2->j[j].i.v[i],tmp,ai); - } - } - if (++l>lmax) break; - r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]); - for (int i=0; icf); - full_ieee = Y(TballGe)(scale,sharp_minscale); - } - } - if (l>lmax) { *done=1; return; } - - Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); - Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax); - } - -static void Z(calc_map2alm) (const Tb cth, const Tb sth, - const sharp_Ylmgen_C *gen, sharp_job *job, const Z(Tbrij) * restrict p1, - const Z(Tbrij) * restrict p2, int *done) - { - int lmax=gen->lmax; - Tb lam_1,lam_2,scale; - int l=gen->m; - Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); - job->opcnt += (l-gen->m) * 4*VLEN*nvec; - if (l>lmax) { *done=1; return; } - job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec; - - const sharp_ylmgen_dbl2 * restrict rf = gen->rf; - Tb corfac; - Y(getCorfac)(scale,&corfac,gen->cf); - dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGe)(scale,sharp_minscale); - while (!full_ieee) - { - for (int j=0; jj[j].r.v[i]); - vfmaeq(tim,tmp,p1->j[j].i.v[i]); - } - alm[l*njobs+j]+=vhsum_cmplx(tre,tim); - } - if (++l>lmax) { *done=1; return; } - Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]); - for (int i=0; ij[j].r.v[i]); - vfmaeq(tim,tmp,p2->j[j].i.v[i]); - } - alm[l*njobs+j]+=vhsum_cmplx(tre,tim); - } - if (++l>lmax) { *done=1; return; } - r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]); - for (int i=0; icf); - full_ieee = Y(TballGe)(scale,sharp_minscale); - } - } - - Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); - Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax); - } - -static inline void Z(saddstep) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py, - const Tb rxp, const Tb rxm, const dcmplx * restrict alm) - { - for (int j=0; jj[j].qr.v[i],agr,lw); - vfmaeq(px->j[j].qi.v[i],agi,lw); - vfmaeq(px->j[j].ur.v[i],acr,lw); - vfmaeq(px->j[j].ui.v[i],aci,lw); - } - for (int i=0; ij[j].qr.v[i],aci,lx); - vfmaeq(py->j[j].qi.v[i],acr,lx); - vfmaeq(py->j[j].ur.v[i],agi,lx); - vfmseq(py->j[j].ui.v[i],agr,lx); - } - } - } - -static inline void Z(saddstepb) (Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, - const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m, - const dcmplx * restrict alm1, const dcmplx * restrict alm2) - { - for (int j=0; jj[j].qr.v[i],agr1,lw1,aci2,lx2); - vfmaaeq(p1->j[j].qi.v[i],agi1,lw1,acr2,lx2); - vfmaaeq(p1->j[j].ur.v[i],acr1,lw1,agi2,lx2); - vfmaseq(p1->j[j].ui.v[i],aci1,lw1,agr2,lx2); - } - for (int i=0; ij[j].qr.v[i],agr2,lw2,aci1,lx1); - vfmaaeq(p2->j[j].qi.v[i],agi2,lw2,acr1,lx1); - vfmaaeq(p2->j[j].ur.v[i],acr2,lw2,agi1,lx1); - vfmaseq(p2->j[j].ui.v[i],aci2,lw2,agr1,lx1); - } - } - } - -static inline void Z(saddstep2) (const Z(Tbquj) * restrict px, - const Z(Tbquj) * restrict py, const Tb * restrict rxp, - const Tb * restrict rxm, dcmplx * restrict alm) - { - for (int j=0; jv[i],rxm->v[i]); - vfmaeq(agr,px->j[j].qr.v[i],lw); - vfmaeq(agi,px->j[j].qi.v[i],lw); - vfmaeq(acr,px->j[j].ur.v[i],lw); - vfmaeq(aci,px->j[j].ui.v[i],lw); - } - for (int i=0; iv[i],rxp->v[i]); - vfmseq(agr,py->j[j].ui.v[i],lx); - vfmaeq(agi,py->j[j].ur.v[i],lx); - vfmaeq(acr,py->j[j].qi.v[i],lx); - vfmseq(aci,py->j[j].qr.v[i],lx); - } - vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]); - } - } - -static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1, - Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, - const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, - int lmax) - { - while (l1) - Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l], - &alm[2*njobs*(l+1)]); -#else - Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l]); - Z(saddstep)(p2, p1, rec1p, rec1m, &alm[2*njobs*(l+1)]); -#endif - fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]); - fx2=vload(fx[l+2].f[2]); - for (int i=0; ilmax; - Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; - Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); - job->opcnt += (l-gen->m) * 10*VLEN*nvec; - if (l>lmax) - { *done=1; return; } - job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; - - const sharp_ylmgen_dbl3 * restrict fx = gen->fx; - Tb corfacp,corfacm; - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); - const dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); - while (!full_ieee) - { - Z(saddstep)(p1, p2, - Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), &alm[2*njobs*l]); - if (++l>lmax) break; - Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); - Z(saddstep)(p2, p1, - Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), &alm[2*njobs*l]); - if (++l>lmax) break; - Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); - if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) - { - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); - full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); - } - } - - if (l>lmax) - { *done=1; return; } - - Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); - Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); - Z(alm2map_spin_kernel) (cth,p1,p2, - rec1p, rec1m, rec2p, rec2m, fx, alm, l, lmax); - } - -static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen, - sharp_job *job, const Z(Tbquj) * restrict p1, const Z(Tbquj) * restrict p2, - int *done) - { - int l, lmax=gen->lmax; - Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; - Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); - job->opcnt += (l-gen->m) * 10*VLEN*nvec; - if (l>lmax) { *done=1; return; } - job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; - - const sharp_ylmgen_dbl3 * restrict fx = gen->fx; - Tb corfacp,corfacm; - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); - dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); - while (!full_ieee) - { - Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm); - Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l]); - if (++l>lmax) { *done=1; return; } - Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); - t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm); - Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l]); - if (++l>lmax) { *done=1; return; } - Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); - if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) - { - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); - full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); - } - } - - Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); - Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); - Z(map2alm_spin_kernel) (cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax); - } - -static inline void Z(saddstep_d) (Z(Tbquj) * restrict px, - Z(Tbquj) * restrict py, const Tb rxp, const Tb rxm, - const dcmplx * restrict alm) - { - for (int j=0; jj[j].qr.v[i],ar,lw); - vfmaeq(px->j[j].qi.v[i],ai,lw); - } - for (int i=0; ij[j].ur.v[i],ai,lx); - vfmseq(py->j[j].ui.v[i],ar,lx); - } - } - } - -static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1, - Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, - const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, - int lmax) - { - while (llmax; - Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; - Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); - job->opcnt += (l-gen->m) * 10*VLEN*nvec; - if (l>lmax) - { *done=1; return; } - job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec; - - const sharp_ylmgen_dbl3 * restrict fx = gen->fx; - Tb corfacp,corfacm; - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); - const dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); - while (!full_ieee) - { - Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), - &alm[njobs*l]); - if (++l>lmax) break; - Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); - Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), - &alm[njobs*l]); - if (++l>lmax) break; - Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); - if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) - { - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); - full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); - } - } - - if (l>lmax) - { *done=1; return; } - - Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); - Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); - Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, - lmax); - } - -#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) - -static void Z(inner_loop) (sharp_job *job, const int *ispair, - const double *cth_, const double *sth_, int llim, int ulim, - sharp_Ylmgen_C *gen, int mi, const int *idx) - { - const int nval=nvec*VLEN; - const int m = job->ainfo->mval[mi]; - sharp_Ylmgen_prepare (gen, m); - - switch (job->type) - { - case SHARP_ALM2MAP: - case SHARP_ALM2MAP_DERIV1: - { - if (job->spin==0) - { - int done=0; - for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; - itot=idx[itot]; - cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; - } - Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1.b,&p2.b,&done); - } - - for (int i=0; iainfo->nm+mi)); - complex double r1 = p1.j[j].r[i] + p1.j[j].i[i]*_Complex_I, - r2 = p2.j[j].r[i] + p2.j[j].i[i]*_Complex_I; - job->phase[phas_idx] = r1+r2; - if (ispair[itot]) - job->phase[phas_idx+1] = r1-r2; - } - } - } - } - } - else - { - int done=0; - for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; - itot=idx[itot]; - cth.s[i]=cth_[itot]; - } - (job->type==SHARP_ALM2MAP) ? - Z(calc_alm2map_spin ) (cth.b,gen,job,&p1.b,&p2.b,&done) : - Z(calc_alm2map_deriv1) (cth.b,gen,job,&p1.b,&p2.b,&done); - } - - for (int i=0; iainfo->nm+mi)); - complex double q1 = p1.j[j].qr[i] + p1.j[j].qi[i]*_Complex_I, - q2 = p2.j[j].qr[i] + p2.j[j].qi[i]*_Complex_I, - u1 = p1.j[j].ur[i] + p1.j[j].ui[i]*_Complex_I, - u2 = p2.j[j].ur[i] + p2.j[j].ui[i]*_Complex_I; - job->phase[phas_idx] = q1+q2; - job->phase[phas_idx+2] = u1+u2; - if (ispair[itot]) - { - dcmplx *phQ = &(job->phase[phas_idx+1]), - *phU = &(job->phase[phas_idx+3]); - *phQ = q1-q2; - *phU = u1-u2; - if ((gen->mhi-gen->m+gen->s)&1) - { *phQ=-(*phQ); *phU=-(*phU); } - } - } - } - } - } - } - break; - } - case SHARP_MAP2ALM: - { - if (job->spin==0) - { - int done=0; - for (int ith=0; (ith=ulim-llim) itot=ulim-llim-1; - itot=idx[itot]; - cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; - if (i+ithainfo->nm+mi)); - dcmplx ph1=job->phase[phas_idx]; - dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.; - p1.j[j].r[i]=creal(ph1+ph2); p1.j[j].i[i]=cimag(ph1+ph2); - p2.j[j].r[i]=creal(ph1-ph2); p2.j[j].i[i]=cimag(ph1-ph2); - } - } - } - Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b,&done); - } - } - else - { - int done=0; - for (int ith=0; (ith=ulim-llim) itot=ulim-llim-1; - itot=idx[itot]; - cth.s[i]=cth_[itot]; - if (i+ithainfo->nm+mi)); - dcmplx p1Q=job->phase[phas_idx], - p1U=job->phase[phas_idx+2], - p2Q=ispair[itot] ? job->phase[phas_idx+1]:0., - p2U=ispair[itot] ? job->phase[phas_idx+3]:0.; - if ((gen->mhi-gen->m+gen->s)&1) - { p2Q=-p2Q; p2U=-p2U; } - p1.j[j].qr[i]=creal(p1Q+p2Q); p1.j[j].qi[i]=cimag(p1Q+p2Q); - p1.j[j].ur[i]=creal(p1U+p2U); p1.j[j].ui[i]=cimag(p1U+p2U); - p2.j[j].qr[i]=creal(p1Q-p2Q); p2.j[j].qi[i]=cimag(p1Q-p2Q); - p2.j[j].ur[i]=creal(p1U-p2U); p2.j[j].ui[i]=cimag(p1U-p2U); - } - } - } - Z(calc_map2alm_spin) (cth.b,gen,job,&p1.b,&p2.b,&done); - } - } - break; - } - default: - { - UTIL_FAIL("must not happen"); - break; - } - } - } - -#undef VZERO diff --git a/libsharp/sharp_core_inc3.c b/libsharp/sharp_core_inc4.c similarity index 86% rename from libsharp/sharp_core_inc3.c rename to libsharp/sharp_core_inc4.c index c46780e..e36527f 100644 --- a/libsharp/sharp_core_inc3.c +++ b/libsharp/sharp_core_inc4.c @@ -22,17 +22,19 @@ * (DLR). */ -/*! \file sharp_core_inc3.c +/*! \file sharp_core_inc4.c * Type-dependent code for the computational core * * Copyright (C) 2012 Max-Planck-Society * \author Martin Reinecke */ -static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1, +static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1, Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, - int l, int lmax, int njobs) + int l, int lmax NJ1) + { +if (njobs>1) { while (llmax; Tb lam_1,lam_2,scale; @@ -213,12 +216,12 @@ static void Y(calc_alm2map) (const Tb cth, const Tb sth, if (l>lmax) { *done=1; return; } Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); - Y(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, njobs); + Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2); } -static void Y(calc_map2alm) (const Tb cth, const Tb sth, +static void Z(calc_map2alm) (const Tb cth, const Tb sth, const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1, - const Y(Tbri) * restrict p2, int njobs, int *done) + const Y(Tbri) * restrict p2 NJ1, int *done) { int lmax=gen->lmax; Tb lam_1,lam_2,scale; @@ -273,11 +276,11 @@ static void Y(calc_map2alm) (const Tb cth, const Tb sth, } Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); - Y(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, njobs); + Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2); } -static inline void Y(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, - const Tb rxp, const Tb rxm, const dcmplx * restrict alm, int njobs) +static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, + const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1) { for (int j=0; j1) + Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l], + &alm[2*njobs*(l+1)] NJ2); +#else + Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2); + Z(saddstep)(p2, p1, rec1p, rec1m, &alm[2*njobs*(l+1)] NJ2); +#endif +#else + Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l], + &alm[2*njobs*(l+1)] NJ2); +#endif fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]); fx2=vload(fx[l+2].f[2]); for (int i=0; ilmax; @@ -447,12 +460,12 @@ static void Y(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen, && Y(TballGe)(scalem,sharp_minscale); while (!full_ieee) { - Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), - &alm[2*njobs*l],njobs); + Z(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), + &alm[2*njobs*l] NJ2); if (++l>lmax) break; Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); - Y(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), - &alm[2*njobs*l], njobs); + Z(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), + &alm[2*njobs*l] NJ2); if (++l>lmax) break; Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) @@ -469,13 +482,13 @@ static void Y(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen, Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); - Y(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, - lmax, njobs); + Z(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, + lmax NJ2); } -static void Y(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen, - sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2, - int njobs, int *done) +static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen, + sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 + NJ1, int *done) { int l, lmax=gen->lmax; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; @@ -494,11 +507,11 @@ static void Y(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen, while (!full_ieee) { Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm); - Y(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l], njobs); + Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l] NJ2); if (++l>lmax) { *done=1; return; } Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm); - Y(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l], njobs); + Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l] NJ2); if (++l>lmax) { *done=1; return; } Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) @@ -512,11 +525,11 @@ static void Y(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen, Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); - Y(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax,njobs); + Z(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax NJ2); } -static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, - const Tb rxp, const Tb rxm, const dcmplx * restrict alm, int njobs) +static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, + const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1) { for (int j=0; jlmax; @@ -590,12 +603,12 @@ static void Y(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen, && Y(TballGe)(scalem,sharp_minscale); while (!full_ieee) { - Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), - &alm[njobs*l],njobs); + Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), + &alm[njobs*l] NJ2); if (++l>lmax) break; Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); - Y(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), - &alm[njobs*l], njobs); + Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), + &alm[njobs*l] NJ2); if (++l>lmax) break; Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) @@ -612,16 +625,16 @@ static void Y(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen, Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); - Y(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, - lmax, njobs); + Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, + lmax NJ2); } #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) -static void Y(inner_loop) (sharp_job *job, const int *ispair, +static void Z(inner_loop) (sharp_job *job, const int *ispair, const double *cth_, const double *sth_, int llim, int ulim, - sharp_Ylmgen_C *gen, int mi, const int *idx, int njobs) + sharp_Ylmgen_C *gen, int mi, const int *idx NJ1) { const int nval=nvec*VLEN; const int m = job->ainfo->mval[mi]; @@ -649,7 +662,7 @@ static void Y(inner_loop) (sharp_job *job, const int *ispair, itot=idx[itot]; cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; } - Y(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done); + Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2,&done); } for (int i=0; itype==SHARP_ALM2MAP) ? - Y(calc_alm2map_spin ) - (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done) : - Y(calc_alm2map_deriv1) - (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done); + Z(calc_alm2map_spin ) + (cth.b,gen,job,&p1[0].b,&p2[0].b NJ2,&done) : + Z(calc_alm2map_deriv1) + (cth.b,gen,job,&p1[0].b,&p2[0].b NJ2,&done); } for (int i=0; iMAXJOB_SPECIAL) -#include "sharp_core_inc3.c" +#define NJ1 ,int njobs +#define NJ2 , njobs +#define Z(arg) CONCAT2(arg,nvec) +#include "sharp_core_inc4.c" +#undef Z +#undef NJ1 +#undef NJ2 #endif +#define NJ1 +#define NJ2 + #if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1)) #define njobs 1 #define Z(arg) CONCAT3(arg,nvec,njobs) -#include "sharp_core_inc2.c" +#include "sharp_core_inc4.c" #undef Z #undef njobs #endif @@ -17,7 +26,7 @@ #if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2)) #define njobs 2 #define Z(arg) CONCAT3(arg,nvec,njobs) -#include "sharp_core_inc2.c" +#include "sharp_core_inc4.c" #undef Z #undef njobs #endif @@ -25,7 +34,7 @@ #if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3)) #define njobs 3 #define Z(arg) CONCAT3(arg,nvec,njobs) -#include "sharp_core_inc2.c" +#include "sharp_core_inc4.c" #undef Z #undef njobs #endif @@ -33,7 +42,7 @@ #if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4)) #define njobs 4 #define Z(arg) CONCAT3(arg,nvec,njobs) -#include "sharp_core_inc2.c" +#include "sharp_core_inc4.c" #undef Z #undef njobs #endif @@ -41,7 +50,7 @@ #if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5)) #define njobs 5 #define Z(arg) CONCAT3(arg,nvec,njobs) -#include "sharp_core_inc2.c" +#include "sharp_core_inc4.c" #undef Z #undef njobs #endif @@ -49,10 +58,13 @@ #if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6)) #define njobs 6 #define Z(arg) CONCAT3(arg,nvec,njobs) -#include "sharp_core_inc2.c" +#include "sharp_core_inc4.c" #undef Z #undef njobs #endif +#undef NJ1 +#undef NJ2 + #undef Y #undef Tb