more simplifications

2018-12-11 09:33:44 +01:00 · 2018-12-11 09:33:44 +01:00 · aee1a51ac2
commit aee1a51ac2
parent 10c5b5f7a9
6 changed files with 28 additions and 189 deletions
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@ -515,7 +515,7 @@ static void dealloc_phase (sharp_job *job)
  { DEALLOC(job->phase); }

 static void alloc_almtmp (sharp_job *job, int lmax)
-  { job->almtmp=RALLOC(dcmplx,job->nalm*(lmax+1)); }
+  { job->almtmp=RALLOC(dcmplx,job->nalm*(lmax+2)); }

 static void dealloc_almtmp (sharp_job *job)
  { DEALLOC(job->almtmp); }
@ -534,6 +534,8 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
      source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \
      job->almtmp[job->nalm*l+i] = expr_of_x;   \
      }                                                     \
+  for (int i=0; i<job->nalm; ++i)             \
+    job->almtmp[job->nalm*(lmax+1)+i] = 0;           \
  }

  if (job->type!=SHARP_MAP2ALM)
@ -852,8 +854,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
  init_output (job);

  int nchunks, chunksize;
-  get_chunk_info(job->ginfo->npairs,(job->flags&SHARP_NVMAX)*VLEN,&nchunks,
-    &chunksize);
+  get_chunk_info(job->ginfo->npairs,6*VLEN,&nchunks,&chunksize);
 //FIXME: needs to be changed to "nm"
  alloc_phase (job,mmax+1,chunksize);

@ -934,8 +935,6 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
  job->ginfo = geom_info;
  job->ainfo = alm_info;
  job->flags = flags;
-  if ((job->flags&SHARP_NVMAX)==0)
-    job->flags|=sharp_nv_oracle (type, spin);
  if (alm_info->flags&SHARP_REAL_HARMONICS)
    job->flags|=SHARP_REAL_HARMONICS;
  job->time = 0.;
@ -965,75 +964,6 @@ void sharp_set_nchunks_max(int new_nchunks_max)
 int sharp_get_nv_max (void)
 { return 6; }

-static int sharp_oracle (sharp_jobtype type, int spin)
-  {
-  int lmax=511;
-  int mmax=(lmax+1)/2;
-  int nrings=(lmax+1)/4;
-  int ppring=1;
-
-  spin = (spin!=0) ? 2 : 0;
-
-  ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
-  sharp_geom_info *tinfo;
-  sharp_make_gauss_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
-
-  ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
-  int ncomp = (spin==0) ? 1 : 2;
-
-  double **map;
-  ALLOC2D(map,double,ncomp,npix);
-  SET_ARRAY(map[0],0,npix*ncomp,0.);
-
-  sharp_alm_info *alms;
-  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
-
-  dcmplx **alm;
-  ALLOC2D(alm,dcmplx,ncomp,nalms);
-  SET_ARRAY(alm[0],0,nalms*ncomp,0.);
-
-  double time=1e30;
-  int nvbest=-1;
-
-  for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
-    {
-    double time_acc=0.;
-    double jtime;
-    int ntries=0;
-    do
-      {
-      sharp_execute(type,spin,&alm[0],&map[0],tinfo,alms,
-        nv|SHARP_DP|SHARP_NO_OPENMP,&jtime,NULL);
-
-      if (jtime<time) { time=jtime; nvbest=nv; }
-      time_acc+=jtime;
-      ++ntries;
-      }
-    while ((time_acc<0.02)&&(ntries<2));
-    }
-
-  DEALLOC2D(map);
-  DEALLOC2D(alm);
-
-  sharp_destroy_alm_info(alms);
-  sharp_destroy_geom_info(tinfo);
-  return nvbest;
-  }
-
-int sharp_nv_oracle (sharp_jobtype type, int spin)
-  {
-  static const int maxtr = 6;
-  static int nv_opt[2][5] = {{0,0,0,0,0},{0,0,0,0,0}};
-
-  if (type==SHARP_ALM2MAP_DERIV1) spin=1;
-  UTIL_ASSERT(type<5,"bad type");
-  UTIL_ASSERT(spin>=0, "bad spin");
-
-  if (nv_opt[spin!=0][type]==0)
-    nv_opt[spin!=0][type]=sharp_oracle(type,spin);
-  return nv_opt[spin!=0][type];
-  }
-
 #ifdef USE_MPI
 #include "sharp_mpi.c"

--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
@ -217,11 +217,12 @@ NOINLINE static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
    {
    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
    for (int i=0; i<nvec; ++i)
+      {
      lam_1.v[i] = vload(gen->rf[l].f[0])*(cth.v[i]*lam_2.v[i])
                 - vload(gen->rf[l].f[1])*lam_1.v[i];
-    for (int i=0; i<nvec; ++i)
      lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
                 - vload(gen->rf[l+1].f[1])*lam_2.v[i];
+      }
    if (Y(rescale)(&lam_1,&lam_2,&scale))
      below_limit = Y(TballLt)(scale,sharp_limscale);
    l+=2;
--- a/libsharp/sharp_core_inc0.c
+++ b/libsharp/sharp_core_inc0.c
@ -40,35 +40,10 @@

 typedef complex double dcmplx;

-// must be in the range [0;6]
-#define MAXJOB_SPECIAL 2
-
 #define XCONCATX(a,b) a##b
 #define CONCATX(a,b) XCONCATX(a,b)
 #define XCONCAT2(a,b) a##_##b
 #define CONCAT2(a,b) XCONCAT2(a,b)
-#define XCONCAT3(a,b,c) a##_##b##_##c
-#define CONCAT3(a,b,c) XCONCAT3(a,b,c)
-
-#define nvec 1
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 2
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 3
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 4
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 5
-#include "sharp_core_inchelper.c"
-#undef nvec

 #define nvec 6
 #include "sharp_core_inchelper.c"
@ -78,27 +53,5 @@ void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *c
  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
  const int *mlim)
  {
-  int nv=job->flags&SHARP_NVMAX;
-  switch (nv)
-    {
-    case 0x1:
-      CONCAT2(inner_loop,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-      return;
-    case 0x2:
-      CONCAT2(inner_loop,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-      return;
-    case 0x3:
-      CONCAT2(inner_loop,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-      return;
-    case 0x4:
-      CONCAT2(inner_loop,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-      return;
-    case 0x5:
-      CONCAT2(inner_loop,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-      return;
-    case 0x6:
-      CONCAT2(inner_loop,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-      return;
-    }
-  UTIL_FAIL("Incorrect vector parameters");
+  CONCAT2(inner_loop,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
  }
--- a/libsharp/sharp_core_inc2.c
+++ b/libsharp/sharp_core_inc2.c
@ -34,75 +34,44 @@ NOINLINE static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
  int l, int lmax)
  {
-  while (l<lmax)
+  while (l<=lmax)
    {
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
-                 - vload(rf[l].f[1])*lam_1.v[i];
-    {
-    Tv ar=vload(creal(alm[l])),
-       ai=vload(cimag(alm[l]));
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
+    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
+       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
    for (int i=0; i<nvec; ++i)
      {
-      p1->r.v[i] += lam_2.v[i]*ar;
-      p1->i.v[i] += lam_2.v[i]*ai;
-      }
-    }
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
-                 - vload(rf[l+1].f[1])*lam_2.v[i];
-    {
-    Tv ar=vload(creal(alm[l+1])),
-       ai=vload(cimag(alm[l+1]));
-    for (int i=0; i<nvec; ++i)
-      {
-      p2->r.v[i] += lam_1.v[i]*ar;
-      p2->i.v[i] += lam_1.v[i]*ai;
-      }
+      lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
+      p1->r.v[i] += lam_2.v[i]*ar1;
+      p1->i.v[i] += lam_2.v[i]*ai1;
+      lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
+      p2->r.v[i] += lam_1.v[i]*ar2;
+      p2->i.v[i] += lam_1.v[i]*ai2;
      }
    l+=2;
    }
-  if (l==lmax)
-    {
-    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
-    for (int i=0; i<nvec; ++i)
-      {
-      p1->r.v[i] += lam_2.v[i]*ar;
-      p1->i.v[i] += lam_2.v[i]*ai;
-      }
-    }
  }

 NOINLINE static void Z(map2alm_kernel) (const Tb cth,
  const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
  const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp)
  {
-  while (l<lmax)
+  while (l<=lmax)
    {
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vabmc(vload(rf[l].f[0]),vmul(cth.v[i],lam_2.v[i]),
-                   vmul(vload(rf[l].f[1]),lam_1.v[i]));
+    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
+       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
    for (int i=0; i<nvec; ++i)
      {
+      lam_1.v[i] = f10*(cth.v[i]*lam_2.v[i]) - f11*lam_1.v[i];
      vfmaeq(atmp[2*l  ],lam_2.v[i],p1->r.v[i]);
      vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]);
-      }
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
-                 - vload(rf[l+1].f[1])*lam_2.v[i];
-    for (int i=0; i<nvec; ++i)
-      {
+      lam_2.v[i] = f20*(cth.v[i]*lam_1.v[i]) - f21*lam_2.v[i];
      vfmaeq(atmp[2*(l+1)  ],lam_1.v[i],p2->r.v[i]);
      vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]);
      }
    l+=2;
    }
-  if (l==lmax)
-    for (int i=0; i<nvec; ++i)
-      {
-      atmp[2*l  ] += lam_2.v[i]*p1->r.v[i];
-      atmp[2*l+1] += lam_2.v[i]*p1->i.v[i];
-      }
  }

 NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth,
@ -187,10 +156,9 @@ NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
      }
    if (++l>lmax) return;
    for (int i=0; i<nvec; ++i)
+      {
      lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
                 - vload(rf[l-1].f[1])*lam_1.v[i];
-    for (int i=0; i<nvec; ++i)
-      {
      Tv tmp=lam_1.v[i]*corfac.v[i];
      atmp[2*l  ]+=tmp*p2->r.v[i];
      atmp[2*l+1]+=tmp*p2->i.v[i];
@ -222,9 +190,6 @@ static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
    vfmaeq(px->qi.v[i],agi,lw);
    vfmaeq(px->ur.v[i],acr,lw);
    vfmaeq(px->ui.v[i],aci,lw);
-    }
-  for (int i=0; i<nvec; ++i)
-    {
    Tv lx=vsub(rxm.v[i],rxp.v[i]);
    vfmseq(py->qr.v[i],aci,lx);
    vfmaeq(py->qi.v[i],acr,lx);
@ -249,9 +214,6 @@ static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
    vfmaaeq(p1->qi.v[i],agi1,lw1,acr2,lx2);
    vfmaaeq(p1->ur.v[i],acr1,lw1,agi2,lx2);
    vfmaseq(p1->ui.v[i],aci1,lw1,agr2,lx2);
-    }
-  for (int i=0; i<nvec; ++i)
-    {
    Tv lx1=r2m.v[i]-r2p.v[i];
    Tv lw2=r1p.v[i]+r1m.v[i];
    vfmaseq(p2->qr.v[i],agr2,lw2,aci1,lx1);
@ -273,9 +235,6 @@ static inline void Z(saddstep2) (const Y(Tbqu) * restrict px,
    vfmaeq(agi,px->qi.v[i],lw);
    vfmaeq(acr,px->ur.v[i],lw);
    vfmaeq(aci,px->ui.v[i],lw);
-    }
-  for (int i=0; i<nvec; ++i)
-    {
    Tv lx=vsub(rxm->v[i],rxp->v[i]);
    vfmseq(agr,py->ui.v[i],lx);
    vfmaeq(agi,py->ur.v[i],lx);
@ -444,9 +403,6 @@ static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
    Tv lw=vadd(rxp.v[i],rxm.v[i]);
    vfmaeq(px->qr.v[i],ar,lw);
    vfmaeq(px->qi.v[i],ai,lw);
-    }
-  for (int i=0; i<nvec; ++i)
-    {
    Tv lx=vsub(rxm.v[i],rxp.v[i]);
    vfmaeq(py->ur.v[i],ai,lx);
    vfmseq(py->ui.v[i],ar,lx);
@ -652,8 +608,8 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
      {
      if (job->spin==0)
        {
-        Tv atmp[2*(gen->lmax+1)];
-        memset (&atmp[2*m],0,2*(gen->lmax+1-m)*sizeof(Tv));
+        Tv atmp[2*(gen->lmax+2)];
+        memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv));
        for (int ith=0; ith<ulim-llim; ith+=nval)
          {
          Y(Tburi) p1, p2; VZERO(p1); VZERO(p2);
--- a/libsharp/sharp_lowlevel.h
+++ b/libsharp/sharp_lowlevel.h
@ -200,7 +200,6 @@ typedef enum { SHARP_DP              = 1<<4,

               SHARP_USE_WEIGHTS     = 1<<20,    /* internal use only */
               SHARP_NO_OPENMP       = 1<<21,    /* internal use only */
-               SHARP_NVMAX           = (1<<4)-1 /* internal use only */
             } sharp_jobflags;

 /*! Performs a libsharp SHT job. The interface deliberately does not use
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@ -69,7 +69,7 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
  gen->m = -1;
  if (spin==0)
    {
-    gen->rf = RALLOC(sharp_ylmgen_dbl2,gen->lmax+1);
+    gen->rf = RALLOC(sharp_ylmgen_dbl2,gen->lmax+2);
    gen->mfac = RALLOC(double,gen->mmax+1);
    gen->mfac[0] = inv_sqrt4pi;
    for (int m=1; m<=gen->mmax; ++m)
@ -159,7 +159,7 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
    {
    gen->rf[m].f[0] = gen->root[2*m+3];
    gen->rf[m].f[1] = 0.;
-    for (int l=m+1; l<=gen->lmax; ++l)
+    for (int l=m+1; l<=gen->lmax+1; ++l)
      {
      double tmp=gen->root[2*l+3]*gen->iroot[l+1+m]*gen->iroot[l+1-m];
      gen->rf[l].f[0] = tmp*gen->root[2*l+1];