From 7928e13156681cf533d253b1899c6ff53d22a7de Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 12 Jul 2012 16:23:27 +0200
Subject: [PATCH] perform autotuning on the fly

---
 Makefile                    | 17 ++++++---
 README.compilation          |  9 ++---
 libsharp/planck.make        |  2 +-
 libsharp/sharp.c            | 76 ++++++++++++++++++++++++++++++++++++-
 libsharp/sharp_oracle.inc   |  9 -----
 libsharp/sharp_vecsupport.h |  6 +--
 6 files changed, 91 insertions(+), 28 deletions(-)
 delete mode 100644 libsharp/sharp_oracle.inc

diff --git a/Makefile b/Makefile
index 3d6f20e..70d457c 100644
--- a/Makefile
+++ b/Makefile
@@ -26,15 +26,9 @@ $(all_lib): %: | $(LIBDIR)_mkdir
 $(all_cbin): %: | $(BINDIR)_mkdir
 	@echo "#  linking C binary $*"
 	$(CL) -o $@ $^ $(CLFLAGS)
-#	$(CXX) -o $@ $^ $(CLFLAGS)
 
 compile_all: $(all_cbin) hdrcopy
 
-autotune: sharp_bench
-	$(BINDIR)/sharp_bench
-	mv sharp_oracle.inc $(SRCROOT)/libsharp
-	$(MAKE)
-
 hdrclean:
 	@if [ -d $(INCDIR) ]; then rm -rf $(INCDIR)/* ; fi
 
@@ -48,3 +42,14 @@ test: compile_all
 	$(BINDIR)/sharp_test healpix 2048 1024 1 0 1 && \
 	$(BINDIR)/sharp_test ecp 2047 4096 0 2 1 && \
 	$(BINDIR)/sharp_test gauss 2047 4096 0 0 2
+
+perftest: compile_all
+	$(BINDIR)/sharp_test healpix 2048 1024 0 0 1 && \
+	$(BINDIR)/sharp_test gauss 63 128 0 0 1 && \
+	$(BINDIR)/sharp_test gauss 127 256 0 0 1 && \
+	$(BINDIR)/sharp_test gauss 255 512 0 0 1 && \
+	$(BINDIR)/sharp_test gauss 511 1024 0 0 1 && \
+	$(BINDIR)/sharp_test gauss 1023 2048 0 0 1 && \
+	$(BINDIR)/sharp_test gauss 2047 4096 0 0 1 && \
+	$(BINDIR)/sharp_test gauss 4095 8192 0 0 1 && \
+	$(BINDIR)/sharp_test gauss 8191 16384 0 0 1
diff --git a/README.compilation b/README.compilation
index 7607750..8a8dfca 100644
--- a/README.compilation
+++ b/README.compilation
@@ -3,12 +3,9 @@ GNU make and GNU gcc (version 4.x) are required for compilation.
 Simply run "./configure"; if this fails, please refer to the output of
 "./configure --help" for additional hints and, if necessary, provide
 additional flags to the configure script.
-Once the script finishes successfully, run "make autotune"
-(or "gmake autotune"). This should perform some necessary self-tuning and
-install the compilation products in the subdirectory "auto/".
-NOTE: Autotuning should be done on the the computer where you wish to use
-the library later on, and no other CPU-intensive tasks should be running
-during the autotuning process.
+Once the script finishes successfully, run "make"
+(or "gmake"). This should install the compilation products in the
+subdirectory "auto/".
 
 Documentation can be created by the command "(g)make doc".
 However this requires the doxygen application to be installed
diff --git a/libsharp/planck.make b/libsharp/planck.make
index 6c72945..87a9b0d 100644
--- a/libsharp/planck.make
+++ b/libsharp/planck.make
@@ -15,7 +15,7 @@ ALLOBJ:=$(ALLOBJ:%=$(OD)/%)
 
 ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils)
 $(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c
-$(OD)/sharp.o: $(SD)/sharp_mpi.c $(SD)/sharp_oracle.inc
+$(OD)/sharp.o: $(SD)/sharp_mpi.c
 BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils)
 
 $(LIB_$(PKG)): $(LIBOBJ)
diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index d24753b..1c6ccd3 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -37,6 +37,8 @@
 #include "sharp_core.h"
 #include "sharp_vecutil.h"
 #include "walltime_c.h"
+#include "sharp_almhelpers.h"
+#include "sharp_geomhelpers.h"
 
 typedef complex double dcmplx;
 typedef complex float  fcmplx;
@@ -585,14 +587,84 @@ void sharps_build_job (sharp_job *job, sharp_jobtype type, int spin,
 int sharp_get_nv_max (void)
 { return 6; }
 
+static int sharp_oracle (sharp_jobtype type, int spin, int ntrans)
+  {
+  int lmax=127;
+  int mmax=(lmax+1)/2;
+  int nrings=(lmax+1)/4;
+  int ppring=1;
+
+  ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
+  sharp_geom_info *tinfo;
+  sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
+
+  ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
+  int ncomp = ntrans*((spin==0) ? 1 : 2);
+
+  double **map;
+  ALLOC2D(map,double,ncomp,npix);
+  SET_ARRAY(map[0],0,npix*ncomp,0.);
+
+  sharp_alm_info *alms;
+  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
+
+  dcmplx **alm;
+  ALLOC2D(alm,dcmplx,ncomp,nalms);
+  SET_ARRAY(alm[0],0,nalms*ncomp,0.);
+
+  double time=1e30;
+  int nvbest=-1;
+
+  for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
+    {
+    double time_acc=0.;
+    sharp_job job;
+    sharpd_build_job(&job,type,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
+    job.nv=nv;
+    do
+      {
+      sharp_execute_job(&job);
+
+      if (job.time<time) { time=job.time; nvbest=nv; }
+      time_acc+=job.time;
+      }
+    while (time_acc<0.02);
+    }
+
+  DEALLOC2D(map);
+  DEALLOC2D(alm);
+
+  sharp_destroy_alm_info(alms);
+  sharp_destroy_geom_info(tinfo);
+  return nvbest;
+  }
+
 int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
   {
+  static const int maxtr = 6;
+  static int nv_opt[6][2][3] = {
+    {{0,0,0},{0,0,0}},
+    {{0,0,0},{0,0,0}},
+    {{0,0,0},{0,0,0}},
+    {{0,0,0},{0,0,0}},
+    {{0,0,0},{0,0,0}},
+    {{0,0,0},{0,0,0}} };
+
+  static int in_oracle=0;
+  if (in_oracle) return -20;
+
   if (type==ALM2MAP_DERIV1) spin=1;
   UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
   UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin");
-#include "sharp_oracle.inc"
+  ntrans=IMIN(ntrans,maxtr);
 
-  return nv_opt[IMIN(ntrans,maxtr)-1][spin!=0][type];
+  if (nv_opt[ntrans-1][spin!=0][type]==0)
+    {
+    in_oracle=1;
+    nv_opt[ntrans-1][spin!=0][type]=sharp_oracle(type,spin,ntrans);
+    in_oracle=0;
+    }
+  return nv_opt[ntrans-1][spin!=0][type];
   }
 
 #include "sharp_mpi.c"
diff --git a/libsharp/sharp_oracle.inc b/libsharp/sharp_oracle.inc
deleted file mode 100644
index 40c9ed0..0000000
--- a/libsharp/sharp_oracle.inc
+++ /dev/null
@@ -1,9 +0,0 @@
-static const int maxtr = 6;
-static const int nv_opt[6][2][3] = {
-{{4,2,-1},{2,1,1}},
-{{5,2,-1},{2,1,1}},
-{{5,2,-1},{5,2,2}},
-{{5,2,-1},{5,2,2}},
-{{5,2,-1},{5,2,2}},
-{{5,2,-1},{5,2,2}}
-};
diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index 2c01b1d..fcfd253 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -137,10 +137,8 @@ typedef __m256d Tv;
 #ifdef __FMA4__
 #define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
 #define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
-#define vfmaaeq(a,b,c,d,e) \
-  a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
-#define vfmaseq(a,b,c,d,e) \
-  a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
+#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
+#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
 #else
 #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
 #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))