From 7928e13156681cf533d253b1899c6ff53d22a7de Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Thu, 12 Jul 2012 16:23:27 +0200 Subject: [PATCH] perform autotuning on the fly --- Makefile | 17 ++++++--- README.compilation | 9 ++--- libsharp/planck.make | 2 +- libsharp/sharp.c | 76 ++++++++++++++++++++++++++++++++++++- libsharp/sharp_oracle.inc | 9 ----- libsharp/sharp_vecsupport.h | 6 +-- 6 files changed, 91 insertions(+), 28 deletions(-) delete mode 100644 libsharp/sharp_oracle.inc diff --git a/Makefile b/Makefile index 3d6f20e..70d457c 100644 --- a/Makefile +++ b/Makefile @@ -26,15 +26,9 @@ $(all_lib): %: | $(LIBDIR)_mkdir $(all_cbin): %: | $(BINDIR)_mkdir @echo "# linking C binary $*" $(CL) -o $@ $^ $(CLFLAGS) -# $(CXX) -o $@ $^ $(CLFLAGS) compile_all: $(all_cbin) hdrcopy -autotune: sharp_bench - $(BINDIR)/sharp_bench - mv sharp_oracle.inc $(SRCROOT)/libsharp - $(MAKE) - hdrclean: @if [ -d $(INCDIR) ]; then rm -rf $(INCDIR)/* ; fi @@ -48,3 +42,14 @@ test: compile_all $(BINDIR)/sharp_test healpix 2048 1024 1 0 1 && \ $(BINDIR)/sharp_test ecp 2047 4096 0 2 1 && \ $(BINDIR)/sharp_test gauss 2047 4096 0 0 2 + +perftest: compile_all + $(BINDIR)/sharp_test healpix 2048 1024 0 0 1 && \ + $(BINDIR)/sharp_test gauss 63 128 0 0 1 && \ + $(BINDIR)/sharp_test gauss 127 256 0 0 1 && \ + $(BINDIR)/sharp_test gauss 255 512 0 0 1 && \ + $(BINDIR)/sharp_test gauss 511 1024 0 0 1 && \ + $(BINDIR)/sharp_test gauss 1023 2048 0 0 1 && \ + $(BINDIR)/sharp_test gauss 2047 4096 0 0 1 && \ + $(BINDIR)/sharp_test gauss 4095 8192 0 0 1 && \ + $(BINDIR)/sharp_test gauss 8191 16384 0 0 1 diff --git a/README.compilation b/README.compilation index 7607750..8a8dfca 100644 --- a/README.compilation +++ b/README.compilation @@ -3,12 +3,9 @@ GNU make and GNU gcc (version 4.x) are required for compilation. Simply run "./configure"; if this fails, please refer to the output of "./configure --help" for additional hints and, if necessary, provide additional flags to the configure script. -Once the script finishes successfully, run "make autotune" -(or "gmake autotune"). This should perform some necessary self-tuning and -install the compilation products in the subdirectory "auto/". -NOTE: Autotuning should be done on the the computer where you wish to use -the library later on, and no other CPU-intensive tasks should be running -during the autotuning process. +Once the script finishes successfully, run "make" +(or "gmake"). This should install the compilation products in the +subdirectory "auto/". Documentation can be created by the command "(g)make doc". However this requires the doxygen application to be installed diff --git a/libsharp/planck.make b/libsharp/planck.make index 6c72945..87a9b0d 100644 --- a/libsharp/planck.make +++ b/libsharp/planck.make @@ -15,7 +15,7 @@ ALLOBJ:=$(ALLOBJ:%=$(OD)/%) ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils) $(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c -$(OD)/sharp.o: $(SD)/sharp_mpi.c $(SD)/sharp_oracle.inc +$(OD)/sharp.o: $(SD)/sharp_mpi.c BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils) $(LIB_$(PKG)): $(LIBOBJ) diff --git a/libsharp/sharp.c b/libsharp/sharp.c index d24753b..1c6ccd3 100644 --- a/libsharp/sharp.c +++ b/libsharp/sharp.c @@ -37,6 +37,8 @@ #include "sharp_core.h" #include "sharp_vecutil.h" #include "walltime_c.h" +#include "sharp_almhelpers.h" +#include "sharp_geomhelpers.h" typedef complex double dcmplx; typedef complex float fcmplx; @@ -585,14 +587,84 @@ void sharps_build_job (sharp_job *job, sharp_jobtype type, int spin, int sharp_get_nv_max (void) { return 6; } +static int sharp_oracle (sharp_jobtype type, int spin, int ntrans) + { + int lmax=127; + int mmax=(lmax+1)/2; + int nrings=(lmax+1)/4; + int ppring=1; + + ptrdiff_t npix=(ptrdiff_t)nrings*ppring; + sharp_geom_info *tinfo; + sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo); + + ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax); + int ncomp = ntrans*((spin==0) ? 1 : 2); + + double **map; + ALLOC2D(map,double,ncomp,npix); + SET_ARRAY(map[0],0,npix*ncomp,0.); + + sharp_alm_info *alms; + sharp_make_triangular_alm_info(lmax,mmax,1,&alms); + + dcmplx **alm; + ALLOC2D(alm,dcmplx,ncomp,nalms); + SET_ARRAY(alm[0],0,nalms*ncomp,0.); + + double time=1e30; + int nvbest=-1; + + for (int nv=1; nv<=sharp_get_nv_max(); ++nv) + { + double time_acc=0.; + sharp_job job; + sharpd_build_job(&job,type,spin,0,&alm[0],&map[0],tinfo,alms,ntrans); + job.nv=nv; + do + { + sharp_execute_job(&job); + + if (job.time0),"bad number of simultaneous transforms"); UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin"); -#include "sharp_oracle.inc" + ntrans=IMIN(ntrans,maxtr); - return nv_opt[IMIN(ntrans,maxtr)-1][spin!=0][type]; + if (nv_opt[ntrans-1][spin!=0][type]==0) + { + in_oracle=1; + nv_opt[ntrans-1][spin!=0][type]=sharp_oracle(type,spin,ntrans); + in_oracle=0; + } + return nv_opt[ntrans-1][spin!=0][type]; } #include "sharp_mpi.c" diff --git a/libsharp/sharp_oracle.inc b/libsharp/sharp_oracle.inc deleted file mode 100644 index 40c9ed0..0000000 --- a/libsharp/sharp_oracle.inc +++ /dev/null @@ -1,9 +0,0 @@ -static const int maxtr = 6; -static const int nv_opt[6][2][3] = { -{{4,2,-1},{2,1,1}}, -{{5,2,-1},{2,1,1}}, -{{5,2,-1},{5,2,2}}, -{{5,2,-1},{5,2,2}}, -{{5,2,-1},{5,2,2}}, -{{5,2,-1},{5,2,2}} -}; diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h index 2c01b1d..fcfd253 100644 --- a/libsharp/sharp_vecsupport.h +++ b/libsharp/sharp_vecsupport.h @@ -137,10 +137,8 @@ typedef __m256d Tv; #ifdef __FMA4__ #define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a) #define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a) -#define vfmaaeq(a,b,c,d,e) \ - a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a)) -#define vfmaseq(a,b,c,d,e) \ - a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a)) +#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a)) +#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a)) #else #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c)) #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))