perform autotuning on the fly

This commit is contained in:
Martin Reinecke 2012-07-12 16:23:27 +02:00
parent fd69f89ae2
commit 7928e13156
6 changed files with 91 additions and 28 deletions

View file

@ -26,15 +26,9 @@ $(all_lib): %: | $(LIBDIR)_mkdir
$(all_cbin): %: | $(BINDIR)_mkdir $(all_cbin): %: | $(BINDIR)_mkdir
@echo "# linking C binary $*" @echo "# linking C binary $*"
$(CL) -o $@ $^ $(CLFLAGS) $(CL) -o $@ $^ $(CLFLAGS)
# $(CXX) -o $@ $^ $(CLFLAGS)
compile_all: $(all_cbin) hdrcopy compile_all: $(all_cbin) hdrcopy
autotune: sharp_bench
$(BINDIR)/sharp_bench
mv sharp_oracle.inc $(SRCROOT)/libsharp
$(MAKE)
hdrclean: hdrclean:
@if [ -d $(INCDIR) ]; then rm -rf $(INCDIR)/* ; fi @if [ -d $(INCDIR) ]; then rm -rf $(INCDIR)/* ; fi
@ -48,3 +42,14 @@ test: compile_all
$(BINDIR)/sharp_test healpix 2048 1024 1 0 1 && \ $(BINDIR)/sharp_test healpix 2048 1024 1 0 1 && \
$(BINDIR)/sharp_test ecp 2047 4096 0 2 1 && \ $(BINDIR)/sharp_test ecp 2047 4096 0 2 1 && \
$(BINDIR)/sharp_test gauss 2047 4096 0 0 2 $(BINDIR)/sharp_test gauss 2047 4096 0 0 2
perftest: compile_all
$(BINDIR)/sharp_test healpix 2048 1024 0 0 1 && \
$(BINDIR)/sharp_test gauss 63 128 0 0 1 && \
$(BINDIR)/sharp_test gauss 127 256 0 0 1 && \
$(BINDIR)/sharp_test gauss 255 512 0 0 1 && \
$(BINDIR)/sharp_test gauss 511 1024 0 0 1 && \
$(BINDIR)/sharp_test gauss 1023 2048 0 0 1 && \
$(BINDIR)/sharp_test gauss 2047 4096 0 0 1 && \
$(BINDIR)/sharp_test gauss 4095 8192 0 0 1 && \
$(BINDIR)/sharp_test gauss 8191 16384 0 0 1

View file

@ -3,12 +3,9 @@ GNU make and GNU gcc (version 4.x) are required for compilation.
Simply run "./configure"; if this fails, please refer to the output of Simply run "./configure"; if this fails, please refer to the output of
"./configure --help" for additional hints and, if necessary, provide "./configure --help" for additional hints and, if necessary, provide
additional flags to the configure script. additional flags to the configure script.
Once the script finishes successfully, run "make autotune" Once the script finishes successfully, run "make"
(or "gmake autotune"). This should perform some necessary self-tuning and (or "gmake"). This should install the compilation products in the
install the compilation products in the subdirectory "auto/". subdirectory "auto/".
NOTE: Autotuning should be done on the the computer where you wish to use
the library later on, and no other CPU-intensive tasks should be running
during the autotuning process.
Documentation can be created by the command "(g)make doc". Documentation can be created by the command "(g)make doc".
However this requires the doxygen application to be installed However this requires the doxygen application to be installed

View file

@ -15,7 +15,7 @@ ALLOBJ:=$(ALLOBJ:%=$(OD)/%)
ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils) ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils)
$(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c $(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c
$(OD)/sharp.o: $(SD)/sharp_mpi.c $(SD)/sharp_oracle.inc $(OD)/sharp.o: $(SD)/sharp_mpi.c
BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils) BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils)
$(LIB_$(PKG)): $(LIBOBJ) $(LIB_$(PKG)): $(LIBOBJ)

View file

@ -37,6 +37,8 @@
#include "sharp_core.h" #include "sharp_core.h"
#include "sharp_vecutil.h" #include "sharp_vecutil.h"
#include "walltime_c.h" #include "walltime_c.h"
#include "sharp_almhelpers.h"
#include "sharp_geomhelpers.h"
typedef complex double dcmplx; typedef complex double dcmplx;
typedef complex float fcmplx; typedef complex float fcmplx;
@ -585,14 +587,84 @@ void sharps_build_job (sharp_job *job, sharp_jobtype type, int spin,
int sharp_get_nv_max (void) int sharp_get_nv_max (void)
{ return 6; } { return 6; }
static int sharp_oracle (sharp_jobtype type, int spin, int ntrans)
{
int lmax=127;
int mmax=(lmax+1)/2;
int nrings=(lmax+1)/4;
int ppring=1;
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
sharp_geom_info *tinfo;
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
int ncomp = ntrans*((spin==0) ? 1 : 2);
double **map;
ALLOC2D(map,double,ncomp,npix);
SET_ARRAY(map[0],0,npix*ncomp,0.);
sharp_alm_info *alms;
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
dcmplx **alm;
ALLOC2D(alm,dcmplx,ncomp,nalms);
SET_ARRAY(alm[0],0,nalms*ncomp,0.);
double time=1e30;
int nvbest=-1;
for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
{
double time_acc=0.;
sharp_job job;
sharpd_build_job(&job,type,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
job.nv=nv;
do
{
sharp_execute_job(&job);
if (job.time<time) { time=job.time; nvbest=nv; }
time_acc+=job.time;
}
while (time_acc<0.02);
}
DEALLOC2D(map);
DEALLOC2D(alm);
sharp_destroy_alm_info(alms);
sharp_destroy_geom_info(tinfo);
return nvbest;
}
int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans) int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
{ {
static const int maxtr = 6;
static int nv_opt[6][2][3] = {
{{0,0,0},{0,0,0}},
{{0,0,0},{0,0,0}},
{{0,0,0},{0,0,0}},
{{0,0,0},{0,0,0}},
{{0,0,0},{0,0,0}},
{{0,0,0},{0,0,0}} };
static int in_oracle=0;
if (in_oracle) return -20;
if (type==ALM2MAP_DERIV1) spin=1; if (type==ALM2MAP_DERIV1) spin=1;
UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms"); UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin"); UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin");
#include "sharp_oracle.inc" ntrans=IMIN(ntrans,maxtr);
return nv_opt[IMIN(ntrans,maxtr)-1][spin!=0][type]; if (nv_opt[ntrans-1][spin!=0][type]==0)
{
in_oracle=1;
nv_opt[ntrans-1][spin!=0][type]=sharp_oracle(type,spin,ntrans);
in_oracle=0;
}
return nv_opt[ntrans-1][spin!=0][type];
} }
#include "sharp_mpi.c" #include "sharp_mpi.c"

View file

@ -1,9 +0,0 @@
static const int maxtr = 6;
static const int nv_opt[6][2][3] = {
{{4,2,-1},{2,1,1}},
{{5,2,-1},{2,1,1}},
{{5,2,-1},{5,2,2}},
{{5,2,-1},{5,2,2}},
{{5,2,-1},{5,2,2}},
{{5,2,-1},{5,2,2}}
};

View file

@ -137,10 +137,8 @@ typedef __m256d Tv;
#ifdef __FMA4__ #ifdef __FMA4__
#define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a) #define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
#define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a) #define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
#define vfmaaeq(a,b,c,d,e) \ #define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a)) #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
#define vfmaseq(a,b,c,d,e) \
a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
#else #else
#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c)) #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c)) #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))