perform autotuning on the fly
This commit is contained in:
parent
fd69f89ae2
commit
7928e13156
6 changed files with 91 additions and 28 deletions
17
Makefile
17
Makefile
|
@ -26,15 +26,9 @@ $(all_lib): %: | $(LIBDIR)_mkdir
|
||||||
$(all_cbin): %: | $(BINDIR)_mkdir
|
$(all_cbin): %: | $(BINDIR)_mkdir
|
||||||
@echo "# linking C binary $*"
|
@echo "# linking C binary $*"
|
||||||
$(CL) -o $@ $^ $(CLFLAGS)
|
$(CL) -o $@ $^ $(CLFLAGS)
|
||||||
# $(CXX) -o $@ $^ $(CLFLAGS)
|
|
||||||
|
|
||||||
compile_all: $(all_cbin) hdrcopy
|
compile_all: $(all_cbin) hdrcopy
|
||||||
|
|
||||||
autotune: sharp_bench
|
|
||||||
$(BINDIR)/sharp_bench
|
|
||||||
mv sharp_oracle.inc $(SRCROOT)/libsharp
|
|
||||||
$(MAKE)
|
|
||||||
|
|
||||||
hdrclean:
|
hdrclean:
|
||||||
@if [ -d $(INCDIR) ]; then rm -rf $(INCDIR)/* ; fi
|
@if [ -d $(INCDIR) ]; then rm -rf $(INCDIR)/* ; fi
|
||||||
|
|
||||||
|
@ -48,3 +42,14 @@ test: compile_all
|
||||||
$(BINDIR)/sharp_test healpix 2048 1024 1 0 1 && \
|
$(BINDIR)/sharp_test healpix 2048 1024 1 0 1 && \
|
||||||
$(BINDIR)/sharp_test ecp 2047 4096 0 2 1 && \
|
$(BINDIR)/sharp_test ecp 2047 4096 0 2 1 && \
|
||||||
$(BINDIR)/sharp_test gauss 2047 4096 0 0 2
|
$(BINDIR)/sharp_test gauss 2047 4096 0 0 2
|
||||||
|
|
||||||
|
perftest: compile_all
|
||||||
|
$(BINDIR)/sharp_test healpix 2048 1024 0 0 1 && \
|
||||||
|
$(BINDIR)/sharp_test gauss 63 128 0 0 1 && \
|
||||||
|
$(BINDIR)/sharp_test gauss 127 256 0 0 1 && \
|
||||||
|
$(BINDIR)/sharp_test gauss 255 512 0 0 1 && \
|
||||||
|
$(BINDIR)/sharp_test gauss 511 1024 0 0 1 && \
|
||||||
|
$(BINDIR)/sharp_test gauss 1023 2048 0 0 1 && \
|
||||||
|
$(BINDIR)/sharp_test gauss 2047 4096 0 0 1 && \
|
||||||
|
$(BINDIR)/sharp_test gauss 4095 8192 0 0 1 && \
|
||||||
|
$(BINDIR)/sharp_test gauss 8191 16384 0 0 1
|
||||||
|
|
|
@ -3,12 +3,9 @@ GNU make and GNU gcc (version 4.x) are required for compilation.
|
||||||
Simply run "./configure"; if this fails, please refer to the output of
|
Simply run "./configure"; if this fails, please refer to the output of
|
||||||
"./configure --help" for additional hints and, if necessary, provide
|
"./configure --help" for additional hints and, if necessary, provide
|
||||||
additional flags to the configure script.
|
additional flags to the configure script.
|
||||||
Once the script finishes successfully, run "make autotune"
|
Once the script finishes successfully, run "make"
|
||||||
(or "gmake autotune"). This should perform some necessary self-tuning and
|
(or "gmake"). This should install the compilation products in the
|
||||||
install the compilation products in the subdirectory "auto/".
|
subdirectory "auto/".
|
||||||
NOTE: Autotuning should be done on the the computer where you wish to use
|
|
||||||
the library later on, and no other CPU-intensive tasks should be running
|
|
||||||
during the autotuning process.
|
|
||||||
|
|
||||||
Documentation can be created by the command "(g)make doc".
|
Documentation can be created by the command "(g)make doc".
|
||||||
However this requires the doxygen application to be installed
|
However this requires the doxygen application to be installed
|
||||||
|
|
|
@ -15,7 +15,7 @@ ALLOBJ:=$(ALLOBJ:%=$(OD)/%)
|
||||||
|
|
||||||
ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils)
|
ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils)
|
||||||
$(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c
|
$(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c
|
||||||
$(OD)/sharp.o: $(SD)/sharp_mpi.c $(SD)/sharp_oracle.inc
|
$(OD)/sharp.o: $(SD)/sharp_mpi.c
|
||||||
BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils)
|
BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils)
|
||||||
|
|
||||||
$(LIB_$(PKG)): $(LIBOBJ)
|
$(LIB_$(PKG)): $(LIBOBJ)
|
||||||
|
|
|
@ -37,6 +37,8 @@
|
||||||
#include "sharp_core.h"
|
#include "sharp_core.h"
|
||||||
#include "sharp_vecutil.h"
|
#include "sharp_vecutil.h"
|
||||||
#include "walltime_c.h"
|
#include "walltime_c.h"
|
||||||
|
#include "sharp_almhelpers.h"
|
||||||
|
#include "sharp_geomhelpers.h"
|
||||||
|
|
||||||
typedef complex double dcmplx;
|
typedef complex double dcmplx;
|
||||||
typedef complex float fcmplx;
|
typedef complex float fcmplx;
|
||||||
|
@ -585,14 +587,84 @@ void sharps_build_job (sharp_job *job, sharp_jobtype type, int spin,
|
||||||
int sharp_get_nv_max (void)
|
int sharp_get_nv_max (void)
|
||||||
{ return 6; }
|
{ return 6; }
|
||||||
|
|
||||||
|
static int sharp_oracle (sharp_jobtype type, int spin, int ntrans)
|
||||||
|
{
|
||||||
|
int lmax=127;
|
||||||
|
int mmax=(lmax+1)/2;
|
||||||
|
int nrings=(lmax+1)/4;
|
||||||
|
int ppring=1;
|
||||||
|
|
||||||
|
ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
|
||||||
|
sharp_geom_info *tinfo;
|
||||||
|
sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
|
||||||
|
|
||||||
|
ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
|
||||||
|
int ncomp = ntrans*((spin==0) ? 1 : 2);
|
||||||
|
|
||||||
|
double **map;
|
||||||
|
ALLOC2D(map,double,ncomp,npix);
|
||||||
|
SET_ARRAY(map[0],0,npix*ncomp,0.);
|
||||||
|
|
||||||
|
sharp_alm_info *alms;
|
||||||
|
sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
|
||||||
|
|
||||||
|
dcmplx **alm;
|
||||||
|
ALLOC2D(alm,dcmplx,ncomp,nalms);
|
||||||
|
SET_ARRAY(alm[0],0,nalms*ncomp,0.);
|
||||||
|
|
||||||
|
double time=1e30;
|
||||||
|
int nvbest=-1;
|
||||||
|
|
||||||
|
for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
|
||||||
|
{
|
||||||
|
double time_acc=0.;
|
||||||
|
sharp_job job;
|
||||||
|
sharpd_build_job(&job,type,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
|
||||||
|
job.nv=nv;
|
||||||
|
do
|
||||||
|
{
|
||||||
|
sharp_execute_job(&job);
|
||||||
|
|
||||||
|
if (job.time<time) { time=job.time; nvbest=nv; }
|
||||||
|
time_acc+=job.time;
|
||||||
|
}
|
||||||
|
while (time_acc<0.02);
|
||||||
|
}
|
||||||
|
|
||||||
|
DEALLOC2D(map);
|
||||||
|
DEALLOC2D(alm);
|
||||||
|
|
||||||
|
sharp_destroy_alm_info(alms);
|
||||||
|
sharp_destroy_geom_info(tinfo);
|
||||||
|
return nvbest;
|
||||||
|
}
|
||||||
|
|
||||||
int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
|
int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
|
||||||
{
|
{
|
||||||
|
static const int maxtr = 6;
|
||||||
|
static int nv_opt[6][2][3] = {
|
||||||
|
{{0,0,0},{0,0,0}},
|
||||||
|
{{0,0,0},{0,0,0}},
|
||||||
|
{{0,0,0},{0,0,0}},
|
||||||
|
{{0,0,0},{0,0,0}},
|
||||||
|
{{0,0,0},{0,0,0}},
|
||||||
|
{{0,0,0},{0,0,0}} };
|
||||||
|
|
||||||
|
static int in_oracle=0;
|
||||||
|
if (in_oracle) return -20;
|
||||||
|
|
||||||
if (type==ALM2MAP_DERIV1) spin=1;
|
if (type==ALM2MAP_DERIV1) spin=1;
|
||||||
UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
|
UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
|
||||||
UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin");
|
UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin");
|
||||||
#include "sharp_oracle.inc"
|
ntrans=IMIN(ntrans,maxtr);
|
||||||
|
|
||||||
return nv_opt[IMIN(ntrans,maxtr)-1][spin!=0][type];
|
if (nv_opt[ntrans-1][spin!=0][type]==0)
|
||||||
|
{
|
||||||
|
in_oracle=1;
|
||||||
|
nv_opt[ntrans-1][spin!=0][type]=sharp_oracle(type,spin,ntrans);
|
||||||
|
in_oracle=0;
|
||||||
|
}
|
||||||
|
return nv_opt[ntrans-1][spin!=0][type];
|
||||||
}
|
}
|
||||||
|
|
||||||
#include "sharp_mpi.c"
|
#include "sharp_mpi.c"
|
||||||
|
|
|
@ -1,9 +0,0 @@
|
||||||
static const int maxtr = 6;
|
|
||||||
static const int nv_opt[6][2][3] = {
|
|
||||||
{{4,2,-1},{2,1,1}},
|
|
||||||
{{5,2,-1},{2,1,1}},
|
|
||||||
{{5,2,-1},{5,2,2}},
|
|
||||||
{{5,2,-1},{5,2,2}},
|
|
||||||
{{5,2,-1},{5,2,2}},
|
|
||||||
{{5,2,-1},{5,2,2}}
|
|
||||||
};
|
|
|
@ -137,10 +137,8 @@ typedef __m256d Tv;
|
||||||
#ifdef __FMA4__
|
#ifdef __FMA4__
|
||||||
#define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
|
#define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
|
||||||
#define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
|
#define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
|
||||||
#define vfmaaeq(a,b,c,d,e) \
|
#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
|
||||||
a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
|
#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
|
||||||
#define vfmaseq(a,b,c,d,e) \
|
|
||||||
a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
|
|
||||||
#else
|
#else
|
||||||
#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
|
#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
|
||||||
#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
|
#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue