diff --git a/libsharp/sharp.c b/libsharp/sharp.c index 913b5de..1daeb7a 100644 --- a/libsharp/sharp.c +++ b/libsharp/sharp.c @@ -427,7 +427,12 @@ static void init_output (sharp_job *job) } static void alloc_phase (sharp_job *job, int nm, int ntheta) - { job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta); } + { + if ((nm&1023)==0) nm+=3; // hack to avoid critical strides + job->s_m=2*job->ntrans*job->nmaps; + job->s_th=job->s_m*nm; + job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta); + } static void dealloc_phase (sharp_job *job) { DEALLOC(job->phase); } @@ -436,7 +441,7 @@ static void dealloc_phase (sharp_job *job) static void map2phase (sharp_job *job, int mmax, int llim, int ulim) { if (job->type != SHARP_MAP2ALM) return; - int pstride = 2*job->ntrans*job->nmaps; + int pstride = job->s_m; #pragma omp parallel { ringhelper helper; @@ -444,7 +449,7 @@ static void map2phase (sharp_job *job, int mmax, int llim, int ulim) #pragma omp for schedule(dynamic,1) for (int ith=llim; iths_th*(ith-llim); for (int i=0; intrans*job->nmaps; ++i) ringhelper_pair2phase(&helper,mmax,&job->ginfo->pair[ith], job->map[i], &job->phase[dim2+2*i], &job->phase[dim2+2*i+1], pstride, job->flags); @@ -589,7 +594,7 @@ static void almtmp2alm (sharp_job *job, int lmax, int mi) static void phase2map (sharp_job *job, int mmax, int llim, int ulim) { if (job->type == SHARP_MAP2ALM) return; - int pstride = 2*job->ntrans*job->nmaps; + int pstride = job->s_m; #pragma omp parallel { ringhelper helper; @@ -597,7 +602,7 @@ static void phase2map (sharp_job *job, int mmax, int llim, int ulim) #pragma omp for schedule(dynamic,1) for (int ith=llim; iths_th*(ith-llim); for (int i=0; intrans*job->nmaps; ++i) ringhelper_phase2pair(&helper,mmax,&job->phase[dim2+2*i], &job->phase[dim2+2*i+1],pstride,&job->ginfo->pair[ith],job->map[i], diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c index baa2186..a11c488 100644 --- a/libsharp/sharp_core_inc2.c +++ b/libsharp/sharp_core_inc2.c @@ -653,7 +653,7 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair, { for (int j=0; jainfo->nm+mi)); + int phas_idx = itot*job->s_th + mi*job->s_m + 2*j; complex double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I, r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I; job->phase[phas_idx] = r1+r2; @@ -693,7 +693,7 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair, { for (int j=0; jainfo->nm+mi)); + int phas_idx = itot*job->s_th + mi*job->s_m + 4*j; complex double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I, q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I, u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I, @@ -736,7 +736,7 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair, { for (int j=0; jainfo->nm+mi)); + int phas_idx = itot*job->s_th + mi*job->s_m + 2*j; dcmplx ph1=job->phase[phas_idx]; dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.; p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2); @@ -766,7 +766,7 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair, { for (int j=0; jainfo->nm+mi)); + int phas_idx = itot*job->s_th + mi*job->s_m + 4*j; dcmplx p1Q=job->phase[phas_idx], p1U=job->phase[phas_idx+2], p2Q=ispair[itot] ? job->phase[phas_idx+1]:0., diff --git a/libsharp/sharp_internal.h b/libsharp/sharp_internal.h index 4fd582d..4ae7ee2 100644 --- a/libsharp/sharp_internal.h +++ b/libsharp/sharp_internal.h @@ -48,6 +48,7 @@ typedef struct int flags; void **map; void **alm; + int s_m, s_th; // strides in m and theta direction complex double *phase; double *norm_l; complex double *almtmp;