cleanup

2019-01-10 13:30:29 +01:00 · 2019-01-10 13:30:29 +01:00 · c9684732b8
commit c9684732b8
parent ecd6c1b48b
6 changed files with 41 additions and 118 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -15,7 +15,6 @@ src_sharp = \
  libsharp/sharp_legendre_roots.c \
  libsharp/sharp_ylmgen_c.c \
  libsharp/sharp_announce.h \
-  libsharp/sharp_complex_hacks.h \
  libsharp/sharp_internal.h \
  libsharp/sharp_legendre_roots.h \
  libsharp/sharp_vecsupport.h \
--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@ -25,7 +25,7 @@
 /*! \file sharp.h
 *  Portable interface for the spherical transform library.
 *
- *  Copyright (C) 2012-2018 Max-Planck-Society
+ *  Copyright (C) 2012-2019 Max-Planck-Society
 *  \author Martin Reinecke \author Dag Sverre Seljebotn
 */

@ -259,8 +259,6 @@ int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
  const sharp_alm_info *alm_info, int flags, double *time,
  unsigned long long *opcnt);

-
-
 /*! \} */

 #ifdef __cplusplus
--- a/libsharp/sharp_almhelpers.h
+++ b/libsharp/sharp_almhelpers.h
@ -25,7 +25,7 @@
 /*! \file sharp_almhelpers.h
 *  SHARP helper function for the creation of a_lm data structures
 *
- *  Copyright (C) 2008-2016 Max-Planck-Society
+ *  Copyright (C) 2008-2019 Max-Planck-Society
 *  \author Martin Reinecke
 */

--- a/libsharp/sharp_complex_hacks.h
+++ b/libsharp/sharp_complex_hacks.h
@ -1,112 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*  \file sharp_complex_hacks.h
- *  support for converting vector types and complex numbers
- *
- *  Copyright (C) 2012-2018 Max-Planck-Society
- *  Author: Martin Reinecke
- */
-
-#ifndef SHARP_COMPLEX_HACKS_H
-#define SHARP_COMPLEX_HACKS_H
-
-#include <math.h>
-#include "sharp_vecsupport.h"
-
-#define UNSAFE_CODE
-
-#if (VLEN==1)
-
-static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict cc)
-  { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
-
-#endif
-
-#if (VLEN==2)
-
-static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
-  Tv d, _Complex double * restrict c1, _Complex double * restrict c2)
-  {
-#ifdef UNSAFE_CODE
-#if defined(__SSE3__)
-  *((__m128d *)c1) += _mm_hadd_pd(a,b);
-  *((__m128d *)c2) += _mm_hadd_pd(c,d);
-#else
-  *((__m128d *)c1) += _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
-                      _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
-  *((__m128d *)c2) += _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
-                      _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
-#endif
-#else
-  union {Tv v; _Complex double c; } u1, u2;
-#if defined(__SSE3__)
-  u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
-#else
-  u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
-         _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
-  u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
-         _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
-#endif
-  *c1+=u1.c; *c2+=u2.c;
-#endif
-  }
-
-static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict cc)
-  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
-
-#endif
-
-#if (VLEN==4)
-
-static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict cc)
-  {
-  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
-  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
-     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
-  tmp1=tmp3+tmp4;
-#ifdef UNSAFE_CODE
-  _mm256_storeu_pd((double *)cc,
-    _mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1));
-#else
-  union {Tv v; _Complex double c[2]; } u;
-  u.v=tmp1;
-  cc[0]+=u.c[0]; cc[1]+=u.c[1];
-#endif
-  }
-
-#endif
-
-#if (VLEN==8)
-
-static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict cc)
-  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
-
-#endif
-
-#endif
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@ -33,7 +33,6 @@
 #include <math.h>
 #include <string.h>
 #include "sharp_vecsupport.h"
-#include "sharp_complex_hacks.h"
 #include "sharp.h"
 #include "sharp_internal.h"
 #include "c_utils.h"
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@ -76,6 +76,11 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
 #define vanyTrue(a) (a)
 #define vallTrue(a) (a)

+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  _Complex double * restrict cc)
+  { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
+
+
 #endif

 #if (VLEN==2)
@ -119,6 +124,21 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vanyTrue(a) (_mm_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm_movemask_pd(a)==3)

+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c,
+  Tv d, _Complex double * restrict cc)
+  {
+  union {Tv v; _Complex double c; } u1, u2;
+#if defined(__SSE3__)
+  u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
+#else
+  u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
+         _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
+  u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
+         _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
+#endif
+  cc[0]+=u1.c; cc[1]+=u2.c;
+  }
+
 #endif

 #if (VLEN==4)
@ -150,6 +170,18 @@ typedef __m256d Tm;
 #define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm256_movemask_pd(a)==15)

+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  _Complex double * restrict cc)
+  {
+  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
+  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
+     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
+  tmp1=tmp3+tmp4;
+  union {Tv v; _Complex double c[2]; } u;
+  u.v=tmp1;
+  cc[0]+=u.c[0]; cc[1]+=u.c[1];
+  }
+
 #endif

 #if (VLEN==8)
@ -180,6 +212,13 @@ typedef __mmask8 Tm;
 #define vanyTrue(a) (a!=0)
 #define vallTrue(a) (a==255)

+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  _Complex double * restrict cc)
+  {
+  cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
+  cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
+  }
+
 #endif

 #endif