From c9684732b84e39c87fc21288049dc0b46bf128e6 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 10 Jan 2019 13:30:29 +0100
Subject: [PATCH] cleanup

---
 Makefile.am                    |   1 -
 libsharp/sharp.h               |   4 +-
 libsharp/sharp_almhelpers.h    |   2 +-
 libsharp/sharp_complex_hacks.h | 112 ---------------------------------
 libsharp/sharp_core.c          |   1 -
 libsharp/sharp_vecsupport.h    |  39 ++++++++++++
 6 files changed, 41 insertions(+), 118 deletions(-)
 delete mode 100644 libsharp/sharp_complex_hacks.h

diff --git a/Makefile.am b/Makefile.am
index 163fcd0..26b41ad 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -15,7 +15,6 @@ src_sharp = \
   libsharp/sharp_legendre_roots.c \
   libsharp/sharp_ylmgen_c.c \
   libsharp/sharp_announce.h \
-  libsharp/sharp_complex_hacks.h \
   libsharp/sharp_internal.h \
   libsharp/sharp_legendre_roots.h \
   libsharp/sharp_vecsupport.h \
diff --git a/libsharp/sharp.h b/libsharp/sharp.h
index 35a0cb5..ef9cafb 100644
--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@@ -25,7 +25,7 @@
 /*! \file sharp.h
  *  Portable interface for the spherical transform library.
  *
- *  Copyright (C) 2012-2018 Max-Planck-Society
+ *  Copyright (C) 2012-2019 Max-Planck-Society
  *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
@@ -259,8 +259,6 @@ int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
   const sharp_alm_info *alm_info, int flags, double *time,
   unsigned long long *opcnt);
 
-
-
 /*! \} */
 
 #ifdef __cplusplus
diff --git a/libsharp/sharp_almhelpers.h b/libsharp/sharp_almhelpers.h
index c17028a..06bee8f 100644
--- a/libsharp/sharp_almhelpers.h
+++ b/libsharp/sharp_almhelpers.h
@@ -25,7 +25,7 @@
 /*! \file sharp_almhelpers.h
  *  SHARP helper function for the creation of a_lm data structures
  *
- *  Copyright (C) 2008-2016 Max-Planck-Society
+ *  Copyright (C) 2008-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
diff --git a/libsharp/sharp_complex_hacks.h b/libsharp/sharp_complex_hacks.h
deleted file mode 100644
index d50eabe..0000000
--- a/libsharp/sharp_complex_hacks.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*  \file sharp_complex_hacks.h
- *  support for converting vector types and complex numbers
- *
- *  Copyright (C) 2012-2018 Max-Planck-Society
- *  Author: Martin Reinecke
- */
-
-#ifndef SHARP_COMPLEX_HACKS_H
-#define SHARP_COMPLEX_HACKS_H
-
-#include <math.h>
-#include "sharp_vecsupport.h"
-
-#define UNSAFE_CODE
-
-#if (VLEN==1)
-
-static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict cc)
-  { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
-
-#endif
-
-#if (VLEN==2)
-
-static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
-  Tv d, _Complex double * restrict c1, _Complex double * restrict c2)
-  {
-#ifdef UNSAFE_CODE
-#if defined(__SSE3__)
-  *((__m128d *)c1) += _mm_hadd_pd(a,b);
-  *((__m128d *)c2) += _mm_hadd_pd(c,d);
-#else
-  *((__m128d *)c1) += _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
-                      _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
-  *((__m128d *)c2) += _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
-                      _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
-#endif
-#else
-  union {Tv v; _Complex double c; } u1, u2;
-#if defined(__SSE3__)
-  u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
-#else
-  u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
-         _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
-  u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
-         _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
-#endif
-  *c1+=u1.c; *c2+=u2.c;
-#endif
-  }
-
-static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict cc)
-  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
-
-#endif
-
-#if (VLEN==4)
-
-static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict cc)
-  {
-  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
-  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
-     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
-  tmp1=tmp3+tmp4;
-#ifdef UNSAFE_CODE
-  _mm256_storeu_pd((double *)cc,
-    _mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1));
-#else
-  union {Tv v; _Complex double c[2]; } u;
-  u.v=tmp1;
-  cc[0]+=u.c[0]; cc[1]+=u.c[1];
-#endif
-  }
-
-#endif
-
-#if (VLEN==8)
-
-static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict cc)
-  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
-
-#endif
-
-#endif
diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 7307202..b619ed3 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -33,7 +33,6 @@
 #include <math.h>
 #include <string.h>
 #include "sharp_vecsupport.h"
-#include "sharp_complex_hacks.h"
 #include "sharp.h"
 #include "sharp_internal.h"
 #include "c_utils.h"
diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index b70143d..e4bfc4f 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -76,6 +76,11 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
 #define vanyTrue(a) (a)
 #define vallTrue(a) (a)
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  _Complex double * restrict cc)
+  { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
+
+
 #endif
 
 #if (VLEN==2)
@@ -119,6 +124,21 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vanyTrue(a) (_mm_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm_movemask_pd(a)==3)
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c,
+  Tv d, _Complex double * restrict cc)
+  {
+  union {Tv v; _Complex double c; } u1, u2;
+#if defined(__SSE3__)
+  u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
+#else
+  u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
+         _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
+  u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
+         _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
+#endif
+  cc[0]+=u1.c; cc[1]+=u2.c;
+  }
+
 #endif
 
 #if (VLEN==4)
@@ -150,6 +170,18 @@ typedef __m256d Tm;
 #define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm256_movemask_pd(a)==15)
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  _Complex double * restrict cc)
+  {
+  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
+  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
+     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
+  tmp1=tmp3+tmp4;
+  union {Tv v; _Complex double c[2]; } u;
+  u.v=tmp1;
+  cc[0]+=u.c[0]; cc[1]+=u.c[1];
+  }
+
 #endif
 
 #if (VLEN==8)
@@ -180,6 +212,13 @@ typedef __mmask8 Tm;
 #define vanyTrue(a) (a!=0)
 #define vallTrue(a) (a==255)
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  _Complex double * restrict cc)
+  {
+  cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
+  cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
+  }
+
 #endif
 
 #endif