/*
 *  This file is part of libsharp.
 *
 *  libsharp is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libsharp is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libsharp; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

/* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik */

/*! \file sharp_core.c
 *  Spherical transform library
 *
 *  Copyright (C) 2019 Max-Planck-Society
 *  \author Martin Reinecke
 */

#define ARCH default
#define GENERIC_ARCH
#include "libsharp/sharp_core_inc.c"
#undef GENERIC_ARCH
#undef ARCH

typedef void (*t_inner_loop) (sharp_job *job, const int *ispair,
  const double *cth_, const double *sth_, int llim, int ulim,
  sharp_Ylmgen_C *gen, int mi, const int *mlim);
typedef int (*t_veclen) (void);
typedef int (*t_max_nvec) (int spin);
typedef const char *(*t_architecture) (void);

static t_inner_loop inner_loop_ = NULL;
static t_veclen veclen_ = NULL;
static t_max_nvec max_nvec_ = NULL;
static t_architecture architecture_ = NULL;

#ifdef MULTIARCH

#if (defined(___AVX512F__) || defined(__FMA4__) || defined(__FMA__) || \
     defined(__AVX2__) || defined(__AVX__))
#error MULTIARCH specified but platform-specific flags detected
#endif

#define DECL(arch) \
static int XCONCATX2(have,arch)(void) \
  { \
  static int res=-1; \
  if (res<0) \
    { \
    __builtin_cpu_init(); \
    res = __builtin_cpu_supports(#arch); \
    } \
  return res; \
  } \
\
void XCONCATX2(inner_loop,arch) (sharp_job *job, const int *ispair, \
  const double *cth_, const double *sth_, int llim, int ulim, \
  sharp_Ylmgen_C *gen, int mi, const int *mlim); \
int XCONCATX2(sharp_veclen,arch) (void); \
int XCONCATX2(sharp_max_nvec,arch) (int spin); \
const char *XCONCATX2(sharp_architecture,arch) (void);

#if (!defined(__APPLE__))
DECL(avx512f)
#endif
DECL(fma4)
DECL(fma)
DECL(avx2)
DECL(avx)

#endif

static void assign_funcs(void)
  {
#ifdef MULTIARCH
#define DECL2(arch) \
  if (XCONCATX2(have,arch)()) \
    { \
    inner_loop_ = XCONCATX2(inner_loop,arch); \
    veclen_ = XCONCATX2(sharp_veclen,arch); \
    max_nvec_ = XCONCATX2(sharp_max_nvec,arch); \
    architecture_ = XCONCATX2(sharp_architecture,arch); \
    return; \
    }
#if (!defined(__APPLE__))
DECL2(avx512f)
#endif
DECL2(fma4)
DECL2(fma)
DECL2(avx2)
DECL2(avx)
#endif
  inner_loop_ = inner_loop_default;
  veclen_ = sharp_veclen_default;
  max_nvec_ = sharp_max_nvec_default;
  architecture_ = sharp_architecture_default;
  }

#pragma GCC visibility push(hidden)

void inner_loop (sharp_job *job, const int *ispair,const double *cth,
  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
  const int *mlim)
  {
  if (!inner_loop_) assign_funcs();
  inner_loop_(job, ispair, cth, sth, llim, ulim, gen, mi, mlim);
  }

int sharp_max_nvec(int spin)
  {
  if (!max_nvec_) assign_funcs();
  return max_nvec_(spin);
  }

#pragma GCC visibility pop

int sharp_veclen(void)
  {
  if (!veclen_) assign_funcs();
  return veclen_();
  }
const char *sharp_architecture(void)
  {
  if (!architecture_) assign_funcs();
  return architecture_();
  }