Merge remote-tracking branch 'dagss/master'

This commit is contained in:
Dag Sverre Seljebotn 2015-04-24 18:08:18 +02:00
commit f48a0bd154
24 changed files with 2034 additions and 67 deletions

View file

@ -8,7 +8,7 @@ FULL_INCLUDE+= -I$(SD)
HDR_$(PKG):=$(SD)/*.h
LIB_$(PKG):=$(LIBDIR)/libsharp.a
BIN:=sharp_testsuite
LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o
LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o sharp_legendre_roots.o
ALLOBJ:=$(LIBOBJ) sharp_testsuite.o
LIBOBJ:=$(LIBOBJ:%=$(OD)/%)
ALLOBJ:=$(ALLOBJ:%=$(OD)/%)

View file

@ -39,5 +39,7 @@
#include <complex.h>
#include "sharp_lowlevel.h"
#include "sharp_legendre.h"
#include "sharp_legendre_roots.h"
#endif

View file

@ -32,6 +32,7 @@
#include <math.h>
#include "sharp_geomhelpers.h"
#include "sharp_legendre_roots.h"
#include "c_utils.h"
#include "ls_fft.h"
#include <stdio.h>
@ -106,69 +107,6 @@ void sharp_make_weighted_healpix_geom_info (int nside, int stride,
sharp_make_subset_healpix_geom_info(nside, stride, 4 * nside - 1, NULL, weight, geom_info);
}
static inline double one_minus_x2 (double x)
{ return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; }
/* Function adapted from GNU GSL file glfixed.c
Original author: Pavel Holoborodko (http://www.holoborodko.com)
Adjustments by M. Reinecke
- adjusted interface (keep epsilon internal, return full number of points)
- removed precomputed tables
- tweaked Newton iteration to obtain higher accuracy */
static void gauss_legendre_tbl(int n, double *x, double *w)
{
const double pi = 3.141592653589793238462643383279502884197;
const double eps = 3e-14;
int m = (n+1)>>1;
double t0 = 1 - (1-1./n) / (8.*n*n);
double t1 = 1./(4.*n+2.);
#pragma omp parallel
{
int i;
#pragma omp for schedule(dynamic,100)
for (i=1; i<=m; ++i)
{
double x0 = cos(pi * ((i<<2)-1) * t1) * t0;
int dobreak=0;
int j=0;
double dpdx;
while(1)
{
double P_1 = 1.0;
double P0 = x0;
double dx, x1;
for (int k=2; k<=n; k++)
{
double P_2 = P_1;
P_1 = P0;
// P0 = ((2*k-1)*x0*P_1-(k-1)*P_2)/k;
P0 = x0*P_1 + (k-1.)/k * (x0*P_1-P_2);
}
dpdx = (P_1 - x0*P0) * n / one_minus_x2(x0);
/* Newton step */
x1 = x0 - P0/dpdx;
dx = x0-x1;
x0 = x1;
if (dobreak) break;
if (fabs(dx)<=eps) dobreak=1;
UTIL_ASSERT(++j<100,"convergence problem");
}
x[i-1] = -x0;
x[n-i] = x0;
w[i-1] = w[n-i] = 2. / (one_minus_x2(x0) * dpdx * dpdx);
}
} // end of parallel region
}
void sharp_make_gauss_geom_info (int nrings, int nphi, double phi0,
int stride_lon, int stride_lat, sharp_geom_info **geom_info)
{
@ -181,7 +119,7 @@ void sharp_make_gauss_geom_info (int nrings, int nphi, double phi0,
ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
int *stride_=RALLOC(int,nrings);
gauss_legendre_tbl(nrings,theta,weight);
sharp_legendre_roots(nrings,theta,weight);
for (int m=0; m<nrings; ++m)
{
theta[m] = acos(-theta[m]);

1319
libsharp/sharp_legendre.c Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,176 @@
/*
NOTE NOTE NOTE
This file is edited in sharp_legendre.c.in which is then preprocessed.
Do not make manual modifications to sharp_legendre.c.
NOTE NOTE NOTE
*/
/*
* This file is part of libsharp.
*
* Redistribution and use in source and binary forms, with or without
* met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*! \file sharp_legendre.c.in
*
* Copyright (C) 2015 University of Oslo
* \author Dag Sverre Seljebotn
*/
#ifndef NO_LEGENDRE
#if (VLEN==8)
#error This code is not tested with MIC; please compile with -DNO_LEGENDRE
/* ...or test it (it probably works) and remove this check */
#endif
#ifndef SHARP_LEGENDRE_CS
#define SHARP_LEGENDRE_CS 4
#endif
#define MAX_CS 6
#if (SHARP_LEGENDRE_CS > MAX_CS)
#error (SHARP_LEGENDRE_CS > MAX_CS)
#endif
#include "sharp_legendre.h"
#include "sharp_vecsupport.h"
#include <malloc.h>
/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
/*{ for cs in range(1, 7) }*/
static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *bl, ptrdiff_t lmax,
{{scalar}} xarr[({{cs}}) * VLEN{{T}}],
{{scalar}} out[({{cs}}) * VLEN{{T}}]) {
/*{ for i in range(cs) }*/
Tv{{T}} P_{{i}}, Pm1_{{i}}, Pm2_{{i}}, x{{i}}, y{{i}};
/*{ endfor }*/
Tv{{T}} W1, W2, b, R;
ptrdiff_t l;
/*{ for i in range(cs) }*/
x{{i}} = vloadu{{T}}(xarr + {{i}} * VLEN{{T}});
Pm1_{{i}} = vload{{T}}(1.0);
P_{{i}} = x{{i}};
b = vload{{T}}(*bl);
y{{i}} = vmul{{T}}(Pm1_{{i}}, b);
/*{ endfor }*/
b = vload{{T}}(*(bl + 1));
/*{ for i in range(cs) }*/
vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
/*{ endfor }*/
for (l = 2; l <= lmax; ++l) {
b = vload{{T}}(*(bl + l));
R = vload{{T}}(*(recfacs + l));
/*
P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
*/
/*{ for i in range(cs) }*/
Pm2_{{i}} = Pm1_{{i}}; Pm1_{{i}} = P_{{i}};
W1 = vmul{{T}}(x{{i}}, Pm1_{{i}});
W2 = W1;
W2 = vsub{{T}}(W2, Pm2_{{i}});
P_{{i}} = W1;
vfmaeq{{T}}(P_{{i}}, W2, R);
vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
/*{ endfor }*/
}
/*{ for i in range(cs) }*/
vstoreu{{T}}(out + {{i}} * VLEN{{T}}, y{{i}});
/*{ endfor }*/
}
/*{ endfor }*/
/*{ endfor }*/
/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
void sharp_legendre_transform_recfac{{T}}({{scalar}} *r, ptrdiff_t lmax) {
/* (l - 1) / l, for l >= 2 */
ptrdiff_t l;
r[0] = 0;
r[1] = 1;
for (l = 2; l <= lmax; ++l) {
r[l] = ({{scalar}})(l - 1) / ({{scalar}})l;
}
}
/*{ endfor }*/
/*
Compute sum_l b_l P_l(x_i) for all i.
*/
#define LEN (SHARP_LEGENDRE_CS * VLEN)
#define LEN_s (SHARP_LEGENDRE_CS * VLEN_s)
/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
void sharp_legendre_transform{{T}}({{scalar}} *bl,
{{scalar}} *recfac,
ptrdiff_t lmax,
{{scalar}} *x, {{scalar}} *out, ptrdiff_t nx) {
{{scalar}} xchunk[MAX_CS * VLEN{{T}}], outchunk[MAX_CS * LEN{{T}}];
int compute_recfac;
ptrdiff_t i, j, len;
compute_recfac = (recfac == NULL);
if (compute_recfac) {
recfac = malloc(sizeof({{scalar}}) * (lmax + 1));
sharp_legendre_transform_recfac{{T}}(recfac, lmax);
}
for (j = 0; j != LEN{{T}}; ++j) xchunk[j] = 0;
for (i = 0; i < nx; i += LEN{{T}}) {
len = (i + (LEN{{T}}) <= nx) ? (LEN{{T}}) : (nx - i);
for (j = 0; j != len; ++j) xchunk[j] = x[i + j];
switch ((len + VLEN{{T}} - 1) / VLEN{{T}}) {
case 6: legendre_transform_vec6{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
case 5: legendre_transform_vec5{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
case 4: legendre_transform_vec4{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
case 3: legendre_transform_vec3{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
case 2: legendre_transform_vec2{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
case 1:
case 0:
legendre_transform_vec1{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
}
for (j = 0; j != len; ++j) out[i + j] = outchunk[j];
}
if (compute_recfac) {
free(recfac);
}
}
/*{ endfor }*/
#endif

62
libsharp/sharp_legendre.h Normal file
View file

@ -0,0 +1,62 @@
/*
* This file is part of libsharp.
*
* Redistribution and use in source and binary forms, with or without
* met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*! \file sharp_legendre.h
* Interface for the Legendre transform parts of the spherical transform library.
*
* Copyright (C) 2015 University of Oslo
* \author Dag Sverre Seljebotn
*/
#ifndef SHARP_LEGENDRE_H
#define SHARP_LEGENDRE_H
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifndef NO_LEGENDRE
void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x,
double *out, ptrdiff_t nx);
void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x,
float *out, ptrdiff_t nx);
void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax);
void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax);
#endif
#ifdef __cplusplus
}
#endif
#endif

View file

@ -0,0 +1,67 @@
/* Function adapted from GNU GSL file glfixed.c
Original author: Pavel Holoborodko (http://www.holoborodko.com)
Adjustments by M. Reinecke
- adjusted interface (keep epsilon internal, return full number of points)
- removed precomputed tables
- tweaked Newton iteration to obtain higher accuracy */
#include <math.h>
#include "sharp_legendre_roots.h"
#include "c_utils.h"
static inline double one_minus_x2 (double x)
{ return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; }
void sharp_legendre_roots(int n, double *x, double *w)
{
const double pi = 3.141592653589793238462643383279502884197;
const double eps = 3e-14;
int m = (n+1)>>1;
double t0 = 1 - (1-1./n) / (8.*n*n);
double t1 = 1./(4.*n+2.);
#pragma omp parallel
{
int i;
#pragma omp for schedule(dynamic,100)
for (i=1; i<=m; ++i)
{
double x0 = cos(pi * ((i<<2)-1) * t1) * t0;
int dobreak=0;
int j=0;
double dpdx;
while(1)
{
double P_1 = 1.0;
double P0 = x0;
double dx, x1;
for (int k=2; k<=n; k++)
{
double P_2 = P_1;
P_1 = P0;
// P0 = ((2*k-1)*x0*P_1-(k-1)*P_2)/k;
P0 = x0*P_1 + (k-1.)/k * (x0*P_1-P_2);
}
dpdx = (P_1 - x0*P0) * n / one_minus_x2(x0);
/* Newton step */
x1 = x0 - P0/dpdx;
dx = x0-x1;
x0 = x1;
if (dobreak) break;
if (fabs(dx)<=eps) dobreak=1;
UTIL_ASSERT(++j<100,"convergence problem");
}
x[i-1] = -x0;
x[n-i] = x0;
w[i-1] = w[n-i] = 2. / (one_minus_x2(x0) * dpdx * dpdx);
}
} // end of parallel region
}

View file

@ -0,0 +1,50 @@
/*
* This file is part of libsharp.
*
* libsharp is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* libsharp is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with libsharp; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
* (DLR).
*/
/*! \file sharp_legendre_roots.h
*
* Copyright (C) 2006-2012 Max-Planck-Society
* \author Martin Reinecke
*/
#ifndef SHARP_LEGENDRE_ROOTS_H
#define SHARP_LEGENDRE_ROOTS_H
#ifdef __cplusplus
extern "C" {
#endif
/*! Computes roots and Gaussian quadrature weights for Legendre polynomial
of degree \a n.
\param n Order of Legendre polynomial
\param x Array of length \a n for output (root position)
\param w Array of length \a w for output (weight for Gaussian quadrature)
*/
void sharp_legendre_roots(int n, double *x, double *w);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -40,23 +40,31 @@ typedef double Ts;
#if (VLEN==1)
typedef double Tv;
typedef float Tv_s;
typedef int Tm;
#define vadd(a,b) ((a)+(b))
#define vadd_s(a,b) ((a)+(b))
#define vaddeq(a,b) ((a)+=(b))
#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
#define vsub(a,b) ((a)-(b))
#define vsub_s(a,b) ((a)-(b))
#define vsubeq(a,b) ((a)-=(b))
#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
#define vmul(a,b) ((a)*(b))
#define vmul_s(a,b) ((a)*(b))
#define vmuleq(a,b) ((a)*=(b))
#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
#define vfmaeq(a,b,c) ((a)+=(b)*(c))
#define vfmaeq_s(a,b,c) ((a)+=(b)*(c))
#define vfmseq(a,b,c) ((a)-=(b)*(c))
#define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
#define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
#define vneg(a) (-(a))
#define vload(a) (a)
#define vload_s(a) (a)
#define vloadu(p) (*(p))
#define vloadu_s(p) (*(p))
#define vabs(a) fabs(a)
#define vsqrt(a) sqrt(a)
#define vlt(a,b) ((a)<(b))
@ -64,6 +72,8 @@ typedef int Tm;
#define vge(a,b) ((a)>=(b))
#define vne(a,b) ((a)!=(b))
#define vand_mask(a,b) ((a)&&(b))
#define vstoreu(p, a) (*(p)=a)
#define vstoreu_s(p, a) (*(p)=a)
static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
@ -87,6 +97,7 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
#endif
typedef __m128d Tv;
typedef __m128 Tv_s;
typedef __m128d Tm;
#if defined(__SSE4_1__)
@ -99,15 +110,19 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
#define vone _mm_set1_pd(1.)
#define vadd(a,b) _mm_add_pd(a,b)
#define vadd_s(a,b) _mm_add_ps(a,b)
#define vaddeq(a,b) a=_mm_add_pd(a,b)
#define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero))
#define vsub(a,b) _mm_sub_pd(a,b)
#define vsub_s(a,b) _mm_sub_ps(a,b)
#define vsubeq(a,b) a=_mm_sub_pd(a,b)
#define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero))
#define vmul(a,b) _mm_mul_pd(a,b)
#define vmul_s(a,b) _mm_mul_ps(a,b)
#define vmuleq(a,b) a=_mm_mul_pd(a,b)
#define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone))
#define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c))
#define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
#define vfmaaeq(a,b,c,d,e) \
a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
@ -115,6 +130,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
#define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
#define vload(a) _mm_set1_pd(a)
#define vload_s(a) _mm_set1_ps(a)
#define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
#define vsqrt(a) _mm_sqrt_pd(a)
#define vlt(a,b) _mm_cmplt_pd(a,b)
@ -126,17 +142,22 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
#define vmax(a,b) _mm_max_pd(a,b);
#define vanyTrue(a) (_mm_movemask_pd(a)!=0)
#define vallTrue(a) (_mm_movemask_pd(a)==3)
#define vloadu(p) _mm_loadu_pd(p)
#define vloadu_s(p) _mm_loadu_ps(p)
#define vstoreu(p, v) _mm_storeu_pd(p, v)
#define vstoreu_s(p, v) _mm_storeu_ps(p, v)
#endif
#if (VLEN==4)
#include <immintrin.h>
#ifdef __FMA4__
#if (USE_FMA4)
#include <x86intrin.h>
#endif
typedef __m256d Tv;
typedef __m256 Tv_s;
typedef __m256d Tm;
#define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
@ -144,21 +165,26 @@ typedef __m256d Tm;
#define vone _mm256_set1_pd(1.)
#define vadd(a,b) _mm256_add_pd(a,b)
#define vadd_s(a,b) _mm256_add_ps(a,b)
#define vaddeq(a,b) a=_mm256_add_pd(a,b)
#define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero))
#define vsub(a,b) _mm256_sub_pd(a,b)
#define vsub_s(a,b) _mm256_sub_ps(a,b)
#define vsubeq(a,b) a=_mm256_sub_pd(a,b)
#define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero))
#define vmul(a,b) _mm256_mul_pd(a,b)
#define vmul_s(a,b) _mm256_mul_ps(a,b)
#define vmuleq(a,b) a=_mm256_mul_pd(a,b)
#define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
#ifdef __FMA4__
#if (USE_FMA4)
#define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
#define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
#else
#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
#define vfmaaeq(a,b,c,d,e) \
a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
@ -167,6 +193,7 @@ typedef __m256d Tm;
#endif
#define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
#define vload(a) _mm256_set1_pd(a)
#define vload_s(a) _mm256_set1_ps(a)
#define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
#define vsqrt(a) _mm256_sqrt_pd(a)
#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
@ -179,6 +206,11 @@ typedef __m256d Tm;
#define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
#define vallTrue(a) (_mm256_movemask_pd(a)==15)
#define vloadu(p) _mm256_loadu_pd(p)
#define vloadu_s(p) _mm256_loadu_ps(p)
#define vstoreu(p, v) _mm256_storeu_pd(p, v)
#define vstoreu_s(p, v) _mm256_storeu_ps(p, v)
#endif
#if (VLEN==8)

View file

@ -32,6 +32,8 @@
#ifndef SHARP_VECUTIL_H
#define SHARP_VECUTIL_H
#ifndef VLEN
#if (defined (__MIC__))
#define VLEN 8
#elif (defined (__AVX__))
@ -43,3 +45,19 @@
#endif
#endif
#if (VLEN==1)
#define VLEN_s 1
#else
#define VLEN_s (2*VLEN)
#endif
#ifndef USE_FMA4
#ifdef __FMA4__
#define USE_FMA4 1
#else
#define USE_FMA4 0
#endif
#endif
#endif