borg_public/libLSS/tools/fused_assign.hpp
2023-05-29 10:41:03 +02:00

187 lines
6.1 KiB
C++

/*+
ARES/HADES/BORG Package -- ./libLSS/tools/fused_assign.hpp
Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
Additional contributions from:
Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
+*/
#ifndef __LIBLSS_FUSED_ASSIGNMENT_HPP
#define __LIBLSS_FUSED_ASSIGNMENT_HPP
#include <iostream>
#include <boost/type_traits/has_trivial_constructor.hpp>
#include "libLSS/tools/console.hpp"
// When we can get rid of AssignFunctor
//#include "libLSS/tools/phoenix_vars.hpp"
//#include <boost/phoenix/operator.hpp>
namespace LibLSS {
namespace FUSE_details {
template<std::size_t N, typename BiFunctor, bool parallel>
struct OperatorAssignment {};
template<std::size_t N, typename BiFunctor>
struct OperatorAssignment<N,BiFunctor,false> {
template<typename A, typename B>
static inline void apply(BiFunctor f, A&& a, const B& b) {
std::size_t s = a.index_bases()[0], e = a.shape()[0];
for (std::size_t i = s; i < s+e; i++) {
OperatorAssignment<N-1,BiFunctor,false> op;
op.apply(f, a[i], b[i]);
}
}
};
template<typename BiFunctor>
struct OperatorAssignment<3,BiFunctor,false> {
template<typename A, typename B>
static inline void apply(BiFunctor f, A&& a, const B& b) {
auto ib = a.index_bases();
auto sh = a.shape();
std::size_t s0 = ib[0], e0 = s0+sh[0];
std::size_t s1 = ib[1], e1 = s1+sh[1];
std::size_t s2 = ib[2], e2 = s2+sh[2];
boost::array<ssize_t, 3> i;
for (i[0] = s0; i[0] < e0; i[0]++) {
for (i[1] = s1; i[1] < e1; i[1]++) {
for (i[2] = s2; i[2] < e2; i[2]++) {
f(a(i), b(i));
}
}
}
}
};
template<std::size_t N, typename BiFunctor>
struct OperatorAssignment<N,BiFunctor,true> {
template<typename A, typename B>
static inline void apply(BiFunctor f, A&& a, const B& b) {
std::size_t s = a.index_bases()[0], e = a.shape()[0];
typename boost::remove_reference<A>::type *a_ptr = &a;
const B *b_ptr = &b;
#pragma omp parallel for schedule(static)
for (std::size_t i = s; i < s+e; i++) {
OperatorAssignment<N-1,BiFunctor,false> op;
op.apply(f, (*a_ptr)[i], (*b_ptr)[i]);
}
}
};
template<typename BiFunctor>
struct OperatorAssignment<3,BiFunctor,true> {
template<typename A, typename B>
static inline void apply(BiFunctor f, A&& a, const B& b) {
auto ib = a.index_bases();
auto sh = a.shape();
std::size_t s0 = ib[0], e0 = s0+sh[0];
std::size_t s1 = ib[1], e1 = s1+sh[1];
std::size_t s2 = ib[2], e2 = s2+sh[2];
// Console::instance().print<LOG_DEBUG>("Using optimized 3-loop collapsed omp");
Console::instance().format<LOG_DEBUG>("Using optimized 3-loop collapsed omp, %dx%dx%d -- %dx%dx%d", s0,s1,s2,e0,e1,e2);
#pragma omp parallel for collapse(3)
for (size_t i = s0; i < e0; i++) {
for (size_t j = s1; j < e1; j++) {
{
/* boost::array<ssize_t, 3> idx;
idx[0] = i;
idx[1] = j;
for (idx[2] = s2; idx[2] < e2; idx[2]++) {
f( a(idx), b(idx));
}
*/
for (size_t k = s2; k < e2; k++) {
f(a[i][j][k], b[i][j][k]);
}
}
}
}
}
};
template<typename BiFunctor>
struct OperatorAssignment<2,BiFunctor,true> {
template<typename A, typename B>
static inline void apply(BiFunctor f, A&& a, const B& b) {
std::size_t s0 = a.index_bases()[0], e0 = a.shape()[0];
std::size_t s1 = a.index_bases()[1], e1 = a.shape()[1];
#pragma omp parallel for collapse(1) schedule(static)
for (std::size_t i = s0; i < s0+e0; i++) {
// Factorize memory access for the last index.
auto stripe_a = a[i];
auto stripe_b = b[i];
for (std::size_t j = s1; j < s1+e1; j++) {
f( a[i][j], b[i][j]);
}
}
}
};
template<typename BiFunctor>
struct OperatorAssignment<0, BiFunctor, false> {
template<typename A, typename B>
static inline void apply(BiFunctor f, A& a, const B& b) {
f(a,b);
}
};
template<typename BiFunctor>
struct OperatorAssignment<0, BiFunctor, true> {
template<typename A, typename B>
static inline void apply(BiFunctor f, A& a, const B& b) {
f(a,b);
}
};
struct AssignFunctor {
template<typename T0,typename T1>
inline void operator()(T0& a, const T1& b) {
a = b;
}
};
template<typename OutArray, typename BiFunctor, typename InArray>
inline void apply_array(BiFunctor f, OutArray A, const InArray& B, bool openmp = true) {
typedef typename boost::remove_reference<OutArray>::type PureArray;
if (openmp) {
OperatorAssignment<PureArray::dimensionality,BiFunctor,true> op;
op.template apply(f, A, B);
} else {
OperatorAssignment<PureArray::dimensionality,BiFunctor,false> op;
op.template apply(f, A, B);
}
}
template<typename OutArray, typename InArray>
inline void copy_array_rv(OutArray A, const InArray& B, bool openmp=true) {
// GCC is not yet sufficiently clever for that one. The code is suboptimal (test_fused_array timing)
// auto assigner = (boost::phoenix::ref(_p1)=boost::phoenix::cref(_p2));
AssignFunctor assigner;
apply_array<OutArray, decltype(assigner), InArray>(assigner, A, B, openmp);
}
template<typename OutArray, typename InArray>
inline void copy_array(OutArray& A, const InArray& B, bool openmp=true) {
copy_array_rv<OutArray&>(A, B, openmp);
}
}
using FUSE_details::apply_array;
using FUSE_details::copy_array;
using FUSE_details::copy_array_rv;
};
#endif