/*+ ARES/HADES/BORG Package -- ./libLSS/tools/fused_assign.hpp Copyright (C) 2014-2020 Guilhem Lavaux Copyright (C) 2009-2020 Jens Jasche Additional contributions from: Guilhem Lavaux (2023) +*/ #ifndef __LIBLSS_FUSED_ASSIGNMENT_HPP #define __LIBLSS_FUSED_ASSIGNMENT_HPP #include #include #include "libLSS/tools/console.hpp" // When we can get rid of AssignFunctor //#include "libLSS/tools/phoenix_vars.hpp" //#include namespace LibLSS { namespace FUSE_details { template struct OperatorAssignment {}; template struct OperatorAssignment { template static inline void apply(BiFunctor f, A&& a, const B& b) { std::size_t s = a.index_bases()[0], e = a.shape()[0]; for (std::size_t i = s; i < s+e; i++) { OperatorAssignment op; op.apply(f, a[i], b[i]); } } }; template struct OperatorAssignment<3,BiFunctor,false> { template static inline void apply(BiFunctor f, A&& a, const B& b) { auto ib = a.index_bases(); auto sh = a.shape(); std::size_t s0 = ib[0], e0 = s0+sh[0]; std::size_t s1 = ib[1], e1 = s1+sh[1]; std::size_t s2 = ib[2], e2 = s2+sh[2]; boost::array i; for (i[0] = s0; i[0] < e0; i[0]++) { for (i[1] = s1; i[1] < e1; i[1]++) { for (i[2] = s2; i[2] < e2; i[2]++) { f(a(i), b(i)); } } } } }; template struct OperatorAssignment { template static inline void apply(BiFunctor f, A&& a, const B& b) { std::size_t s = a.index_bases()[0], e = a.shape()[0]; typename boost::remove_reference::type *a_ptr = &a; const B *b_ptr = &b; #pragma omp parallel for schedule(static) for (std::size_t i = s; i < s+e; i++) { OperatorAssignment op; op.apply(f, (*a_ptr)[i], (*b_ptr)[i]); } } }; template struct OperatorAssignment<3,BiFunctor,true> { template static inline void apply(BiFunctor f, A&& a, const B& b) { auto ib = a.index_bases(); auto sh = a.shape(); std::size_t s0 = ib[0], e0 = s0+sh[0]; std::size_t s1 = ib[1], e1 = s1+sh[1]; std::size_t s2 = ib[2], e2 = s2+sh[2]; // Console::instance().print("Using optimized 3-loop collapsed omp"); Console::instance().format("Using optimized 3-loop collapsed omp, %dx%dx%d -- %dx%dx%d", s0,s1,s2,e0,e1,e2); #pragma omp parallel for collapse(3) for (size_t i = s0; i < e0; i++) { for (size_t j = s1; j < e1; j++) { { /* boost::array idx; idx[0] = i; idx[1] = j; for (idx[2] = s2; idx[2] < e2; idx[2]++) { f( a(idx), b(idx)); } */ for (size_t k = s2; k < e2; k++) { f(a[i][j][k], b[i][j][k]); } } } } } }; template struct OperatorAssignment<2,BiFunctor,true> { template static inline void apply(BiFunctor f, A&& a, const B& b) { std::size_t s0 = a.index_bases()[0], e0 = a.shape()[0]; std::size_t s1 = a.index_bases()[1], e1 = a.shape()[1]; #pragma omp parallel for collapse(1) schedule(static) for (std::size_t i = s0; i < s0+e0; i++) { // Factorize memory access for the last index. auto stripe_a = a[i]; auto stripe_b = b[i]; for (std::size_t j = s1; j < s1+e1; j++) { f( a[i][j], b[i][j]); } } } }; template struct OperatorAssignment<0, BiFunctor, false> { template static inline void apply(BiFunctor f, A& a, const B& b) { f(a,b); } }; template struct OperatorAssignment<0, BiFunctor, true> { template static inline void apply(BiFunctor f, A& a, const B& b) { f(a,b); } }; struct AssignFunctor { template inline void operator()(T0& a, const T1& b) { a = b; } }; template inline void apply_array(BiFunctor f, OutArray A, const InArray& B, bool openmp = true) { typedef typename boost::remove_reference::type PureArray; if (openmp) { OperatorAssignment op; op.template apply(f, A, B); } else { OperatorAssignment op; op.template apply(f, A, B); } } template inline void copy_array_rv(OutArray A, const InArray& B, bool openmp=true) { // GCC is not yet sufficiently clever for that one. The code is suboptimal (test_fused_array timing) // auto assigner = (boost::phoenix::ref(_p1)=boost::phoenix::cref(_p2)); AssignFunctor assigner; apply_array(assigner, A, B, openmp); } template inline void copy_array(OutArray& A, const InArray& B, bool openmp=true) { copy_array_rv(A, B, openmp); } } using FUSE_details::apply_array; using FUSE_details::copy_array; using FUSE_details::copy_array_rv; }; #endif