doxygen/deal.II/include_2deal_8II_2base_2parallel_8h_source.html

 // ---------------------------------------------------------------------
 //
 // Copyright (C) 2008 - 2018 by the deal.II authors
 //
 // This file is part of the deal.II library.
 //
 // The deal.II library is free software; you can use it, redistribute
 // it, and/or modify it under the terms of the GNU Lesser General
 // Public License as published by the Free Software Foundation; either
 // version 2.1 of the License, or (at your option) any later version.
 // The full text of the license can be found in the file LICENSE.md at
 // the top level directory of deal.II.
 //
 // ---------------------------------------------------------------------

 #ifndef dealii_parallel_h
 #define dealii_parallel_h


 #include <deal.II/base/config.h>

 #include <deal.II/base/exceptions.h>
 #include <deal.II/base/synchronous_iterator.h>
 #include <deal.II/base/template_constraints.h>
 #include <deal.II/base/thread_management.h>

 #include <cstddef>
 #include <functional>
 #include <memory>
 #include <tuple>

 #ifdef DEAL_II_WITH_THREADS
 #  include <tbb/blocked_range.h>
 #  include <tbb/parallel_for.h>
 #  include <tbb/parallel_reduce.h>
 #  include <tbb/partitioner.h>
 #endif


 // TODO[WB]: allow calling functions to pass along a tbb::affinity_partitioner
 // object to ensure that subsequent calls use the same cache lines

 DEAL_II_NAMESPACE_OPEN

 namespace parallel
 {
   namespace internal
   {
     template <typename Number>
     struct EnableOpenMPSimdFor
     {
       static const bool value = true;
     };

 #ifdef __INTEL_COMPILER
     // Disable long double SIMD instructions on ICC. This is to work around a
     // bug that generates wrong code at least up to intel 15 (see
     // tests/lac/vector-vector, tests/lac/intel-15-bug, and the discussion at
     // https://github.com/dealii/dealii/issues/598).
     template <>
     struct EnableOpenMPSimdFor<long double>
     {
       static const bool value = false;
     };
 #endif


     template <typename F>
     struct Body
     {
       Body(const F &f)
         : f(f)
       {}

       template <typename Range>
       void
       operator()(const Range &range) const
       {
         for (typename Range::const_iterator p = range.begin(); p != range.end();
              ++p)
           apply(f, *p);
       }

     private:
       const F f;

       template <typename I1, typename I2>
       static void
       apply(const F &f, const std::tuple<I1, I2> &p)
       {
         *std::get<1>(p) = f(*std::get<0>(p));
       }

       template <typename I1, typename I2, typename I3>
       static void
       apply(const F &f, const std::tuple<I1, I2, I3> &p)
       {
         *std::get<2>(p) = f(*std::get<0>(p), *std::get<1>(p));
       }

       template <typename I1, typename I2, typename I3, typename I4>
       static void
       apply(const F &f, const std::tuple<I1, I2, I3, I4> &p)
       {
         *std::get<3>(p) = f(*std::get<0>(p), *std::get<1>(p), *std::get<2>(p));
       }
     };


     template <typename F>
     Body<F>
     make_body(const F &f)
     {
       return Body<F>(f);
     }
   } // namespace internal

   template <typename InputIterator, typename OutputIterator, typename Predicate>
   void
   transform(const InputIterator &begin_in,
             const InputIterator &end_in,
             OutputIterator       out,
             Predicate &          predicate,
             const unsigned int   grainsize)
   {
 #ifndef DEAL_II_WITH_THREADS
     // make sure we don't get compiler
     // warnings about unused arguments
     (void)grainsize;

     for (OutputIterator in = begin_in; in != end_in;)
       *out++ = predicate(*in++);
 #else
     using Iterators     = std::tuple<InputIterator, OutputIterator>;
     using SyncIterators = SynchronousIterators<Iterators>;
     Iterators x_begin(begin_in, out);
     Iterators x_end(end_in, OutputIterator());
     tbb::parallel_for(tbb::blocked_range<SyncIterators>(x_begin,
                                                         x_end,
                                                         grainsize),
                       internal::make_body(predicate),
                       tbb::auto_partitioner());
 #endif
   }


   template <typename InputIterator1,
             typename InputIterator2,
             typename OutputIterator,
             typename Predicate>
   void
   transform(const InputIterator1 &begin_in1,
             const InputIterator1 &end_in1,
             InputIterator2        in2,
             OutputIterator        out,
             Predicate &           predicate,
             const unsigned int    grainsize)
   {
 #ifndef DEAL_II_WITH_THREADS
     // make sure we don't get compiler
     // warnings about unused arguments
     (void)grainsize;

     for (OutputIterator in1 = begin_in1; in1 != end_in1;)
       *out++ = predicate(*in1++, *in2++);
 #else
     using Iterators =
       std::tuple<InputIterator1, InputIterator2, OutputIterator>;
     using SyncIterators = SynchronousIterators<Iterators>;
     Iterators x_begin(begin_in1, in2, out);
     Iterators x_end(end_in1, InputIterator2(), OutputIterator());
     tbb::parallel_for(tbb::blocked_range<SyncIterators>(x_begin,
                                                         x_end,
                                                         grainsize),
                       internal::make_body(predicate),
                       tbb::auto_partitioner());
 #endif
   }


   template <typename InputIterator1,
             typename InputIterator2,
             typename InputIterator3,
             typename OutputIterator,
             typename Predicate>
   void
   transform(const InputIterator1 &begin_in1,
             const InputIterator1 &end_in1,
             InputIterator2        in2,
             InputIterator3        in3,
             OutputIterator        out,
             Predicate &           predicate,
             const unsigned int    grainsize)
   {
 #ifndef DEAL_II_WITH_THREADS
     // make sure we don't get compiler
     // warnings about unused arguments
     (void)grainsize;

     for (OutputIterator in1 = begin_in1; in1 != end_in1;)
       *out++ = predicate(*in1++, *in2++, *in3++);
 #else
     using Iterators = std::
       tuple<InputIterator1, InputIterator2, InputIterator3, OutputIterator>;
     using SyncIterators = SynchronousIterators<Iterators>;
     Iterators x_begin(begin_in1, in2, in3, out);
     Iterators x_end(end_in1,
                     InputIterator2(),
                     InputIterator3(),
                     OutputIterator());
     tbb::parallel_for(tbb::blocked_range<SyncIterators>(x_begin,
                                                         x_end,
                                                         grainsize),
                       internal::make_body(predicate),
                       tbb::auto_partitioner());
 #endif
   }


   namespace internal
   {
 #ifdef DEAL_II_WITH_THREADS

     template <typename RangeType, typename Function>
     void
     apply_to_subranges(const tbb::blocked_range<RangeType> &range,
                        const Function &                     f)
     {
       f(range.begin(), range.end());
     }
 #endif
   } // namespace internal


   template <typename RangeType, typename Function>
   void
   apply_to_subranges(const RangeType &                         begin,
                      const typename identity<RangeType>::type &end,
                      const Function &                          f,
                      const unsigned int                        grainsize)
   {
 #ifndef DEAL_II_WITH_THREADS
     // make sure we don't get compiler
     // warnings about unused arguments
     (void)grainsize;

 #  ifndef DEAL_II_BIND_NO_CONST_OP_PARENTHESES
     f(begin, end);
 #  else
     // work around a problem with MS VC++ where there is no const
     // operator() in 'Function' if 'Function' is the result of std::bind
     Function ff = f;
     ff(begin, end);
 #  endif
 #else
     tbb::parallel_for(
       tbb::blocked_range<RangeType>(begin, end, grainsize),
       std::bind(&internal::apply_to_subranges<RangeType, Function>,
                 std::placeholders::_1,
                 std::cref(f)),
       tbb::auto_partitioner());
 #endif
   }


   struct ParallelForInteger
   {
     virtual ~ParallelForInteger() = default;

     void
     apply_parallel(const std::size_t begin,
                    const std::size_t end,
                    const std::size_t minimum_parallel_grain_size) const;

     virtual void
     apply_to_subrange(const std::size_t, const std::size_t) const = 0;
   };


   namespace internal
   {
 #ifdef DEAL_II_WITH_THREADS

     template <typename ResultType, typename Function>
     struct ReductionOnSubranges
     {
       ResultType result;

       template <typename Reductor>
       ReductionOnSubranges(const Function & f,
                            const Reductor & reductor,
                            const ResultType neutral_element = ResultType())
         : result(neutral_element)
         , f(f)
         , neutral_element(neutral_element)
         , reductor(reductor)
       {}

       ReductionOnSubranges(const ReductionOnSubranges &r, tbb::split)
         : result(r.neutral_element)
         , f(r.f)
         , neutral_element(r.neutral_element)
         , reductor(r.reductor)
       {}

       void
       join(const ReductionOnSubranges &r)
       {
         result = reductor(result, r.result);
       }

       template <typename RangeType>
       void
       operator()(const tbb::blocked_range<RangeType> &range)
       {
         result = reductor(result, f(range.begin(), range.end()));
       }

     private:
       const Function f;

       const ResultType neutral_element;

       const std::function<ResultType(ResultType, ResultType)> reductor;
     };
 #endif
   } // namespace internal


   template <typename ResultType, typename RangeType, typename Function>
   ResultType
   accumulate_from_subranges(const Function &                          f,
                             const RangeType &                         begin,
                             const typename identity<RangeType>::type &end,
                             const unsigned int                        grainsize)
   {
 #ifndef DEAL_II_WITH_THREADS
     // make sure we don't get compiler
     // warnings about unused arguments
     (void)grainsize;

 #  ifndef DEAL_II_BIND_NO_CONST_OP_PARENTHESES
     return f(begin, end);
 #  else
     // work around a problem with MS VC++ where there is no const
     // operator() in 'Function' if 'Function' is the result of std::bind
     Function ff = f;
     return ff(begin, end);
 #  endif
 #else
     internal::ReductionOnSubranges<ResultType, Function> reductor(
       f, std::plus<ResultType>(), 0);
     tbb::parallel_reduce(tbb::blocked_range<RangeType>(begin, end, grainsize),
                          reductor,
                          tbb::auto_partitioner());
     return reductor.result;
 #endif
   }


   // --------------------- for loop affinity partitioner -----------------------

   namespace internal
   {
     class TBBPartitioner
     {
     public:
       TBBPartitioner();

 #ifdef DEAL_II_WITH_THREADS

       ~TBBPartitioner();

       std::shared_ptr<tbb::affinity_partitioner>
       acquire_one_partitioner();

       void
       release_one_partitioner(std::shared_ptr<tbb::affinity_partitioner> &p);

     private:
       std::shared_ptr<tbb::affinity_partitioner> my_partitioner;

       bool in_use;

       ::Threads::Mutex mutex;
 #endif
     };
   } // namespace internal
 } // namespace parallel


 namespace internal
 {
   namespace VectorImplementation
   {
     extern unsigned int minimum_parallel_grain_size;
   } // namespace VectorImplementation


   namespace SparseMatrixImplementation
   {
     extern unsigned int minimum_parallel_grain_size;
   } // namespace SparseMatrixImplementation

 } // end of namespace internal


 /* --------------------------- inline functions ------------------------- */

 namespace parallel
 {
 #ifdef DEAL_II_WITH_THREADS

   namespace internal
   {
     struct ParallelForWrapper
     {
       ParallelForWrapper(const parallel::ParallelForInteger &worker)
         : worker_(worker)
       {}

       void
       operator()(const tbb::blocked_range<std::size_t> &range) const
       {
         worker_.apply_to_subrange(range.begin(), range.end());
       }

       const parallel::ParallelForInteger &worker_;
     };
   } // namespace internal

 #endif


   inline void
   ParallelForInteger::apply_parallel(
     const std::size_t begin,
     const std::size_t end,
     const std::size_t minimum_parallel_grain_size) const
   {
 #ifndef DEAL_II_WITH_THREADS
     // make sure we don't get compiler
     // warnings about unused arguments
     (void)minimum_parallel_grain_size;

     apply_to_subrange(begin, end);
 #else
     internal::ParallelForWrapper worker(*this);
     tbb::parallel_for(
       tbb::blocked_range<std::size_t>(begin, end, minimum_parallel_grain_size),
       worker,
       tbb::auto_partitioner());
 #endif
   }

 } // end of namespace parallel

 DEAL_II_NAMESPACE_CLOSE

 #endif
parallel::internal::make_body
Body< F > make_body(const F &f)
Definition: parallel.h:142

parallel::internal::ReductionOnSubranges::f
const Function f
Definition: parallel.h:572

parallel::ParallelForInteger::apply_parallel
void apply_parallel(const std::size_t begin, const std::size_t end, const std::size_t minimum_parallel_grain_size) const
Definition: parallel.h:816

Function
Definition: function.h:148

parallel::internal::EnableOpenMPSimdFor
Definition: parallel.h:54

parallel::internal::ReductionOnSubranges::join
void join(const ReductionOnSubranges &r)
Definition: parallel.h:553

parallel::internal::Body::apply
static void apply(const F &f, const std::tuple< I1, I2, I3 > &p)
Definition: parallel.h:117

parallel::internal::Body::Body
Body(const F &f)
Definition: parallel.h:83

SynchronousIterators
Definition: synchronous_iterator.h:52

parallel::internal::Body
Definition: parallel.h:78

parallel::internal::ParallelForWrapper
Definition: parallel.h:796

parallel::internal::ReductionOnSubranges::operator()
void operator()(const tbb::blocked_range< RangeType > &range)
Definition: parallel.h:563

parallel
Definition: distributed.h:419

parallel::internal::ReductionOnSubranges::ReductionOnSubranges
ReductionOnSubranges(const Function &f, const Reductor &reductor, const ResultType neutral_element=ResultType())
Definition: parallel.h:529

parallel::internal::Body::apply
static void apply(const F &f, const std::tuple< I1, I2, I3, I4 > &p)
Definition: parallel.h:127

parallel::internal::apply_to_subranges
void apply_to_subranges(const tbb::blocked_range< RangeType > &range, const Function &f)
Definition: parallel.h:330

parallel::internal::ReductionOnSubranges::ReductionOnSubranges
ReductionOnSubranges(const ReductionOnSubranges &r, tbb::split)
Definition: parallel.h:541

parallel::internal::Body::f
const F f
Definition: parallel.h:100

parallel::internal::ReductionOnSubranges::reductor
const std::function< ResultType(ResultType, ResultType)> reductor
Definition: parallel.h:585

parallel::internal::Body::apply
static void apply(const F &f, const std::tuple< I1, I2 > &p)
Definition: parallel.h:107

parallel::accumulate_from_subranges
ResultType accumulate_from_subranges(const Function &f, const RangeType &begin, const typename identity< RangeType >::type &end, const unsigned int grainsize)
Definition: parallel.h:652

parallel::internal::ReductionOnSubranges::neutral_element
const ResultType neutral_element
Definition: parallel.h:579

internal
Definition: aligned_vector.h:345

parallel::transform
void transform(const InputIterator &begin_in, const InputIterator &end_in, OutputIterator out, Predicate &predicate, const unsigned int grainsize)
Definition: parallel.h:173

Threads::Mutex
Definition: thread_management.h:266

parallel::internal::ReductionOnSubranges::result
ResultType result
Definition: parallel.h:517

parallel::ParallelForInteger
Definition: parallel.h:469

parallel::internal::ReductionOnSubranges
Definition: parallel.h:512