17 #ifndef dealii_vector_operations_internal_h 18 #define dealii_vector_operations_internal_h 20 #include <deal.II/base/multithread_info.h> 21 #include <deal.II/base/parallel.h> 22 #include <deal.II/base/thread_management.h> 23 #include <deal.II/base/vectorization.h> 28 DEAL_II_NAMESPACE_OPEN
32 namespace VectorOperations
38 is_non_negative(
const T &t)
46 is_non_negative(
const std::complex<T> &)
56 print(
const T &t,
const char *format)
58 if (format !=
nullptr)
59 std::printf(format, t);
61 std::printf(
" %5.2f",
double(t));
68 print(
const std::complex<T> &t,
const char *format)
70 if (format !=
nullptr)
71 std::printf(format, t.real(), t.imag());
73 std::printf(
" %5.2f+%5.2fi",
double(t.real()),
double(t.imag()));
80 template <
typename T,
typename U>
82 copy(
const T *begin,
const T *end, U *dest)
84 std::copy(begin, end, dest);
87 template <
typename T,
typename U>
89 copy(
const std::complex<T> *begin,
90 const std::complex<T> *end,
91 std::complex<U> * dest)
93 std::copy(begin, end, dest);
96 template <
typename T,
typename U>
98 copy(
const std::complex<T> *,
const std::complex<T> *, U *)
101 ExcMessage(
"Can't convert a vector of complex numbers " 102 "into a vector of reals/doubles"));
107 #ifdef DEAL_II_WITH_THREADS 116 template <
typename Functor>
120 const size_type start,
126 const size_type vec_size = end - start;
128 const unsigned int gs =
129 internal::VectorImplementation::minimum_parallel_grain_size;
133 chunk_size = vec_size / n_chunks;
139 if (chunk_size > 512)
140 chunk_size = ((chunk_size + 511) / 512) * 512;
141 n_chunks = (vec_size + chunk_size - 1) / chunk_size;
147 operator()(
const tbb::blocked_range<size_type> &range)
const 149 const size_type r_begin = start + range.begin() * chunk_size;
150 const size_type r_end = std::min(start + range.end() * chunk_size, end);
151 functor(r_begin, r_end);
155 const size_type start;
157 unsigned int n_chunks;
158 size_type chunk_size;
162 template <
typename Functor>
168 std::shared_ptr<parallel::internal::TBBPartitioner> &partitioner)
170 #ifdef DEAL_II_WITH_THREADS 171 size_type vec_size = end - start;
175 4 * internal::VectorImplementation::minimum_parallel_grain_size &&
178 Assert(partitioner.get() !=
nullptr,
180 "Unexpected initialization of Vector that does " 181 "not set the TBB partitioner to a usable state."));
182 std::shared_ptr<tbb::affinity_partitioner> tbb_partitioner =
183 partitioner->acquire_one_partitioner();
187 tbb::blocked_range<size_type>(0, generic_functor.n_chunks, 1),
190 partitioner->release_one_partitioner(tbb_partitioner);
192 else if (vec_size > 0)
204 template <
typename Number>
207 Vector_set(Number value, Number *dst)
215 operator()(
const size_type begin,
const size_type end)
const 219 if (value == Number())
221 #ifdef DEAL_II_WITH_CXX17 222 if constexpr (std::is_trivial<Number>::value)
224 if (std::is_trivial<Number>::value)
227 std::memset(dst + begin, 0,
sizeof(Number) * (end - begin));
231 std::fill(dst + begin, dst + end, value);
238 template <
typename Number,
typename OtherNumber>
241 Vector_copy(
const OtherNumber *src, Number *dst)
250 operator()(
const size_type begin,
const size_type end)
const 254 #if __GNUG__ && __GNUC__ < 5 255 if (__has_trivial_copy(Number) &&
256 std::is_same<Number, OtherNumber>::value)
258 # ifdef DEAL_II_WITH_CXX17 259 if constexpr (std::is_trivially_copyable<Number>() &&
260 std::is_same<Number, OtherNumber>::value)
262 if (std::is_trivially_copyable<Number>() &&
263 std::is_same<Number, OtherNumber>::value)
266 std::memcpy(dst + begin, src + begin, (end - begin) *
sizeof(Number));
269 DEAL_II_OPENMP_SIMD_PRAGMA
270 for (size_type i = begin; i < end; ++i)
275 const OtherNumber *
const src;
279 template <
typename Number>
280 struct Vectorization_multiply_factor
282 Vectorization_multiply_factor(Number *val, Number factor)
288 operator()(
const size_type begin,
const size_type end)
const 292 DEAL_II_OPENMP_SIMD_PRAGMA
293 for (size_type i = begin; i < end; ++i)
298 for (size_type i = begin; i < end; ++i)
307 template <
typename Number>
308 struct Vectorization_add_av
310 Vectorization_add_av(Number *val, Number *v_val, Number factor)
317 operator()(
const size_type begin,
const size_type end)
const 321 DEAL_II_OPENMP_SIMD_PRAGMA
322 for (size_type i = begin; i < end; ++i)
323 val[i] += factor * v_val[i];
327 for (size_type i = begin; i < end; ++i)
328 val[i] += factor * v_val[i];
337 template <
typename Number>
338 struct Vectorization_sadd_xav
340 Vectorization_sadd_xav(Number *val, Number *v_val, Number a, Number x)
348 operator()(
const size_type begin,
const size_type end)
const 352 DEAL_II_OPENMP_SIMD_PRAGMA
353 for (size_type i = begin; i < end; ++i)
354 val[i] = x * val[i] + a * v_val[i];
358 for (size_type i = begin; i < end; ++i)
359 val[i] = x * val[i] + a * v_val[i];
369 template <
typename Number>
370 struct Vectorization_subtract_v
372 Vectorization_subtract_v(Number *val, Number *v_val)
378 operator()(
const size_type begin,
const size_type end)
const 382 DEAL_II_OPENMP_SIMD_PRAGMA
383 for (size_type i = begin; i < end; ++i)
388 for (size_type i = begin; i < end; ++i)
397 template <
typename Number>
398 struct Vectorization_add_factor
400 Vectorization_add_factor(Number *val, Number factor)
406 operator()(
const size_type begin,
const size_type end)
const 410 DEAL_II_OPENMP_SIMD_PRAGMA
411 for (size_type i = begin; i < end; ++i)
416 for (size_type i = begin; i < end; ++i)
425 template <
typename Number>
426 struct Vectorization_add_v
428 Vectorization_add_v(Number *val, Number *v_val)
434 operator()(
const size_type begin,
const size_type end)
const 438 DEAL_II_OPENMP_SIMD_PRAGMA
439 for (size_type i = begin; i < end; ++i)
444 for (size_type i = begin; i < end; ++i)
453 template <
typename Number>
454 struct Vectorization_add_avpbw
456 Vectorization_add_avpbw(Number *val,
469 operator()(
const size_type begin,
const size_type end)
const 473 DEAL_II_OPENMP_SIMD_PRAGMA
474 for (size_type i = begin; i < end; ++i)
475 val[i] = val[i] + a * v_val[i] + b * w_val[i];
479 for (size_type i = begin; i < end; ++i)
480 val[i] = val[i] + a * v_val[i] + b * w_val[i];
491 template <
typename Number>
492 struct Vectorization_sadd_xv
494 Vectorization_sadd_xv(Number *val, Number *v_val, Number x)
501 operator()(
const size_type begin,
const size_type end)
const 505 DEAL_II_OPENMP_SIMD_PRAGMA
506 for (size_type i = begin; i < end; ++i)
507 val[i] = x * val[i] + v_val[i];
511 for (size_type i = begin; i < end; ++i)
512 val[i] = x * val[i] + v_val[i];
521 template <
typename Number>
522 struct Vectorization_sadd_xavbw
524 Vectorization_sadd_xavbw(Number *val,
539 operator()(
const size_type begin,
const size_type end)
const 543 DEAL_II_OPENMP_SIMD_PRAGMA
544 for (size_type i = begin; i < end; ++i)
545 val[i] = x * val[i] + a * v_val[i] + b * w_val[i];
549 for (size_type i = begin; i < end; ++i)
550 val[i] = x * val[i] + a * v_val[i] + b * w_val[i];
562 template <
typename Number>
563 struct Vectorization_scale
565 Vectorization_scale(Number *val, Number *v_val)
571 operator()(
const size_type begin,
const size_type end)
const 575 DEAL_II_OPENMP_SIMD_PRAGMA
576 for (size_type i = begin; i < end; ++i)
581 for (size_type i = begin; i < end; ++i)
590 template <
typename Number>
591 struct Vectorization_equ_au
593 Vectorization_equ_au(Number *val, Number *u_val, Number a)
600 operator()(
const size_type begin,
const size_type end)
const 604 DEAL_II_OPENMP_SIMD_PRAGMA
605 for (size_type i = begin; i < end; ++i)
606 val[i] = a * u_val[i];
610 for (size_type i = begin; i < end; ++i)
611 val[i] = a * u_val[i];
620 template <
typename Number>
621 struct Vectorization_equ_aubv
623 Vectorization_equ_aubv(Number *val,
636 operator()(
const size_type begin,
const size_type end)
const 640 DEAL_II_OPENMP_SIMD_PRAGMA
641 for (size_type i = begin; i < end; ++i)
642 val[i] = a * u_val[i] + b * v_val[i];
646 for (size_type i = begin; i < end; ++i)
647 val[i] = a * u_val[i] + b * v_val[i];
658 template <
typename Number>
659 struct Vectorization_equ_aubvcw
661 Vectorization_equ_aubvcw(Number *val,
678 operator()(
const size_type begin,
const size_type end)
const 682 DEAL_II_OPENMP_SIMD_PRAGMA
683 for (size_type i = begin; i < end; ++i)
684 val[i] = a * u_val[i] + b * v_val[i] + c * w_val[i];
688 for (size_type i = begin; i < end; ++i)
689 val[i] = a * u_val[i] + b * v_val[i] + c * w_val[i];
702 template <
typename Number>
703 struct Vectorization_ratio
705 Vectorization_ratio(Number *val, Number *a_val, Number *b_val)
712 operator()(
const size_type begin,
const size_type end)
const 716 DEAL_II_OPENMP_SIMD_PRAGMA
717 for (size_type i = begin; i < end; ++i)
718 val[i] = a_val[i] / b_val[i];
722 for (size_type i = begin; i < end; ++i)
723 val[i] = a_val[i] / b_val[i];
739 template <
typename Number,
typename Number2>
742 static const bool vectorizes =
743 std::is_same<Number, Number2>::value &&
746 Dot(
const Number *X,
const Number2 *Y)
752 operator()(
const size_type i)
const 758 do_vectorized(
const size_type i)
const 772 "This operation is not correctly implemented for " 773 "complex-valued objects.");
781 template <
typename Number,
typename RealType>
784 static const bool vectorizes =
787 Norm2(
const Number *X)
792 operator()(
const size_type i)
const 798 do_vectorized(
const size_type i)
const 808 template <
typename Number,
typename RealType>
811 static const bool vectorizes =
814 Norm1(
const Number *X)
819 operator()(
const size_type i)
const 825 do_vectorized(
const size_type i)
const 835 template <
typename Number,
typename RealType>
838 static const bool vectorizes =
841 NormP(
const Number *X, RealType p)
847 operator()(
const size_type i)
const 853 do_vectorized(
const size_type i)
const 857 return std::pow(std::abs(x), p);
864 template <
typename Number>
867 static const bool vectorizes =
870 MeanValue(
const Number *X)
875 operator()(
const size_type i)
const 881 do_vectorized(
const size_type i)
const 891 template <
typename Number>
894 static const bool vectorizes =
897 AddAndDot(Number *X,
const Number *V,
const Number *W, Number a)
905 operator()(
const size_type i)
const 912 do_vectorized(
const size_type i)
const 931 "This operation is not correctly implemented for " 932 "complex-valued objects.");
983 const unsigned int vector_accumulation_recursion_threshold = 128;
985 template <
typename Operation,
typename ResultType>
987 accumulate_recursive(
const Operation &op,
988 const size_type first,
989 const size_type last,
992 const size_type vec_size = last - first;
993 if (vec_size <= vector_accumulation_recursion_threshold * 32)
998 size_type index = first;
999 ResultType outer_results[vector_accumulation_recursion_threshold];
1003 outer_results[0] = ResultType();
1010 size_type n_chunks = vec_size / 32;
1011 const size_type remainder = vec_size % 32;
1013 n_chunks < vector_accumulation_recursion_threshold,
1025 std::integral_constant<bool, Operation::vectorizes>());
1036 vector_accumulation_recursion_threshold + 1);
1041 const size_type inner_chunks = remainder / 8;
1043 const size_type remainder_inner = remainder % 8;
1044 ResultType r0 = ResultType(), r1 = ResultType(),
1046 switch (inner_chunks)
1050 for (size_type j = 1; j < 8; ++j)
1052 DEAL_II_FALLTHROUGH;
1055 for (size_type j = 1; j < 8; ++j)
1058 DEAL_II_FALLTHROUGH;
1061 for (size_type j = 1; j < 8; ++j)
1063 DEAL_II_FALLTHROUGH;
1065 for (size_type j = 0; j < remainder_inner; ++j)
1069 if (n_chunks == vector_accumulation_recursion_threshold)
1070 outer_results[vector_accumulation_recursion_threshold -
1074 outer_results[n_chunks] = r0;
1085 while (n_chunks > 1)
1087 if (n_chunks % 2 == 1)
1088 outer_results[n_chunks++] = ResultType();
1089 for (size_type i = 0; i < n_chunks; i += 2)
1090 outer_results[i / 2] = outer_results[i] + outer_results[i + 1];
1093 result = outer_results[0];
1100 const size_type new_size =
1101 (vec_size / (vector_accumulation_recursion_threshold * 32)) *
1102 vector_accumulation_recursion_threshold * 8;
1104 ResultType r0, r1, r2, r3;
1105 accumulate_recursive(op, first, first + new_size, r0);
1106 accumulate_recursive(op, first + new_size, first + 2 * new_size, r1);
1107 accumulate_recursive(op,
1108 first + 2 * new_size,
1109 first + 3 * new_size,
1111 accumulate_recursive(op, first + 3 * new_size, last, r3);
1123 template <
typename Operation,
typename ResultType>
1126 const Operation &op,
1127 size_type & n_chunks,
1129 ResultType (&outer_results)[vector_accumulation_recursion_threshold],
1130 std::integral_constant<bool, false>)
1134 for (size_type i = 0; i < n_chunks; ++i)
1136 ResultType r0 = op(index);
1137 ResultType r1 = op(index + 1);
1138 ResultType r2 = op(index + 2);
1139 ResultType r3 = op(index + 3);
1141 for (size_type j = 1; j < 8; ++j, index += 4)
1144 r1 += op(index + 1);
1145 r2 += op(index + 2);
1146 r3 += op(index + 3);
1150 outer_results[i] = r0 + r2;
1161 template <
typename Operation,
typename Number>
1164 const Operation &op,
1165 size_type & n_chunks,
1167 Number (&outer_results)[vector_accumulation_recursion_threshold],
1168 std::integral_constant<bool, true>)
1177 const size_type regular_chunks = n_chunks / nvecs;
1178 for (size_type i = 0; i < regular_chunks; ++i)
1185 for (size_type j = 1; j < 8; ++j, index += nvecs * 4)
1187 r0 += op.do_vectorized(index);
1188 r1 += op.do_vectorized(index + nvecs);
1189 r2 += op.do_vectorized(index + 2 * nvecs);
1190 r3 += op.do_vectorized(index + 3 * nvecs);
1211 const size_type start_irreg = regular_chunks * nvecs;
1212 for (size_type c = start_irreg; c < n_chunks; ++c)
1213 for (size_type j = 0; j < 32; j += 2 * nvecs, index += 2 * nvecs)
1215 r0 += op.do_vectorized(index);
1216 r1 += op.do_vectorized(index + nvecs);
1219 r0.
store(&outer_results[start_irreg]);
1228 #ifdef DEAL_II_WITH_THREADS 1257 template <
typename Operation,
typename ResultType>
1260 static const unsigned int threshold_array_allocate = 512;
1263 const size_type start,
1264 const size_type end)
1269 const size_type vec_size = end - start;
1271 const unsigned int gs =
1272 internal::VectorImplementation::minimum_parallel_grain_size;
1276 chunk_size = vec_size / n_chunks;
1282 if (chunk_size > 512)
1283 chunk_size = ((chunk_size + 511) / 512) * 512;
1284 n_chunks = (vec_size + chunk_size - 1) / chunk_size;
1288 if (n_chunks > threshold_array_allocate)
1292 large_array.resize(2 * ((n_chunks + 1) / 2));
1293 array_ptr = large_array.data();
1296 array_ptr = &small_array[0];
1306 for (size_type i = range.begin(); i < range.end(); ++i)
1307 accumulate_recursive(op,
1308 start + i * chunk_size,
1309 std::min(start + (i + 1) * chunk_size, end),
1316 while (n_chunks > 1)
1318 if (n_chunks % 2 == 1)
1319 array_ptr[n_chunks++] = ResultType();
1320 for (size_type i = 0; i < n_chunks; i += 2)
1321 array_ptr[i / 2] = array_ptr[i] + array_ptr[i + 1];
1324 return array_ptr[0];
1327 const Operation &op;
1328 const size_type start;
1329 const size_type end;
1331 mutable unsigned int n_chunks;
1332 unsigned int chunk_size;
1333 ResultType small_array[threshold_array_allocate];
1334 std::vector<ResultType> large_array;
1337 mutable ResultType *array_ptr;
1347 template <
typename Operation,
typename ResultType>
1350 const Operation & op,
1351 const size_type start,
1352 const size_type end,
1353 ResultType & result,
1354 std::shared_ptr<parallel::internal::TBBPartitioner> &partitioner)
1356 #ifdef DEAL_II_WITH_THREADS 1357 size_type vec_size = end - start;
1361 4 * internal::VectorImplementation::minimum_parallel_grain_size &&
1364 Assert(partitioner.get() !=
nullptr,
1366 "Unexpected initialization of Vector that does " 1367 "not set the TBB partitioner to a usable state."));
1368 std::shared_ptr<tbb::affinity_partitioner> tbb_partitioner =
1369 partitioner->acquire_one_partitioner();
1375 tbb::blocked_range<size_type>(0, generic_functor.n_chunks, 1),
1378 partitioner->release_one_partitioner(tbb_partitioner);
1379 result = generic_functor.do_sum();
1382 accumulate_recursive(op, start, end, result);
1384 accumulate_recursive(op, start, end, result);
1391 DEAL_II_NAMESPACE_CLOSE
#define AssertDimension(dim1, dim2)
#define AssertIndexRange(index, range)
static real_type abs(const number &x)
unsigned long long int global_dof_index
static::ExceptionBase & ExcMessage(std::string arg1)
void store(Number *ptr) const
static real_type abs_square(const number &x)
#define Assert(cond, exc)
types::global_dof_index size_type
void load(const Number *ptr)
static unsigned int n_threads()
void operator()(const tbb::blocked_range< size_type > &range) const
static::ExceptionBase & ExcInternalError()