Reference documentation for deal.II version 9.1.0-pre
vectorization.h
1 // ---------------------------------------------------------------------
2 //
3 // Copyright (C) 2011 - 2018 by the deal.II authors
4 //
5 // This file is part of the deal.II library.
6 //
7 // The deal.II library is free software; you can use it, redistribute
8 // it, and/or modify it under the terms of the GNU Lesser General
9 // Public License as published by the Free Software Foundation; either
10 // version 2.1 of the License, or (at your option) any later version.
11 // The full text of the license can be found in the file LICENSE.md at
12 // the top level directory of deal.II.
13 //
14 // ---------------------------------------------------------------------
15 
16 
17 #ifndef dealii_vectorization_h
18 #define dealii_vectorization_h
19 
20 #include <deal.II/base/config.h>
21 
22 #include <deal.II/base/exceptions.h>
23 #include <deal.II/base/template_constraints.h>
24 
25 #include <cmath>
26 
27 // Note:
28 // The flag DEAL_II_COMPILER_VECTORIZATION_LEVEL is essentially constructed
29 // according to the following scheme
30 // #ifdef __AVX512F__
31 // #define DEAL_II_COMPILER_VECTORIZATION_LEVEL 3
32 // #elif defined (__AVX__)
33 // #define DEAL_II_COMPILER_VECTORIZATION_LEVEL 2
34 // #elif defined (__SSE2__)
35 // #define DEAL_II_COMPILER_VECTORIZATION_LEVEL 1
36 // #else
37 // #define DEAL_II_COMPILER_VECTORIZATION_LEVEL 0
38 // #endif
39 // In addition to checking the flags __AVX__ and __SSE2__, a CMake test,
40 // 'check_01_cpu_features.cmake', ensures that these feature are not only
41 // present in the compilation unit but also working properly.
42 
43 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 2 && !defined(__AVX__)
44 # error \
45  "Mismatch in vectorization capabilities: AVX was detected during configuration of deal.II and switched on, but it is apparently not available for the file you are trying to compile at the moment. Check compilation flags controlling the instruction set, such as -march=native."
46 #endif
47 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 3 && !defined(__AVX512F__)
48 # error \
49  "Mismatch in vectorization capabilities: AVX-512F was detected during configuration of deal.II and switched on, but it is apparently not available for the file you are trying to compile at the moment. Check compilation flags controlling the instruction set, such as -march=native."
50 #endif
51 
52 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 2 // AVX, AVX-512
53 # include <immintrin.h>
54 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL == 1 // SSE2
55 # include <emmintrin.h>
56 #endif
57 
58 
59 DEAL_II_NAMESPACE_OPEN
60 
61 
62 namespace internal
63 {
77  template <typename T>
79  {
80  static const VectorizedArray<T> &
81  value(const VectorizedArray<T> &t)
82  {
83  return t;
84  }
85 
86  static VectorizedArray<T>
87  value(const T &t)
88  {
90  tmp = t;
91  return tmp;
92  }
93  };
94 } // namespace internal
95 
96 
97 // Enable the EnableIfScalar type trait for VectorizedArray<Number> such
98 // that it can be used as a Number type in Tensor<rank,dim,Number>, etc.
99 
100 template <typename Number>
101 struct EnableIfScalar<VectorizedArray<Number>>
102 {
104 };
105 
106 
107 
158 template <typename Number>
159 class VectorizedArray
160 {
161 public:
165  static const unsigned int n_array_elements = 1;
166 
167  // POD means that there should be no user-defined constructors, destructors
168  // and copy functions (the standard is somewhat relaxed in C++2011, though).
169 
173  DEAL_II_ALWAYS_INLINE
174  VectorizedArray &
175  operator=(const Number scalar)
176  {
177  data = scalar;
178  return *this;
179  }
180 
184  DEAL_II_ALWAYS_INLINE
185  Number &operator[](const unsigned int comp)
186  {
187  (void)comp;
188  AssertIndexRange(comp, 1);
189  return data;
190  }
191 
195  DEAL_II_ALWAYS_INLINE
196  const Number &operator[](const unsigned int comp) const
197  {
198  (void)comp;
199  AssertIndexRange(comp, 1);
200  return data;
201  }
202 
206  DEAL_II_ALWAYS_INLINE
207  VectorizedArray &
209  {
210  data += vec.data;
211  return *this;
212  }
213 
217  DEAL_II_ALWAYS_INLINE
218  VectorizedArray &
220  {
221  data -= vec.data;
222  return *this;
223  }
224 
228  DEAL_II_ALWAYS_INLINE
229  VectorizedArray &
231  {
232  data *= vec.data;
233  return *this;
234  }
235 
239  DEAL_II_ALWAYS_INLINE
240  VectorizedArray &
242  {
243  data /= vec.data;
244  return *this;
245  }
246 
253  DEAL_II_ALWAYS_INLINE
254  void
255  load(const Number *ptr)
256  {
257  data = *ptr;
258  }
259 
266  DEAL_II_ALWAYS_INLINE
267  void
268  store(Number *ptr) const
269  {
270  *ptr = data;
271  }
272 
317  DEAL_II_ALWAYS_INLINE
318  void
319  streaming_store(Number *ptr) const
320  {
321  *ptr = data;
322  }
323 
336  DEAL_II_ALWAYS_INLINE
337  void
338  gather(const Number *base_ptr, const unsigned int *offsets)
339  {
340  data = base_ptr[offsets[0]];
341  }
342 
355  DEAL_II_ALWAYS_INLINE
356  void
357  scatter(const unsigned int *offsets, Number *base_ptr) const
358  {
359  base_ptr[offsets[0]] = data;
360  }
361 
366  Number data;
367 
368 private:
373  DEAL_II_ALWAYS_INLINE
374  VectorizedArray
375  get_sqrt() const
376  {
377  VectorizedArray res;
378  res.data = std::sqrt(data);
379  return res;
380  }
381 
386  DEAL_II_ALWAYS_INLINE
387  VectorizedArray
388  get_abs() const
389  {
390  VectorizedArray res;
391  res.data = std::fabs(data);
392  return res;
393  }
394 
399  DEAL_II_ALWAYS_INLINE
400  VectorizedArray
401  get_max(const VectorizedArray &other) const
402  {
403  VectorizedArray res;
404  res.data = std::max(data, other.data);
405  return res;
406  }
407 
412  DEAL_II_ALWAYS_INLINE
413  VectorizedArray
414  get_min(const VectorizedArray &other) const
415  {
416  VectorizedArray res;
417  res.data = std::min(data, other.data);
418  return res;
419  }
420 
424  template <typename Number2>
426  std::sqrt(const VectorizedArray<Number2> &);
427  template <typename Number2>
429  std::abs(const VectorizedArray<Number2> &);
430  template <typename Number2>
432  std::max(const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
433  template <typename Number2>
435  std::min(const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
436 };
437 
438 
439 
446 template <typename Number>
447 inline DEAL_II_ALWAYS_INLINE VectorizedArray<Number>
448  make_vectorized_array(const Number &u)
449 {
451  result = u;
452  return result;
453 }
454 
455 
456 
482 template <typename Number>
483 inline void
484 vectorized_load_and_transpose(const unsigned int n_entries,
485  const Number * in,
486  const unsigned int * offsets,
488 {
489  for (unsigned int i = 0; i < n_entries; ++i)
490  for (unsigned int v = 0; v < VectorizedArray<Number>::n_array_elements; ++v)
491  out[i][v] = in[offsets[v] + i];
492 }
493 
494 
495 
534 template <typename Number>
535 inline void
536 vectorized_transpose_and_store(const bool add_into,
537  const unsigned int n_entries,
538  const VectorizedArray<Number> *in,
539  const unsigned int * offsets,
540  Number * out)
541 {
542  if (add_into)
543  for (unsigned int i = 0; i < n_entries; ++i)
544  for (unsigned int v = 0; v < VectorizedArray<Number>::n_array_elements;
545  ++v)
546  out[offsets[v] + i] += in[i][v];
547  else
548  for (unsigned int i = 0; i < n_entries; ++i)
549  for (unsigned int v = 0; v < VectorizedArray<Number>::n_array_elements;
550  ++v)
551  out[offsets[v] + i] = in[i][v];
552 }
553 
554 
555 
556 // for safety, also check that __AVX512F__ is defined in case the user manually
557 // set some conflicting compile flags which prevent compilation
558 
559 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 3 && defined(__AVX512F__)
560 
564 template <>
565 class VectorizedArray<double>
566 {
567 public:
571  static const unsigned int n_array_elements = 8;
572 
576  DEAL_II_ALWAYS_INLINE
577  VectorizedArray &
578  operator=(const double x)
579  {
580  data = _mm512_set1_pd(x);
581  return *this;
582  }
583 
587  DEAL_II_ALWAYS_INLINE
588  double &operator[](const unsigned int comp)
589  {
590  AssertIndexRange(comp, 8);
591  return *(reinterpret_cast<double *>(&data) + comp);
592  }
593 
597  DEAL_II_ALWAYS_INLINE
598  const double &operator[](const unsigned int comp) const
599  {
600  AssertIndexRange(comp, 8);
601  return *(reinterpret_cast<const double *>(&data) + comp);
602  }
603 
607  DEAL_II_ALWAYS_INLINE
608  VectorizedArray &
609  operator+=(const VectorizedArray &vec)
610  {
611  // if the compiler supports vector arithmetics, we can simply use +=
612  // operator on the given data type. this allows the compiler to combine
613  // additions with multiplication (fused multiply-add) if those
614  // instructions are available. Otherwise, we need to use the built-in
615  // intrinsic command for __m512d
616 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
617  data += vec.data;
618 # else
619  data = _mm512_add_pd(data, vec.data);
620 # endif
621  return *this;
622  }
623 
627  DEAL_II_ALWAYS_INLINE
628  VectorizedArray &
629  operator-=(const VectorizedArray &vec)
630  {
631 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
632  data -= vec.data;
633 # else
634  data = _mm512_sub_pd(data, vec.data);
635 # endif
636  return *this;
637  }
641  DEAL_II_ALWAYS_INLINE
642  VectorizedArray &
643  operator*=(const VectorizedArray &vec)
644  {
645 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
646  data *= vec.data;
647 # else
648  data = _mm512_mul_pd(data, vec.data);
649 # endif
650  return *this;
651  }
652 
656  DEAL_II_ALWAYS_INLINE
657  VectorizedArray &
658  operator/=(const VectorizedArray &vec)
659  {
660 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
661  data /= vec.data;
662 # else
663  data = _mm512_div_pd(data, vec.data);
664 # endif
665  return *this;
666  }
667 
673  DEAL_II_ALWAYS_INLINE
674  void
675  load(const double *ptr)
676  {
677  data = _mm512_loadu_pd(ptr);
678  }
679 
686  DEAL_II_ALWAYS_INLINE
687  void
688  store(double *ptr) const
689  {
690  _mm512_storeu_pd(ptr, data);
691  }
692 
696  DEAL_II_ALWAYS_INLINE
697  void
698  streaming_store(double *ptr) const
699  {
700  Assert(reinterpret_cast<std::size_t>(ptr) % 64 == 0,
701  ExcMessage("Memory not aligned"));
702  _mm512_stream_pd(ptr, data);
703  }
704 
717  DEAL_II_ALWAYS_INLINE
718  void
719  gather(const double *base_ptr, const unsigned int *offsets)
720  {
721  // unfortunately, there does not appear to be a 256 bit integer load, so
722  // do it by some reinterpret casts here. this is allowed because the Intel
723  // API allows aliasing between different vector types.
724  const __m256 index_val = _mm256_loadu_ps((const float *)offsets);
725  const __m256i index = *((__m256i *)(&index_val));
726  data = _mm512_i32gather_pd(index, base_ptr, 8);
727  }
728 
741  DEAL_II_ALWAYS_INLINE
742  void
743  scatter(const unsigned int *offsets, double *base_ptr) const
744  {
745  for (unsigned int i = 0; i < 8; ++i)
746  for (unsigned int j = i + 1; j < 8; ++j)
747  Assert(offsets[i] != offsets[j],
748  ExcMessage("Result of scatter undefined if two offset elements"
749  " point to the same position"));
750 
751  // unfortunately, there does not appear to be a 256 bit integer load, so
752  // do it by some reinterpret casts here. this is allowed because the Intel
753  // API allows aliasing between different vector types.
754  const __m256 index_val = _mm256_loadu_ps((const float *)offsets);
755  const __m256i index = *((__m256i *)(&index_val));
756  _mm512_i32scatter_pd(base_ptr, index, data, 8);
757  }
758 
763  __m512d data;
764 
765 private:
770  DEAL_II_ALWAYS_INLINE
771  VectorizedArray
772  get_sqrt() const
773  {
774  VectorizedArray res;
775  res.data = _mm512_sqrt_pd(data);
776  return res;
777  }
778 
783  DEAL_II_ALWAYS_INLINE
784  VectorizedArray
785  get_abs() const
786  {
787  // to compute the absolute value, perform bitwise andnot with -0. This
788  // will leave all value and exponent bits unchanged but force the sign
789  // value to +. Since there is no andnot for AVX512, we interpret the data
790  // as 64 bit integers and do the andnot on those types (note that andnot
791  // is a bitwise operation so the data type does not matter)
792  __m512d mask = _mm512_set1_pd(-0.);
793  VectorizedArray res;
794  res.data = (__m512d)_mm512_andnot_epi64((__m512i)mask, (__m512i)data);
795  return res;
796  }
797 
802  DEAL_II_ALWAYS_INLINE
803  VectorizedArray
804  get_max(const VectorizedArray &other) const
805  {
806  VectorizedArray res;
807  res.data = _mm512_max_pd(data, other.data);
808  return res;
809  }
810 
815  DEAL_II_ALWAYS_INLINE
816  VectorizedArray
817  get_min(const VectorizedArray &other) const
818  {
819  VectorizedArray res;
820  res.data = _mm512_min_pd(data, other.data);
821  return res;
822  }
823 
827  template <typename Number2>
829  std::sqrt(const VectorizedArray<Number2> &);
830  template <typename Number2>
832  std::abs(const VectorizedArray<Number2> &);
833  template <typename Number2>
835  std::max(const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
836  template <typename Number2>
838  std::min(const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
839 };
840 
841 
842 
846 template <>
847 inline void
848 vectorized_load_and_transpose(const unsigned int n_entries,
849  const double * in,
850  const unsigned int * offsets,
852 {
853  const unsigned int n_chunks = n_entries / 4;
854  for (unsigned int outer = 0; outer < 8; outer += 4)
855  {
856  const double *in0 = in + offsets[0 + outer];
857  const double *in1 = in + offsets[1 + outer];
858  const double *in2 = in + offsets[2 + outer];
859  const double *in3 = in + offsets[3 + outer];
860 
861  for (unsigned int i = 0; i < n_chunks; ++i)
862  {
863  __m256d u0 = _mm256_loadu_pd(in0 + 4 * i);
864  __m256d u1 = _mm256_loadu_pd(in1 + 4 * i);
865  __m256d u2 = _mm256_loadu_pd(in2 + 4 * i);
866  __m256d u3 = _mm256_loadu_pd(in3 + 4 * i);
867  __m256d t0 = _mm256_permute2f128_pd(u0, u2, 0x20);
868  __m256d t1 = _mm256_permute2f128_pd(u1, u3, 0x20);
869  __m256d t2 = _mm256_permute2f128_pd(u0, u2, 0x31);
870  __m256d t3 = _mm256_permute2f128_pd(u1, u3, 0x31);
871  *(__m256d *)((double *)(&out[4 * i + 0].data) + outer) =
872  _mm256_unpacklo_pd(t0, t1);
873  *(__m256d *)((double *)(&out[4 * i + 1].data) + outer) =
874  _mm256_unpackhi_pd(t0, t1);
875  *(__m256d *)((double *)(&out[4 * i + 2].data) + outer) =
876  _mm256_unpacklo_pd(t2, t3);
877  *(__m256d *)((double *)(&out[4 * i + 3].data) + outer) =
878  _mm256_unpackhi_pd(t2, t3);
879  }
880  for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
881  for (unsigned int v = 0; v < 4; ++v)
882  out[i][outer + v] = in[offsets[v + outer] + i];
883  }
884 }
885 
886 
887 
891 template <>
892 inline void
893 vectorized_transpose_and_store(const bool add_into,
894  const unsigned int n_entries,
895  const VectorizedArray<double> *in,
896  const unsigned int * offsets,
897  double * out)
898 {
899  const unsigned int n_chunks = n_entries / 4;
900  // do not do full transpose because the code is too long and will most
901  // likely not pay off. rather do the transposition on the vectorized array
902  // on size smaller, mm256d
903  for (unsigned int outer = 0; outer < 8; outer += 4)
904  {
905  double *out0 = out + offsets[0 + outer];
906  double *out1 = out + offsets[1 + outer];
907  double *out2 = out + offsets[2 + outer];
908  double *out3 = out + offsets[3 + outer];
909  for (unsigned int i = 0; i < n_chunks; ++i)
910  {
911  __m256d u0 =
912  *(const __m256d *)((const double *)(&in[4 * i + 0].data) + outer);
913  __m256d u1 =
914  *(const __m256d *)((const double *)(&in[4 * i + 1].data) + outer);
915  __m256d u2 =
916  *(const __m256d *)((const double *)(&in[4 * i + 2].data) + outer);
917  __m256d u3 =
918  *(const __m256d *)((const double *)(&in[4 * i + 3].data) + outer);
919  __m256d t0 = _mm256_permute2f128_pd(u0, u2, 0x20);
920  __m256d t1 = _mm256_permute2f128_pd(u1, u3, 0x20);
921  __m256d t2 = _mm256_permute2f128_pd(u0, u2, 0x31);
922  __m256d t3 = _mm256_permute2f128_pd(u1, u3, 0x31);
923  __m256d res0 = _mm256_unpacklo_pd(t0, t1);
924  __m256d res1 = _mm256_unpackhi_pd(t0, t1);
925  __m256d res2 = _mm256_unpacklo_pd(t2, t3);
926  __m256d res3 = _mm256_unpackhi_pd(t2, t3);
927 
928  // Cannot use the same store instructions in both paths of the 'if'
929  // because the compiler cannot know that there is no aliasing between
930  // pointers
931  if (add_into)
932  {
933  res0 = _mm256_add_pd(_mm256_loadu_pd(out0 + 4 * i), res0);
934  _mm256_storeu_pd(out0 + 4 * i, res0);
935  res1 = _mm256_add_pd(_mm256_loadu_pd(out1 + 4 * i), res1);
936  _mm256_storeu_pd(out1 + 4 * i, res1);
937  res2 = _mm256_add_pd(_mm256_loadu_pd(out2 + 4 * i), res2);
938  _mm256_storeu_pd(out2 + 4 * i, res2);
939  res3 = _mm256_add_pd(_mm256_loadu_pd(out3 + 4 * i), res3);
940  _mm256_storeu_pd(out3 + 4 * i, res3);
941  }
942  else
943  {
944  _mm256_storeu_pd(out0 + 4 * i, res0);
945  _mm256_storeu_pd(out1 + 4 * i, res1);
946  _mm256_storeu_pd(out2 + 4 * i, res2);
947  _mm256_storeu_pd(out3 + 4 * i, res3);
948  }
949  }
950  if (add_into)
951  for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
952  for (unsigned int v = 0; v < 4; ++v)
953  out[offsets[v + outer] + i] += in[i][v + outer];
954  else
955  for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
956  for (unsigned int v = 0; v < 4; ++v)
957  out[offsets[v + outer] + i] = in[i][v + outer];
958  }
959 }
960 
961 
962 
966 template <>
967 class VectorizedArray<float>
968 {
969 public:
973  static const unsigned int n_array_elements = 16;
974 
978  DEAL_II_ALWAYS_INLINE
979  VectorizedArray &
980  operator=(const float x)
981  {
982  data = _mm512_set1_ps(x);
983  return *this;
984  }
985 
989  DEAL_II_ALWAYS_INLINE
990  float &operator[](const unsigned int comp)
991  {
992  AssertIndexRange(comp, 16);
993  return *(reinterpret_cast<float *>(&data) + comp);
994  }
995 
999  DEAL_II_ALWAYS_INLINE
1000  const float &operator[](const unsigned int comp) const
1001  {
1002  AssertIndexRange(comp, 16);
1003  return *(reinterpret_cast<const float *>(&data) + comp);
1004  }
1005 
1009  DEAL_II_ALWAYS_INLINE
1010  VectorizedArray &
1011  operator+=(const VectorizedArray &vec)
1012  {
1013  // if the compiler supports vector arithmetics, we can simply use +=
1014  // operator on the given data type. this allows the compiler to combine
1015  // additions with multiplication (fused multiply-add) if those
1016  // instructions are available. Otherwise, we need to use the built-in
1017  // intrinsic command for __m512d
1018 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1019  data += vec.data;
1020 # else
1021  data = _mm512_add_ps(data, vec.data);
1022 # endif
1023  return *this;
1024  }
1025 
1029  DEAL_II_ALWAYS_INLINE
1030  VectorizedArray &
1031  operator-=(const VectorizedArray &vec)
1032  {
1033 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1034  data -= vec.data;
1035 # else
1036  data = _mm512_sub_ps(data, vec.data);
1037 # endif
1038  return *this;
1039  }
1043  DEAL_II_ALWAYS_INLINE
1044  VectorizedArray &
1045  operator*=(const VectorizedArray &vec)
1046  {
1047 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1048  data *= vec.data;
1049 # else
1050  data = _mm512_mul_ps(data, vec.data);
1051 # endif
1052  return *this;
1053  }
1054 
1058  DEAL_II_ALWAYS_INLINE
1059  VectorizedArray &
1060  operator/=(const VectorizedArray &vec)
1061  {
1062 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1063  data /= vec.data;
1064 # else
1065  data = _mm512_div_ps(data, vec.data);
1066 # endif
1067  return *this;
1068  }
1069 
1075  DEAL_II_ALWAYS_INLINE
1076  void
1077  load(const float *ptr)
1078  {
1079  data = _mm512_loadu_ps(ptr);
1080  }
1081 
1088  DEAL_II_ALWAYS_INLINE
1089  void
1090  store(float *ptr) const
1091  {
1092  _mm512_storeu_ps(ptr, data);
1093  }
1094 
1098  DEAL_II_ALWAYS_INLINE
1099  void
1100  streaming_store(float *ptr) const
1101  {
1102  Assert(reinterpret_cast<std::size_t>(ptr) % 64 == 0,
1103  ExcMessage("Memory not aligned"));
1104  _mm512_stream_ps(ptr, data);
1105  }
1106 
1119  DEAL_II_ALWAYS_INLINE
1120  void
1121  gather(const float *base_ptr, const unsigned int *offsets)
1122  {
1123  // unfortunately, there does not appear to be a 512 bit integer load, so
1124  // do it by some reinterpret casts here. this is allowed because the Intel
1125  // API allows aliasing between different vector types.
1126  const __m512 index_val = _mm512_loadu_ps((const float *)offsets);
1127  const __m512i index = *((__m512i *)(&index_val));
1128  data = _mm512_i32gather_ps(index, base_ptr, 4);
1129  }
1130 
1143  DEAL_II_ALWAYS_INLINE
1144  void
1145  scatter(const unsigned int *offsets, float *base_ptr) const
1146  {
1147  for (unsigned int i = 0; i < 16; ++i)
1148  for (unsigned int j = i + 1; j < 16; ++j)
1149  Assert(offsets[i] != offsets[j],
1150  ExcMessage("Result of scatter undefined if two offset elements"
1151  " point to the same position"));
1152 
1153  // unfortunately, there does not appear to be a 512 bit integer load, so
1154  // do it by some reinterpret casts here. this is allowed because the Intel
1155  // API allows aliasing between different vector types.
1156  const __m512 index_val = _mm512_loadu_ps((const float *)offsets);
1157  const __m512i index = *((__m512i *)(&index_val));
1158  _mm512_i32scatter_ps(base_ptr, index, data, 4);
1159  }
1160 
1165  __m512 data;
1166 
1167 private:
1172  DEAL_II_ALWAYS_INLINE
1173  VectorizedArray
1174  get_sqrt() const
1175  {
1176  VectorizedArray res;
1177  res.data = _mm512_sqrt_ps(data);
1178  return res;
1179  }
1180 
1185  DEAL_II_ALWAYS_INLINE
1186  VectorizedArray
1187  get_abs() const
1188  {
1189  // to compute the absolute value, perform bitwise andnot with -0. This
1190  // will leave all value and exponent bits unchanged but force the sign
1191  // value to +. Since there is no andnot for AVX512, we interpret the data
1192  // as 32 bit integers and do the andnot on those types (note that andnot
1193  // is a bitwise operation so the data type does not matter)
1194  __m512 mask = _mm512_set1_ps(-0.f);
1195  VectorizedArray res;
1196  res.data = (__m512)_mm512_andnot_epi32((__m512i)mask, (__m512i)data);
1197  return res;
1198  }
1199 
1204  DEAL_II_ALWAYS_INLINE
1205  VectorizedArray
1206  get_max(const VectorizedArray &other) const
1207  {
1208  VectorizedArray res;
1209  res.data = _mm512_max_ps(data, other.data);
1210  return res;
1211  }
1212 
1217  DEAL_II_ALWAYS_INLINE
1218  VectorizedArray
1219  get_min(const VectorizedArray &other) const
1220  {
1221  VectorizedArray res;
1222  res.data = _mm512_min_ps(data, other.data);
1223  return res;
1224  }
1225 
1229  template <typename Number2>
1231  std::sqrt(const VectorizedArray<Number2> &);
1232  template <typename Number2>
1234  std::abs(const VectorizedArray<Number2> &);
1235  template <typename Number2>
1237  std::max(const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
1238  template <typename Number2>
1240  std::min(const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
1241 };
1242 
1243 
1244 
1248 template <>
1249 inline void
1250 vectorized_load_and_transpose(const unsigned int n_entries,
1251  const float * in,
1252  const unsigned int * offsets,
1254 {
1255  const unsigned int n_chunks = n_entries / 4;
1256  for (unsigned int outer = 0; outer < 16; outer += 8)
1257  {
1258  for (unsigned int i = 0; i < n_chunks; ++i)
1259  {
1260  __m128 u0 = _mm_loadu_ps(in + 4 * i + offsets[0 + outer]);
1261  __m128 u1 = _mm_loadu_ps(in + 4 * i + offsets[1 + outer]);
1262  __m128 u2 = _mm_loadu_ps(in + 4 * i + offsets[2 + outer]);
1263  __m128 u3 = _mm_loadu_ps(in + 4 * i + offsets[3 + outer]);
1264  __m128 u4 = _mm_loadu_ps(in + 4 * i + offsets[4 + outer]);
1265  __m128 u5 = _mm_loadu_ps(in + 4 * i + offsets[5 + outer]);
1266  __m128 u6 = _mm_loadu_ps(in + 4 * i + offsets[6 + outer]);
1267  __m128 u7 = _mm_loadu_ps(in + 4 * i + offsets[7 + outer]);
1268  // To avoid warnings about uninitialized variables, need to initialize
1269  // one variable with zero before using it.
1270  __m256 t0, t1, t2, t3 = _mm256_set1_ps(0.F);
1271  t0 = _mm256_insertf128_ps(t3, u0, 0);
1272  t0 = _mm256_insertf128_ps(t0, u4, 1);
1273  t1 = _mm256_insertf128_ps(t3, u1, 0);
1274  t1 = _mm256_insertf128_ps(t1, u5, 1);
1275  t2 = _mm256_insertf128_ps(t3, u2, 0);
1276  t2 = _mm256_insertf128_ps(t2, u6, 1);
1277  t3 = _mm256_insertf128_ps(t3, u3, 0);
1278  t3 = _mm256_insertf128_ps(t3, u7, 1);
1279  __m256 v0 = _mm256_shuffle_ps(t0, t1, 0x44);
1280  __m256 v1 = _mm256_shuffle_ps(t0, t1, 0xee);
1281  __m256 v2 = _mm256_shuffle_ps(t2, t3, 0x44);
1282  __m256 v3 = _mm256_shuffle_ps(t2, t3, 0xee);
1283  *(__m256 *)((float *)(&out[4 * i + 0].data) + outer) =
1284  _mm256_shuffle_ps(v0, v2, 0x88);
1285  *(__m256 *)((float *)(&out[4 * i + 1].data) + outer) =
1286  _mm256_shuffle_ps(v0, v2, 0xdd);
1287  *(__m256 *)((float *)(&out[4 * i + 2].data) + outer) =
1288  _mm256_shuffle_ps(v1, v3, 0x88);
1289  *(__m256 *)((float *)(&out[4 * i + 3].data) + outer) =
1290  _mm256_shuffle_ps(v1, v3, 0xdd);
1291  }
1292  for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
1293  for (unsigned int v = 0; v < 8; ++v)
1294  out[i][v + outer] = in[offsets[v + outer] + i];
1295  }
1296 }
1297 
1298 
1299 
1303 template <>
1304 inline void
1305 vectorized_transpose_and_store(const bool add_into,
1306  const unsigned int n_entries,
1307  const VectorizedArray<float> *in,
1308  const unsigned int * offsets,
1309  float * out)
1310 {
1311  const unsigned int n_chunks = n_entries / 4;
1312  for (unsigned int outer = 0; outer < 16; outer += 8)
1313  {
1314  for (unsigned int i = 0; i < n_chunks; ++i)
1315  {
1316  __m256 u0 =
1317  *(const __m256 *)((const float *)(&in[4 * i + 0].data) + outer);
1318  __m256 u1 =
1319  *(const __m256 *)((const float *)(&in[4 * i + 1].data) + outer);
1320  __m256 u2 =
1321  *(const __m256 *)((const float *)(&in[4 * i + 2].data) + outer);
1322  __m256 u3 =
1323  *(const __m256 *)((const float *)(&in[4 * i + 3].data) + outer);
1324  __m256 t0 = _mm256_shuffle_ps(u0, u1, 0x44);
1325  __m256 t1 = _mm256_shuffle_ps(u0, u1, 0xee);
1326  __m256 t2 = _mm256_shuffle_ps(u2, u3, 0x44);
1327  __m256 t3 = _mm256_shuffle_ps(u2, u3, 0xee);
1328  u0 = _mm256_shuffle_ps(t0, t2, 0x88);
1329  u1 = _mm256_shuffle_ps(t0, t2, 0xdd);
1330  u2 = _mm256_shuffle_ps(t1, t3, 0x88);
1331  u3 = _mm256_shuffle_ps(t1, t3, 0xdd);
1332  __m128 res0 = _mm256_extractf128_ps(u0, 0);
1333  __m128 res4 = _mm256_extractf128_ps(u0, 1);
1334  __m128 res1 = _mm256_extractf128_ps(u1, 0);
1335  __m128 res5 = _mm256_extractf128_ps(u1, 1);
1336  __m128 res2 = _mm256_extractf128_ps(u2, 0);
1337  __m128 res6 = _mm256_extractf128_ps(u2, 1);
1338  __m128 res3 = _mm256_extractf128_ps(u3, 0);
1339  __m128 res7 = _mm256_extractf128_ps(u3, 1);
1340 
1341  // Cannot use the same store instructions in both paths of the 'if'
1342  // because the compiler cannot know that there is no aliasing between
1343  // pointers
1344  if (add_into)
1345  {
1346  res0 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[0 + outer]),
1347  res0);
1348  _mm_storeu_ps(out + 4 * i + offsets[0 + outer], res0);
1349  res1 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[1 + outer]),
1350  res1);
1351  _mm_storeu_ps(out + 4 * i + offsets[1 + outer], res1);
1352  res2 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[2 + outer]),
1353  res2);
1354  _mm_storeu_ps(out + 4 * i + offsets[2 + outer], res2);
1355  res3 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[3 + outer]),
1356  res3);
1357  _mm_storeu_ps(out + 4 * i + offsets[3 + outer], res3);
1358  res4 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[4 + outer]),
1359  res4);
1360  _mm_storeu_ps(out + 4 * i + offsets[4 + outer], res4);
1361  res5 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[5 + outer]),
1362  res5);
1363  _mm_storeu_ps(out + 4 * i + offsets[5 + outer], res5);
1364  res6 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[6 + outer]),
1365  res6);
1366  _mm_storeu_ps(out + 4 * i + offsets[6 + outer], res6);
1367  res7 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[7 + outer]),
1368  res7);
1369  _mm_storeu_ps(out + 4 * i + offsets[7 + outer], res7);
1370  }
1371  else
1372  {
1373  _mm_storeu_ps(out + 4 * i + offsets[0 + outer], res0);
1374  _mm_storeu_ps(out + 4 * i + offsets[1 + outer], res1);
1375  _mm_storeu_ps(out + 4 * i + offsets[2 + outer], res2);
1376  _mm_storeu_ps(out + 4 * i + offsets[3 + outer], res3);
1377  _mm_storeu_ps(out + 4 * i + offsets[4 + outer], res4);
1378  _mm_storeu_ps(out + 4 * i + offsets[5 + outer], res5);
1379  _mm_storeu_ps(out + 4 * i + offsets[6 + outer], res6);
1380  _mm_storeu_ps(out + 4 * i + offsets[7 + outer], res7);
1381  }
1382  }
1383  if (add_into)
1384  for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
1385  for (unsigned int v = 0; v < 8; ++v)
1386  out[offsets[v + outer] + i] += in[i][v + outer];
1387  else
1388  for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
1389  for (unsigned int v = 0; v < 8; ++v)
1390  out[offsets[v + outer] + i] = in[i][v + outer];
1391  }
1392 }
1393 
1394 
1395 
1396 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 2 && defined(__AVX__)
1397 
1401 template <>
1402 class VectorizedArray<double>
1403 {
1404 public:
1408  static const unsigned int n_array_elements = 4;
1409 
1413  DEAL_II_ALWAYS_INLINE
1414  VectorizedArray &
1415  operator=(const double x)
1416  {
1417  data = _mm256_set1_pd(x);
1418  return *this;
1419  }
1420 
1424  DEAL_II_ALWAYS_INLINE
1425  double &operator[](const unsigned int comp)
1426  {
1427  AssertIndexRange(comp, 4);
1428  return *(reinterpret_cast<double *>(&data) + comp);
1429  }
1430 
1434  DEAL_II_ALWAYS_INLINE
1435  const double &operator[](const unsigned int comp) const
1436  {
1437  AssertIndexRange(comp, 4);
1438  return *(reinterpret_cast<const double *>(&data) + comp);
1439  }
1440 
1444  DEAL_II_ALWAYS_INLINE
1445  VectorizedArray &
1446  operator+=(const VectorizedArray &vec)
1447  {
1448  // if the compiler supports vector arithmetics, we can simply use +=
1449  // operator on the given data type. this allows the compiler to combine
1450  // additions with multiplication (fused multiply-add) if those
1451  // instructions are available. Otherwise, we need to use the built-in
1452  // intrinsic command for __m256d
1453 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1454  data += vec.data;
1455 # else
1456  data = _mm256_add_pd(data, vec.data);
1457 # endif
1458  return *this;
1459  }
1460 
1464  DEAL_II_ALWAYS_INLINE
1465  VectorizedArray &
1466  operator-=(const VectorizedArray &vec)
1467  {
1468 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1469  data -= vec.data;
1470 # else
1471  data = _mm256_sub_pd(data, vec.data);
1472 # endif
1473  return *this;
1474  }
1478  DEAL_II_ALWAYS_INLINE
1479  VectorizedArray &
1480  operator*=(const VectorizedArray &vec)
1481  {
1482 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1483  data *= vec.data;
1484 # else
1485  data = _mm256_mul_pd(data, vec.data);
1486 # endif
1487  return *this;
1488  }
1489 
1493  DEAL_II_ALWAYS_INLINE
1494  VectorizedArray &
1495  operator/=(const VectorizedArray &vec)
1496  {
1497 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1498  data /= vec.data;
1499 # else
1500  data = _mm256_div_pd(data, vec.data);
1501 # endif
1502  return *this;
1503  }
1504 
1510  DEAL_II_ALWAYS_INLINE
1511  void
1512  load(const double *ptr)
1513  {
1514  data = _mm256_loadu_pd(ptr);
1515  }
1516 
1523  DEAL_II_ALWAYS_INLINE
1524  void
1525  store(double *ptr) const
1526  {
1527  _mm256_storeu_pd(ptr, data);
1528  }
1529 
1533  DEAL_II_ALWAYS_INLINE
1534  void
1535  streaming_store(double *ptr) const
1536  {
1537  Assert(reinterpret_cast<std::size_t>(ptr) % 32 == 0,
1538  ExcMessage("Memory not aligned"));
1539  _mm256_stream_pd(ptr, data);
1540  }
1541 
1554  DEAL_II_ALWAYS_INLINE
1555  void
1556  gather(const double *base_ptr, const unsigned int *offsets)
1557  {
1558 # ifdef __AVX2__
1559  // unfortunately, there does not appear to be a 128 bit integer load, so
1560  // do it by some reinterpret casts here. this is allowed because the Intel
1561  // API allows aliasing between different vector types.
1562  const __m128 index_val = _mm_loadu_ps((const float *)offsets);
1563  const __m128i index = *((__m128i *)(&index_val));
1564  data = _mm256_i32gather_pd(base_ptr, index, 8);
1565 # else
1566  for (unsigned int i = 0; i < 4; ++i)
1567  *(reinterpret_cast<double *>(&data) + i) = base_ptr[offsets[i]];
1568 # endif
1569  }
1570 
1583  DEAL_II_ALWAYS_INLINE
1584  void
1585  scatter(const unsigned int *offsets, double *base_ptr) const
1586  {
1587  // no scatter operation in AVX/AVX2
1588  for (unsigned int i = 0; i < 4; ++i)
1589  base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
1590  }
1591 
1596  __m256d data;
1597 
1598 private:
1603  DEAL_II_ALWAYS_INLINE
1604  VectorizedArray
1605  get_sqrt() const
1606  {
1607  VectorizedArray res;
1608  res.data = _mm256_sqrt_pd(data);
1609  return res;
1610  }
1611 
1616  DEAL_II_ALWAYS_INLINE
1617  VectorizedArray
1618  get_abs() const
1619  {
1620  // to compute the absolute value, perform bitwise andnot with -0. This
1621  // will leave all value and exponent bits unchanged but force the sign
1622  // value to +.
1623  __m256d mask = _mm256_set1_pd(-0.);
1624  VectorizedArray res;
1625  res.data = _mm256_andnot_pd(mask, data);
1626  return res;
1627  }
1628 
1633  DEAL_II_ALWAYS_INLINE
1634  VectorizedArray
1635  get_max(const VectorizedArray &other) const
1636  {
1637  VectorizedArray res;
1638  res.data = _mm256_max_pd(data, other.data);
1639  return res;
1640  }
1641 
1646  DEAL_II_ALWAYS_INLINE
1647  VectorizedArray
1648  get_min(const VectorizedArray &other) const
1649  {
1650  VectorizedArray res;
1651  res.data = _mm256_min_pd(data, other.data);
1652  return res;
1653  }
1654 
1658  template <typename Number2>
1660  std::sqrt(const VectorizedArray<Number2> &);
1661  template <typename Number2>
1663  std::abs(const VectorizedArray<Number2> &);
1664  template <typename Number2>
1666  std::max(const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
1667  template <typename Number2>
1669  std::min(const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
1670 };
1671 
1672 
1673 
1677 template <>
1678 inline void
1679 vectorized_load_and_transpose(const unsigned int n_entries,
1680  const double * in,
1681  const unsigned int * offsets,
1683 {
1684  const unsigned int n_chunks = n_entries / 4;
1685  const double * in0 = in + offsets[0];
1686  const double * in1 = in + offsets[1];
1687  const double * in2 = in + offsets[2];
1688  const double * in3 = in + offsets[3];
1689 
1690  for (unsigned int i = 0; i < n_chunks; ++i)
1691  {
1692  __m256d u0 = _mm256_loadu_pd(in0 + 4 * i);
1693  __m256d u1 = _mm256_loadu_pd(in1 + 4 * i);
1694  __m256d u2 = _mm256_loadu_pd(in2 + 4 * i);
1695  __m256d u3 = _mm256_loadu_pd(in3 + 4 * i);
1696  __m256d t0 = _mm256_permute2f128_pd(u0, u2, 0x20);
1697  __m256d t1 = _mm256_permute2f128_pd(u1, u3, 0x20);
1698  __m256d t2 = _mm256_permute2f128_pd(u0, u2, 0x31);
1699  __m256d t3 = _mm256_permute2f128_pd(u1, u3, 0x31);
1700  out[4 * i + 0].data = _mm256_unpacklo_pd(t0, t1);
1701  out[4 * i + 1].data = _mm256_unpackhi_pd(t0, t1);
1702  out[4 * i + 2].data = _mm256_unpacklo_pd(t2, t3);
1703  out[4 * i + 3].data = _mm256_unpackhi_pd(t2, t3);
1704  }
1705  for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
1706  for (unsigned int v = 0; v < 4; ++v)
1707  out[i][v] = in[offsets[v] + i];
1708 }
1709 
1710 
1711 
1715 template <>
1716 inline void
1717 vectorized_transpose_and_store(const bool add_into,
1718  const unsigned int n_entries,
1719  const VectorizedArray<double> *in,
1720  const unsigned int * offsets,
1721  double * out)
1722 {
1723  const unsigned int n_chunks = n_entries / 4;
1724  double * out0 = out + offsets[0];
1725  double * out1 = out + offsets[1];
1726  double * out2 = out + offsets[2];
1727  double * out3 = out + offsets[3];
1728  for (unsigned int i = 0; i < n_chunks; ++i)
1729  {
1730  __m256d u0 = in[4 * i + 0].data;
1731  __m256d u1 = in[4 * i + 1].data;
1732  __m256d u2 = in[4 * i + 2].data;
1733  __m256d u3 = in[4 * i + 3].data;
1734  __m256d t0 = _mm256_permute2f128_pd(u0, u2, 0x20);
1735  __m256d t1 = _mm256_permute2f128_pd(u1, u3, 0x20);
1736  __m256d t2 = _mm256_permute2f128_pd(u0, u2, 0x31);
1737  __m256d t3 = _mm256_permute2f128_pd(u1, u3, 0x31);
1738  __m256d res0 = _mm256_unpacklo_pd(t0, t1);
1739  __m256d res1 = _mm256_unpackhi_pd(t0, t1);
1740  __m256d res2 = _mm256_unpacklo_pd(t2, t3);
1741  __m256d res3 = _mm256_unpackhi_pd(t2, t3);
1742 
1743  // Cannot use the same store instructions in both paths of the 'if'
1744  // because the compiler cannot know that there is no aliasing between
1745  // pointers
1746  if (add_into)
1747  {
1748  res0 = _mm256_add_pd(_mm256_loadu_pd(out0 + 4 * i), res0);
1749  _mm256_storeu_pd(out0 + 4 * i, res0);
1750  res1 = _mm256_add_pd(_mm256_loadu_pd(out1 + 4 * i), res1);
1751  _mm256_storeu_pd(out1 + 4 * i, res1);
1752  res2 = _mm256_add_pd(_mm256_loadu_pd(out2 + 4 * i), res2);
1753  _mm256_storeu_pd(out2 + 4 * i, res2);
1754  res3 = _mm256_add_pd(_mm256_loadu_pd(out3 + 4 * i), res3);
1755  _mm256_storeu_pd(out3 + 4 * i, res3);
1756  }
1757  else
1758  {
1759  _mm256_storeu_pd(out0 + 4 * i, res0);
1760  _mm256_storeu_pd(out1 + 4 * i, res1);
1761  _mm256_storeu_pd(out2 + 4 * i, res2);
1762  _mm256_storeu_pd(out3 + 4 * i, res3);
1763  }
1764  }
1765  if (add_into)
1766  for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
1767  for (unsigned int v = 0; v < 4; ++v)
1768  out[offsets[v] + i] += in[i][v];
1769  else
1770  for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
1771  for (unsigned int v = 0; v < 4; ++v)
1772  out[offsets[v] + i] = in[i][v];
1773 }
1774 
1775 
1776 
1780 template <>
1781 class VectorizedArray<float>
1782 {
1783 public:
1787  static const unsigned int n_array_elements = 8;
1788 
1792  DEAL_II_ALWAYS_INLINE
1793  VectorizedArray &
1794  operator=(const float x)
1795  {
1796  data = _mm256_set1_ps(x);
1797  return *this;
1798  }
1799 
1803  DEAL_II_ALWAYS_INLINE
1804  float &operator[](const unsigned int comp)
1805  {
1806  AssertIndexRange(comp, 8);
1807  return *(reinterpret_cast<float *>(&data) + comp);
1808  }
1809 
1813  DEAL_II_ALWAYS_INLINE
1814  const float &operator[](const unsigned int comp) const
1815  {
1816  AssertIndexRange(comp, 8);
1817  return *(reinterpret_cast<const float *>(&data) + comp);
1818  }
1819 
1823  DEAL_II_ALWAYS_INLINE
1824  VectorizedArray &
1825  operator+=(const VectorizedArray &vec)
1826  {
1827  // if the compiler supports vector arithmetics, we can simply use +=
1828  // operator on the given data type. this allows the compiler to combine
1829  // additions with multiplication (fused multiply-add) if those
1830  // instructions are available. Otherwise, we need to use the built-in
1831  // intrinsic command for __m256d
1832 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1833  data += vec.data;
1834 # else
1835  data = _mm256_add_ps(data, vec.data);
1836 # endif
1837  return *this;
1838  }
1839 
1843  DEAL_II_ALWAYS_INLINE
1844  VectorizedArray &
1845  operator-=(const VectorizedArray &vec)
1846  {
1847 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1848  data -= vec.data;
1849 # else
1850  data = _mm256_sub_ps(data, vec.data);
1851 # endif
1852  return *this;
1853  }
1857  DEAL_II_ALWAYS_INLINE
1858  VectorizedArray &
1859  operator*=(const VectorizedArray &vec)
1860  {
1861 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1862  data *= vec.data;
1863 # else
1864  data = _mm256_mul_ps(data, vec.data);
1865 # endif
1866  return *this;
1867  }
1868 
1872  DEAL_II_ALWAYS_INLINE
1873  VectorizedArray &
1874  operator/=(const VectorizedArray &vec)
1875  {
1876 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1877  data /= vec.data;
1878 # else
1879  data = _mm256_div_ps(data, vec.data);
1880 # endif
1881  return *this;
1882  }
1883 
1889  DEAL_II_ALWAYS_INLINE
1890  void
1891  load(const float *ptr)
1892  {
1893  data = _mm256_loadu_ps(ptr);
1894  }
1895 
1902  DEAL_II_ALWAYS_INLINE
1903  void
1904  store(float *ptr) const
1905  {
1906  _mm256_storeu_ps(ptr, data);
1907  }
1908 
1912  DEAL_II_ALWAYS_INLINE
1913  void
1914  streaming_store(float *ptr) const
1915  {
1916  Assert(reinterpret_cast<std::size_t>(ptr) % 32 == 0,
1917  ExcMessage("Memory not aligned"));
1918  _mm256_stream_ps(ptr, data);
1919  }
1920 
1933  DEAL_II_ALWAYS_INLINE
1934  void
1935  gather(const float *base_ptr, const unsigned int *offsets)
1936  {
1937 # ifdef __AVX2__
1938  // unfortunately, there does not appear to be a 256 bit integer load, so
1939  // do it by some reinterpret casts here. this is allowed because the Intel
1940  // API allows aliasing between different vector types.
1941  const __m256 index_val = _mm256_loadu_ps((const float *)offsets);
1942  const __m256i index = *((__m256i *)(&index_val));
1943  data = _mm256_i32gather_ps(base_ptr, index, 4);
1944 # else
1945  for (unsigned int i = 0; i < 8; ++i)
1946  *(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
1947 # endif
1948  }
1949 
1962  DEAL_II_ALWAYS_INLINE
1963  void
1964  scatter(const unsigned int *offsets, float *base_ptr) const
1965  {
1966  // no scatter operation in AVX/AVX2
1967  for (unsigned int i = 0; i < 8; ++i)
1968  base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
1969  }
1970 
1975  __m256 data;
1976 
1977 private:
1982  DEAL_II_ALWAYS_INLINE
1983  VectorizedArray
1984  get_sqrt() const
1985  {
1986  VectorizedArray res;
1987  res.data = _mm256_sqrt_ps(data);
1988  return res;
1989  }
1990 
1995  DEAL_II_ALWAYS_INLINE
1996  VectorizedArray
1997  get_abs() const
1998  {
1999  // to compute the absolute value, perform bitwise andnot with -0. This
2000  // will leave all value and exponent bits unchanged but force the sign
2001  // value to +.
2002  __m256 mask = _mm256_set1_ps(-0.f);
2003  VectorizedArray res;
2004  res.data = _mm256_andnot_ps(mask, data);
2005  return res;
2006  }
2007 
2012  DEAL_II_ALWAYS_INLINE
2013  VectorizedArray
2014  get_max(const VectorizedArray &other) const
2015  {
2016  VectorizedArray res;
2017  res.data = _mm256_max_ps(data, other.data);
2018  return res;
2019  }
2020 
2025  DEAL_II_ALWAYS_INLINE
2026  VectorizedArray
2027  get_min(const VectorizedArray &other) const
2028  {
2029  VectorizedArray res;
2030  res.data = _mm256_min_ps(data, other.data);
2031  return res;
2032  }
2033 
2037  template <typename Number2>
2039  std::sqrt(const VectorizedArray<Number2> &);
2040  template <typename Number2>
2042  std::abs(const VectorizedArray<Number2> &);
2043  template <typename Number2>
2045  std::max(const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
2046  template <typename Number2>
2048  std::min(const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
2049 };
2050 
2051 
2052 
2056 template <>
2057 inline void
2058 vectorized_load_and_transpose(const unsigned int n_entries,
2059  const float * in,
2060  const unsigned int * offsets,
2062 {
2063  const unsigned int n_chunks = n_entries / 4;
2064  for (unsigned int i = 0; i < n_chunks; ++i)
2065  {
2066  __m128 u0 = _mm_loadu_ps(in + 4 * i + offsets[0]);
2067  __m128 u1 = _mm_loadu_ps(in + 4 * i + offsets[1]);
2068  __m128 u2 = _mm_loadu_ps(in + 4 * i + offsets[2]);
2069  __m128 u3 = _mm_loadu_ps(in + 4 * i + offsets[3]);
2070  __m128 u4 = _mm_loadu_ps(in + 4 * i + offsets[4]);
2071  __m128 u5 = _mm_loadu_ps(in + 4 * i + offsets[5]);
2072  __m128 u6 = _mm_loadu_ps(in + 4 * i + offsets[6]);
2073  __m128 u7 = _mm_loadu_ps(in + 4 * i + offsets[7]);
2074  // To avoid warnings about uninitialized variables, need to initialize
2075  // one variable with zero before using it.
2076  __m256 t0, t1, t2, t3 = _mm256_set1_ps(0.F);
2077  t0 = _mm256_insertf128_ps(t3, u0, 0);
2078  t0 = _mm256_insertf128_ps(t0, u4, 1);
2079  t1 = _mm256_insertf128_ps(t3, u1, 0);
2080  t1 = _mm256_insertf128_ps(t1, u5, 1);
2081  t2 = _mm256_insertf128_ps(t3, u2, 0);
2082  t2 = _mm256_insertf128_ps(t2, u6, 1);
2083  t3 = _mm256_insertf128_ps(t3, u3, 0);
2084  t3 = _mm256_insertf128_ps(t3, u7, 1);
2085  __m256 v0 = _mm256_shuffle_ps(t0, t1, 0x44);
2086  __m256 v1 = _mm256_shuffle_ps(t0, t1, 0xee);
2087  __m256 v2 = _mm256_shuffle_ps(t2, t3, 0x44);
2088  __m256 v3 = _mm256_shuffle_ps(t2, t3, 0xee);
2089  out[4 * i + 0].data = _mm256_shuffle_ps(v0, v2, 0x88);
2090  out[4 * i + 1].data = _mm256_shuffle_ps(v0, v2, 0xdd);
2091  out[4 * i + 2].data = _mm256_shuffle_ps(v1, v3, 0x88);
2092  out[4 * i + 3].data = _mm256_shuffle_ps(v1, v3, 0xdd);
2093  }
2094  for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
2095  for (unsigned int v = 0; v < 8; ++v)
2096  out[i][v] = in[offsets[v] + i];
2097 }
2098 
2099 
2100 
2104 template <>
2105 inline void
2106 vectorized_transpose_and_store(const bool add_into,
2107  const unsigned int n_entries,
2108  const VectorizedArray<float> *in,
2109  const unsigned int * offsets,
2110  float * out)
2111 {
2112  const unsigned int n_chunks = n_entries / 4;
2113  for (unsigned int i = 0; i < n_chunks; ++i)
2114  {
2115  __m256 u0 = in[4 * i + 0].data;
2116  __m256 u1 = in[4 * i + 1].data;
2117  __m256 u2 = in[4 * i + 2].data;
2118  __m256 u3 = in[4 * i + 3].data;
2119  __m256 t0 = _mm256_shuffle_ps(u0, u1, 0x44);
2120  __m256 t1 = _mm256_shuffle_ps(u0, u1, 0xee);
2121  __m256 t2 = _mm256_shuffle_ps(u2, u3, 0x44);
2122  __m256 t3 = _mm256_shuffle_ps(u2, u3, 0xee);
2123  u0 = _mm256_shuffle_ps(t0, t2, 0x88);
2124  u1 = _mm256_shuffle_ps(t0, t2, 0xdd);
2125  u2 = _mm256_shuffle_ps(t1, t3, 0x88);
2126  u3 = _mm256_shuffle_ps(t1, t3, 0xdd);
2127  __m128 res0 = _mm256_extractf128_ps(u0, 0);
2128  __m128 res4 = _mm256_extractf128_ps(u0, 1);
2129  __m128 res1 = _mm256_extractf128_ps(u1, 0);
2130  __m128 res5 = _mm256_extractf128_ps(u1, 1);
2131  __m128 res2 = _mm256_extractf128_ps(u2, 0);
2132  __m128 res6 = _mm256_extractf128_ps(u2, 1);
2133  __m128 res3 = _mm256_extractf128_ps(u3, 0);
2134  __m128 res7 = _mm256_extractf128_ps(u3, 1);
2135 
2136  // Cannot use the same store instructions in both paths of the 'if'
2137  // because the compiler cannot know that there is no aliasing between
2138  // pointers
2139  if (add_into)
2140  {
2141  res0 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[0]), res0);
2142  _mm_storeu_ps(out + 4 * i + offsets[0], res0);
2143  res1 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[1]), res1);
2144  _mm_storeu_ps(out + 4 * i + offsets[1], res1);
2145  res2 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[2]), res2);
2146  _mm_storeu_ps(out + 4 * i + offsets[2], res2);
2147  res3 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[3]), res3);
2148  _mm_storeu_ps(out + 4 * i + offsets[3], res3);
2149  res4 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[4]), res4);
2150  _mm_storeu_ps(out + 4 * i + offsets[4], res4);
2151  res5 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[5]), res5);
2152  _mm_storeu_ps(out + 4 * i + offsets[5], res5);
2153  res6 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[6]), res6);
2154  _mm_storeu_ps(out + 4 * i + offsets[6], res6);
2155  res7 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[7]), res7);
2156  _mm_storeu_ps(out + 4 * i + offsets[7], res7);
2157  }
2158  else
2159  {
2160  _mm_storeu_ps(out + 4 * i + offsets[0], res0);
2161  _mm_storeu_ps(out + 4 * i + offsets[1], res1);
2162  _mm_storeu_ps(out + 4 * i + offsets[2], res2);
2163  _mm_storeu_ps(out + 4 * i + offsets[3], res3);
2164  _mm_storeu_ps(out + 4 * i + offsets[4], res4);
2165  _mm_storeu_ps(out + 4 * i + offsets[5], res5);
2166  _mm_storeu_ps(out + 4 * i + offsets[6], res6);
2167  _mm_storeu_ps(out + 4 * i + offsets[7], res7);
2168  }
2169  }
2170  if (add_into)
2171  for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
2172  for (unsigned int v = 0; v < 8; ++v)
2173  out[offsets[v] + i] += in[i][v];
2174  else
2175  for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
2176  for (unsigned int v = 0; v < 8; ++v)
2177  out[offsets[v] + i] = in[i][v];
2178 }
2179 
2180 
2181 
2182 // for safety, also check that __SSE2__ is defined in case the user manually
2183 // set some conflicting compile flags which prevent compilation
2184 
2185 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 1
2186 
2190 template <>
2191 class VectorizedArray<double>
2192 {
2193 public:
2197  static const unsigned int n_array_elements = 2;
2198 
2202  DEAL_II_ALWAYS_INLINE
2203  VectorizedArray &
2204  operator=(const double x)
2205  {
2206  data = _mm_set1_pd(x);
2207  return *this;
2208  }
2209 
2213  DEAL_II_ALWAYS_INLINE
2214  double &operator[](const unsigned int comp)
2215  {
2216  AssertIndexRange(comp, 2);
2217  return *(reinterpret_cast<double *>(&data) + comp);
2218  }
2219 
2223  DEAL_II_ALWAYS_INLINE
2224  const double &operator[](const unsigned int comp) const
2225  {
2226  AssertIndexRange(comp, 2);
2227  return *(reinterpret_cast<const double *>(&data) + comp);
2228  }
2229 
2233  DEAL_II_ALWAYS_INLINE
2234  VectorizedArray &
2235  operator+=(const VectorizedArray &vec)
2236  {
2237 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
2238  data += vec.data;
2239 # else
2240  data = _mm_add_pd(data, vec.data);
2241 # endif
2242  return *this;
2243  }
2244 
2248  DEAL_II_ALWAYS_INLINE
2249  VectorizedArray &
2250  operator-=(const VectorizedArray &vec)
2251  {
2252 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
2253  data -= vec.data;
2254 # else
2255  data = _mm_sub_pd(data, vec.data);
2256 # endif
2257  return *this;
2258  }
2259 
2263  DEAL_II_ALWAYS_INLINE
2264  VectorizedArray &
2265  operator*=(const VectorizedArray &vec)
2266  {
2267 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
2268  data *= vec.data;
2269 # else
2270  data = _mm_mul_pd(data, vec.data);
2271 # endif
2272  return *this;
2273  }
2274 
2278  DEAL_II_ALWAYS_INLINE
2279  VectorizedArray &
2280  operator/=(const VectorizedArray &vec)
2281  {
2282 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
2283  data /= vec.data;
2284 # else
2285  data = _mm_div_pd(data, vec.data);
2286 # endif
2287  return *this;
2288  }
2289 
2295  DEAL_II_ALWAYS_INLINE
2296  void
2297  load(const double *ptr)
2298  {
2299  data = _mm_loadu_pd(ptr);
2300  }
2301 
2308  DEAL_II_ALWAYS_INLINE
2309  void
2310  store(double *ptr) const
2311  {
2312  _mm_storeu_pd(ptr, data);
2313  }
2314 
2318  DEAL_II_ALWAYS_INLINE
2319  void
2320  streaming_store(double *ptr) const
2321  {
2322  Assert(reinterpret_cast<std::size_t>(ptr) % 16 == 0,
2323  ExcMessage("Memory not aligned"));
2324  _mm_stream_pd(ptr, data);
2325  }
2326 
2339  DEAL_II_ALWAYS_INLINE
2340  void
2341  gather(const double *base_ptr, const unsigned int *offsets)
2342  {
2343  for (unsigned int i = 0; i < 2; ++i)
2344  *(reinterpret_cast<double *>(&data) + i) = base_ptr[offsets[i]];
2345  }
2346 
2359  DEAL_II_ALWAYS_INLINE
2360  void
2361  scatter(const unsigned int *offsets, double *base_ptr) const
2362  {
2363  for (unsigned int i = 0; i < 2; ++i)
2364  base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
2365  }
2366 
2371  __m128d data;
2372 
2373 private:
2378  DEAL_II_ALWAYS_INLINE
2379  VectorizedArray
2380  get_sqrt() const
2381  {
2382  VectorizedArray res;
2383  res.data = _mm_sqrt_pd(data);
2384  return res;
2385  }
2386 
2391  DEAL_II_ALWAYS_INLINE
2392  VectorizedArray
2393  get_abs() const
2394  {
2395  // to compute the absolute value, perform
2396  // bitwise andnot with -0. This will leave all
2397  // value and exponent bits unchanged but force
2398  // the sign value to +.
2399  __m128d mask = _mm_set1_pd(-0.);
2400  VectorizedArray res;
2401  res.data = _mm_andnot_pd(mask, data);
2402  return res;
2403  }
2404 
2409  DEAL_II_ALWAYS_INLINE
2410  VectorizedArray
2411  get_max(const VectorizedArray &other) const
2412  {
2413  VectorizedArray res;
2414  res.data = _mm_max_pd(data, other.data);
2415  return res;
2416  }
2417 
2422  DEAL_II_ALWAYS_INLINE
2423  VectorizedArray
2424  get_min(const VectorizedArray &other) const
2425  {
2426  VectorizedArray res;
2427  res.data = _mm_min_pd(data, other.data);
2428  return res;
2429  }
2430 
2434  template <typename Number2>
2436  std::sqrt(const VectorizedArray<Number2> &);
2437  template <typename Number2>
2439  std::abs(const VectorizedArray<Number2> &);
2440  template <typename Number2>
2442  std::max(const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
2443  template <typename Number2>
2445  std::min(const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
2446 };
2447 
2448 
2449 
2453 template <>
2454 inline void
2455 vectorized_load_and_transpose(const unsigned int n_entries,
2456  const double * in,
2457  const unsigned int * offsets,
2459 {
2460  const unsigned int n_chunks = n_entries / 2;
2461  for (unsigned int i = 0; i < n_chunks; ++i)
2462  {
2463  __m128d u0 = _mm_loadu_pd(in + 2 * i + offsets[0]);
2464  __m128d u1 = _mm_loadu_pd(in + 2 * i + offsets[1]);
2465  out[2 * i + 0].data = _mm_unpacklo_pd(u0, u1);
2466  out[2 * i + 1].data = _mm_unpackhi_pd(u0, u1);
2467  }
2468  for (unsigned int i = 2 * n_chunks; i < n_entries; ++i)
2469  for (unsigned int v = 0; v < 2; ++v)
2470  out[i][v] = in[offsets[v] + i];
2471 }
2472 
2473 
2474 
2478 template <>
2479 inline void
2480 vectorized_transpose_and_store(const bool add_into,
2481  const unsigned int n_entries,
2482  const VectorizedArray<double> *in,
2483  const unsigned int * offsets,
2484  double * out)
2485 {
2486  const unsigned int n_chunks = n_entries / 2;
2487  if (add_into)
2488  {
2489  for (unsigned int i = 0; i < n_chunks; ++i)
2490  {
2491  __m128d u0 = in[2 * i + 0].data;
2492  __m128d u1 = in[2 * i + 1].data;
2493  __m128d res0 = _mm_unpacklo_pd(u0, u1);
2494  __m128d res1 = _mm_unpackhi_pd(u0, u1);
2495  _mm_storeu_pd(out + 2 * i + offsets[0],
2496  _mm_add_pd(_mm_loadu_pd(out + 2 * i + offsets[0]),
2497  res0));
2498  _mm_storeu_pd(out + 2 * i + offsets[1],
2499  _mm_add_pd(_mm_loadu_pd(out + 2 * i + offsets[1]),
2500  res1));
2501  }
2502  for (unsigned int i = 2 * n_chunks; i < n_entries; ++i)
2503  for (unsigned int v = 0; v < 2; ++v)
2504  out[offsets[v] + i] += in[i][v];
2505  }
2506  else
2507  {
2508  for (unsigned int i = 0; i < n_chunks; ++i)
2509  {
2510  __m128d u0 = in[2 * i + 0].data;
2511  __m128d u1 = in[2 * i + 1].data;
2512  __m128d res0 = _mm_unpacklo_pd(u0, u1);
2513  __m128d res1 = _mm_unpackhi_pd(u0, u1);
2514  _mm_storeu_pd(out + 2 * i + offsets[0], res0);
2515  _mm_storeu_pd(out + 2 * i + offsets[1], res1);
2516  }
2517  for (unsigned int i = 2 * n_chunks; i < n_entries; ++i)
2518  for (unsigned int v = 0; v < 2; ++v)
2519  out[offsets[v] + i] = in[i][v];
2520  }
2521 }
2522 
2523 
2524 
2528 template <>
2529 class VectorizedArray<float>
2530 {
2531 public:
2535  static const unsigned int n_array_elements = 4;
2536 
2541  DEAL_II_ALWAYS_INLINE
2542  VectorizedArray &
2543  operator=(const float x)
2544  {
2545  data = _mm_set1_ps(x);
2546  return *this;
2547  }
2548 
2552  DEAL_II_ALWAYS_INLINE
2553  float &operator[](const unsigned int comp)
2554  {
2555  AssertIndexRange(comp, 4);
2556  return *(reinterpret_cast<float *>(&data) + comp);
2557  }
2558 
2562  DEAL_II_ALWAYS_INLINE
2563  const float &operator[](const unsigned int comp) const
2564  {
2565  AssertIndexRange(comp, 4);
2566  return *(reinterpret_cast<const float *>(&data) + comp);
2567  }
2568 
2572  DEAL_II_ALWAYS_INLINE
2573  VectorizedArray &
2574  operator+=(const VectorizedArray &vec)
2575  {
2576 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
2577  data += vec.data;
2578 # else
2579  data = _mm_add_ps(data, vec.data);
2580 # endif
2581  return *this;
2582  }
2583 
2587  DEAL_II_ALWAYS_INLINE
2588  VectorizedArray &
2589  operator-=(const VectorizedArray &vec)
2590  {
2591 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
2592  data -= vec.data;
2593 # else
2594  data = _mm_sub_ps(data, vec.data);
2595 # endif
2596  return *this;
2597  }
2598 
2602  DEAL_II_ALWAYS_INLINE
2603  VectorizedArray &
2604  operator*=(const VectorizedArray &vec)
2605  {
2606 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
2607  data *= vec.data;
2608 # else
2609  data = _mm_mul_ps(data, vec.data);
2610 # endif
2611  return *this;
2612  }
2613 
2617  DEAL_II_ALWAYS_INLINE
2618  VectorizedArray &
2619  operator/=(const VectorizedArray &vec)
2620  {
2621 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
2622  data /= vec.data;
2623 # else
2624  data = _mm_div_ps(data, vec.data);
2625 # endif
2626  return *this;
2627  }
2628 
2634  DEAL_II_ALWAYS_INLINE
2635  void
2636  load(const float *ptr)
2637  {
2638  data = _mm_loadu_ps(ptr);
2639  }
2640 
2647  DEAL_II_ALWAYS_INLINE
2648  void
2649  store(float *ptr) const
2650  {
2651  _mm_storeu_ps(ptr, data);
2652  }
2653 
2657  DEAL_II_ALWAYS_INLINE
2658  void
2659  streaming_store(float *ptr) const
2660  {
2661  Assert(reinterpret_cast<std::size_t>(ptr) % 16 == 0,
2662  ExcMessage("Memory not aligned"));
2663  _mm_stream_ps(ptr, data);
2664  }
2665 
2678  DEAL_II_ALWAYS_INLINE
2679  void
2680  gather(const float *base_ptr, const unsigned int *offsets)
2681  {
2682  for (unsigned int i = 0; i < 4; ++i)
2683  *(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
2684  }
2685 
2698  DEAL_II_ALWAYS_INLINE
2699  void
2700  scatter(const unsigned int *offsets, float *base_ptr) const
2701  {
2702  for (unsigned int i = 0; i < 4; ++i)
2703  base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
2704  }
2705 
2710  __m128 data;
2711 
2712 private:
2717  DEAL_II_ALWAYS_INLINE
2718  VectorizedArray
2719  get_sqrt() const
2720  {
2721  VectorizedArray res;
2722  res.data = _mm_sqrt_ps(data);
2723  return res;
2724  }
2725 
2730  DEAL_II_ALWAYS_INLINE
2731  VectorizedArray
2732  get_abs() const
2733  {
2734  // to compute the absolute value, perform bitwise andnot with -0. This
2735  // will leave all value and exponent bits unchanged but force the sign
2736  // value to +.
2737  __m128 mask = _mm_set1_ps(-0.f);
2738  VectorizedArray res;
2739  res.data = _mm_andnot_ps(mask, data);
2740  return res;
2741  }
2742 
2747  DEAL_II_ALWAYS_INLINE
2748  VectorizedArray
2749  get_max(const VectorizedArray &other) const
2750  {
2751  VectorizedArray res;
2752  res.data = _mm_max_ps(data, other.data);
2753  return res;
2754  }
2755 
2760  DEAL_II_ALWAYS_INLINE
2761  VectorizedArray
2762  get_min(const VectorizedArray &other) const
2763  {
2764  VectorizedArray res;
2765  res.data = _mm_min_ps(data, other.data);
2766  return res;
2767  }
2768 
2772  template <typename Number2>
2774  std::sqrt(const VectorizedArray<Number2> &);
2775  template <typename Number2>
2777  std::abs(const VectorizedArray<Number2> &);
2778  template <typename Number2>
2780  std::max(const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
2781  template <typename Number2>
2783  std::min(const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
2784 };
2785 
2786 
2787 
2791 template <>
2792 inline void
2793 vectorized_load_and_transpose(const unsigned int n_entries,
2794  const float * in,
2795  const unsigned int * offsets,
2797 {
2798  const unsigned int n_chunks = n_entries / 4;
2799  for (unsigned int i = 0; i < n_chunks; ++i)
2800  {
2801  __m128 u0 = _mm_loadu_ps(in + 4 * i + offsets[0]);
2802  __m128 u1 = _mm_loadu_ps(in + 4 * i + offsets[1]);
2803  __m128 u2 = _mm_loadu_ps(in + 4 * i + offsets[2]);
2804  __m128 u3 = _mm_loadu_ps(in + 4 * i + offsets[3]);
2805  __m128 v0 = _mm_shuffle_ps(u0, u1, 0x44);
2806  __m128 v1 = _mm_shuffle_ps(u0, u1, 0xee);
2807  __m128 v2 = _mm_shuffle_ps(u2, u3, 0x44);
2808  __m128 v3 = _mm_shuffle_ps(u2, u3, 0xee);
2809  out[4 * i + 0].data = _mm_shuffle_ps(v0, v2, 0x88);
2810  out[4 * i + 1].data = _mm_shuffle_ps(v0, v2, 0xdd);
2811  out[4 * i + 2].data = _mm_shuffle_ps(v1, v3, 0x88);
2812  out[4 * i + 3].data = _mm_shuffle_ps(v1, v3, 0xdd);
2813  }
2814  for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
2815  for (unsigned int v = 0; v < 4; ++v)
2816  out[i][v] = in[offsets[v] + i];
2817 }
2818 
2819 
2820 
2824 template <>
2825 inline void
2826 vectorized_transpose_and_store(const bool add_into,
2827  const unsigned int n_entries,
2828  const VectorizedArray<float> *in,
2829  const unsigned int * offsets,
2830  float * out)
2831 {
2832  const unsigned int n_chunks = n_entries / 4;
2833  for (unsigned int i = 0; i < n_chunks; ++i)
2834  {
2835  __m128 u0 = in[4 * i + 0].data;
2836  __m128 u1 = in[4 * i + 1].data;
2837  __m128 u2 = in[4 * i + 2].data;
2838  __m128 u3 = in[4 * i + 3].data;
2839  __m128 t0 = _mm_shuffle_ps(u0, u1, 0x44);
2840  __m128 t1 = _mm_shuffle_ps(u0, u1, 0xee);
2841  __m128 t2 = _mm_shuffle_ps(u2, u3, 0x44);
2842  __m128 t3 = _mm_shuffle_ps(u2, u3, 0xee);
2843  u0 = _mm_shuffle_ps(t0, t2, 0x88);
2844  u1 = _mm_shuffle_ps(t0, t2, 0xdd);
2845  u2 = _mm_shuffle_ps(t1, t3, 0x88);
2846  u3 = _mm_shuffle_ps(t1, t3, 0xdd);
2847 
2848  // Cannot use the same store instructions in both paths of the 'if'
2849  // because the compiler cannot know that there is no aliasing between
2850  // pointers
2851  if (add_into)
2852  {
2853  u0 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[0]), u0);
2854  _mm_storeu_ps(out + 4 * i + offsets[0], u0);
2855  u1 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[1]), u1);
2856  _mm_storeu_ps(out + 4 * i + offsets[1], u1);
2857  u2 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[2]), u2);
2858  _mm_storeu_ps(out + 4 * i + offsets[2], u2);
2859  u3 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[3]), u3);
2860  _mm_storeu_ps(out + 4 * i + offsets[3], u3);
2861  }
2862  else
2863  {
2864  _mm_storeu_ps(out + 4 * i + offsets[0], u0);
2865  _mm_storeu_ps(out + 4 * i + offsets[1], u1);
2866  _mm_storeu_ps(out + 4 * i + offsets[2], u2);
2867  _mm_storeu_ps(out + 4 * i + offsets[3], u3);
2868  }
2869  }
2870  if (add_into)
2871  for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
2872  for (unsigned int v = 0; v < 4; ++v)
2873  out[offsets[v] + i] += in[i][v];
2874  else
2875  for (unsigned int i = 4 * n_chunks; i < n_entries; ++i)
2876  for (unsigned int v = 0; v < 4; ++v)
2877  out[offsets[v] + i] = in[i][v];
2878 }
2879 
2880 
2881 
2882 #endif // if DEAL_II_COMPILER_VECTORIZATION_LEVEL > 0
2883 
2884 
2890 template <typename Number>
2891 inline DEAL_II_ALWAYS_INLINE bool
2893  const VectorizedArray<Number> &rhs)
2894 {
2895  for (unsigned int i = 0; i < VectorizedArray<Number>::n_array_elements; ++i)
2896  if (lhs[i] != rhs[i])
2897  return false;
2898 
2899  return true;
2900 }
2901 
2902 
2908 template <typename Number>
2909 inline DEAL_II_ALWAYS_INLINE VectorizedArray<Number>
2911 {
2912  VectorizedArray<Number> tmp = u;
2913  return tmp += v;
2914 }
2915 
2921 template <typename Number>
2922 inline DEAL_II_ALWAYS_INLINE VectorizedArray<Number>
2924 {
2925  VectorizedArray<Number> tmp = u;
2926  return tmp -= v;
2927 }
2928 
2934 template <typename Number>
2935 inline DEAL_II_ALWAYS_INLINE VectorizedArray<Number>
2937 {
2938  VectorizedArray<Number> tmp = u;
2939  return tmp *= v;
2940 }
2941 
2947 template <typename Number>
2948 inline DEAL_II_ALWAYS_INLINE VectorizedArray<Number>
2950 {
2951  VectorizedArray<Number> tmp = u;
2952  return tmp /= v;
2953 }
2954 
2961 template <typename Number>
2962 inline DEAL_II_ALWAYS_INLINE VectorizedArray<Number>
2963  operator+(const Number &u, const VectorizedArray<Number> &v)
2964 {
2966  tmp = u;
2967  return tmp += v;
2968 }
2969 
2978 inline DEAL_II_ALWAYS_INLINE VectorizedArray<float>
2979  operator+(const double &u, const VectorizedArray<float> &v)
2980 {
2982  tmp = u;
2983  return tmp += v;
2984 }
2985 
2992 template <typename Number>
2993 inline DEAL_II_ALWAYS_INLINE VectorizedArray<Number>
2994  operator+(const VectorizedArray<Number> &v, const Number &u)
2995 {
2996  return u + v;
2997 }
2998 
3007 inline DEAL_II_ALWAYS_INLINE VectorizedArray<float>
3008  operator+(const VectorizedArray<float> &v, const double &u)
3009 {
3010  return u + v;
3011 }
3012 
3019 template <typename Number>
3020 inline DEAL_II_ALWAYS_INLINE VectorizedArray<Number>
3021  operator-(const Number &u, const VectorizedArray<Number> &v)
3022 {
3024  tmp = u;
3025  return tmp -= v;
3026 }
3027 
3036 inline DEAL_II_ALWAYS_INLINE VectorizedArray<float>
3037  operator-(const double &u, const VectorizedArray<float> &v)
3038 {
3040  tmp = float(u);
3041  return tmp -= v;
3042 }
3043 
3050 template <typename Number>
3051 inline DEAL_II_ALWAYS_INLINE VectorizedArray<Number>
3052  operator-(const VectorizedArray<Number> &v, const Number &u)
3053 {
3055  tmp = u;
3056  return v - tmp;
3057 }
3058 
3067 inline DEAL_II_ALWAYS_INLINE VectorizedArray<float>
3068  operator-(const VectorizedArray<float> &v, const double &u)
3069 {
3071  tmp = float(u);
3072  return v - tmp;
3073 }
3074 
3081 template <typename Number>
3082 inline DEAL_II_ALWAYS_INLINE VectorizedArray<Number>
3083  operator*(const Number &u, const VectorizedArray<Number> &v)
3084 {
3086  tmp = u;
3087  return tmp *= v;
3088 }
3089 
3098 inline DEAL_II_ALWAYS_INLINE VectorizedArray<float>
3099  operator*(const double &u, const VectorizedArray<float> &v)
3100 {
3102  tmp = float(u);
3103  return tmp *= v;
3104 }
3105 
3112 template <typename Number>
3113 inline DEAL_II_ALWAYS_INLINE VectorizedArray<Number>
3114  operator*(const VectorizedArray<Number> &v, const Number &u)
3115 {
3116  return u * v;
3117 }
3118 
3127 inline DEAL_II_ALWAYS_INLINE VectorizedArray<float>
3128  operator*(const VectorizedArray<float> &v, const double &u)
3129 {
3130  return u * v;
3131 }
3132 
3139 template <typename Number>
3140 inline DEAL_II_ALWAYS_INLINE VectorizedArray<Number>
3141  operator/(const Number &u, const VectorizedArray<Number> &v)
3142 {
3144  tmp = u;
3145  return tmp /= v;
3146 }
3147 
3156 inline DEAL_II_ALWAYS_INLINE VectorizedArray<float>
3157  operator/(const double &u, const VectorizedArray<float> &v)
3158 {
3160  tmp = float(u);
3161  return tmp /= v;
3162 }
3163 
3170 template <typename Number>
3171 inline DEAL_II_ALWAYS_INLINE VectorizedArray<Number>
3172  operator/(const VectorizedArray<Number> &v, const Number &u)
3173 {
3175  tmp = u;
3176  return v / tmp;
3177 }
3178 
3187 inline DEAL_II_ALWAYS_INLINE VectorizedArray<float>
3188  operator/(const VectorizedArray<float> &v, const double &u)
3189 {
3191  tmp = float(u);
3192  return v / tmp;
3193 }
3194 
3200 template <typename Number>
3201 inline DEAL_II_ALWAYS_INLINE VectorizedArray<Number>
3203 {
3204  return u;
3205 }
3206 
3212 template <typename Number>
3213 inline DEAL_II_ALWAYS_INLINE VectorizedArray<Number>
3215 {
3216  // to get a negative sign, subtract the input from zero (could also
3217  // multiply by -1, but this one is slightly simpler)
3218  return VectorizedArray<Number>() - u;
3219 }
3220 
3221 
3222 DEAL_II_NAMESPACE_CLOSE
3223 
3224 
3231 namespace std
3232 {
3240  template <typename Number>
3241  inline ::VectorizedArray<Number>
3242  sin(const ::VectorizedArray<Number> &x)
3243  {
3244  // put values in an array and later read in that array with an unaligned
3245  // read. This should save some instructions as compared to directly
3246  // setting the individual elements and also circumvents a compiler
3247  // optimization bug in gcc-4.6 with SSE2 (see also deal.II developers list
3248  // from April 2014, topic "matrix_free/step-48 Test").
3250  for (unsigned int i = 0;
3251  i < ::VectorizedArray<Number>::n_array_elements;
3252  ++i)
3253  values[i] = std::sin(x[i]);
3255  out.load(&values[0]);
3256  return out;
3257  }
3258 
3259 
3260 
3268  template <typename Number>
3269  inline ::VectorizedArray<Number>
3270  cos(const ::VectorizedArray<Number> &x)
3271  {
3273  for (unsigned int i = 0;
3274  i < ::VectorizedArray<Number>::n_array_elements;
3275  ++i)
3276  values[i] = std::cos(x[i]);
3278  out.load(&values[0]);
3279  return out;
3280  }
3281 
3282 
3283 
3291  template <typename Number>
3292  inline ::VectorizedArray<Number>
3293  tan(const ::VectorizedArray<Number> &x)
3294  {
3296  for (unsigned int i = 0;
3297  i < ::VectorizedArray<Number>::n_array_elements;
3298  ++i)
3299  values[i] = std::tan(x[i]);
3301  out.load(&values[0]);
3302  return out;
3303  }
3304 
3305 
3306 
3314  template <typename Number>
3315  inline ::VectorizedArray<Number>
3316  exp(const ::VectorizedArray<Number> &x)
3317  {
3319  for (unsigned int i = 0;
3320  i < ::VectorizedArray<Number>::n_array_elements;
3321  ++i)
3322  values[i] = std::exp(x[i]);
3324  out.load(&values[0]);
3325  return out;
3326  }
3327 
3328 
3329 
3337  template <typename Number>
3338  inline ::VectorizedArray<Number>
3339  log(const ::VectorizedArray<Number> &x)
3340  {
3342  for (unsigned int i = 0;
3343  i < ::VectorizedArray<Number>::n_array_elements;
3344  ++i)
3345  values[i] = std::log(x[i]);
3347  out.load(&values[0]);
3348  return out;
3349  }
3350 
3351 
3352 
3360  template <typename Number>
3361  inline ::VectorizedArray<Number>
3362  sqrt(const ::VectorizedArray<Number> &x)
3363  {
3364  return x.get_sqrt();
3365  }
3366 
3367 
3368 
3376  template <typename Number>
3377  inline ::VectorizedArray<Number>
3378  pow(const ::VectorizedArray<Number> &x, const Number p)
3379  {
3381  for (unsigned int i = 0;
3382  i < ::VectorizedArray<Number>::n_array_elements;
3383  ++i)
3384  values[i] = std::pow(x[i], p);
3386  out.load(&values[0]);
3387  return out;
3388  }
3389 
3390 
3391 
3399  template <typename Number>
3400  inline ::VectorizedArray<Number>
3401  abs(const ::VectorizedArray<Number> &x)
3402  {
3403  return x.get_abs();
3404  }
3405 
3406 
3407 
3415  template <typename Number>
3416  inline ::VectorizedArray<Number>
3417  max(const ::VectorizedArray<Number> &x,
3418  const ::VectorizedArray<Number> &y)
3419  {
3420  return x.get_max(y);
3421  }
3422 
3423 
3424 
3432  template <typename Number>
3433  inline ::VectorizedArray<Number>
3434  min(const ::VectorizedArray<Number> &x,
3435  const ::VectorizedArray<Number> &y)
3436  {
3437  return x.get_min(y);
3438  }
3439 
3440 } // namespace std
3441 
3442 #endif
VectorizedArray< Number > operator/(const VectorizedArray< Number > &u, const VectorizedArray< Number > &v)
VectorizedArray< Number > operator+(const VectorizedArray< Number > &u, const VectorizedArray< Number > &v)
void gather(const double *base_ptr, const unsigned int *offsets)
VectorizedArray & operator-=(const VectorizedArray &vec)
double & operator[](const unsigned int comp)
VectorizedArray< Number > operator-(const VectorizedArray< Number > &u, const VectorizedArray< Number > &v)
VectorizedArray< Number > log(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > operator*(const VectorizedArray< Number > &u, const VectorizedArray< Number > &v)
const Number & operator[](const unsigned int comp) const
VectorizedArray & operator+=(const VectorizedArray &vec)
VectorizedArray get_sqrt() const
VectorizedArray< Number > operator-(const Number &u, const VectorizedArray< Number > &v)
VectorizedArray< float > operator-(const double &u, const VectorizedArray< float > &v)
VectorizedArray< float > operator+(const double &u, const VectorizedArray< float > &v)
VectorizedArray< Number > operator/(const Number &u, const VectorizedArray< Number > &v)
VectorizedArray< float > operator*(const double &u, const VectorizedArray< float > &v)
VectorizedArray< Number > make_vectorized_array(const Number &u)
VectorizedArray< float > operator/(const double &u, const VectorizedArray< float > &v)
VectorizedArray< Number > tan(const ::VectorizedArray< Number > &x)
#define AssertIndexRange(index, range)
Definition: exceptions.h:1407
void streaming_store(Number *ptr) const
STL namespace.
float & operator[](const unsigned int comp)
VectorizedArray< Number > exp(const ::VectorizedArray< Number > &x)
VectorizedArray get_sqrt() const
bool operator==(const VectorizedArray< Number > &lhs, const VectorizedArray< Number > &rhs)
void store(double *ptr) const
void gather(const float *base_ptr, const unsigned int *offsets)
void scatter(const unsigned int *offsets, Number *base_ptr) const
void streaming_store(float *ptr) const
VectorizedArray & operator+=(const VectorizedArray< Number > &vec)
void scatter(const unsigned int *offsets, double *base_ptr) const
VectorizedArray get_min(const VectorizedArray &other) const
VectorizedArray & operator*=(const VectorizedArray &vec)
void vectorized_transpose_and_store(const bool add_into, const unsigned int n_entries, const VectorizedArray< Number > *in, const unsigned int *offsets, Number *out)
const float & operator[](const unsigned int comp) const
static::ExceptionBase & ExcMessage(std::string arg1)
Number & operator[](const unsigned int comp)
VectorizedArray< Number > operator+(const VectorizedArray< Number > &v, const Number &u)
VectorizedArray get_abs() const
VectorizedArray get_abs() const
void store(Number *ptr) const
VectorizedArray< Number > min(const ::VectorizedArray< Number > &x, const ::VectorizedArray< Number > &y)
VectorizedArray< Number > operator-(const VectorizedArray< Number > &u)
void load(const double *ptr)
#define Assert(cond, exc)
Definition: exceptions.h:1227
void scatter(const unsigned int *offsets, float *base_ptr) const
VectorizedArray get_min(const VectorizedArray &other) const
VectorizedArray get_abs() const
void streaming_store(double *ptr) const
VectorizedArray< Number > pow(const ::VectorizedArray< Number > &x, const Number p)
VectorizedArray & operator/=(const VectorizedArray &vec)
VectorizedArray & operator=(const double x)
VectorizedArray< Number > operator+(const VectorizedArray< Number > &u)
VectorizedArray get_max(const VectorizedArray &other) const
void vectorized_load_and_transpose(const unsigned int n_entries, const Number *in, const unsigned int *offsets, VectorizedArray< Number > *out)
VectorizedArray< Number > operator+(const Number &u, const VectorizedArray< Number > &v)
VectorizedArray & operator-=(const VectorizedArray< Number > &vec)
VectorizedArray & operator+=(const VectorizedArray &vec)
VectorizedArray< Number > operator-(const VectorizedArray< Number > &v, const Number &u)
void load(const float *ptr)
VectorizedArray & operator=(const float x)
VectorizedArray< Number > sqrt(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > sin(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > operator/(const VectorizedArray< Number > &v, const Number &u)
VectorizedArray get_max(const VectorizedArray &other) const
VectorizedArray & operator/=(const VectorizedArray &vec)
const double & operator[](const unsigned int comp) const
VectorizedArray & operator-=(const VectorizedArray &vec)
VectorizedArray & operator/=(const VectorizedArray< Number > &vec)
void load(const Number *ptr)
VectorizedArray & operator*=(const VectorizedArray &vec)
VectorizedArray< Number > operator*(const Number &u, const VectorizedArray< Number > &v)
VectorizedArray< Number > operator*(const VectorizedArray< Number > &v, const Number &u)
void gather(const Number *base_ptr, const unsigned int *offsets)
VectorizedArray & operator*=(const VectorizedArray< Number > &vec)
VectorizedArray< float > operator*(const VectorizedArray< float > &v, const double &u)
VectorizedArray< float > operator/(const VectorizedArray< float > &v, const double &u)
VectorizedArray & operator=(const Number scalar)
VectorizedArray< Number > abs(const ::VectorizedArray< Number > &x)
void store(float *ptr) const
VectorizedArray get_min(const VectorizedArray &other) const
VectorizedArray< float > operator+(const VectorizedArray< float > &v, const double &u)
VectorizedArray< Number > max(const ::VectorizedArray< Number > &x, const ::VectorizedArray< Number > &y)
VectorizedArray get_sqrt() const
VectorizedArray< Number > cos(const ::VectorizedArray< Number > &x)
VectorizedArray< float > operator-(const VectorizedArray< float > &v, const double &u)
VectorizedArray get_max(const VectorizedArray &other) const