17 #ifndef dealii_vectorization_h 18 #define dealii_vectorization_h 20 #include <deal.II/base/config.h> 22 #include <deal.II/base/exceptions.h> 23 #include <deal.II/base/template_constraints.h> 43 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 2 && !defined(__AVX__) 45 "Mismatch in vectorization capabilities: AVX was detected during configuration of deal.II and switched on, but it is apparently not available for the file you are trying to compile at the moment. Check compilation flags controlling the instruction set, such as -march=native." 47 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 3 && !defined(__AVX512F__) 49 "Mismatch in vectorization capabilities: AVX-512F was detected during configuration of deal.II and switched on, but it is apparently not available for the file you are trying to compile at the moment. Check compilation flags controlling the instruction set, such as -march=native." 52 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 2 // AVX, AVX-512 53 # include <immintrin.h> 54 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL == 1 // SSE2 55 # include <emmintrin.h> 59 DEAL_II_NAMESPACE_OPEN
100 template <
typename Number>
158 template <
typename Number>
159 class VectorizedArray
165 static const unsigned int n_array_elements = 1;
173 DEAL_II_ALWAYS_INLINE
184 DEAL_II_ALWAYS_INLINE
195 DEAL_II_ALWAYS_INLINE
206 DEAL_II_ALWAYS_INLINE
217 DEAL_II_ALWAYS_INLINE
228 DEAL_II_ALWAYS_INLINE
239 DEAL_II_ALWAYS_INLINE
253 DEAL_II_ALWAYS_INLINE
266 DEAL_II_ALWAYS_INLINE
317 DEAL_II_ALWAYS_INLINE
336 DEAL_II_ALWAYS_INLINE
338 gather(
const Number *base_ptr,
const unsigned int *offsets)
340 data = base_ptr[offsets[0]];
355 DEAL_II_ALWAYS_INLINE
357 scatter(
const unsigned int *offsets, Number *base_ptr)
const 359 base_ptr[offsets[0]] = data;
373 DEAL_II_ALWAYS_INLINE
378 res.
data = std::sqrt(data);
386 DEAL_II_ALWAYS_INLINE
391 res.
data = std::fabs(data);
399 DEAL_II_ALWAYS_INLINE
404 res.
data = std::max(data, other.
data);
412 DEAL_II_ALWAYS_INLINE
417 res.
data = std::min(data, other.
data);
424 template <
typename Number2>
427 template <
typename Number2>
430 template <
typename Number2>
433 template <
typename Number2>
446 template <
typename Number>
482 template <
typename Number>
486 const unsigned int * offsets,
489 for (
unsigned int i = 0; i < n_entries; ++i)
490 for (
unsigned int v = 0; v < VectorizedArray<Number>::n_array_elements; ++v)
491 out[i][v] = in[offsets[v] + i];
534 template <
typename Number>
537 const unsigned int n_entries,
539 const unsigned int * offsets,
543 for (
unsigned int i = 0; i < n_entries; ++i)
544 for (
unsigned int v = 0; v < VectorizedArray<Number>::n_array_elements;
546 out[offsets[v] + i] += in[i][v];
548 for (
unsigned int i = 0; i < n_entries; ++i)
549 for (
unsigned int v = 0; v < VectorizedArray<Number>::n_array_elements;
551 out[offsets[v] + i] = in[i][v];
559 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 3 && defined(__AVX512F__) 565 class VectorizedArray<double>
571 static const unsigned int n_array_elements = 8;
576 DEAL_II_ALWAYS_INLINE
578 operator=(
const double x)
580 data = _mm512_set1_pd(x);
587 DEAL_II_ALWAYS_INLINE
588 double &operator[](
const unsigned int comp)
591 return *(
reinterpret_cast<double *
>(&data) + comp);
597 DEAL_II_ALWAYS_INLINE
598 const double &operator[](
const unsigned int comp)
const 601 return *(
reinterpret_cast<const double *
>(&data) + comp);
607 DEAL_II_ALWAYS_INLINE
609 operator+=(
const VectorizedArray &vec)
616 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 619 data = _mm512_add_pd(data, vec.
data);
627 DEAL_II_ALWAYS_INLINE
629 operator-=(
const VectorizedArray &vec)
631 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 634 data = _mm512_sub_pd(data, vec.
data);
641 DEAL_II_ALWAYS_INLINE
643 operator*=(
const VectorizedArray &vec)
645 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 648 data = _mm512_mul_pd(data, vec.
data);
656 DEAL_II_ALWAYS_INLINE
658 operator/=(
const VectorizedArray &vec)
660 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 663 data = _mm512_div_pd(data, vec.
data);
673 DEAL_II_ALWAYS_INLINE
675 load(
const double *ptr)
677 data = _mm512_loadu_pd(ptr);
686 DEAL_II_ALWAYS_INLINE
688 store(
double *ptr)
const 690 _mm512_storeu_pd(ptr, data);
696 DEAL_II_ALWAYS_INLINE
698 streaming_store(
double *ptr)
const 700 Assert(reinterpret_cast<std::size_t>(ptr) % 64 == 0,
702 _mm512_stream_pd(ptr, data);
717 DEAL_II_ALWAYS_INLINE
719 gather(
const double *base_ptr,
const unsigned int *offsets)
724 const __m256 index_val = _mm256_loadu_ps((
const float *)offsets);
725 const __m256i index = *((__m256i *)(&index_val));
726 data = _mm512_i32gather_pd(index, base_ptr, 8);
741 DEAL_II_ALWAYS_INLINE
743 scatter(
const unsigned int *offsets,
double *base_ptr)
const 745 for (
unsigned int i = 0; i < 8; ++i)
746 for (
unsigned int j = i + 1; j < 8; ++j)
747 Assert(offsets[i] != offsets[j],
748 ExcMessage(
"Result of scatter undefined if two offset elements" 749 " point to the same position"));
754 const __m256 index_val = _mm256_loadu_ps((
const float *)offsets);
755 const __m256i index = *((__m256i *)(&index_val));
756 _mm512_i32scatter_pd(base_ptr, index, data, 8);
770 DEAL_II_ALWAYS_INLINE
775 res.
data = _mm512_sqrt_pd(data);
783 DEAL_II_ALWAYS_INLINE
792 __m512d mask = _mm512_set1_pd(-0.);
794 res.
data = (__m512d)_mm512_andnot_epi64((__m512i)mask, (__m512i)data);
802 DEAL_II_ALWAYS_INLINE
804 get_max(
const VectorizedArray &other)
const 807 res.
data = _mm512_max_pd(data, other.
data);
815 DEAL_II_ALWAYS_INLINE
817 get_min(
const VectorizedArray &other)
const 820 res.
data = _mm512_min_pd(data, other.
data);
827 template <
typename Number2>
830 template <
typename Number2>
833 template <
typename Number2>
836 template <
typename Number2>
848 vectorized_load_and_transpose(
const unsigned int n_entries,
850 const unsigned int * offsets,
853 const unsigned int n_chunks = n_entries / 4;
854 for (
unsigned int outer = 0; outer < 8; outer += 4)
856 const double *in0 = in + offsets[0 + outer];
857 const double *in1 = in + offsets[1 + outer];
858 const double *in2 = in + offsets[2 + outer];
859 const double *in3 = in + offsets[3 + outer];
861 for (
unsigned int i = 0; i < n_chunks; ++i)
863 __m256d u0 = _mm256_loadu_pd(in0 + 4 * i);
864 __m256d u1 = _mm256_loadu_pd(in1 + 4 * i);
865 __m256d u2 = _mm256_loadu_pd(in2 + 4 * i);
866 __m256d u3 = _mm256_loadu_pd(in3 + 4 * i);
867 __m256d t0 = _mm256_permute2f128_pd(u0, u2, 0x20);
868 __m256d t1 = _mm256_permute2f128_pd(u1, u3, 0x20);
869 __m256d t2 = _mm256_permute2f128_pd(u0, u2, 0x31);
870 __m256d t3 = _mm256_permute2f128_pd(u1, u3, 0x31);
871 *(__m256d *)((
double *)(&out[4 * i + 0].
data) + outer) =
872 _mm256_unpacklo_pd(t0, t1);
873 *(__m256d *)((
double *)(&out[4 * i + 1].
data) + outer) =
874 _mm256_unpackhi_pd(t0, t1);
875 *(__m256d *)((
double *)(&out[4 * i + 2].
data) + outer) =
876 _mm256_unpacklo_pd(t2, t3);
877 *(__m256d *)((
double *)(&out[4 * i + 3].
data) + outer) =
878 _mm256_unpackhi_pd(t2, t3);
880 for (
unsigned int i = 4 * n_chunks; i < n_entries; ++i)
881 for (
unsigned int v = 0; v < 4; ++v)
882 out[i][outer + v] = in[offsets[v + outer] + i];
893 vectorized_transpose_and_store(
const bool add_into,
894 const unsigned int n_entries,
896 const unsigned int * offsets,
899 const unsigned int n_chunks = n_entries / 4;
903 for (
unsigned int outer = 0; outer < 8; outer += 4)
905 double *out0 = out + offsets[0 + outer];
906 double *out1 = out + offsets[1 + outer];
907 double *out2 = out + offsets[2 + outer];
908 double *out3 = out + offsets[3 + outer];
909 for (
unsigned int i = 0; i < n_chunks; ++i)
912 *(
const __m256d *)((
const double *)(&in[4 * i + 0].
data) + outer);
914 *(
const __m256d *)((
const double *)(&in[4 * i + 1].
data) + outer);
916 *(
const __m256d *)((
const double *)(&in[4 * i + 2].
data) + outer);
918 *(
const __m256d *)((
const double *)(&in[4 * i + 3].
data) + outer);
919 __m256d t0 = _mm256_permute2f128_pd(u0, u2, 0x20);
920 __m256d t1 = _mm256_permute2f128_pd(u1, u3, 0x20);
921 __m256d t2 = _mm256_permute2f128_pd(u0, u2, 0x31);
922 __m256d t3 = _mm256_permute2f128_pd(u1, u3, 0x31);
923 __m256d res0 = _mm256_unpacklo_pd(t0, t1);
924 __m256d res1 = _mm256_unpackhi_pd(t0, t1);
925 __m256d res2 = _mm256_unpacklo_pd(t2, t3);
926 __m256d res3 = _mm256_unpackhi_pd(t2, t3);
933 res0 = _mm256_add_pd(_mm256_loadu_pd(out0 + 4 * i), res0);
934 _mm256_storeu_pd(out0 + 4 * i, res0);
935 res1 = _mm256_add_pd(_mm256_loadu_pd(out1 + 4 * i), res1);
936 _mm256_storeu_pd(out1 + 4 * i, res1);
937 res2 = _mm256_add_pd(_mm256_loadu_pd(out2 + 4 * i), res2);
938 _mm256_storeu_pd(out2 + 4 * i, res2);
939 res3 = _mm256_add_pd(_mm256_loadu_pd(out3 + 4 * i), res3);
940 _mm256_storeu_pd(out3 + 4 * i, res3);
944 _mm256_storeu_pd(out0 + 4 * i, res0);
945 _mm256_storeu_pd(out1 + 4 * i, res1);
946 _mm256_storeu_pd(out2 + 4 * i, res2);
947 _mm256_storeu_pd(out3 + 4 * i, res3);
951 for (
unsigned int i = 4 * n_chunks; i < n_entries; ++i)
952 for (
unsigned int v = 0; v < 4; ++v)
953 out[offsets[v + outer] + i] += in[i][v + outer];
955 for (
unsigned int i = 4 * n_chunks; i < n_entries; ++i)
956 for (
unsigned int v = 0; v < 4; ++v)
957 out[offsets[v + outer] + i] = in[i][v + outer];
967 class VectorizedArray<float>
973 static const unsigned int n_array_elements = 16;
978 DEAL_II_ALWAYS_INLINE
980 operator=(
const float x)
982 data = _mm512_set1_ps(x);
989 DEAL_II_ALWAYS_INLINE
990 float &operator[](
const unsigned int comp)
993 return *(
reinterpret_cast<float *
>(&data) + comp);
999 DEAL_II_ALWAYS_INLINE
1000 const float &operator[](
const unsigned int comp)
const 1003 return *(
reinterpret_cast<const float *
>(&data) + comp);
1009 DEAL_II_ALWAYS_INLINE
1011 operator+=(
const VectorizedArray &vec)
1018 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1021 data = _mm512_add_ps(data, vec.
data);
1029 DEAL_II_ALWAYS_INLINE
1031 operator-=(
const VectorizedArray &vec)
1033 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1036 data = _mm512_sub_ps(data, vec.
data);
1043 DEAL_II_ALWAYS_INLINE
1045 operator*=(
const VectorizedArray &vec)
1047 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1050 data = _mm512_mul_ps(data, vec.
data);
1058 DEAL_II_ALWAYS_INLINE
1060 operator/=(
const VectorizedArray &vec)
1062 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1065 data = _mm512_div_ps(data, vec.
data);
1075 DEAL_II_ALWAYS_INLINE
1077 load(
const float *ptr)
1079 data = _mm512_loadu_ps(ptr);
1088 DEAL_II_ALWAYS_INLINE
1090 store(
float *ptr)
const 1092 _mm512_storeu_ps(ptr, data);
1098 DEAL_II_ALWAYS_INLINE
1100 streaming_store(
float *ptr)
const 1102 Assert(reinterpret_cast<std::size_t>(ptr) % 64 == 0,
1104 _mm512_stream_ps(ptr, data);
1119 DEAL_II_ALWAYS_INLINE
1121 gather(
const float *base_ptr,
const unsigned int *offsets)
1126 const __m512 index_val = _mm512_loadu_ps((
const float *)offsets);
1127 const __m512i index = *((__m512i *)(&index_val));
1128 data = _mm512_i32gather_ps(index, base_ptr, 4);
1143 DEAL_II_ALWAYS_INLINE
1145 scatter(
const unsigned int *offsets,
float *base_ptr)
const 1147 for (
unsigned int i = 0; i < 16; ++i)
1148 for (
unsigned int j = i + 1; j < 16; ++j)
1149 Assert(offsets[i] != offsets[j],
1150 ExcMessage(
"Result of scatter undefined if two offset elements" 1151 " point to the same position"));
1156 const __m512 index_val = _mm512_loadu_ps((
const float *)offsets);
1157 const __m512i index = *((__m512i *)(&index_val));
1158 _mm512_i32scatter_ps(base_ptr, index, data, 4);
1172 DEAL_II_ALWAYS_INLINE
1176 VectorizedArray res;
1177 res.
data = _mm512_sqrt_ps(data);
1185 DEAL_II_ALWAYS_INLINE
1194 __m512 mask = _mm512_set1_ps(-0.f);
1195 VectorizedArray res;
1196 res.
data = (__m512)_mm512_andnot_epi32((__m512i)mask, (__m512i)data);
1204 DEAL_II_ALWAYS_INLINE
1206 get_max(
const VectorizedArray &other)
const 1208 VectorizedArray res;
1209 res.
data = _mm512_max_ps(data, other.
data);
1217 DEAL_II_ALWAYS_INLINE
1219 get_min(
const VectorizedArray &other)
const 1221 VectorizedArray res;
1222 res.
data = _mm512_min_ps(data, other.
data);
1229 template <
typename Number2>
1232 template <
typename Number2>
1235 template <
typename Number2>
1238 template <
typename Number2>
1250 vectorized_load_and_transpose(
const unsigned int n_entries,
1252 const unsigned int * offsets,
1255 const unsigned int n_chunks = n_entries / 4;
1256 for (
unsigned int outer = 0; outer < 16; outer += 8)
1258 for (
unsigned int i = 0; i < n_chunks; ++i)
1260 __m128 u0 = _mm_loadu_ps(in + 4 * i + offsets[0 + outer]);
1261 __m128 u1 = _mm_loadu_ps(in + 4 * i + offsets[1 + outer]);
1262 __m128 u2 = _mm_loadu_ps(in + 4 * i + offsets[2 + outer]);
1263 __m128 u3 = _mm_loadu_ps(in + 4 * i + offsets[3 + outer]);
1264 __m128 u4 = _mm_loadu_ps(in + 4 * i + offsets[4 + outer]);
1265 __m128 u5 = _mm_loadu_ps(in + 4 * i + offsets[5 + outer]);
1266 __m128 u6 = _mm_loadu_ps(in + 4 * i + offsets[6 + outer]);
1267 __m128 u7 = _mm_loadu_ps(in + 4 * i + offsets[7 + outer]);
1270 __m256 t0, t1, t2, t3 = _mm256_set1_ps(0.F);
1271 t0 = _mm256_insertf128_ps(t3, u0, 0);
1272 t0 = _mm256_insertf128_ps(t0, u4, 1);
1273 t1 = _mm256_insertf128_ps(t3, u1, 0);
1274 t1 = _mm256_insertf128_ps(t1, u5, 1);
1275 t2 = _mm256_insertf128_ps(t3, u2, 0);
1276 t2 = _mm256_insertf128_ps(t2, u6, 1);
1277 t3 = _mm256_insertf128_ps(t3, u3, 0);
1278 t3 = _mm256_insertf128_ps(t3, u7, 1);
1279 __m256 v0 = _mm256_shuffle_ps(t0, t1, 0x44);
1280 __m256 v1 = _mm256_shuffle_ps(t0, t1, 0xee);
1281 __m256 v2 = _mm256_shuffle_ps(t2, t3, 0x44);
1282 __m256 v3 = _mm256_shuffle_ps(t2, t3, 0xee);
1283 *(__m256 *)((
float *)(&out[4 * i + 0].
data) + outer) =
1284 _mm256_shuffle_ps(v0, v2, 0x88);
1285 *(__m256 *)((
float *)(&out[4 * i + 1].
data) + outer) =
1286 _mm256_shuffle_ps(v0, v2, 0xdd);
1287 *(__m256 *)((
float *)(&out[4 * i + 2].
data) + outer) =
1288 _mm256_shuffle_ps(v1, v3, 0x88);
1289 *(__m256 *)((
float *)(&out[4 * i + 3].
data) + outer) =
1290 _mm256_shuffle_ps(v1, v3, 0xdd);
1292 for (
unsigned int i = 4 * n_chunks; i < n_entries; ++i)
1293 for (
unsigned int v = 0; v < 8; ++v)
1294 out[i][v + outer] = in[offsets[v + outer] + i];
1305 vectorized_transpose_and_store(
const bool add_into,
1306 const unsigned int n_entries,
1308 const unsigned int * offsets,
1311 const unsigned int n_chunks = n_entries / 4;
1312 for (
unsigned int outer = 0; outer < 16; outer += 8)
1314 for (
unsigned int i = 0; i < n_chunks; ++i)
1317 *(
const __m256 *)((
const float *)(&in[4 * i + 0].
data) + outer);
1319 *(
const __m256 *)((
const float *)(&in[4 * i + 1].
data) + outer);
1321 *(
const __m256 *)((
const float *)(&in[4 * i + 2].
data) + outer);
1323 *(
const __m256 *)((
const float *)(&in[4 * i + 3].
data) + outer);
1324 __m256 t0 = _mm256_shuffle_ps(u0, u1, 0x44);
1325 __m256 t1 = _mm256_shuffle_ps(u0, u1, 0xee);
1326 __m256 t2 = _mm256_shuffle_ps(u2, u3, 0x44);
1327 __m256 t3 = _mm256_shuffle_ps(u2, u3, 0xee);
1328 u0 = _mm256_shuffle_ps(t0, t2, 0x88);
1329 u1 = _mm256_shuffle_ps(t0, t2, 0xdd);
1330 u2 = _mm256_shuffle_ps(t1, t3, 0x88);
1331 u3 = _mm256_shuffle_ps(t1, t3, 0xdd);
1332 __m128 res0 = _mm256_extractf128_ps(u0, 0);
1333 __m128 res4 = _mm256_extractf128_ps(u0, 1);
1334 __m128 res1 = _mm256_extractf128_ps(u1, 0);
1335 __m128 res5 = _mm256_extractf128_ps(u1, 1);
1336 __m128 res2 = _mm256_extractf128_ps(u2, 0);
1337 __m128 res6 = _mm256_extractf128_ps(u2, 1);
1338 __m128 res3 = _mm256_extractf128_ps(u3, 0);
1339 __m128 res7 = _mm256_extractf128_ps(u3, 1);
1346 res0 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[0 + outer]),
1348 _mm_storeu_ps(out + 4 * i + offsets[0 + outer], res0);
1349 res1 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[1 + outer]),
1351 _mm_storeu_ps(out + 4 * i + offsets[1 + outer], res1);
1352 res2 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[2 + outer]),
1354 _mm_storeu_ps(out + 4 * i + offsets[2 + outer], res2);
1355 res3 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[3 + outer]),
1357 _mm_storeu_ps(out + 4 * i + offsets[3 + outer], res3);
1358 res4 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[4 + outer]),
1360 _mm_storeu_ps(out + 4 * i + offsets[4 + outer], res4);
1361 res5 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[5 + outer]),
1363 _mm_storeu_ps(out + 4 * i + offsets[5 + outer], res5);
1364 res6 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[6 + outer]),
1366 _mm_storeu_ps(out + 4 * i + offsets[6 + outer], res6);
1367 res7 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[7 + outer]),
1369 _mm_storeu_ps(out + 4 * i + offsets[7 + outer], res7);
1373 _mm_storeu_ps(out + 4 * i + offsets[0 + outer], res0);
1374 _mm_storeu_ps(out + 4 * i + offsets[1 + outer], res1);
1375 _mm_storeu_ps(out + 4 * i + offsets[2 + outer], res2);
1376 _mm_storeu_ps(out + 4 * i + offsets[3 + outer], res3);
1377 _mm_storeu_ps(out + 4 * i + offsets[4 + outer], res4);
1378 _mm_storeu_ps(out + 4 * i + offsets[5 + outer], res5);
1379 _mm_storeu_ps(out + 4 * i + offsets[6 + outer], res6);
1380 _mm_storeu_ps(out + 4 * i + offsets[7 + outer], res7);
1384 for (
unsigned int i = 4 * n_chunks; i < n_entries; ++i)
1385 for (
unsigned int v = 0; v < 8; ++v)
1386 out[offsets[v + outer] + i] += in[i][v + outer];
1388 for (
unsigned int i = 4 * n_chunks; i < n_entries; ++i)
1389 for (
unsigned int v = 0; v < 8; ++v)
1390 out[offsets[v + outer] + i] = in[i][v + outer];
1396 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 2 && defined(__AVX__) 1402 class VectorizedArray<double>
1408 static const unsigned int n_array_elements = 4;
1413 DEAL_II_ALWAYS_INLINE
1415 operator=(
const double x)
1417 data = _mm256_set1_pd(x);
1424 DEAL_II_ALWAYS_INLINE
1425 double &operator[](
const unsigned int comp)
1428 return *(
reinterpret_cast<double *
>(&data) + comp);
1434 DEAL_II_ALWAYS_INLINE
1435 const double &operator[](
const unsigned int comp)
const 1438 return *(
reinterpret_cast<const double *
>(&data) + comp);
1444 DEAL_II_ALWAYS_INLINE
1446 operator+=(
const VectorizedArray &vec)
1453 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1456 data = _mm256_add_pd(data, vec.
data);
1464 DEAL_II_ALWAYS_INLINE
1466 operator-=(
const VectorizedArray &vec)
1468 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1471 data = _mm256_sub_pd(data, vec.
data);
1478 DEAL_II_ALWAYS_INLINE
1480 operator*=(
const VectorizedArray &vec)
1482 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1485 data = _mm256_mul_pd(data, vec.
data);
1493 DEAL_II_ALWAYS_INLINE
1495 operator/=(
const VectorizedArray &vec)
1497 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1500 data = _mm256_div_pd(data, vec.
data);
1510 DEAL_II_ALWAYS_INLINE
1512 load(
const double *ptr)
1514 data = _mm256_loadu_pd(ptr);
1523 DEAL_II_ALWAYS_INLINE
1525 store(
double *ptr)
const 1527 _mm256_storeu_pd(ptr, data);
1533 DEAL_II_ALWAYS_INLINE
1535 streaming_store(
double *ptr)
const 1537 Assert(reinterpret_cast<std::size_t>(ptr) % 32 == 0,
1539 _mm256_stream_pd(ptr, data);
1554 DEAL_II_ALWAYS_INLINE
1556 gather(
const double *base_ptr,
const unsigned int *offsets)
1562 const __m128 index_val = _mm_loadu_ps((
const float *)offsets);
1563 const __m128i index = *((__m128i *)(&index_val));
1564 data = _mm256_i32gather_pd(base_ptr, index, 8);
1566 for (
unsigned int i = 0; i < 4; ++i)
1567 *(reinterpret_cast<double *>(&data) + i) = base_ptr[offsets[i]];
1583 DEAL_II_ALWAYS_INLINE
1585 scatter(
const unsigned int *offsets,
double *base_ptr)
const 1588 for (
unsigned int i = 0; i < 4; ++i)
1589 base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
1603 DEAL_II_ALWAYS_INLINE
1607 VectorizedArray res;
1608 res.
data = _mm256_sqrt_pd(data);
1616 DEAL_II_ALWAYS_INLINE
1623 __m256d mask = _mm256_set1_pd(-0.);
1624 VectorizedArray res;
1625 res.
data = _mm256_andnot_pd(mask, data);
1633 DEAL_II_ALWAYS_INLINE
1635 get_max(
const VectorizedArray &other)
const 1637 VectorizedArray res;
1638 res.
data = _mm256_max_pd(data, other.
data);
1646 DEAL_II_ALWAYS_INLINE
1648 get_min(
const VectorizedArray &other)
const 1650 VectorizedArray res;
1651 res.
data = _mm256_min_pd(data, other.
data);
1658 template <
typename Number2>
1661 template <
typename Number2>
1664 template <
typename Number2>
1667 template <
typename Number2>
1679 vectorized_load_and_transpose(
const unsigned int n_entries,
1681 const unsigned int * offsets,
1684 const unsigned int n_chunks = n_entries / 4;
1685 const double * in0 = in + offsets[0];
1686 const double * in1 = in + offsets[1];
1687 const double * in2 = in + offsets[2];
1688 const double * in3 = in + offsets[3];
1690 for (
unsigned int i = 0; i < n_chunks; ++i)
1692 __m256d u0 = _mm256_loadu_pd(in0 + 4 * i);
1693 __m256d u1 = _mm256_loadu_pd(in1 + 4 * i);
1694 __m256d u2 = _mm256_loadu_pd(in2 + 4 * i);
1695 __m256d u3 = _mm256_loadu_pd(in3 + 4 * i);
1696 __m256d t0 = _mm256_permute2f128_pd(u0, u2, 0x20);
1697 __m256d t1 = _mm256_permute2f128_pd(u1, u3, 0x20);
1698 __m256d t2 = _mm256_permute2f128_pd(u0, u2, 0x31);
1699 __m256d t3 = _mm256_permute2f128_pd(u1, u3, 0x31);
1700 out[4 * i + 0].
data = _mm256_unpacklo_pd(t0, t1);
1701 out[4 * i + 1].
data = _mm256_unpackhi_pd(t0, t1);
1702 out[4 * i + 2].
data = _mm256_unpacklo_pd(t2, t3);
1703 out[4 * i + 3].
data = _mm256_unpackhi_pd(t2, t3);
1705 for (
unsigned int i = 4 * n_chunks; i < n_entries; ++i)
1706 for (
unsigned int v = 0; v < 4; ++v)
1707 out[i][v] = in[offsets[v] + i];
1717 vectorized_transpose_and_store(
const bool add_into,
1718 const unsigned int n_entries,
1720 const unsigned int * offsets,
1723 const unsigned int n_chunks = n_entries / 4;
1724 double * out0 = out + offsets[0];
1725 double * out1 = out + offsets[1];
1726 double * out2 = out + offsets[2];
1727 double * out3 = out + offsets[3];
1728 for (
unsigned int i = 0; i < n_chunks; ++i)
1730 __m256d u0 = in[4 * i + 0].
data;
1731 __m256d u1 = in[4 * i + 1].
data;
1732 __m256d u2 = in[4 * i + 2].
data;
1733 __m256d u3 = in[4 * i + 3].
data;
1734 __m256d t0 = _mm256_permute2f128_pd(u0, u2, 0x20);
1735 __m256d t1 = _mm256_permute2f128_pd(u1, u3, 0x20);
1736 __m256d t2 = _mm256_permute2f128_pd(u0, u2, 0x31);
1737 __m256d t3 = _mm256_permute2f128_pd(u1, u3, 0x31);
1738 __m256d res0 = _mm256_unpacklo_pd(t0, t1);
1739 __m256d res1 = _mm256_unpackhi_pd(t0, t1);
1740 __m256d res2 = _mm256_unpacklo_pd(t2, t3);
1741 __m256d res3 = _mm256_unpackhi_pd(t2, t3);
1748 res0 = _mm256_add_pd(_mm256_loadu_pd(out0 + 4 * i), res0);
1749 _mm256_storeu_pd(out0 + 4 * i, res0);
1750 res1 = _mm256_add_pd(_mm256_loadu_pd(out1 + 4 * i), res1);
1751 _mm256_storeu_pd(out1 + 4 * i, res1);
1752 res2 = _mm256_add_pd(_mm256_loadu_pd(out2 + 4 * i), res2);
1753 _mm256_storeu_pd(out2 + 4 * i, res2);
1754 res3 = _mm256_add_pd(_mm256_loadu_pd(out3 + 4 * i), res3);
1755 _mm256_storeu_pd(out3 + 4 * i, res3);
1759 _mm256_storeu_pd(out0 + 4 * i, res0);
1760 _mm256_storeu_pd(out1 + 4 * i, res1);
1761 _mm256_storeu_pd(out2 + 4 * i, res2);
1762 _mm256_storeu_pd(out3 + 4 * i, res3);
1766 for (
unsigned int i = 4 * n_chunks; i < n_entries; ++i)
1767 for (
unsigned int v = 0; v < 4; ++v)
1768 out[offsets[v] + i] += in[i][v];
1770 for (
unsigned int i = 4 * n_chunks; i < n_entries; ++i)
1771 for (
unsigned int v = 0; v < 4; ++v)
1772 out[offsets[v] + i] = in[i][v];
1781 class VectorizedArray<float>
1787 static const unsigned int n_array_elements = 8;
1792 DEAL_II_ALWAYS_INLINE
1794 operator=(
const float x)
1796 data = _mm256_set1_ps(x);
1803 DEAL_II_ALWAYS_INLINE
1804 float &operator[](
const unsigned int comp)
1807 return *(
reinterpret_cast<float *
>(&data) + comp);
1813 DEAL_II_ALWAYS_INLINE
1814 const float &operator[](
const unsigned int comp)
const 1817 return *(
reinterpret_cast<const float *
>(&data) + comp);
1823 DEAL_II_ALWAYS_INLINE
1825 operator+=(
const VectorizedArray &vec)
1832 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1835 data = _mm256_add_ps(data, vec.
data);
1843 DEAL_II_ALWAYS_INLINE
1845 operator-=(
const VectorizedArray &vec)
1847 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1850 data = _mm256_sub_ps(data, vec.
data);
1857 DEAL_II_ALWAYS_INLINE
1859 operator*=(
const VectorizedArray &vec)
1861 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1864 data = _mm256_mul_ps(data, vec.
data);
1872 DEAL_II_ALWAYS_INLINE
1874 operator/=(
const VectorizedArray &vec)
1876 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1879 data = _mm256_div_ps(data, vec.
data);
1889 DEAL_II_ALWAYS_INLINE
1891 load(
const float *ptr)
1893 data = _mm256_loadu_ps(ptr);
1902 DEAL_II_ALWAYS_INLINE
1904 store(
float *ptr)
const 1906 _mm256_storeu_ps(ptr, data);
1912 DEAL_II_ALWAYS_INLINE
1914 streaming_store(
float *ptr)
const 1916 Assert(reinterpret_cast<std::size_t>(ptr) % 32 == 0,
1918 _mm256_stream_ps(ptr, data);
1933 DEAL_II_ALWAYS_INLINE
1935 gather(
const float *base_ptr,
const unsigned int *offsets)
1941 const __m256 index_val = _mm256_loadu_ps((
const float *)offsets);
1942 const __m256i index = *((__m256i *)(&index_val));
1943 data = _mm256_i32gather_ps(base_ptr, index, 4);
1945 for (
unsigned int i = 0; i < 8; ++i)
1946 *(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
1962 DEAL_II_ALWAYS_INLINE
1964 scatter(
const unsigned int *offsets,
float *base_ptr)
const 1967 for (
unsigned int i = 0; i < 8; ++i)
1968 base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
1982 DEAL_II_ALWAYS_INLINE
1986 VectorizedArray res;
1987 res.
data = _mm256_sqrt_ps(data);
1995 DEAL_II_ALWAYS_INLINE
2002 __m256 mask = _mm256_set1_ps(-0.f);
2003 VectorizedArray res;
2004 res.
data = _mm256_andnot_ps(mask, data);
2012 DEAL_II_ALWAYS_INLINE
2014 get_max(
const VectorizedArray &other)
const 2016 VectorizedArray res;
2017 res.
data = _mm256_max_ps(data, other.
data);
2025 DEAL_II_ALWAYS_INLINE
2027 get_min(
const VectorizedArray &other)
const 2029 VectorizedArray res;
2030 res.
data = _mm256_min_ps(data, other.
data);
2037 template <
typename Number2>
2040 template <
typename Number2>
2043 template <
typename Number2>
2046 template <
typename Number2>
2058 vectorized_load_and_transpose(
const unsigned int n_entries,
2060 const unsigned int * offsets,
2063 const unsigned int n_chunks = n_entries / 4;
2064 for (
unsigned int i = 0; i < n_chunks; ++i)
2066 __m128 u0 = _mm_loadu_ps(in + 4 * i + offsets[0]);
2067 __m128 u1 = _mm_loadu_ps(in + 4 * i + offsets[1]);
2068 __m128 u2 = _mm_loadu_ps(in + 4 * i + offsets[2]);
2069 __m128 u3 = _mm_loadu_ps(in + 4 * i + offsets[3]);
2070 __m128 u4 = _mm_loadu_ps(in + 4 * i + offsets[4]);
2071 __m128 u5 = _mm_loadu_ps(in + 4 * i + offsets[5]);
2072 __m128 u6 = _mm_loadu_ps(in + 4 * i + offsets[6]);
2073 __m128 u7 = _mm_loadu_ps(in + 4 * i + offsets[7]);
2076 __m256 t0, t1, t2, t3 = _mm256_set1_ps(0.F);
2077 t0 = _mm256_insertf128_ps(t3, u0, 0);
2078 t0 = _mm256_insertf128_ps(t0, u4, 1);
2079 t1 = _mm256_insertf128_ps(t3, u1, 0);
2080 t1 = _mm256_insertf128_ps(t1, u5, 1);
2081 t2 = _mm256_insertf128_ps(t3, u2, 0);
2082 t2 = _mm256_insertf128_ps(t2, u6, 1);
2083 t3 = _mm256_insertf128_ps(t3, u3, 0);
2084 t3 = _mm256_insertf128_ps(t3, u7, 1);
2085 __m256 v0 = _mm256_shuffle_ps(t0, t1, 0x44);
2086 __m256 v1 = _mm256_shuffle_ps(t0, t1, 0xee);
2087 __m256 v2 = _mm256_shuffle_ps(t2, t3, 0x44);
2088 __m256 v3 = _mm256_shuffle_ps(t2, t3, 0xee);
2089 out[4 * i + 0].
data = _mm256_shuffle_ps(v0, v2, 0x88);
2090 out[4 * i + 1].
data = _mm256_shuffle_ps(v0, v2, 0xdd);
2091 out[4 * i + 2].
data = _mm256_shuffle_ps(v1, v3, 0x88);
2092 out[4 * i + 3].
data = _mm256_shuffle_ps(v1, v3, 0xdd);
2094 for (
unsigned int i = 4 * n_chunks; i < n_entries; ++i)
2095 for (
unsigned int v = 0; v < 8; ++v)
2096 out[i][v] = in[offsets[v] + i];
2106 vectorized_transpose_and_store(
const bool add_into,
2107 const unsigned int n_entries,
2109 const unsigned int * offsets,
2112 const unsigned int n_chunks = n_entries / 4;
2113 for (
unsigned int i = 0; i < n_chunks; ++i)
2115 __m256 u0 = in[4 * i + 0].
data;
2116 __m256 u1 = in[4 * i + 1].
data;
2117 __m256 u2 = in[4 * i + 2].
data;
2118 __m256 u3 = in[4 * i + 3].
data;
2119 __m256 t0 = _mm256_shuffle_ps(u0, u1, 0x44);
2120 __m256 t1 = _mm256_shuffle_ps(u0, u1, 0xee);
2121 __m256 t2 = _mm256_shuffle_ps(u2, u3, 0x44);
2122 __m256 t3 = _mm256_shuffle_ps(u2, u3, 0xee);
2123 u0 = _mm256_shuffle_ps(t0, t2, 0x88);
2124 u1 = _mm256_shuffle_ps(t0, t2, 0xdd);
2125 u2 = _mm256_shuffle_ps(t1, t3, 0x88);
2126 u3 = _mm256_shuffle_ps(t1, t3, 0xdd);
2127 __m128 res0 = _mm256_extractf128_ps(u0, 0);
2128 __m128 res4 = _mm256_extractf128_ps(u0, 1);
2129 __m128 res1 = _mm256_extractf128_ps(u1, 0);
2130 __m128 res5 = _mm256_extractf128_ps(u1, 1);
2131 __m128 res2 = _mm256_extractf128_ps(u2, 0);
2132 __m128 res6 = _mm256_extractf128_ps(u2, 1);
2133 __m128 res3 = _mm256_extractf128_ps(u3, 0);
2134 __m128 res7 = _mm256_extractf128_ps(u3, 1);
2141 res0 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[0]), res0);
2142 _mm_storeu_ps(out + 4 * i + offsets[0], res0);
2143 res1 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[1]), res1);
2144 _mm_storeu_ps(out + 4 * i + offsets[1], res1);
2145 res2 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[2]), res2);
2146 _mm_storeu_ps(out + 4 * i + offsets[2], res2);
2147 res3 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[3]), res3);
2148 _mm_storeu_ps(out + 4 * i + offsets[3], res3);
2149 res4 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[4]), res4);
2150 _mm_storeu_ps(out + 4 * i + offsets[4], res4);
2151 res5 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[5]), res5);
2152 _mm_storeu_ps(out + 4 * i + offsets[5], res5);
2153 res6 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[6]), res6);
2154 _mm_storeu_ps(out + 4 * i + offsets[6], res6);
2155 res7 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[7]), res7);
2156 _mm_storeu_ps(out + 4 * i + offsets[7], res7);
2160 _mm_storeu_ps(out + 4 * i + offsets[0], res0);
2161 _mm_storeu_ps(out + 4 * i + offsets[1], res1);
2162 _mm_storeu_ps(out + 4 * i + offsets[2], res2);
2163 _mm_storeu_ps(out + 4 * i + offsets[3], res3);
2164 _mm_storeu_ps(out + 4 * i + offsets[4], res4);
2165 _mm_storeu_ps(out + 4 * i + offsets[5], res5);
2166 _mm_storeu_ps(out + 4 * i + offsets[6], res6);
2167 _mm_storeu_ps(out + 4 * i + offsets[7], res7);
2171 for (
unsigned int i = 4 * n_chunks; i < n_entries; ++i)
2172 for (
unsigned int v = 0; v < 8; ++v)
2173 out[offsets[v] + i] += in[i][v];
2175 for (
unsigned int i = 4 * n_chunks; i < n_entries; ++i)
2176 for (
unsigned int v = 0; v < 8; ++v)
2177 out[offsets[v] + i] = in[i][v];
2185 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 1 2191 class VectorizedArray<double>
2197 static const unsigned int n_array_elements = 2;
2202 DEAL_II_ALWAYS_INLINE
2206 data = _mm_set1_pd(x);
2213 DEAL_II_ALWAYS_INLINE
2217 return *(
reinterpret_cast<double *
>(&data) + comp);
2223 DEAL_II_ALWAYS_INLINE
2227 return *(
reinterpret_cast<const double *
>(&data) + comp);
2233 DEAL_II_ALWAYS_INLINE
2237 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2240 data = _mm_add_pd(data, vec.
data);
2248 DEAL_II_ALWAYS_INLINE
2252 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2255 data = _mm_sub_pd(data, vec.
data);
2263 DEAL_II_ALWAYS_INLINE
2267 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2270 data = _mm_mul_pd(data, vec.
data);
2278 DEAL_II_ALWAYS_INLINE
2282 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2285 data = _mm_div_pd(data, vec.
data);
2295 DEAL_II_ALWAYS_INLINE
2299 data = _mm_loadu_pd(ptr);
2308 DEAL_II_ALWAYS_INLINE
2312 _mm_storeu_pd(ptr, data);
2318 DEAL_II_ALWAYS_INLINE
2322 Assert(reinterpret_cast<std::size_t>(ptr) % 16 == 0,
2324 _mm_stream_pd(ptr, data);
2339 DEAL_II_ALWAYS_INLINE
2341 gather(
const double *base_ptr,
const unsigned int *offsets)
2343 for (
unsigned int i = 0; i < 2; ++i)
2344 *(reinterpret_cast<double *>(&data) + i) = base_ptr[offsets[i]];
2359 DEAL_II_ALWAYS_INLINE
2361 scatter(
const unsigned int *offsets,
double *base_ptr)
const 2363 for (
unsigned int i = 0; i < 2; ++i)
2364 base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data) + i);
2378 DEAL_II_ALWAYS_INLINE
2382 VectorizedArray res;
2383 res.
data = _mm_sqrt_pd(data);
2391 DEAL_II_ALWAYS_INLINE
2399 __m128d mask = _mm_set1_pd(-0.);
2400 VectorizedArray res;
2401 res.
data = _mm_andnot_pd(mask, data);
2409 DEAL_II_ALWAYS_INLINE
2413 VectorizedArray res;
2414 res.
data = _mm_max_pd(data, other.
data);
2422 DEAL_II_ALWAYS_INLINE
2426 VectorizedArray res;
2427 res.
data = _mm_min_pd(data, other.
data);
2434 template <
typename Number2>
2437 template <
typename Number2>
2440 template <
typename Number2>
2443 template <
typename Number2>
2455 vectorized_load_and_transpose(
const unsigned int n_entries,
2457 const unsigned int * offsets,
2460 const unsigned int n_chunks = n_entries / 2;
2461 for (
unsigned int i = 0; i < n_chunks; ++i)
2463 __m128d u0 = _mm_loadu_pd(in + 2 * i + offsets[0]);
2464 __m128d u1 = _mm_loadu_pd(in + 2 * i + offsets[1]);
2465 out[2 * i + 0].
data = _mm_unpacklo_pd(u0, u1);
2466 out[2 * i + 1].
data = _mm_unpackhi_pd(u0, u1);
2468 for (
unsigned int i = 2 * n_chunks; i < n_entries; ++i)
2469 for (
unsigned int v = 0; v < 2; ++v)
2470 out[i][v] = in[offsets[v] + i];
2480 vectorized_transpose_and_store(
const bool add_into,
2481 const unsigned int n_entries,
2483 const unsigned int * offsets,
2486 const unsigned int n_chunks = n_entries / 2;
2489 for (
unsigned int i = 0; i < n_chunks; ++i)
2491 __m128d u0 = in[2 * i + 0].
data;
2492 __m128d u1 = in[2 * i + 1].
data;
2493 __m128d res0 = _mm_unpacklo_pd(u0, u1);
2494 __m128d res1 = _mm_unpackhi_pd(u0, u1);
2495 _mm_storeu_pd(out + 2 * i + offsets[0],
2496 _mm_add_pd(_mm_loadu_pd(out + 2 * i + offsets[0]),
2498 _mm_storeu_pd(out + 2 * i + offsets[1],
2499 _mm_add_pd(_mm_loadu_pd(out + 2 * i + offsets[1]),
2502 for (
unsigned int i = 2 * n_chunks; i < n_entries; ++i)
2503 for (
unsigned int v = 0; v < 2; ++v)
2504 out[offsets[v] + i] += in[i][v];
2508 for (
unsigned int i = 0; i < n_chunks; ++i)
2510 __m128d u0 = in[2 * i + 0].
data;
2511 __m128d u1 = in[2 * i + 1].
data;
2512 __m128d res0 = _mm_unpacklo_pd(u0, u1);
2513 __m128d res1 = _mm_unpackhi_pd(u0, u1);
2514 _mm_storeu_pd(out + 2 * i + offsets[0], res0);
2515 _mm_storeu_pd(out + 2 * i + offsets[1], res1);
2517 for (
unsigned int i = 2 * n_chunks; i < n_entries; ++i)
2518 for (
unsigned int v = 0; v < 2; ++v)
2519 out[offsets[v] + i] = in[i][v];
2529 class VectorizedArray<float>
2535 static const unsigned int n_array_elements = 4;
2541 DEAL_II_ALWAYS_INLINE
2545 data = _mm_set1_ps(x);
2552 DEAL_II_ALWAYS_INLINE
2556 return *(
reinterpret_cast<float *
>(&data) + comp);
2562 DEAL_II_ALWAYS_INLINE
2566 return *(
reinterpret_cast<const float *
>(&data) + comp);
2572 DEAL_II_ALWAYS_INLINE
2576 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2579 data = _mm_add_ps(data, vec.
data);
2587 DEAL_II_ALWAYS_INLINE
2591 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2594 data = _mm_sub_ps(data, vec.
data);
2602 DEAL_II_ALWAYS_INLINE
2606 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2609 data = _mm_mul_ps(data, vec.
data);
2617 DEAL_II_ALWAYS_INLINE
2621 # ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2624 data = _mm_div_ps(data, vec.
data);
2634 DEAL_II_ALWAYS_INLINE
2638 data = _mm_loadu_ps(ptr);
2647 DEAL_II_ALWAYS_INLINE
2651 _mm_storeu_ps(ptr, data);
2657 DEAL_II_ALWAYS_INLINE
2661 Assert(reinterpret_cast<std::size_t>(ptr) % 16 == 0,
2663 _mm_stream_ps(ptr, data);
2678 DEAL_II_ALWAYS_INLINE
2680 gather(
const float *base_ptr,
const unsigned int *offsets)
2682 for (
unsigned int i = 0; i < 4; ++i)
2683 *(reinterpret_cast<float *>(&data) + i) = base_ptr[offsets[i]];
2698 DEAL_II_ALWAYS_INLINE
2700 scatter(
const unsigned int *offsets,
float *base_ptr)
const 2702 for (
unsigned int i = 0; i < 4; ++i)
2703 base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data) + i);
2717 DEAL_II_ALWAYS_INLINE
2721 VectorizedArray res;
2722 res.
data = _mm_sqrt_ps(data);
2730 DEAL_II_ALWAYS_INLINE
2737 __m128 mask = _mm_set1_ps(-0.f);
2738 VectorizedArray res;
2739 res.
data = _mm_andnot_ps(mask, data);
2747 DEAL_II_ALWAYS_INLINE
2751 VectorizedArray res;
2752 res.
data = _mm_max_ps(data, other.
data);
2760 DEAL_II_ALWAYS_INLINE
2764 VectorizedArray res;
2765 res.
data = _mm_min_ps(data, other.
data);
2772 template <
typename Number2>
2775 template <
typename Number2>
2778 template <
typename Number2>
2781 template <
typename Number2>
2793 vectorized_load_and_transpose(
const unsigned int n_entries,
2795 const unsigned int * offsets,
2798 const unsigned int n_chunks = n_entries / 4;
2799 for (
unsigned int i = 0; i < n_chunks; ++i)
2801 __m128 u0 = _mm_loadu_ps(in + 4 * i + offsets[0]);
2802 __m128 u1 = _mm_loadu_ps(in + 4 * i + offsets[1]);
2803 __m128 u2 = _mm_loadu_ps(in + 4 * i + offsets[2]);
2804 __m128 u3 = _mm_loadu_ps(in + 4 * i + offsets[3]);
2805 __m128 v0 = _mm_shuffle_ps(u0, u1, 0x44);
2806 __m128 v1 = _mm_shuffle_ps(u0, u1, 0xee);
2807 __m128 v2 = _mm_shuffle_ps(u2, u3, 0x44);
2808 __m128 v3 = _mm_shuffle_ps(u2, u3, 0xee);
2809 out[4 * i + 0].
data = _mm_shuffle_ps(v0, v2, 0x88);
2810 out[4 * i + 1].
data = _mm_shuffle_ps(v0, v2, 0xdd);
2811 out[4 * i + 2].
data = _mm_shuffle_ps(v1, v3, 0x88);
2812 out[4 * i + 3].
data = _mm_shuffle_ps(v1, v3, 0xdd);
2814 for (
unsigned int i = 4 * n_chunks; i < n_entries; ++i)
2815 for (
unsigned int v = 0; v < 4; ++v)
2816 out[i][v] = in[offsets[v] + i];
2826 vectorized_transpose_and_store(
const bool add_into,
2827 const unsigned int n_entries,
2829 const unsigned int * offsets,
2832 const unsigned int n_chunks = n_entries / 4;
2833 for (
unsigned int i = 0; i < n_chunks; ++i)
2835 __m128 u0 = in[4 * i + 0].
data;
2836 __m128 u1 = in[4 * i + 1].
data;
2837 __m128 u2 = in[4 * i + 2].
data;
2838 __m128 u3 = in[4 * i + 3].
data;
2839 __m128 t0 = _mm_shuffle_ps(u0, u1, 0x44);
2840 __m128 t1 = _mm_shuffle_ps(u0, u1, 0xee);
2841 __m128 t2 = _mm_shuffle_ps(u2, u3, 0x44);
2842 __m128 t3 = _mm_shuffle_ps(u2, u3, 0xee);
2843 u0 = _mm_shuffle_ps(t0, t2, 0x88);
2844 u1 = _mm_shuffle_ps(t0, t2, 0xdd);
2845 u2 = _mm_shuffle_ps(t1, t3, 0x88);
2846 u3 = _mm_shuffle_ps(t1, t3, 0xdd);
2853 u0 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[0]), u0);
2854 _mm_storeu_ps(out + 4 * i + offsets[0], u0);
2855 u1 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[1]), u1);
2856 _mm_storeu_ps(out + 4 * i + offsets[1], u1);
2857 u2 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[2]), u2);
2858 _mm_storeu_ps(out + 4 * i + offsets[2], u2);
2859 u3 = _mm_add_ps(_mm_loadu_ps(out + 4 * i + offsets[3]), u3);
2860 _mm_storeu_ps(out + 4 * i + offsets[3], u3);
2864 _mm_storeu_ps(out + 4 * i + offsets[0], u0);
2865 _mm_storeu_ps(out + 4 * i + offsets[1], u1);
2866 _mm_storeu_ps(out + 4 * i + offsets[2], u2);
2867 _mm_storeu_ps(out + 4 * i + offsets[3], u3);
2871 for (
unsigned int i = 4 * n_chunks; i < n_entries; ++i)
2872 for (
unsigned int v = 0; v < 4; ++v)
2873 out[offsets[v] + i] += in[i][v];
2875 for (
unsigned int i = 4 * n_chunks; i < n_entries; ++i)
2876 for (
unsigned int v = 0; v < 4; ++v)
2877 out[offsets[v] + i] = in[i][v];
2882 #endif // if DEAL_II_COMPILER_VECTORIZATION_LEVEL > 0 2890 template <
typename Number>
2891 inline DEAL_II_ALWAYS_INLINE
bool 2895 for (
unsigned int i = 0; i < VectorizedArray<Number>::n_array_elements; ++i)
2896 if (lhs[i] != rhs[i])
2908 template <
typename Number>
2921 template <
typename Number>
2934 template <
typename Number>
2947 template <
typename Number>
2961 template <
typename Number>
2992 template <
typename Number>
3019 template <
typename Number>
3050 template <
typename Number>
3081 template <
typename Number>
3112 template <
typename Number>
3139 template <
typename Number>
3170 template <
typename Number>
3200 template <
typename Number>
3212 template <
typename Number>
3222 DEAL_II_NAMESPACE_CLOSE
3240 template <
typename Number>
3241 inline ::VectorizedArray<Number>
3242 sin(const ::VectorizedArray<Number> &x)
3250 for (
unsigned int i = 0;
3251 i < ::VectorizedArray<Number>::n_array_elements;
3253 values[i] = std::sin(x[i]);
3255 out.
load(&values[0]);
3268 template <
typename Number>
3269 inline ::VectorizedArray<Number>
3270 cos(const ::VectorizedArray<Number> &x)
3273 for (
unsigned int i = 0;
3274 i < ::VectorizedArray<Number>::n_array_elements;
3276 values[i] = std::cos(x[i]);
3278 out.
load(&values[0]);
3291 template <
typename Number>
3292 inline ::VectorizedArray<Number>
3293 tan(const ::VectorizedArray<Number> &x)
3296 for (
unsigned int i = 0;
3297 i < ::VectorizedArray<Number>::n_array_elements;
3299 values[i] = std::tan(x[i]);
3301 out.
load(&values[0]);
3314 template <
typename Number>
3315 inline ::VectorizedArray<Number>
3316 exp(const ::VectorizedArray<Number> &x)
3319 for (
unsigned int i = 0;
3320 i < ::VectorizedArray<Number>::n_array_elements;
3322 values[i] = std::exp(x[i]);
3324 out.
load(&values[0]);
3337 template <
typename Number>
3338 inline ::VectorizedArray<Number>
3339 log(const ::VectorizedArray<Number> &x)
3342 for (
unsigned int i = 0;
3343 i < ::VectorizedArray<Number>::n_array_elements;
3345 values[i] = std::log(x[i]);
3347 out.
load(&values[0]);
3360 template <
typename Number>
3361 inline ::VectorizedArray<Number>
3362 sqrt(const ::VectorizedArray<Number> &x)
3364 return x.get_sqrt();
3376 template <
typename Number>
3377 inline ::VectorizedArray<Number>
3378 pow(const ::VectorizedArray<Number> &x,
const Number p)
3381 for (
unsigned int i = 0;
3382 i < ::VectorizedArray<Number>::n_array_elements;
3384 values[i] = std::pow(x[i], p);
3386 out.
load(&values[0]);
3399 template <
typename Number>
3400 inline ::VectorizedArray<Number>
3401 abs(const ::VectorizedArray<Number> &x)
3415 template <
typename Number>
3416 inline ::VectorizedArray<Number>
3417 max(const ::VectorizedArray<Number> &x,
3418 const ::VectorizedArray<Number> &y)
3420 return x.get_max(y);
3432 template <
typename Number>
3433 inline ::VectorizedArray<Number>
3434 min(const ::VectorizedArray<Number> &x,
3435 const ::VectorizedArray<Number> &y)
3437 return x.get_min(y);
VectorizedArray< Number > operator/(const VectorizedArray< Number > &u, const VectorizedArray< Number > &v)
VectorizedArray< Number > operator+(const VectorizedArray< Number > &u, const VectorizedArray< Number > &v)
void gather(const double *base_ptr, const unsigned int *offsets)
VectorizedArray & operator-=(const VectorizedArray &vec)
double & operator[](const unsigned int comp)
VectorizedArray< Number > operator-(const VectorizedArray< Number > &u, const VectorizedArray< Number > &v)
VectorizedArray< Number > log(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > operator*(const VectorizedArray< Number > &u, const VectorizedArray< Number > &v)
const Number & operator[](const unsigned int comp) const
VectorizedArray & operator+=(const VectorizedArray &vec)
VectorizedArray get_sqrt() const
VectorizedArray< Number > operator-(const Number &u, const VectorizedArray< Number > &v)
VectorizedArray< float > operator-(const double &u, const VectorizedArray< float > &v)
VectorizedArray< float > operator+(const double &u, const VectorizedArray< float > &v)
VectorizedArray< Number > operator/(const Number &u, const VectorizedArray< Number > &v)
VectorizedArray< float > operator*(const double &u, const VectorizedArray< float > &v)
VectorizedArray< Number > make_vectorized_array(const Number &u)
VectorizedArray< float > operator/(const double &u, const VectorizedArray< float > &v)
VectorizedArray< Number > tan(const ::VectorizedArray< Number > &x)
#define AssertIndexRange(index, range)
void streaming_store(Number *ptr) const
float & operator[](const unsigned int comp)
VectorizedArray< Number > exp(const ::VectorizedArray< Number > &x)
VectorizedArray get_sqrt() const
bool operator==(const VectorizedArray< Number > &lhs, const VectorizedArray< Number > &rhs)
void store(double *ptr) const
void gather(const float *base_ptr, const unsigned int *offsets)
void scatter(const unsigned int *offsets, Number *base_ptr) const
void streaming_store(float *ptr) const
VectorizedArray & operator+=(const VectorizedArray< Number > &vec)
void scatter(const unsigned int *offsets, double *base_ptr) const
VectorizedArray get_min(const VectorizedArray &other) const
VectorizedArray & operator*=(const VectorizedArray &vec)
void vectorized_transpose_and_store(const bool add_into, const unsigned int n_entries, const VectorizedArray< Number > *in, const unsigned int *offsets, Number *out)
const float & operator[](const unsigned int comp) const
static::ExceptionBase & ExcMessage(std::string arg1)
Number & operator[](const unsigned int comp)
VectorizedArray< Number > operator+(const VectorizedArray< Number > &v, const Number &u)
VectorizedArray get_abs() const
VectorizedArray get_abs() const
void store(Number *ptr) const
VectorizedArray< Number > min(const ::VectorizedArray< Number > &x, const ::VectorizedArray< Number > &y)
VectorizedArray< Number > operator-(const VectorizedArray< Number > &u)
void load(const double *ptr)
#define Assert(cond, exc)
void scatter(const unsigned int *offsets, float *base_ptr) const
VectorizedArray get_min(const VectorizedArray &other) const
VectorizedArray get_abs() const
void streaming_store(double *ptr) const
VectorizedArray< Number > pow(const ::VectorizedArray< Number > &x, const Number p)
VectorizedArray & operator/=(const VectorizedArray &vec)
VectorizedArray & operator=(const double x)
VectorizedArray< Number > operator+(const VectorizedArray< Number > &u)
VectorizedArray get_max(const VectorizedArray &other) const
void vectorized_load_and_transpose(const unsigned int n_entries, const Number *in, const unsigned int *offsets, VectorizedArray< Number > *out)
VectorizedArray< Number > operator+(const Number &u, const VectorizedArray< Number > &v)
VectorizedArray & operator-=(const VectorizedArray< Number > &vec)
VectorizedArray & operator+=(const VectorizedArray &vec)
VectorizedArray< Number > operator-(const VectorizedArray< Number > &v, const Number &u)
void load(const float *ptr)
VectorizedArray & operator=(const float x)
VectorizedArray< Number > sqrt(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > sin(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > operator/(const VectorizedArray< Number > &v, const Number &u)
VectorizedArray get_max(const VectorizedArray &other) const
VectorizedArray & operator/=(const VectorizedArray &vec)
const double & operator[](const unsigned int comp) const
VectorizedArray & operator-=(const VectorizedArray &vec)
VectorizedArray & operator/=(const VectorizedArray< Number > &vec)
void load(const Number *ptr)
VectorizedArray & operator*=(const VectorizedArray &vec)
VectorizedArray< Number > operator*(const Number &u, const VectorizedArray< Number > &v)
VectorizedArray< Number > operator*(const VectorizedArray< Number > &v, const Number &u)
void gather(const Number *base_ptr, const unsigned int *offsets)
VectorizedArray & operator*=(const VectorizedArray< Number > &vec)
VectorizedArray< float > operator*(const VectorizedArray< float > &v, const double &u)
VectorizedArray< float > operator/(const VectorizedArray< float > &v, const double &u)
VectorizedArray & operator=(const Number scalar)
VectorizedArray< Number > abs(const ::VectorizedArray< Number > &x)
void store(float *ptr) const
VectorizedArray get_min(const VectorizedArray &other) const
VectorizedArray< float > operator+(const VectorizedArray< float > &v, const double &u)
VectorizedArray< Number > max(const ::VectorizedArray< Number > &x, const ::VectorizedArray< Number > &y)
VectorizedArray get_sqrt() const
VectorizedArray< Number > cos(const ::VectorizedArray< Number > &x)
VectorizedArray< float > operator-(const VectorizedArray< float > &v, const double &u)
VectorizedArray get_max(const VectorizedArray &other) const