Reference documentation for deal.II version 9.1.0-pre
cuda_tensor_product_kernels.h
1 // ---------------------------------------------------------------------
2 //
3 // Copyright (C) 2017 - 2018 by the deal.II authors
4 //
5 // This file is part of the deal.II library.
6 //
7 // The deal.II library is free software; you can use it, redistribute
8 // it, and/or modify it under the terms of the GNU Lesser General
9 // Public License as published by the Free Software Foundation; either
10 // version 2.1 of the License, or (at your option) any later version.
11 // The full text of the license can be found in the file LICENSE.md at
12 // the top level directory of deal.II.
13 //
14 // ---------------------------------------------------------------------
15 
16 
17 #ifndef dealii_cuda_tensor_product_kernels_h
18 #define dealii_cuda_tensor_product_kernels_h
19 
20 #include <deal.II/base/config.h>
21 
22 
23 DEAL_II_NAMESPACE_OPEN
24 
25 
26 namespace CUDAWrappers
27 {
28  namespace internal
29  {
36  // TODO: for now only the general variant is implemented
38  {
39  evaluate_general,
40  evaluate_symmetric,
41  evaluate_evenodd
42  };
43 
44 
45 
51  template <EvaluatorVariant variant,
52  int dim,
53  int fe_degree,
54  int n_q_points_1d,
55  typename Number>
57  {};
58 
59 
60 
67  template <int dim, int fe_degree, int n_q_points_1d, typename Number>
68  struct EvaluatorTensorProduct<evaluate_general,
69  dim,
70  fe_degree,
71  n_q_points_1d,
72  Number>
73  {
74  static constexpr unsigned int dofs_per_cell =
75  Utilities::pow(fe_degree + 1, dim);
76  static constexpr unsigned int n_q_points =
77  Utilities::pow(n_q_points_1d, dim);
78 
79  __device__
81 
86  template <int direction, bool dof_to_quad, bool add, bool in_place>
87  __device__ void
88  values(const Number *in, Number *out) const;
89 
94  template <int direction, bool dof_to_quad, bool add, bool in_place>
95  __device__ void
96  gradients(const Number *in, Number *out) const;
97 
101  template <int direction, bool dof_to_quad, bool add, bool in_place>
102  __device__ void
103  apply(Number shape_data[], const Number *in, Number *out) const;
104 
108  __device__ void
109  value_at_quad_pts(Number *u);
110 
114  __device__ void
115  integrate_value(Number *u);
116 
121  __device__ void
122  gradient_at_quad_pts(const Number *const u, Number *grad_u[dim]);
123 
128  template <bool add>
129  __device__ void
130  integrate_gradient(Number *u, Number *grad_u[dim]);
131  };
132 
133 
134 
135  template <int dim, int fe_degree, int n_q_points_1d, typename Number>
136  __device__
137  EvaluatorTensorProduct<evaluate_general,
138  dim,
139  fe_degree,
140  n_q_points_1d,
141  Number>::EvaluatorTensorProduct()
142  {}
143 
144 
145 
146  template <int dim, int fe_degree, int n_q_points_1d, typename Number>
147  template <int direction, bool dof_to_quad, bool add, bool in_place>
148  __device__ void
149  EvaluatorTensorProduct<evaluate_general,
150  dim,
151  fe_degree,
152  n_q_points_1d,
153  Number>::values(const Number *in, Number *out) const
154  {
155  apply<direction, dof_to_quad, add, in_place>(global_shape_values,
156  in,
157  out);
158  }
159 
160 
161 
162  template <int dim, int fe_degree, int n_q_points_1d, typename Number>
163  template <int direction, bool dof_to_quad, bool add, bool in_place>
164  __device__ void
165  EvaluatorTensorProduct<evaluate_general,
166  dim,
167  fe_degree,
168  n_q_points_1d,
169  Number>::gradients(const Number *in,
170  Number * out) const
171  {
172  apply<direction, dof_to_quad, add, in_place>(global_shape_gradients,
173  in,
174  out);
175  }
176 
177 
178 
179  template <int dim, int fe_degree, int n_q_points_1d, typename Number>
180  template <int direction, bool dof_to_quad, bool add, bool in_place>
181  __device__ void
182  EvaluatorTensorProduct<evaluate_general,
183  dim,
184  fe_degree,
185  n_q_points_1d,
186  Number>::apply(Number shape_data[],
187  const Number *in,
188  Number * out) const
189  {
190  const unsigned int i = (dim == 1) ? 0 : threadIdx.x % n_q_points_1d;
191  const unsigned int j = (dim == 3) ? threadIdx.y : 0;
192  const unsigned int q = (dim == 1) ?
193  (threadIdx.x % n_q_points_1d) :
194  (dim == 2) ? threadIdx.y : threadIdx.z;
195 
196  // This loop simply multiply the shape function at the quadrature point by
197  // the value finite element coefficient.
198  Number t = 0;
199  for (int k = 0; k < n_q_points_1d; ++k)
200  {
201  const unsigned int shape_idx =
202  dof_to_quad ? (q + k * n_q_points_1d) : (k + q * n_q_points_1d);
203  const unsigned int source_idx =
204  (direction == 0) ?
205  (k + n_q_points_1d * (i + n_q_points_1d * j)) :
206  (direction == 1) ? (i + n_q_points_1d * (k + n_q_points_1d * j)) :
207  (i + n_q_points_1d * (j + n_q_points_1d * k));
208  t += shape_data[shape_idx] *
209  (in_place ? out[source_idx] : in[source_idx]);
210  }
211 
212  if (in_place)
213  __syncthreads();
214 
215  const unsigned int destination_idx =
216  (direction == 0) ?
217  (q + n_q_points_1d * (i + n_q_points_1d * j)) :
218  (direction == 1) ? (i + n_q_points_1d * (q + n_q_points_1d * j)) :
219  (i + n_q_points_1d * (j + n_q_points_1d * q));
220 
221  if (add)
222  out[destination_idx] += t;
223  else
224  out[destination_idx] = t;
225  }
226 
227 
228 
229  template <int dim, int fe_degree, int n_q_points_1d, typename Number>
230  inline __device__ void
231  EvaluatorTensorProduct<evaluate_general,
232  dim,
233  fe_degree,
234  n_q_points_1d,
235  Number>::value_at_quad_pts(Number *u)
236  {
237  switch (dim)
238  {
239  case 1:
240  {
241  values<0, true, false, true>(u, u);
242 
243  break;
244  }
245  case 2:
246  {
247  values<0, true, false, true>(u, u);
248  __syncthreads();
249  values<1, true, false, true>(u, u);
250 
251  break;
252  }
253  case 3:
254  {
255  values<0, true, false, true>(u, u);
256  __syncthreads();
257  values<1, true, false, true>(u, u);
258  __syncthreads();
259  values<2, true, false, true>(u, u);
260 
261  break;
262  }
263  default:
264  {
265  // Do nothing. We should throw but we can't from a __device__
266  // function.
267  }
268  }
269  }
270 
271 
272 
273  template <int dim, int fe_degree, int n_q_points_1d, typename Number>
274  inline __device__ void
275  EvaluatorTensorProduct<evaluate_general,
276  dim,
277  fe_degree,
278  n_q_points_1d,
279  Number>::integrate_value(Number *u)
280  {
281  switch (dim)
282  {
283  case 1:
284  {
285  values<0, false, false, true>(u, u);
286 
287  break;
288  }
289  case 2:
290  {
291  values<0, false, false, true>(u, u);
292  __syncthreads();
293  values<1, false, false, true>(u, u);
294 
295  break;
296  }
297  case 3:
298  {
299  values<0, false, false, true>(u, u);
300  __syncthreads();
301  values<1, false, false, true>(u, u);
302  __syncthreads();
303  values<2, false, false, true>(u, u);
304 
305  break;
306  }
307  default:
308  {
309  // Do nothing. We should throw but we can't from a __device__
310  // function.
311  }
312  }
313  }
314 
315 
316 
317  template <int dim, int fe_degree, int n_q_points_1d, typename Number>
318  inline __device__ void
319  EvaluatorTensorProduct<evaluate_general,
320  dim,
321  fe_degree,
322  n_q_points_1d,
323  Number>::gradient_at_quad_pts(const Number *const u,
324  Number *grad_u[dim])
325  {
326  switch (dim)
327  {
328  case 1:
329  {
330  gradients<0, true, false, false>(u, grad_u[0]);
331 
332  break;
333  }
334  case 2:
335  {
336  gradients<0, true, false, false>(u, grad_u[0]);
337  values<0, true, false, false>(u, grad_u[1]);
338 
339  __syncthreads();
340 
341  values<1, true, false, true>(grad_u[0], grad_u[0]);
342  gradients<1, true, false, true>(grad_u[1], grad_u[1]);
343 
344  break;
345  }
346  case 3:
347  {
348  gradients<0, true, false, false>(u, grad_u[0]);
349  values<0, true, false, false>(u, grad_u[1]);
350  values<0, true, false, false>(u, grad_u[2]);
351 
352  __syncthreads();
353 
354  values<1, true, false, true>(grad_u[0], grad_u[0]);
355  gradients<1, true, false, true>(grad_u[1], grad_u[1]);
356  values<1, true, false, true>(grad_u[2], grad_u[2]);
357 
358  __syncthreads();
359 
360  values<2, true, false, true>(grad_u[0], grad_u[0]);
361  values<2, true, false, true>(grad_u[1], grad_u[1]);
362  gradients<2, true, false, true>(grad_u[2], grad_u[2]);
363 
364  break;
365  }
366  default:
367  {
368  // Do nothing. We should throw but we can't from a __device__
369  // function.
370  }
371  }
372  }
373 
374 
375 
376  template <int dim, int fe_degree, int n_q_points_1d, typename Number>
377  template <bool add>
378  inline __device__ void
379  EvaluatorTensorProduct<evaluate_general,
380  dim,
381  fe_degree,
382  n_q_points_1d,
383  Number>::integrate_gradient(Number *u,
384  Number *grad_u[dim])
385  {
386  switch (dim)
387  {
388  case 1:
389  {
390  gradients<0, false, add, false>(grad_u[dim], u);
391 
392  break;
393  }
394  case 2:
395  {
396  gradients<0, false, false, true>(grad_u[0], grad_u[0]);
397  values<0, false, false, true>(grad_u[1], grad_u[1]);
398 
399  __syncthreads();
400 
401  values<1, false, add, false>(grad_u[0], u);
402  __syncthreads();
403  gradients<1, false, true, false>(grad_u[1], u);
404 
405  break;
406  }
407  case 3:
408  {
409  gradients<0, false, false, true>(grad_u[0], grad_u[0]);
410  values<0, false, false, true>(grad_u[1], grad_u[1]);
411  values<0, false, false, true>(grad_u[2], grad_u[2]);
412 
413  __syncthreads();
414 
415  values<1, false, false, true>(grad_u[0], grad_u[0]);
416  gradients<1, false, false, true>(grad_u[1], grad_u[1]);
417  values<1, false, false, true>(grad_u[2], grad_u[2]);
418 
419  __syncthreads();
420 
421  values<2, false, add, false>(grad_u[0], u);
422  __syncthreads();
423  values<2, false, true, false>(grad_u[1], u);
424  __syncthreads();
425  gradients<2, false, true, false>(grad_u[2], u);
426 
427  break;
428  }
429  default:
430  {
431  // Do nothing. We should throw but we can't from a __device__
432  // function.
433  }
434  }
435  }
436  } // namespace internal
437 } // namespace CUDAWrappers
438 
439 DEAL_II_NAMESPACE_CLOSE
440 
441 #endif
constexpr unsigned int pow(const unsigned int base, const unsigned int iexp)
Definition: utilities.h:353