Reference documentation for deal.II version 9.1.0-pre
cuda_fe_evaluation.h
1 // ---------------------------------------------------------------------
2 //
3 // Copyright (C) 2016 - 2018 by the deal.II authors
4 //
5 // This file is part of the deal.II library.
6 //
7 // The deal.II library is free software; you can use it, redistribute
8 // it, and/or modify it under the terms of the GNU Lesser General
9 // Public License as published by the Free Software Foundation; either
10 // version 2.1 of the License, or (at your option) any later version.
11 // The full text of the license can be found in the file LICENSE.md at
12 // the top level directory of deal.II.
13 //
14 // ---------------------------------------------------------------------
15 
16 #ifndef dealii_cuda_fe_evaluation_h
17 #define dealii_cuda_fe_evaluation_h
18 
19 #include <deal.II/base/tensor.h>
20 #include <deal.II/base/utilities.h>
21 
22 #include <deal.II/lac/cuda_vector.h>
23 
24 #include <deal.II/matrix_free/cuda_matrix_free.h>
25 #include <deal.II/matrix_free/cuda_matrix_free.templates.h>
26 #include <deal.II/matrix_free/cuda_tensor_product_kernels.h>
27 
28 DEAL_II_NAMESPACE_OPEN
29 
33 namespace CUDAWrappers
34 {
35  namespace internal
36  {
37  template <int dim, int fe_degree, bool transpose, typename Number>
38  __device__ void
39  resolve_hanging_nodes_shmem(Number *values, const unsigned int constr)
40  {
41  // TODO
42  }
43  } // namespace internal
44 
45 
46 
74  template <int dim,
75  int fe_degree,
76  int n_q_points_1d = fe_degree + 1,
77  int n_components_ = 1,
78  typename Number = double>
80  {
81  public:
82  using value_type = Number;
84  using data_type = typename MatrixFree<dim, Number>::Data;
85  static constexpr unsigned int dimension = dim;
86  static constexpr unsigned int n_components = n_components_;
87  static constexpr unsigned int n_q_points =
88  Utilities::pow(n_q_points_1d, dim);
89  static constexpr unsigned int tensor_dofs_per_cell =
90  Utilities::pow(fe_degree + 1, dim);
91 
95  __device__
96  FEEvaluation(int cell_id,
97  const data_type * data,
98  SharedData<dim, Number> *shdata);
99 
108  __device__ void
109  read_dof_values(const Number *src);
110 
117  __device__ void
118  distribute_local_to_global(Number *dst) const;
119 
127  __device__ void
128  evaluate(const bool evaluate_val, const bool evaluate_grad);
129 
137  __device__ void
138  integrate(const bool integrate_val, const bool integrate_grad);
139 
144  __device__ value_type
145  get_value(const unsigned int q_point) const;
146 
153  __device__ void
154  submit_value(const value_type &val_in, const unsigned int q_point);
155 
160  __device__ gradient_type
161  get_gradient(const unsigned int q_point) const;
162 
167  __device__ void
168  submit_gradient(const gradient_type &grad_in, const unsigned int q_point);
169 
173  template <typename functor>
174  __device__ void
175  apply_quad_point_operations(const functor &func);
176 
177  private:
178  unsigned int *local_to_global;
179  unsigned int n_cells;
180  unsigned int padding_length;
181 
182  const unsigned int constraint_mask;
183 
184  Number *inv_jac;
185  Number *JxW;
186 
187  // Internal buffer
188  Number *values;
189  Number *gradients[dim];
190  };
191 
192 
193 
194  template <int dim,
195  int fe_degree,
196  int n_q_points_1d,
197  int n_components_,
198  typename Number>
199  __device__
201  FEEvaluation(int cell_id,
202  const data_type * data,
203  SharedData<dim, Number> *shdata)
204  : n_cells(data->n_cells)
205  , padding_length(data->padding_length)
206  , constraint_mask(data->constraint_mask[cell_id])
207  , values(shdata->values)
208  {
209  local_to_global = data->local_to_global + padding_length * cell_id;
210  inv_jac = data->inv_jacobian + padding_length * cell_id;
211  JxW = data->JxW + padding_length * cell_id;
212 
213  for (unsigned int i = 0; i < dim; ++i)
214  gradients[i] = shdata->gradients[i];
215  }
216 
217 
218 
219  template <int dim,
220  int fe_degree,
221  int n_q_points_1d,
222  int n_components_,
223  typename Number>
224  __device__ void
226  read_dof_values(const Number *src)
227  {
228  static_assert(n_components_ == 1, "This function only supports FE with one \
229  components");
230  const unsigned int idx =
231  (threadIdx.x % n_q_points_1d) +
232  (dim > 1 ? threadIdx.y : 0) * n_q_points_1d +
233  (dim > 2 ? threadIdx.z : 0) * n_q_points_1d * n_q_points_1d;
234 
235  const unsigned int src_idx = local_to_global[idx];
236  // Use the read-only data cache.
237  values[idx] = __ldg(&src[src_idx]);
238 
239  if (constraint_mask)
240  internal::resolve_hanging_nodes_shmem<dim, fe_degree, false>(
241  values, constraint_mask);
242 
243  __syncthreads();
244  }
245 
246 
247 
248  template <int dim,
249  int fe_degree,
250  int n_q_points_1d,
251  int n_components_,
252  typename Number>
253  __device__ void
255  distribute_local_to_global(Number *dst) const
256  {
257  static_assert(n_components_ == 1, "This function only supports FE with one \
258  components");
259  if (constraint_mask)
260  internal::resolve_hanging_nodes_shmem<dim, fe_degree, true>(
261  values, constraint_mask);
262 
263 
264  const unsigned int idx =
265  (threadIdx.x % n_q_points_1d) +
266  (dim > 1 ? threadIdx.y : 0) * n_q_points_1d +
267  (dim > 2 ? threadIdx.z : 0) * n_q_points_1d * n_q_points_1d;
268  const unsigned int destination_idx = local_to_global[idx];
269 
270  dst[destination_idx] += values[idx];
271  }
272 
273 
274 
275  template <int dim,
276  int fe_degree,
277  int n_q_points_1d,
278  int n_components_,
279  typename Number>
280  __device__ void
282  const bool evaluate_val,
283  const bool evaluate_grad)
284  {
285  // First evaluate the gradients because it requires values that will be
286  // changed if evaluate_val is true
288  internal::EvaluatorVariant::evaluate_general,
289  dim,
290  fe_degree,
291  n_q_points_1d,
292  Number>
293  evaluator_tensor_product;
294  if (evaluate_grad == true)
295  {
296  evaluator_tensor_product.gradient_at_quad_pts(values, gradients);
297  __syncthreads();
298  }
299 
300  if (evaluate_val == true)
301  {
302  evaluator_tensor_product.value_at_quad_pts(values);
303  __syncthreads();
304  }
305  }
306 
307 
308 
309  template <int dim,
310  int fe_degree,
311  int n_q_points_1d,
312  int n_components_,
313  typename Number>
314  __device__ void
316  const bool integrate_val,
317  const bool integrate_grad)
318  {
320  internal::EvaluatorVariant::evaluate_general,
321  dim,
322  fe_degree,
323  n_q_points_1d,
324  Number>
325  evaluator_tensor_product;
326  if (integrate_val == true)
327  {
328  evaluator_tensor_product.integrate_value(values);
329  __syncthreads();
330  if (integrate_grad == true)
331  {
332  evaluator_tensor_product.integrate_gradient<true>(values,
333  gradients);
334  __syncthreads();
335  }
336  }
337  else if (integrate_grad == true)
338  {
339  evaluator_tensor_product.integrate_gradient<false>(values, gradients);
340  __syncthreads();
341  }
342  }
343 
344 
345 
346  template <int dim,
347  int fe_degree,
348  int n_q_points_1d,
349  int n_components_,
350  typename Number>
351  __device__ typename FEEvaluation<dim,
352  fe_degree,
353  n_q_points_1d,
354  n_components_,
355  Number>::value_type
357  const unsigned int q_point) const
358  {
359  return values[q_point];
360  }
361 
362 
363 
364  template <int dim,
365  int fe_degree,
366  int n_q_points_1d,
367  int n_components_,
368  typename Number>
369  __device__ void
371  submit_value(const value_type &val_in, const unsigned int q_point)
372  {
373  values[q_point] = val_in * JxW[q_point];
374  }
375 
376 
377 
378  template <int dim,
379  int fe_degree,
380  int n_q_points_1d,
381  int n_components_,
382  typename Number>
383  __device__ typename FEEvaluation<dim,
384  fe_degree,
385  n_q_points_1d,
386  n_components_,
387  Number>::gradient_type
389  get_gradient(const unsigned int q_point) const
390  {
391  static_assert(n_components_ == 1, "This function only supports FE with one \
392  components");
393  // TODO optimize if the mesh is uniform
394  const Number *inv_jacobian = &inv_jac[q_point];
395  gradient_type grad;
396  for (int d_1 = 0; d_1 < dim; ++d_1)
397  {
398  Number tmp = 0.;
399  for (int d_2 = 0; d_2 < dim; ++d_2)
400  tmp += inv_jacobian[padding_length * n_cells * (dim * d_2 + d_1)] *
401  gradients[d_2][q_point];
402  grad[d_1] = tmp;
403  }
404 
405  return grad;
406  }
407 
408 
409 
410  template <int dim,
411  int fe_degree,
412  int n_q_points_1d,
413  int n_components_,
414  typename Number>
415  __device__ void
417  submit_gradient(const gradient_type &grad_in, const unsigned int q_point)
418  {
419  // TODO optimize if the mesh is uniform
420  const Number *inv_jacobian = &inv_jac[q_point];
421  for (int d_1 = 0; d_1 < dim; ++d_1)
422  {
423  Number tmp = 0.;
424  for (int d_2 = 0; d_2 < dim; ++d_2)
425  tmp += inv_jacobian[n_cells * padding_length * (dim * d_1 + d_2)] *
426  grad_in[d_2];
427  gradients[d_1][q_point] = tmp * JxW[q_point];
428  }
429  }
430 
431 
432 
433  template <int dim,
434  int fe_degree,
435  int n_q_points_1d,
436  int n_components_,
437  typename Number>
438  template <typename functor>
439  __device__ void
441  apply_quad_point_operations(const functor &func)
442  {
443  const unsigned int q_point =
444  (dim == 1 ?
445  threadIdx.x % n_q_points_1d :
446  dim == 2 ?
447  threadIdx.x % n_q_points_1d + n_q_points_1d * threadIdx.y :
448  threadIdx.x % n_q_points_1d +
449  n_q_points_1d * (threadIdx.y + n_q_points_1d * threadIdx.z));
450  func(this, q_point);
451 
452  __syncthreads();
453  }
454 } // namespace CUDAWrappers
455 
456 DEAL_II_NAMESPACE_CLOSE
457 
458 #endif
void submit_value(const value_type &val_in, const unsigned int q_point)
constexpr unsigned int pow(const unsigned int base, const unsigned int iexp)
Definition: utilities.h:353
void evaluate(const bool evaluate_val, const bool evaluate_grad)
FEEvaluation(int cell_id, const data_type *data, SharedData< dim, Number > *shdata)
void apply_quad_point_operations(const functor &func)
gradient_type get_gradient(const unsigned int q_point) const
value_type get_value(const unsigned int q_point) const
void read_dof_values(const Number *src)
void submit_gradient(const gradient_type &grad_in, const unsigned int q_point)
void distribute_local_to_global(Number *dst) const
void integrate(const bool integrate_val, const bool integrate_grad)