Reference documentation for deal.II version 9.1.0-pre
Public Types | Public Member Functions | Public Attributes | Static Public Attributes | Private Member Functions | List of all members
FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number > Class Template Reference

#include <deal.II/matrix_free/fe_evaluation.h>

Inheritance diagram for FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >:
[legend]

Public Types

using BaseClass = FEEvaluationAccess< dim, n_components_, Number, false >
 
using number_type = Number
 
using value_type = typename BaseClass::value_type
 
using gradient_type = typename BaseClass::gradient_type
 

Public Member Functions

 FEEvaluation (const MatrixFree< dim, Number > &matrix_free, const unsigned int dof_no=0, const unsigned int quad_no=0, const unsigned int first_selected_component=0)
 
 FEEvaluation (const Mapping< dim > &mapping, const FiniteElement< dim > &fe, const Quadrature< 1 > &quadrature, const UpdateFlags update_flags, const unsigned int first_selected_component=0)
 
 FEEvaluation (const FiniteElement< dim > &fe, const Quadrature< 1 > &quadrature, const UpdateFlags update_flags, const unsigned int first_selected_component=0)
 
template<int n_components_other>
 FEEvaluation (const FiniteElement< dim > &fe, const FEEvaluationBase< dim, n_components_other, Number > &other, const unsigned int first_selected_component=0)
 
 FEEvaluation (const FEEvaluation &other)
 
FEEvaluationoperator= (const FEEvaluation &other)
 
void reinit (const unsigned int cell_batch_index)
 
template<typename DoFHandlerType , bool level_dof_access>
void reinit (const TriaIterator< DoFCellAccessor< DoFHandlerType, level_dof_access >> &cell)
 
void reinit (const typename Triangulation< dim >::cell_iterator &cell)
 
void evaluate (const bool evaluate_values, const bool evaluate_gradients, const bool evaluate_hessians=false)
 
void evaluate (const VectorizedArray< Number > *values_array, const bool evaluate_values, const bool evaluate_gradients, const bool evaluate_hessians=false)
 
template<typename VectorType >
void gather_evaluate (const VectorType &input_vector, const bool evaluate_values, const bool evaluate_gradients, const bool evaluate_hessians=false)
 
void integrate (const bool integrate_values, const bool integrate_gradients)
 
void integrate (const bool integrate_values, const bool integrate_gradients, VectorizedArray< Number > *values_array)
 
template<typename VectorType >
void integrate_scatter (const bool integrate_values, const bool integrate_gradients, VectorType &output_vector)
 
Point< dim, VectorizedArray< Number > > quadrature_point (const unsigned int q_point) const
 
- Public Member Functions inherited from FEEvaluationBase< dim, n_components_, Number, is_face >
 ~FEEvaluationBase ()
 
unsigned int get_cell_data_number () const
 
unsigned int get_mapping_data_index_offset () const
 
internal::MatrixFreeFunctions::GeometryType get_cell_type () const
 
const internal::MatrixFreeFunctions::ShapeInfo< VectorizedArray< Number > > & get_shape_info () const
 
template<typename VectorType >
void read_dof_values (const VectorType &src, const unsigned int first_index=0)
 
template<typename VectorType >
void read_dof_values_plain (const VectorType &src, const unsigned int first_index=0)
 
template<typename VectorType >
void distribute_local_to_global (VectorType &dst, const unsigned int first_index=0) const
 
template<typename VectorType >
void set_dof_values (VectorType &dst, const unsigned int first_index=0) const
 
value_type get_dof_value (const unsigned int dof) const
 
void submit_dof_value (const value_type val_in, const unsigned int dof)
 
value_type get_value (const unsigned int q_point) const
 
void submit_value (const value_type val_in, const unsigned int q_point)
 
gradient_type get_gradient (const unsigned int q_point) const
 
value_type get_normal_derivative (const unsigned int q_point) const
 
void submit_gradient (const gradient_type grad_in, const unsigned int q_point)
 
void submit_normal_derivative (const value_type grad_in, const unsigned int q_point)
 
Tensor< 1, n_components_, Tensor< 2, dim, VectorizedArray< Number > > > get_hessian (const unsigned int q_point) const
 
gradient_type get_hessian_diagonal (const unsigned int q_point) const
 
value_type get_laplacian (const unsigned int q_point) const
 
VectorizedArray< Number > get_divergence (const unsigned int q_point) const
 
SymmetricTensor< 2, dim, VectorizedArray< Number > > get_symmetric_gradient (const unsigned int q_point) const
 
Tensor< 1,(dim==2?1:dim), VectorizedArray< Number > > get_curl (const unsigned int q_point) const
 
void submit_divergence (const VectorizedArray< Number > div_in, const unsigned int q_point)
 
void submit_symmetric_gradient (const SymmetricTensor< 2, dim, VectorizedArray< Number >> grad_in, const unsigned int q_point)
 
void submit_curl (const Tensor< 1, dim==2?1:dim, VectorizedArray< Number >> curl_in, const unsigned int q_point)
 
value_type integrate_value () const
 
VectorizedArray< Number > JxW (const unsigned int q_index) const
 
void fill_JxW_values (AlignedVector< VectorizedArray< Number >> &JxW_values) const
 
Tensor< 2, dim, VectorizedArray< Number > > inverse_jacobian (const unsigned int q_index) const
 
Tensor< 1, dim, VectorizedArray< Number > > get_normal_vector (const unsigned int q_point) const
 
VectorizedArray< Number > read_cell_data (const AlignedVector< VectorizedArray< Number >> &array) const
 
const VectorizedArray< Number > * begin_dof_values () const
 
VectorizedArray< Number > * begin_dof_values ()
 
const VectorizedArray< Number > * begin_values () const
 
VectorizedArray< Number > * begin_values ()
 
const VectorizedArray< Number > * begin_gradients () const
 
VectorizedArray< Number > * begin_gradients ()
 
const VectorizedArray< Number > * begin_hessians () const
 
VectorizedArray< Number > * begin_hessians ()
 
const std::vector< unsigned int > & get_internal_dof_numbering () const
 
ArrayView< VectorizedArray< Number > > get_scratch_data () const
 

Public Attributes

const unsigned int dofs_per_component
 
const unsigned int dofs_per_cell
 
const unsigned int n_q_points
 

Static Public Attributes

static constexpr unsigned int dimension = dim
 
static constexpr unsigned int n_components = n_components_
 
static constexpr unsigned int static_n_q_points
 
static constexpr unsigned int static_dofs_per_component
 
static constexpr unsigned int tensor_dofs_per_cell
 
static constexpr unsigned int static_dofs_per_cell
 

Private Member Functions

void check_template_arguments (const unsigned int fe_no, const unsigned int first_selected_component)
 

Additional Inherited Members

- Protected Member Functions inherited from FEEvaluationAccess< dim, n_components_, Number, false >
 FEEvaluationAccess (const MatrixFree< dim, Number > &matrix_free, const unsigned int dof_no, const unsigned int first_selected_component, const unsigned int quad_no, const unsigned int fe_degree, const unsigned int n_q_points, const bool is_interior_face=true)
 
 FEEvaluationAccess (const Mapping< dim > &mapping, const FiniteElement< dim > &fe, const Quadrature< 1 > &quadrature, const UpdateFlags update_flags, const unsigned int first_selected_component, const FEEvaluationBase< dim, n_components_other, Number, is_face > *other)
 
 FEEvaluationAccess (const FEEvaluationAccess &other)
 
FEEvaluationAccessoperator= (const FEEvaluationAccess &other)
 
- Protected Member Functions inherited from FEEvaluationBase< dim, n_components_, Number, is_face >
 FEEvaluationBase (const MatrixFree< dim, Number > &matrix_free, const unsigned int dof_no, const unsigned int first_selected_component, const unsigned int quad_no, const unsigned int fe_degree, const unsigned int n_q_points, const bool is_interior_face)
 
template<int n_components_other>
 FEEvaluationBase (const Mapping< dim > &mapping, const FiniteElement< dim > &fe, const Quadrature< 1 > &quadrature, const UpdateFlags update_flags, const unsigned int first_selected_component, const FEEvaluationBase< dim, n_components_other, Number > *other)
 
 FEEvaluationBase (const FEEvaluationBase &other)
 
FEEvaluationBaseoperator= (const FEEvaluationBase &other)
 
template<typename VectorType , typename VectorOperation >
void read_write_operation (const VectorOperation &operation, VectorType *vectors[], const bool apply_constraints=true) const
 
template<typename VectorType , typename VectorOperation >
void read_write_operation_contiguous (const VectorOperation &operation, VectorType *vectors[]) const
 
template<typename VectorType , typename VectorOperation >
void read_write_operation_global (const VectorOperation &operation, VectorType *vectors[]) const
 
- Protected Attributes inherited from FEEvaluationBase< dim, n_components_, Number, is_face >
AlignedVector< VectorizedArray< Number > > * scratch_data_array
 
VectorizedArray< Number > * scratch_data
 
VectorizedArray< Number > * values_dofs [n_components]
 
VectorizedArray< Number > * values_quad [n_components]
 
VectorizedArray< Number > * gradients_quad [n_components][dim]
 
VectorizedArray< Number > * hessians_quad [n_components][(dim *(dim+1))/2]
 
const unsigned int quad_no
 
const unsigned int n_fe_components
 
const unsigned int active_fe_index
 
const unsigned int active_quad_index
 
const unsigned int n_quadrature_points
 
const MatrixFree< dim, Number > * matrix_info
 
const internal::MatrixFreeFunctions::DoFInfodof_info
 
const internal::MatrixFreeFunctions::MappingInfoStorage<(is_face?dim-1:dim), dim, Number > * mapping_data
 
const internal::MatrixFreeFunctions::ShapeInfo< VectorizedArray< Number > > * data
 
const Tensor< 2, dim, VectorizedArray< Number > > * jacobian
 
const VectorizedArray< Number > * J_value
 
const Tensor< 1, dim, VectorizedArray< Number > > * normal_vectors
 
const Tensor< 1, dim, VectorizedArray< Number > > * normal_x_jacobian
 
const Number * quadrature_weights
 
unsigned int cell
 
bool is_interior_face
 
internal::MatrixFreeFunctions::DoFInfo::DoFAccessIndex dof_access_index
 
unsigned int face_no
 
unsigned int face_orientation
 
unsigned int subface_index
 
internal::MatrixFreeFunctions::GeometryType cell_type
 
bool dof_values_initialized
 
bool values_quad_initialized
 
bool gradients_quad_initialized
 
bool hessians_quad_initialized
 
bool values_quad_submitted
 
bool gradients_quad_submitted
 
std::shared_ptr< internal::MatrixFreeFunctions::MappingDataOnTheFly< dim, Number > > mapped_geometry
 
const unsigned int first_selected_component
 
std::vector< types::global_dof_indexlocal_dof_indices
 

Detailed Description

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
class FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >

The class that provides all functions necessary to evaluate functions at quadrature points and cell integrations. In functionality, this class is similar to FEValues, however, it includes a lot of specialized functions that make it much faster (between 5 and 500, depending on the polynomial degree). For evaluation of face terms in DG, see the class FEFaceEvaluation.

Usage and initialization

Fast usage in combination with MatrixFree

The first and foremost way of usage is to initialize this class from a MatrixFree object that caches everything related to the degrees of freedom and the mapping information. This way, it is possible to use vectorization for applying a differential operator for several cells at once.

The capabilities of FEEvaluation span a large spectrum of integration tasks for weak forms. In general, there are two classes of tasks that get done. One is the evaluate path that interpolates from a solution vector to quadrature points:

FEEvaluation<dim,fe_degree> phi(matrix_free);
for (unsigned int cell_index = cell_range.first;
cell_index < cell_range.second; ++cell_index)
{
phi.reinit(cell_index);
phi.read_dof_values(vector);
phi.evaluate(true, false); // interpolate values, but not gradients
for (unsigned int q_index=0; q_index<phi.n_q_points; ++q_index)
{
VectorizedArray<double> val = phi.get_value(q_index);
// do something with val
}
}

Likewise, a gradient of the finite element solution represented by vector can be interpolated to the quadrature points by phi.get_gradient(q). The combination of read_dof_values(), evaluate() and get_value() is similar to what FEValues::get_function_values or FEValues::get_function_gradients does, but it is in general much faster because it makes use of the tensor product, see the description of the evaluation routines below, and can do this operation for several cells at once through vectorization.

The second class of tasks done by FEEvaluation are integration tasks for right hand sides. In finite element computations, these typically consist of multiplying a quantity on quadrature points (a function value, or a field interpolated by the finite element space itself) by a set of test functions and integrating over the cell through summation of the values in each quadrature point, multiplied by the quadrature weight and the Jacobian determinant of the transformation. If a generic Function object is given and we want to compute \(v_i = \int_\Omega \varphi_i f dx\), this is done by the following cell-wise integration:

FEEvaluation<dim,fe_degree> phi(matrix_free);
Function<dim> &function = ...;
for (unsigned int cell_index = cell_range.first;
cell_index < cell_range.second; ++cell_index)
{
phi.reinit(cell_index);
for (unsigned int q_index=0; q_index<phi.n_q_points; ++q_index)
{
phi.quadrature_point(q_index);
// Need to evaluate function for each component in VectorizedArray
for (unsigned int v=0;
v<VectorizedArray<double>::n_array_elements;
++v)
{
for (unsigned int d=0; d<dim; ++d)
p[d] = p_vect[d][v];
f_value[v] = function.value(p);
}
phi.submit_value(f_value, q);
}
phi.integrate(true, false);
phi.distribute_local_to_global(dst);
}

In this code, the call to phi.submit_value() prepares for the multiplication by the test function prior to the actual integration (inside the submit call, the value to be tested is also multiplied by the determinant of the Jacobian and the quadrature weight). In the integrate() call, an integral contribution tested by each basis function underlying the FEEvaluation object (e.g. the four linear shape functions of FE_Q<2>(1) in 2D) is computed, which gives the vector entries to be summed into the dst vector. Note that the above code needs to explicitly loop over the components in the vectorized array for evaluating the function, which is necessary for interfacing with a generic Function object with double arguments. Simple functions can also be implemented in VectorizedArray form directly as VectorizedArray provides the basic math operations.

For evaluating a bilinear form, the evaluation on a source vector is combined with the integration involving test functions that get written into a result vector. This setting is the context of matrix-free operator evaluation and explained in the step-37 and step-48 tutorial programs.

Note that the two vector accesses through FEEvaluation::read_dof_values and FEEvaluation::distribute_local_to_global resolve constraints on the fly, based on the AffineConstraints object specified at the MatrixFree::reinit() call. In case the values in the degrees of freedom are of interest (usually only the values in quadrature points are necessary), these can be accessed through FEEvaluation::get_dof_value(i), where i is the index of the basis function. Note that the numbering of the degrees of freedom for continuous elements in FEEvaluation is different from the ordering in FE_Q (or FEValues) because FEEvaluation needs to access them in lexicographic order, which is the ordering used in FE_DGQ, for instance. Re-indexing would be too expensive because the access inside evaluate() and integrate() is on the critical path in the tensorial evaluation parts. An alternative to filling the DoF values by read_dof_values() before an evaluate() call is to manually assign a value by a set_dof_value() call. Likewise, if the local result of integration should be further processed rather than scattered into a vector by distribute_local_to_global(), one can access it by get_dof_value() after an integrate() call. An example for using the values of an integral in a different context is fast assembly of matrices as shown in the next subsection.

For most operator evaluation tasks that repeatedly go through the mesh, the realization by MatrixFree that combines pre-computed data for the mapping (Jacobian transformations for the geometry description) with on-the-fly evaluation of basis functions is the most efficient way of doing things. In other words, the framework selects a trade-off between memory usage and initialization of objects that is suitable for replacement of matrix-vector products or explicit time integration in a matrix-free way.

Usage without pre-initialized MatrixFree object

The second form of usage is to initialize FEEvaluation from geometry information generated by FEValues. This allows to apply the integration loops on the fly without prior initialization of MatrixFree objects. This can be useful when the memory and initialization cost of MatrixFree is not acceptable, e.g. when a different number of quadrature points should be used for one single evaluation in error computation. Also, when using the routines of this class to assemble matrices the trade-off implied by the MatrixFree class may not be desired. In such a case, the cost to initialize the necessary geometry data on the fly is comparably low and thus avoiding a global object MatrixFree can be useful. When used in this way, reinit methods reminiscent from FEValues with a cell iterator are used. However, note that this model results in working on a single cell at a time, with geometry data duplicated in all components of the vectorized array. Thus, vectorization is only useful when it can apply the same operation on different data, e.g. when performing matrix assembly.

As an example, consider the following code to assemble the contributions to the Laplace matrix:

FEEvaluation<dim,fe_degree> fe_eval (mapping, finite_element,
QGauss<1>(fe_degree+1), flags);
for (cell = dof_handler.begin_active();
cell != dof_handler.end();
++cell)
{
fe_eval.reinit(cell);
for (unsigned int i=0; i<dofs_per_cell;
{
const unsigned int n_items =
(dofs_per_cell - i) :
// Set n_items unit vectors
for (unsigned int j=0; j<dofs_per_cell; ++j)
fe_eval.set_dof_value(VectorizedArray<double>(), j);
for (unsigned int v=0; v<n_items; ++v)
{
one_value[v] = 1.;
fe_eval.set_dof_value(one_value, i+v);
}
// Apply operator on unit vector to generate the next few matrix
// columns
fe_eval.evaluate(true, true);
for (unsigned int q=0; q<n_q_points; ++q)
{
fe_eval.submit_value(10.*fe_eval.get_value(q), q);
fe_eval.submit_gradient(fe_eval.get_gradient(q), q);
}
fe_eval.integrate(true, true);
// Insert computed entries in matrix
for (unsigned int v=0; v<n_items; ++v)
for (unsigned int j=0; j<dofs_per_cell; ++j)
cell_matrix(fe_eval.get_internal_dof_numbering()[j],
fe_eval.get_internal_dof_numbering()[i+v])
= fe_eval.get_dof_value(j)[v];
}
...
}

This code generates the columns of the cell matrix with the loop over i above. The way this is done is the following: FEEvaluation's routines focus on the evaluation of finite element operators, so for computing a cell matrix out of an operator evaluation it is applied to all the unit vectors on the cell. Applying the operator on a unit vector might seem inefficient but the evaluation routines used here are so quick that they still work much faster than what is possible with FEValues. In particular, the complexity is (fe_degree+1)2*dim+1 rather than (fe_degree+1)3*dim .

Due to vectorization, we can generate matrix columns for several unit vectors at a time (e.g. 4). The variable n_items make sure that we do the last iteration where the number of cell dofs is not divisible by the vectorization length correctly. Also note that we need to get the internal dof numbering applied by fe_eval because FEEvaluation internally uses a lexicographic numbering of degrees of freedom as explained above.

Internal data organization

The temporary data for holding the solution values on the local degrees of freedom as well as the interpolated values, gradients, and Hessians on quadrature points is a scratch array provided by MatrixFree::acquire_scratch_data() that is re-used between different calls to FEEvaluation. Therefore, constructing an FEEvaluation object is typically cheap and does not involve any expensive operation. Only a few dozen pointers to the actual data fields are set during construction. Therefore, no negative performance impact arises when creating an FEEvaluation several times per loop, such as at the top of a local_cell_operation operation that is split in small chunks for a parallel for loop, obviating a separate scratch data field for parallel loops as necessary in the loop of WorkStream.

When using the FEEvaluation class in multithreaded mode, the thread local storage of the scratch data in MatrixFree automatically makes sure that each thread gets it private data array. Note, however, that deal.II must be compiled with thread support also when all the thread parallelization is provided externally and not done via deal.II's routines, such as OpenMP. This is because deal.II needs to know the notation of thread local storage. The FEEvaluation kernels have been verified to work within OpenMP loops.

Vectorization scheme through VectorizedArray

This class is designed to perform all arithmetics on single-instruction multiple-data (SIMD) instructions present on modern CPUs by explicit vectorization, which are made available in deal.II through the class VectorizedArray, using the widest vector width available at configure/compile time. In order to keep programs flexible, FEEvaluation always applies vectorization over several elements. This is often the best compromise because computations on different elements are usually independent in the finite element method (except of course the process of adding an integral contribution to a global residual vector), also in more complicated scenarios: Stabilization parameter can e.g. be defined as the maximum of some quantities on all quadrature points of a cell divided by the cell's volume, but without locally mixing the results with neighbors. Using the terminology from computer architecture, the design of FEEvaluation relies on not doing any cross-lane data exchange when operating on the cell in typical integration scenarios.

When the number of cells in the problem is not a multiple of the number of array elements in the SIMD vector, the implementation of FEEvaluation fills in some dummy entries in the unused SIMD lanes and carries them around nonetheless, a choice made necessary since the length of VectorizedArray is fixed at compile time. Yet, this approach most often results in superior code as compared to an auto-vectorization setup where an alternative unvectorized code path would be necessary next to the vectorized version to be used on fully populated lanes, together with a dispatch mechanism. In read_dof_values, the empty lanes resulting from a reinit() call to an incomplete batch of cells are set to zero, whereas distribute_local_to_global or set_dof_values simply ignores the content in the empty lanes. The number of actually filled SIMD lanes can by queried by MatrixFree::n_components_filled().

Obviously, the computations performed on the artificial lanes (without real data) should never be mixed with valid results. The contract in using this class is that the user makes sure that lanes are not crossed in user code, in particular since it is not clear a priori which cells are going to be put together in vectorization. For example, results on an element should not be added to results on other elements except through the global vector access methods or by access that is masked by MatrixFree::n_components_filled(). No guarantee can be made that results on artificial lanes will always be zero that can safely be added to other results: The data on JxW or Jacobians is copied from the last valid lane in order to avoid division by zero that could trigger floating point exceptions or trouble in other situations.

Description of evaluation routines

This class contains specialized evaluation routines for elements based on tensor-product quadrature formulas and tensor-product-like shape functions, including standard FE_Q or FE_DGQ elements and quadrature points symmetric around 0.5 (like Gauss quadrature), FE_DGP elements based on truncated tensor products as well as the faster case of Gauss-Lobatto elements with Gauss-Lobatto quadrature which give diagonal mass matrices and quicker evaluation internally. The main benefit of this class is the evaluation of all shape functions in all quadrature or integration over all shape functions in dim (fe_degree+1)dim+1 operations instead of the slower (fe_degree+1)2*dim complexity in the evaluation routines of FEValues. This is done by an algorithm called sum factorization which factors out constant factors during the evaluation along a coordinate direction. This algorithm is the basis of many spectral element algorithms.

Note that many of the operations available through this class are inherited from the base class FEEvaluationBase, in particular reading from and writing to vectors. Also, the class inherits from FEEvaluationAccess that implements access to values, gradients and Hessians of the finite element function on quadrature points.

This class assumes that the shape functions of the FiniteElement under consideration do not depend on the geometry of the cells in real space. Currently, other finite elements cannot be treated with the matrix-free concept.

Degree of finite element as a compile-time parameter

The default usage model of FEEvaluation expects the polynomial degree to be given as a template parameter. This requirement guarantees maximum efficiency: The sum factorization evaluation performs a number of nested short 1D loops of length equal to the polynomial degree plus one. If the loop bounds are known at compile time, the compiler can unroll loops as deemed most efficient by its heuristics. At least the innermost loop is almost always completely unrolled, avoiding the loop overhead.

However, carrying the polynomial degree (and the number of quadrature points) as a template parameter makes things more complicated in codes where different polynomial degrees should be considered, e.g. in application codes where the polynomial degree is given through an input file. The recommended approach for good performance is to create different cell functions, possibly through different operator classes that are derived from a common base class with virtual functions to access the operator evaluation. The program then keeps only a pointer to the common base class after initializing templated derived classes with fixed polynomial degree that are selected from the detected polynomial degree. This approach requires a-priori knowledge of the range of valid degrees and can lead to rather long compile times in programs with many apply functions.

A flexible choice of the polynomial degree based on the information in the element passed to the initialization is also supported by FEEvaluation, even though it runs two to three times more slowly. For this, set the template parameter for the polynomial degree to -1 (and choose an arbitrary number for the number of quadrature points), which switches to the other code path. Internally, an evaluator object with template specialization for -1 is invoked that runs according to run-time bounds, whereas the default case with compile-time bounds given through fe_degree selects another template class without compromising efficiency.

An overview of the performance of FEEvaluation is given in the following figure. It considers the time spent per degree of freedom for evaluating the Laplacian with continuous finite elements using a code similar to the step-37 tutorial program for single-precision arithmetics. The time is based on an experiment on a single core of an Intel Xeon E5-2687W v4, running at 3.4 GHz and measured at problem sizes around 10 million. The plot lists the computational time (around 0.1 seconds) divided by the number of degrees freedom.

fe_evaluation_laplacian_time_per_dof.png

The figure shows that the templated version is between 2.5 and 3 times faster. The fastest turnaround on this setup is for polynomial degree 5 at 7.4e-9 seconds per degree of freedom or 134 million degrees of freedom per second - on a single core. The non-templated version is also fastest at polynomial degree 5 with 2.1e-9 seconds per degree of freedom or 48 million degrees of freedom per second.

Handling multi-component systems

FEEvaluation also allows for treating vector-valued problems through a template parameter on the number of components:

If used this way, the components can be gathered from several components of an std::vector<VectorType> through the call

phi.read_dof_values(src, 0);

where the 0 means that the vectors starting from the zeroth vector in the std::vector should be used, src[0], src[1], ..., src[n_components-1].

An alternative way for reading multi-component systems is possible if the DoFHandler underlying the MatrixFree data is based on an FESystem of n_components entries. In that case, a single vector is provided for the read_dof_values() and distribute_local_to_global() calls.

An important property of FEEvaluation in multi-component systems is the layout of multiple components in the get_value(), get_gradient(), or get_dof_value() calls. In this case, instead of a scalar return field VectorizedArray<double> a tensor is returned,

In a similar vein, the submit_value() and submit_gradient() calls take tensors of values. Note that there exist specializations for n_components=1 and n_components=dim, which are provided through the base class FEEvaluationAccess. In the scalar case, these provide the scalar return types described above. In the vector-valued case, the gradient is converted from Tensor<1,dim,Tensor<1,dim,VectorizedArray<double> > > to Tensor<2,dim,VectorizedArray<double> >. Furthermore, additional operations such as the diveregence or curl are available.

In case different shape functions are combined, for example mixed finite element formulations in Stokes flow, two FEEvaluation objects are created, one for the velocity and one for the pressure. Those are then combined on quadrature points:

for (unsigned int cell=cell_range.first; cell<cell_range.second; ++cell)
{
velocity.reinit (cell);
velocity.read_dof_values (src.block(0));
velocity.evaluate (false,true);
pressure.reinit (cell);
pressure.read_dof_values (src.block(1));
pressure.evaluate (true,false);
for (unsigned int q=0; q<velocity.n_q_points; ++q)
{
velocity.get_symmetric_gradient (q);
VectorizedArray<double> pres = pressure.get_value(q);
VectorizedArray<double> div = -trace(sym_grad_u);
pressure.submit_value (div, q);
// subtract p * I
for (unsigned int d=0; d<dim; ++d)
sym_grad_u[d][d] -= pres;
velocity.submit_symmetric_gradient(sym_grad_u, q);
}
velocity.integrate (false,true);
velocity.distribute_local_to_global (dst.block(0));
pressure.integrate (true,false);
pressure.distribute_local_to_global (dst.block(1));
}

This code assumes that a BlockVector of two components describes the velocity and pressure components, respectively. For identifying the different DoFHandler objects for velocity and pressure, the second argument to the FEEvaluation objects specify the respective component 0 for velocity and 1 for pressure. For further examples of vector-valued problems, the deal.II test suite includes a few additional examples as well, e.g. the Stokes operator described above is found at https://github.com/dealii/dealii/blob/master/tests/matrix_free/matrix_vector_stokes_noflux.cc

Handling several integration tasks and data storage in quadrature points

The design of FEEvaluation and MatrixFree separates the geometry from the basis functions. Therefore, several DoFHandler objects (or the same DoFHandler equipped with different constraint objects) can share the same geometry information like in the Stokes example above. All geometry is cached once in MatrixFree, so FEEvaluation does not need to do expensive initialization calls and rather sets a few pointers. This realization is based on the idea that the geometry information is needed only once also when several fields are evaluated, in a departure from FEValues which sets up the internal mapping data for each field. If for example a multi-component PDE involves the shape values on one component and the shape gradient on the other, no efficiency is lost if both are based on the same MatrixFree object where the update flags specify that both update_values , update_gradients , and update_jxw_values are given. The selection of desired quantities of shape values is through the flags in the evaluate() or integrate calls and the access at quadrature points:

phi1.evaluate(true, false);
phi2.evaluate(false, true);
for (unsigned int q_index=0; q_index<phi1.n_q_points; ++q_index)
{
VectorizedArray<double> val1 = phi1.get_value(q);
Tensor<1,dim,VectorizedArray<double> > grad2 = phi2.get_gradient(q);
Point<dim,VectorizedArray<double> > point = phi1.get_quadrature_point(q);
// ... some complicated formula combining those three...
}

In the loop over quadrature points, one can ask any of the two FEEvaluation objects — it does not really matter which one because they only keep pointers to the quadrature point data — to provide the quadrature point location.

This observation also translates to the case when different differential operators are implemented in a program, for example the action of a mass matrix for one phase of the algorithm and the action of a stiffness matrix in another one. Only a single MatrixFree object is necessary, maintaining full efficiency by using different local functions with the respective implementation in separate FEEvaluation objects. In other words, a user does not need to bother about being conservative when providing update_flags to the initialization of MatrixFree for efficiency reasons - no overhead incurs inside FEEvaluation, except for at most one or two more if statements inside the FEEvaluation::reinit() call. Rather, the largest set of flags necessary among all calls is perfectly fine from an efficiency point of view.

For the combination of different fields, including different solution vectors that come from different time steps, it is mandatory that all FEEvaluation objects share the same MatrixFree object. This is because the way cells are looped by MatrixFree::cell_loop() can be different for different DoFHandler or AffineConstraints arguments. More precisely, even though the layout is going to be the same in serial, there is no guarantee about the ordering for different DoFHandler/Constraints in the MPI case. The reason is that the algorithm detects cells that need data exchange with MPI and those can change for different elements — FE_Q with hanging node constraints connects to more neighbors than a FE_DGQ element, for instance, and cells which need data exchange are put in different positions inside the cell loop. Of course, if the exact same DoFHandler, AffineConstraints, and options (such as the setting for thread parallelism) are set, then the order is going to be the same because the algorithm is deterministic.

Template Parameters
dimDimension in which this class is to be used
fe_degreeDegree of the tensor product finite element with fe_degree+1 degrees of freedom per coordinate direction. Can be set to -1 if the degree is not known at compile time, but performance will usually be worse by a factor of 2-3.
n_q_points_1dNumber of points in the quadrature formula in 1D, defaults to fe_degree+1
n_componentsNumber of vector components when solving a system of PDEs. If the same operation is applied to several components of a PDE (e.g. a vector Laplace equation), they can be applied simultaneously with one call (and often more efficiently). Defaults to 1.
NumberNumber format, usually double or float. Defaults to double
Author
Katharina Kormann and Martin Kronbichler, 2010, 2011

Definition at line 62 of file fe_evaluation.h.

Member Typedef Documentation

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
using FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::BaseClass = FEEvaluationAccess<dim, n_components_, Number, false>

An alias to the base class.

Definition at line 2152 of file fe_evaluation.h.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
using FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::number_type = Number

An underlying number type specified as template argument.

Definition at line 2157 of file fe_evaluation.h.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
using FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::value_type = typename BaseClass::value_type

The type of function values, e.g. VectorizedArray<Number> for n_components=1 or Tensor<1,dim,VectorizedArray<Number> > for n_components=dim.

Definition at line 2164 of file fe_evaluation.h.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
using FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::gradient_type = typename BaseClass::gradient_type

The type of gradients, e.g. Tensor<1,dim,VectorizedArray<Number>> for n_components=1 or Tensor<2,dim,VectorizedArray<Number> > for n_components=dim.

Definition at line 2171 of file fe_evaluation.h.

Constructor & Destructor Documentation

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::FEEvaluation ( const MatrixFree< dim, Number > &  matrix_free,
const unsigned int  dof_no = 0,
const unsigned int  quad_no = 0,
const unsigned int  first_selected_component = 0 
)

Constructor. Takes all data stored in MatrixFree. If applied to problems with more than one finite element or more than one quadrature formula selected during construction of matrix_free, the appropriate component can be selected by the optional arguments.

Parameters
matrix_freeData object that contains all data
dof_noIf matrix_free was set up with multiple DoFHandler objects, this parameter selects to which DoFHandler/AffineConstraints pair the given evaluator should be attached to.
quad_noIf matrix_free was set up with multiple Quadrature objects, this parameter selects the appropriate number of the quadrature formula.
first_selected_componentIf the dof_handler selected by dof_no uses an FESystem consisting of more than one component, this parameter allows for selecting the component where the current evaluation routine should start. Note that one evaluator does not support combining different shape functions in different components. In other words, the same base element of a FESystem needs to be set for the components between first_selected_component and first_selected_component+n_components_.
template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::FEEvaluation ( const Mapping< dim > &  mapping,
const FiniteElement< dim > &  fe,
const Quadrature< 1 > &  quadrature,
const UpdateFlags  update_flags,
const unsigned int  first_selected_component = 0 
)

Constructor that comes with reduced functionality and works similar as FEValues. The arguments are similar to the ones passed to the constructor of FEValues, with the notable difference that FEEvaluation expects a one- dimensional quadrature formula, Quadrature<1>, instead of a dim dimensional one. The finite element can be both scalar or vector valued, but this method always only selects a scalar base element at a time (with n_components copies as specified by the class template). For vector- valued elements, the optional argument first_selected_component allows to specify the index of the base element to be used for evaluation. Note that the internal data structures always assume that the base element is primitive, non-primitive are not supported currently.

As known from FEValues, a call to the reinit method with a Triangulation<dim>::cell_iterator is necessary to make the geometry and degrees of freedom of the current class known. If the iterator includes DoFHandler information (i.e., it is a DoFHandler<dim>::cell_iterator or similar), the initialization allows to also read from or write to vectors in the standard way for DoFHandler<dim>::active_cell_iterator types for one cell at a time. However, this approach is much slower than the path with MatrixFree with MPI since index translation has to be done. As only one cell at a time is used, this method does not vectorize over several elements (which is most efficient for vector operations), but only possibly within the element if the evaluate/integrate routines are combined inside user code (e.g. for computing cell matrices).

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::FEEvaluation ( const FiniteElement< dim > &  fe,
const Quadrature< 1 > &  quadrature,
const UpdateFlags  update_flags,
const unsigned int  first_selected_component = 0 
)

Constructor for the reduced functionality. This constructor is equivalent to the other one except that it makes the object use a \(Q_1\) mapping (i.e., an object of type MappingQGeneric(1)) implicitly.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
template<int n_components_other>
FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::FEEvaluation ( const FiniteElement< dim > &  fe,
const FEEvaluationBase< dim, n_components_other, Number > &  other,
const unsigned int  first_selected_component = 0 
)

Constructor for the reduced functionality. Similar to the other constructor with FiniteElement argument but using another FEEvaluationBase object to provide information about the geometry. This allows several FEEvaluation objects to share the geometry evaluation, i.e., the underlying mapping and quadrature points do only need to be evaluated once. Make sure to not pass an optional object around when you intend to use the FEEvaluation object in parallel to the given one because otherwise the intended sharing may create race conditions.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::FEEvaluation ( const FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number > &  other)

Copy constructor. If FEEvaluationBase was constructed from a mapping, fe, quadrature, and update flags, the underlying geometry evaluation based on FEValues will be deep-copied in order to allow for using in parallel with threads.

Member Function Documentation

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
FEEvaluation& FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::operator= ( const FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number > &  other)

Copy assignment operator. If FEEvaluationBase was constructed from a mapping, fe, quadrature, and update flags, the underlying geometry evaluation based on FEValues will be deep-copied in order to allow for using in parallel with threads.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
void FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::reinit ( const unsigned int  cell_batch_index)

Initialize the operation pointer to the current cell batch index. Unlike the reinit functions taking a cell iterator as argument below and the FEValues::reinit() methods, where the information related to a particular cell is generated in the reinit call, this function is very cheap since all data is pre-computed in matrix_free, and only a few indices have to be set appropriately.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
template<typename DoFHandlerType , bool level_dof_access>
void FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::reinit ( const TriaIterator< DoFCellAccessor< DoFHandlerType, level_dof_access >> &  cell)

Initialize the data to the current cell using a TriaIterator object as usual in FEValues. The argument is either of type DoFHandler::active_cell_iterator or DoFHandler::level_cell_iterator. This option is only available if the FEEvaluation object was created with a finite element, quadrature formula and correct update flags and without a MatrixFree object. This initialization method loses the ability to use vectorization, see also the description of the FEEvaluation class. When this reinit method is used, FEEvaluation can also read from vectors (but less efficient than with data coming from MatrixFree).

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
void FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::reinit ( const typename Triangulation< dim >::cell_iterator &  cell)

Initialize the data to the current cell using a TriaIterator object as usual in FEValues. This option is only available if the FEEvaluation object was created with a finite element, quadrature formula and correct update flags and without a MatrixFree object. This initialization method loses the ability to use vectorization, see also the description of the FEEvaluation class. When this reinit method is used, FEEvaluation can not read from vectors because no DoFHandler information is available.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
void FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::evaluate ( const bool  evaluate_values,
const bool  evaluate_gradients,
const bool  evaluate_hessians = false 
)

Evaluate the function values, the gradients, and the Hessians of the polynomial interpolation from the DoF values in the input vector to the quadrature points on the unit cell. The function arguments specify which parts shall actually be computed. This function has to be called first so that the access functions get_value(), get_gradient() or get_laplacian give useful information (unless these values have been set manually).

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
void FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::evaluate ( const VectorizedArray< Number > *  values_array,
const bool  evaluate_values,
const bool  evaluate_gradients,
const bool  evaluate_hessians = false 
)

Evaluate the function values, the gradients, and the Hessians of the polynomial interpolation from the DoF values in the input array values_array to the quadrature points on the unit cell. If multiple components are involved in the current FEEvaluation object, the sorting in values_array is such that all degrees of freedom for the first component come first, then all degrees of freedom for the second, and so on. The function arguments specify which parts shall actually be computed. This function has to be called first so that the access functions get_value(), get_gradient() or get_laplacian give useful information (unless these values have been set manually).

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
template<typename VectorType >
void FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::gather_evaluate ( const VectorType &  input_vector,
const bool  evaluate_values,
const bool  evaluate_gradients,
const bool  evaluate_hessians = false 
)

Read from the input vector and evaluates the function values, the gradients, and the Hessians of the polynomial interpolation of the vector entries from input_vector associated with the current cell to the quadrature points on the unit cell. The function arguments specify which parts shall actually be computed. This function has to be called first so that the access functions get_value(), get_gradient() or get_laplacian give useful information (unless these values have been set manually).

This call is equivalent to calling read_dof_values() followed by evaluate(), but might internally use some additional optimizations.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
void FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::integrate ( const bool  integrate_values,
const bool  integrate_gradients 
)

This function takes the values and/or gradients that are stored on quadrature points, tests them by all the basis functions/gradients on the cell and performs the cell integration. The two function arguments integrate_values and integrate_gradients are used to enable/disable summation of the contributions submitted to the values or gradients slots, respectively. The result is written into the internal data field dof_values (that is usually written into the result vector by the distribute_local_to_global() or set_dof_values() methods).

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
void FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::integrate ( const bool  integrate_values,
const bool  integrate_gradients,
VectorizedArray< Number > *  values_array 
)

This function takes the values and/or gradients that are stored on quadrature points, tests them by all the basis functions/gradients on the cell and performs the cell integration. The two function arguments integrate_values and integrate_gradients are used to enable/disable summation of the contributions submitted to the values or gradients slots, respectively. As opposed to the other integrate() method, this call stores the result of the testing in the given array values_array, whose previous results is overwritten, rather than writing it on the internal data structures behind begin_dof_values().

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
template<typename VectorType >
void FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::integrate_scatter ( const bool  integrate_values,
const bool  integrate_gradients,
VectorType &  output_vector 
)

This function takes the values and/or gradients that are stored on quadrature points, tests them by all the basis functions/gradients on the cell, performs the cell integration, and adds the result into the global vector output_vector on the degrees of freedom associated with the present cell index. The two function arguments integrate_values and integrate_gradients are used to enable/disable summation of the contributions submitted to the values or gradients slots, respectively.

This call is equivalent to calling integrate() followed by distribute_local_to_global(), but might internally use some additional optimizations.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
Point<dim, VectorizedArray<Number> > FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::quadrature_point ( const unsigned int  q_point) const

Return the q-th quadrature point in real coordinates stored in MappingInfo.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
void FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::check_template_arguments ( const unsigned int  fe_no,
const unsigned int  first_selected_component 
)
private

Checks if the template arguments regarding degree of the element corresponds to the actual element used at initialization.

Member Data Documentation

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
constexpr unsigned int FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::dimension = dim
static

The dimension given as template argument.

Definition at line 2176 of file fe_evaluation.h.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
constexpr unsigned int FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::n_components = n_components_
static

The number of solution components of the evaluator given as template argument.

Definition at line 2182 of file fe_evaluation.h.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
constexpr unsigned int FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::static_n_q_points
static
Initial value:
=
Utilities::pow(n_q_points_1d, dim)

The static number of quadrature points determined from the given template argument n_q_points_1d. Note that the actual number of quadrature points, n_q_points, can be different if fe_degree=-1 is given and run-time loop lengths are used rather than compile time ones.

Definition at line 2190 of file fe_evaluation.h.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
constexpr unsigned int FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::static_dofs_per_component
static
Initial value:
=
Utilities::pow(fe_degree + 1, dim)

The static number of degrees of freedom of a scalar component determined from the given template argument fe_degree. Note that the actual number of degrees of freedom dofs_per_component can be different if fe_degree=-1 is given or if the underlying is of more complicated type than the usual FE_Q or FE_DGQ ones, such as FE_DGP.

Definition at line 2200 of file fe_evaluation.h.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
constexpr unsigned int FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::tensor_dofs_per_cell
static
Initial value:

The static number of degrees of freedom of all components determined from the given template argument fe_degree. Note that the actual number of degrees of freedom dofs_per_cell can be different if fe_degree=-1 is given or if the underlying is of more complicated type than the usual FE_Q or FE_DGQ ones, such as FE_DGP.

Definition at line 2210 of file fe_evaluation.h.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
constexpr unsigned int FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::static_dofs_per_cell
static
Initial value:

The static number of degrees of freedom of all components determined from the given template argument fe_degree. Note that the actual number of degrees of freedom dofs_per_cell can be different if fe_degree=-1 is given or if the underlying is of more complicated type than the usual FE_Q or FE_DGQ ones, such as FE_DGP.

Definition at line 2220 of file fe_evaluation.h.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
const unsigned int FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::dofs_per_component

The number of degrees of freedom of a single component on the cell for the underlying evaluation object. Usually close to static_dofs_per_component, but the number depends on the actual element selected and is thus not static.

Definition at line 2481 of file fe_evaluation.h.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
const unsigned int FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::dofs_per_cell

The number of degrees of freedom on the cell accumulated over all components in the current evaluation object. Usually close to static_dofs_per_cell = static_dofs_per_component*n_components, but the number depends on the actual element selected and is thus not static.

Definition at line 2489 of file fe_evaluation.h.

template<int dim, int fe_degree, int n_q_points_1d, int n_components_, typename Number>
const unsigned int FEEvaluation< dim, fe_degree, n_q_points_1d, n_components_, Number >::n_q_points

The number of quadrature points in use. If the number of quadrature points in 1d is given as a template, this number is simply the dim-th power of that value. If the element degree is set to -1 (dynamic selection of element degree), the static value of quadrature points is inaccurate and this value must be used instead.

Definition at line 2498 of file fe_evaluation.h.


The documentation for this class was generated from the following file: