283 for (
unsigned int face = range.first; face < range.second; ++face)
286 phi_m.gather_evaluate(src, face_evaluation_flags);
288 phi_p.gather_evaluate(src, face_evaluation_flags);
292 phi_m.integrate_scatter(face_evaluation_flags, dst);
293 phi_p.integrate_scatter(face_evaluation_flags, dst);
296 [&](
const auto &data,
auto &dst,
const auto &src,
const auto range) {
301 for (
unsigned int face = range.first; face < range.second; ++face)
304 phi_m.gather_evaluate(src, face_evaluation_flags);
308 phi_m.integrate_scatter(face_evaluation_flags, dst);
318matrix_free.template loop_cell_centric<VectorType, VectorType>(
319 [&](
const auto &data,
auto &dst,
const auto &src,
const auto range) {
324 for (
unsigned int cell = range.first; cell < range.second; ++cell)
327 phi.gather_evaluate(src, cell_evaluation_flags);
331 phi.integrate_scatter(cell_evaluation_flags, dst);
334 for (
unsigned int face = 0; face < GeometryInfo<dim>::faces_per_cell; ++face)
336 if (data.get_faces_by_cells_boundary_id(cell, face)[0] ==
340 phi_m.reinit(cell, face);
341 phi_m.gather_evaluate(src, face_evaluation_flags);
342 phi_p.reinit(cell, face);
343 phi_p.gather_evaluate(src, face_evaluation_flags);
347 phi_m.integrate_scatter(face_evaluation_flags, dst);
352 phi_m.reinit(cell, face);
353 phi_m.gather_evaluate(src, face_evaluation_flags);
357 phi_m.integrate_scatter(face_evaluation_flags, dst);
367the cell number and the local face number. The given example only
368highlights how to
transform face-centric loops into cell-centric loops and
369is by no means efficient, since data is read and written multiple times
370from and to the global vector as well as computations are performed
371redundantly. Below, we will discuss advanced techniques that target these issues.
382additional_data.mapping_update_flags_faces_by_cells =
383 additional_data.mapping_update_flags_inner_faces |
384 additional_data.mapping_update_flags_boundary_faces;
386data.reinit(
mapping, dof_handler, constraint, quadrature, additional_data);
389In particular, these flags enable that the
internal data structures are set up
390for all faces of the cells.
392Currently, cell-centric loops in deal.II only work
for uniformly refined meshes
393and
if no constraints are applied (which is the standard
case DG is normally
397<a name=
"step_76-ProvidinglambdastoMatrixFreeloops"></a><h3>Providing lambdas to
MatrixFree loops</h3>
400The examples given above have already used lambdas, which have been provided to
402a version where a
class and a pointer to
one of its methods are used and a
403variant where lambdas are utilized.
405In the following code, a class and a pointer to
one of its methods, which should
410local_apply_cell(
const MatrixFree<dim, Number> & data,
412 const VectorType & src,
413 const std::pair<unsigned int, unsigned int> &range)
const
415 FEEvaluation<dim, degree, degree + 1, 1, Number> phi(data);
416 for (
unsigned int cell = range.first; cell < range.second; ++cell)
419 phi.gather_evaluate(src, cell_evaluation_flags);
423 phi.integrate_scatter(cell_evaluation_flags, dst);
429matrix_free.cell_loop(&Operator::local_apply_cell,
this, dst, src);
432However, it is also possible to pass an anonymous function via a
lambda function
436matrix_free.template cell_loop<VectorType, VectorType>(
437 [&](
const auto &data,
auto &dst,
const auto &src,
const auto range) {
438 FEEvaluation<dim, degree, degree + 1, 1, Number> phi(data);
439 for (
unsigned int cell = range.first; cell < range.second; ++cell)
442 phi.gather_evaluate(src, cell_evaluation_flags);
446 phi.integrate_scatter(cell_evaluation_flags, dst);
453<a name=
"step_76-VectorizedArrayType"></a><h3>VectorizedArrayType</h3>
456The
class VectorizedArray<Number> is a key component to achieve the high
457node-level performance of the
matrix-free algorithms in deal.II.
458It is a wrapper class around a short vector of @f$n@f$ entries of type Number and
459maps arithmetic operations to appropriate single-instruction/multiple-data
460(SIMD) concepts by intrinsic
functions. The length of the vector can be
461queried by VectorizedArray::size() and its underlying number type by
464In the default case (<code>VectorizedArray<Number></code>), the vector length is
465set at compile time of the library to
466match the highest
value supported by the given processor architecture.
467However, also a second optional template argument can be
468specified as <code>VectorizedArray<Number, size></code>, where <code>size</code> explicitly
469controls the vector length within the capabilities of a particular instruction
470set.
A full list of supported vector lengths is presented in the following table:
472<table align="center" class="doxtable">
479 <td><code>VectorizedArray<double, 1></code></td>
480 <td><code>VectorizedArray<float, 1></code></td>
481 <td>(auto-vectorization)</td>
484 <td><code>VectorizedArray<double, 2></code></td>
485 <td><code>VectorizedArray<float, 4></code></td>
486 <td>SSE2/AltiVec</td>
489 <td><code>VectorizedArray<double, 4></code></td>
490 <td><code>VectorizedArray<float, 8></code></td>
494 <td><code>VectorizedArray<double, 8></code></td>
495 <td><code>VectorizedArray<float, 16></code></td>
500This allows users to select the vector length/ISA and, as a consequence, the
501number of cells to be processed at once in
matrix-free operator evaluations,
502possibly reducing the pressure on the caches, an severe issue for very high
503degrees (and dimensions).
505A possible further reason to
reduce the number of filled lanes
506is to simplify debugging: instead of having to look at,
e.g., 8
507cells,
one can concentrate on a single cell.
509The interface of VectorizedArray also enables the replacement by any type with
510a
matching interface. Specifically, this prepares deal.II for the <code>std::simd</code>
511class that is planned to become part of the
C++23 standard. The following table
512compares the deal.II-specific SIMD classes and the equivalent
C++23 classes:
515<table align="center" class="doxtable">
517 <th>VectorizedArray (deal.II)</th>
518 <th>std::simd (C++23)</th>
521 <td><code>VectorizedArray<Number></code></td>
522 <td><code>std::experimental::native_simd<Number></code></td>
525 <td><code>VectorizedArray<Number, size></code></td>
526 <td><code>std::experimental::fixed_size_simd<Number, size></code></td>
531 * <a name="step_76-CommProg"></a>
532 * <h1> The commented program</h1>
535 * <a name="step_76-Parametersandutilityfunctions"></a>
536 * <h3>Parameters and utility
functions</h3>
540 * The same includes as in @ref step_67 "step-67":
543 * #include <deal.II/base/conditional_ostream.h>
544 * #include <deal.II/base/function.h>
545 * #include <deal.II/base/time_stepping.h>
546 * #include <deal.II/base/timer.h>
547 * #include <deal.II/base/utilities.h>
548 * #include <deal.II/base/vectorization.h>
550 * #include <deal.II/distributed/tria.h>
552 * #include <deal.II/dofs/dof_handler.h>
554 * #include <deal.II/fe/fe_dgq.h>
555 * #include <deal.II/fe/fe_system.h>
557 * #include <deal.II/grid/grid_generator.h>
558 * #include <deal.II/grid/tria.h>
559 * #include <deal.II/grid/tria_accessor.h>
560 * #include <deal.II/grid/tria_iterator.h>
562 * #include <deal.II/lac/affine_constraints.h>
563 * #include <deal.II/lac/la_parallel_vector.h>
565 * #include <deal.II/matrix_free/fe_evaluation.h>
566 * #include <deal.II/matrix_free/matrix_free.h>
567 * #include <deal.II/matrix_free/operators.h>
569 * #include <deal.II/numerics/data_out.h>
573 * #include <iostream>
577 *
A new include for categorizing of cells according to their boundary IDs:
580 * #include <deal.II/matrix_free/tools.h>
586 *
using namespace dealii;
590 * The same input parameters as in @ref step_67
"step-67":
593 *
constexpr unsigned int testcase = 1;
595 *
constexpr unsigned int n_global_refinements = 2;
596 *
constexpr unsigned int fe_degree = 5;
597 *
constexpr unsigned int n_q_points_1d = fe_degree + 2;
601 * This parameter specifies the size of the shared-memory
group. Currently,
603 * to the options that the memory features can be turned off or all processes
604 * having access to the same shared-memory domain are grouped together.
609 *
using Number = double;
613 * Here, the type of the data structure is chosen
for vectorization. In the
614 *
default case, VectorizedArray<Number> is used, i.e., the highest
615 * instruction-set-architecture extension available on the given hardware with
616 * the maximum number of vector lanes is used. However,
one might
reduce
617 * the number of filled lanes,
e.g., by writing
618 * <code>
using VectorizedArrayType = VectorizedArray<Number, 4></code> to only
622 *
using VectorizedArrayType = VectorizedArray<Number>;
626 * The following parameters have not changed:
629 *
constexpr double gamma = 1.4;
630 *
constexpr double final_time = testcase == 0 ? 10 : 2.0;
631 *
constexpr double output_tick = testcase == 0 ? 1 : 0.05;
633 *
const double courant_number = 0.15 /
std::pow(fe_degree, 1.5);
637 * Specify
max number of time steps useful
for performance studies.
644 * Runge-Kutta-related
functions copied from @ref step_67
"step-67" and slightly modified
645 * with the purpose to minimize global vector access:
648 *
enum LowStorageRungeKuttaScheme
655 *
constexpr LowStorageRungeKuttaScheme lsrk_scheme = stage_5_order_4;
659 *
class LowStorageRungeKuttaIntegrator
662 * LowStorageRungeKuttaIntegrator(
const LowStorageRungeKuttaScheme scheme)
667 *
case stage_3_order_3:
670 *
case stage_5_order_4:
673 *
case stage_7_order_4:
676 *
case stage_9_order_5:
683 * TimeStepping::LowStorageRungeKutta<
684 * LinearAlgebra::distributed::Vector<Number>>
685 * rk_integrator(lsrk);
686 * std::vector<double> ci;
687 * rk_integrator.get_coefficients(ai, bi, ci);
690 *
unsigned int n_stages() const
695 *
template <
typename VectorType,
typename Operator>
696 *
void perform_time_step(
const Operator &pde_operator,
697 *
const double current_time,
698 *
const double time_step,
699 * VectorType &solution,
700 * VectorType &vec_ri,
701 * VectorType &vec_ki)
const
705 * vec_ki.swap(solution);
707 *
double sum_previous_bi = 0;
708 *
for (
unsigned int stage = 0; stage < bi.size(); ++stage)
710 *
const double c_i = stage == 0 ? 0 : sum_previous_bi + ai[stage - 1];
712 * pde_operator.perform_stage(stage,
713 * current_time + c_i * time_step,
714 * bi[stage] * time_step,
715 * (stage == bi.size() - 1 ?
717 * ai[stage] * time_step),
718 * (stage % 2 == 0 ? vec_ki : vec_ri),
719 * (stage % 2 == 0 ? vec_ri : vec_ki),
723 * sum_previous_bi += bi[stage - 1];
728 * std::vector<double> bi;
729 * std::vector<double> ai;
735 * Euler-specific utility
functions from @ref step_67
"step-67":
738 *
enum EulerNumericalFlux
740 * lax_friedrichs_modified,
741 * harten_lax_vanleer,
743 *
constexpr EulerNumericalFlux numerical_flux_type = lax_friedrichs_modified;
748 *
class ExactSolution :
public Function<dim>
751 * ExactSolution(
const double time)
752 * : Function<dim>(dim + 2, time)
755 *
virtual double value(
const Point<dim> &p,
756 *
const unsigned int component = 0)
const override;
762 *
double ExactSolution<dim>::value(
const Point<dim> &x,
763 *
const unsigned int component)
const
765 *
const double t = this->
get_time();
771 *
Assert(dim == 2, ExcNotImplemented());
772 *
const double beta = 5;
776 *
const double radius_sqr =
777 * (x - x0).norm_square() - 2. * (x[0] - x0[0]) * t + t * t;
778 *
const double factor =
780 *
const double density_log = std::log2(
781 *
std::abs(1. - (gamma - 1.) / gamma * 0.25 * factor * factor));
782 *
const double density = std::exp2(density_log * (1. / (gamma - 1.)));
783 *
const double u = 1. - factor * (x[1] - x0[1]);
784 *
const double v = factor * (x[0] - t - x0[0]);
786 *
if (component == 0)
788 *
else if (component == 1)
789 *
return density * u;
790 *
else if (component == 2)
791 *
return density * v;
794 *
const double pressure =
795 * std::exp2(density_log * (gamma / (gamma - 1.)));
796 *
return pressure / (
gamma - 1.) +
797 * 0.5 * (density * u * u + density * v * v);
803 *
if (component == 0)
805 *
else if (component == 1)
807 *
else if (component == dim + 1)
808 *
return 3.097857142857143;
821 *
template <
int dim,
typename Number>
823 * Tensor<1, dim, Number>
824 * euler_velocity(
const Tensor<1, dim + 2, Number> &conserved_variables)
826 *
const Number inverse_density = Number(1.) / conserved_variables[0];
828 * Tensor<1, dim, Number> velocity;
829 *
for (
unsigned int d = 0;
d < dim; ++
d)
830 * velocity[d] = conserved_variables[1 + d] * inverse_density;
835 *
template <
int dim,
typename Number>
838 * euler_pressure(
const Tensor<1, dim + 2, Number> &conserved_variables)
840 *
const Tensor<1, dim, Number> velocity =
841 * euler_velocity<dim>(conserved_variables);
843 * Number rho_u_dot_u = conserved_variables[1] * velocity[0];
844 *
for (
unsigned int d = 1;
d < dim; ++
d)
845 * rho_u_dot_u += conserved_variables[1 + d] * velocity[d];
847 *
return (gamma - 1.) * (conserved_variables[dim + 1] - 0.5 * rho_u_dot_u);
850 *
template <
int dim,
typename Number>
852 * Tensor<1, dim + 2, Tensor<1, dim, Number>>
853 * euler_flux(
const Tensor<1, dim + 2, Number> &conserved_variables)
855 *
const Tensor<1, dim, Number> velocity =
856 * euler_velocity<dim>(conserved_variables);
857 *
const Number pressure = euler_pressure<dim>(conserved_variables);
859 * Tensor<1, dim + 2, Tensor<1, dim, Number>> flux;
860 *
for (
unsigned int d = 0;
d < dim; ++
d)
862 * flux[0][
d] = conserved_variables[1 +
d];
863 *
for (
unsigned int e = 0;
e < dim; ++
e)
864 * flux[e + 1][d] = conserved_variables[e + 1] * velocity[d];
865 * flux[
d + 1][
d] += pressure;
867 * velocity[
d] * (conserved_variables[dim + 1] + pressure);
873 *
template <
int n_components,
int dim,
typename Number>
875 * Tensor<1, n_components, Number>
876 *
operator*(
const Tensor<1, n_components, Tensor<1, dim, Number>> &matrix,
877 *
const Tensor<1, dim, Number> &vector)
879 * Tensor<1, n_components, Number> result;
880 *
for (
unsigned int d = 0;
d < n_components; ++
d)
881 * result[d] = matrix[d] * vector;
885 *
template <
int dim,
typename Number>
887 * Tensor<1, dim + 2, Number>
888 * euler_numerical_flux(
const Tensor<1, dim + 2, Number> &u_m,
889 *
const Tensor<1, dim + 2, Number> &u_p,
890 *
const Tensor<1, dim, Number> &normal)
892 *
const auto velocity_m = euler_velocity<dim>(u_m);
893 *
const auto velocity_p = euler_velocity<dim>(u_p);
895 *
const auto pressure_m = euler_pressure<dim>(u_m);
896 *
const auto pressure_p = euler_pressure<dim>(u_p);
898 *
const auto flux_m = euler_flux<dim>(u_m);
899 *
const auto flux_p = euler_flux<dim>(u_p);
901 *
switch (numerical_flux_type)
903 *
case lax_friedrichs_modified:
907 * gamma * pressure_p * (1. / u_p[0]),
908 * velocity_m.norm_square() +
909 * gamma * pressure_m * (1. / u_m[0])));
911 *
return 0.5 * (flux_m * normal + flux_p * normal) +
912 * 0.5 * lambda * (u_m - u_p);
915 *
case harten_lax_vanleer:
917 *
const auto avg_velocity_normal =
918 * 0.5 * ((velocity_m + velocity_p) * normal);
921 * (pressure_p * (1. / u_p[0]) + pressure_m * (1. / u_m[0]))));
922 *
const Number s_pos =
923 *
std::max(Number(), avg_velocity_normal + avg_c);
924 *
const Number s_neg =
925 *
std::min(Number(), avg_velocity_normal - avg_c);
926 *
const Number inverse_s = Number(1.) / (s_pos - s_neg);
929 * ((s_pos * (flux_m * normal) - s_neg * (flux_p * normal)) -
930 * s_pos * s_neg * (u_m - u_p));
945 * General-purpose utility
functions from @ref step_67
"step-67":
948 *
template <
int dim,
typename VectorizedArrayType>
949 * VectorizedArrayType
950 * evaluate_function(
const Function<dim> &function,
951 *
const Point<dim, VectorizedArrayType> &p_vectorized,
952 *
const unsigned int component)
954 * VectorizedArrayType result;
955 *
for (
unsigned int v = 0; v < VectorizedArrayType::size(); ++v)
958 *
for (
unsigned int d = 0;
d < dim; ++
d)
959 * p[d] = p_vectorized[d][v];
960 * result[v] = function.value(p, component);
966 *
template <
int dim,
typename VectorizedArrayType,
int n_components = dim + 2>
967 * Tensor<1, n_components, VectorizedArrayType>
968 * evaluate_function(
const Function<dim> &function,
969 *
const Point<dim, VectorizedArrayType> &p_vectorized)
972 * Tensor<1, n_components, VectorizedArrayType> result;
973 *
for (
unsigned int v = 0; v < VectorizedArrayType::size(); ++v)
976 *
for (
unsigned int d = 0;
d < dim; ++
d)
977 * p[d] = p_vectorized[d][v];
978 *
for (
unsigned int d = 0;
d < n_components; ++
d)
979 * result[d][v] = function.value(p, d);
988 * <a name=
"step_76-EuleroperatorusingacellcentricloopandMPI30sharedmemory"></a>
989 * <h3>Euler
operator using a cell-centric
loop and
MPI-3.0 shared memory</h3>
993 * Euler
operator from @ref step_67
"step-67" with some changes as detailed below:
996 *
template <
int dim,
int degree,
int n_po
ints_1d>
997 *
class EulerOperator
1000 *
static constexpr unsigned int n_quadrature_points_1d = n_points_1d;
1002 * EulerOperator(TimerOutput &timer_output);
1007 *
const DoFHandler<dim> &dof_handler);
1010 * std::unique_ptr<Function<dim>> inflow_function);
1012 *
void set_subsonic_outflow_boundary(
1014 * std::unique_ptr<Function<dim>> outflow_energy);
1018 *
void set_body_force(std::unique_ptr<Function<dim>> body_force);
1021 * perform_stage(
const unsigned int stage,
1022 *
const Number cur_time,
1025 *
const LinearAlgebra::distributed::Vector<Number> ¤t_ri,
1026 * LinearAlgebra::distributed::Vector<Number> &vec_ki,
1027 * LinearAlgebra::distributed::Vector<Number> &solution)
const;
1029 *
void project(
const Function<dim> &function,
1030 * LinearAlgebra::distributed::Vector<Number> &solution)
const;
1032 * std::array<double, 3> compute_errors(
1033 *
const Function<dim> &function,
1034 *
const LinearAlgebra::distributed::Vector<Number> &solution)
const;
1036 *
double compute_cell_transport_speed(
1037 *
const LinearAlgebra::distributed::Vector<Number> &solution)
const;
1040 * initialize_vector(LinearAlgebra::distributed::Vector<Number> &vector)
const;
1045 * Instance of SubCommunicatorWrapper containing the sub-communicator, which
1047 * shared-memory capabilities:
1050 * MPI_Comm subcommunicator;
1052 * MatrixFree<dim, Number, VectorizedArrayType> data;
1054 * TimerOutput &timer;
1056 * std::map<types::boundary_id, std::unique_ptr<Function<dim>>>
1057 * inflow_boundaries;
1058 * std::map<types::boundary_id, std::unique_ptr<Function<dim>>>
1059 * subsonic_outflow_boundaries;
1060 * std::set<types::boundary_id> wall_boundaries;
1061 * std::unique_ptr<Function<dim>> body_force;
1068 * New constructor, which creates a sub-communicator. The user can specify
1069 * the size of the sub-communicator via the global parameter group_size. If
1070 * the size is set to -1, all
MPI processes of a
1071 * shared-memory domain are combined to a group. The specified size is
1072 * decisive for the benefit of the shared-memory capabilities of MatrixFree
1073 * and, therefore, setting the <code>size</code> to <code>-1</code> is a
1074 * reasonable choice. By setting, the size to <code>1</code> users explicitly
1075 * disable the
MPI-3.0 shared-memory features of MatrixFree and rely
1076 * completely on
MPI-2.0 features, like <code>MPI_Isend</code> and
1077 * <code>MPI_Irecv</code>.
1080 * template <
int dim,
int degree,
int n_points_1d>
1081 * EulerOperator<dim, degree, n_points_1d>::EulerOperator(TimerOutput &timer)
1084 * #ifdef DEAL_II_WITH_MPI
1085 *
if (group_size == 1)
1087 * this->subcommunicator = MPI_COMM_SELF;
1093 * MPI_Comm_split_type(MPI_COMM_WORLD,
1094 * MPI_COMM_TYPE_SHARED,
1097 * &subcommunicator);
1104 * (
void)subcommunicator;
1106 * this->subcommunicator = MPI_COMM_SELF;
1113 * New destructor responsible
for freeing of the sub-communicator.
1116 *
template <
int dim,
int degree,
int n_po
ints_1d>
1117 * EulerOperator<dim, degree, n_points_1d>::~EulerOperator()
1119 * #ifdef DEAL_II_WITH_MPI
1120 *
if (this->subcommunicator != MPI_COMM_SELF)
1121 * MPI_Comm_free(&subcommunicator);
1128 * Modified
reinit() function to set up the internal data structures in
1129 * MatrixFree in a way that it is usable by the cell-centric loops and
1130 * the
MPI-3.0 shared-memory capabilities are used:
1133 * template <
int dim,
int degree,
int n_points_1d>
1134 *
void EulerOperator<dim, degree, n_points_1d>::reinit(
1135 * const Mapping<dim> &
mapping,
1136 * const DoFHandler<dim> &dof_handler)
1138 *
const std::vector<const DoFHandler<dim> *> dof_handlers = {&dof_handler};
1139 *
const AffineConstraints<double> dummy;
1140 *
const std::vector<const AffineConstraints<double> *> constraints = {&dummy};
1141 *
const std::vector<Quadrature<1>> quadratures = {QGauss<1>(n_q_points_1d),
1142 * QGauss<1>(fe_degree + 1)};
1144 *
typename MatrixFree<dim, Number, VectorizedArrayType>::AdditionalData
1146 * additional_data.mapping_update_flags =
1149 * additional_data.mapping_update_flags_inner_faces =
1152 * additional_data.mapping_update_flags_boundary_faces =
1155 * additional_data.tasks_parallel_scheme =
1156 * MatrixFree<dim, Number, VectorizedArrayType>::AdditionalData::none;
1160 * Categorize cells so that all lanes have the same boundary IDs
for each
1161 * face. This is strictly not necessary, however, allows to write simpler
1162 * code in EulerOperator::perform_stage() without masking, since it is
1163 * guaranteed that all cells grouped together (in a VectorizedArray)
1164 * have to perform exactly the same operation also on the faces.
1167 * MatrixFreeTools::categorize_by_boundary_ids(dof_handler.get_triangulation(),
1172 * Enable
MPI-3.0 shared-memory capabilities within MatrixFree by providing
1173 * the sub-communicator:
1176 * additional_data.communicator_sm = subcommunicator;
1179 *
mapping, dof_handlers, constraints, quadratures, additional_data);
1185 * The following function does an entire stage of a Runge--Kutta update
1186 * and is, alongside the slightly modified setup, the heart of this tutorial
1187 * compared to @ref step_67
"step-67".
1191 * In contrast to @ref step_67
"step-67", we are not executing the advection step
1192 * (using MatrixFree::loop()) and the inverse mass-matrix step
1193 * (using MatrixFree::cell_loop()) in sequence, but evaluate everything in
1194 * one go inside of MatrixFree::loop_cell_centric(). This function expects
1195 * a single function that is executed on each locally-owned (macro) cell as
1196 * parameter so that we need to loop over all faces of that cell and perform
1197 * needed integration steps on our own.
1201 * The following function contains to a large extent copies of the following
1202 * functions from @ref step_67
"step-67" so that comments related the evaluation of the weak
1203 * form are skipped here:
1204 * - <code>EulerDG::EulerOperator::local_apply_cell</code>
1205 * - <code>EulerDG::EulerOperator::local_apply_face</code>
1206 * - <code>EulerDG::EulerOperator::local_apply_boundary_face</code>
1207 * - <code>EulerDG::EulerOperator::local_apply_inverse_mass_matrix</code>
1210 * template <
int dim,
int degree,
int n_points_1d>
1211 *
void EulerOperator<dim, degree, n_points_1d>::perform_stage(
1212 * const
unsigned int stage,
1213 * const Number current_time,
1216 * const LinearAlgebra::distributed::Vector<Number> ¤t_ri,
1217 * LinearAlgebra::distributed::Vector<Number> &vec_ki,
1218 * LinearAlgebra::distributed::Vector<Number> &solution) const
1220 *
for (
auto &i : inflow_boundaries)
1221 * i.second->set_time(current_time);
1222 * for (
auto &i : subsonic_outflow_boundaries)
1223 * i.second->set_time(current_time);
1228 * providing a lambda containing the effects of the cell, face and
1229 * boundary-face integrals:
1232 * data.template loop_cell_centric<LinearAlgebra::distributed::Vector<Number>,
1233 * LinearAlgebra::distributed::Vector<Number>>(
1234 * [&](
const auto &data,
auto &dst,
const auto &src,
const auto cell_range) {
1235 * using FECellIntegral = FEEvaluation<dim,
1240 * VectorizedArrayType>;
1241 * using FEFaceIntegral = FEFaceEvaluation<dim,
1246 * VectorizedArrayType>;
1248 * FECellIntegral phi(data);
1249 * FECellIntegral phi_temp(data);
1250 * FEFaceIntegral phi_m(data, true);
1251 * FEFaceIntegral phi_p(data, false);
1253 * Tensor<1, dim, VectorizedArrayType> constant_body_force;
1254 * const Functions::ConstantFunction<dim> *constant_function =
1255 * dynamic_cast<Functions::ConstantFunction<dim> *>(body_force.get());
1257 * if (constant_function)
1258 * constant_body_force =
1259 * evaluate_function<dim, VectorizedArrayType, dim>(
1260 * *constant_function, Point<dim, VectorizedArrayType>());
1262 * const internal::EvaluatorTensorProduct<
1263 * internal::EvaluatorVariant::evaluate_evenodd,
1267 * VectorizedArrayType,
1270 * data.get_shape_info().data[0].shape_gradients_collocation_eo,
1273 * AlignedVector<VectorizedArrayType> buffer(phi.static_n_q_points *
1274 * phi.n_components);
1278 * Loop over all cell batches:
1281 * for (
unsigned int cell = cell_range.first; cell < cell_range.second;
1286 * if (ai != Number())
1287 * phi_temp.reinit(cell);
1291 * Read values from global vector and compute the values at the
1292 * quadrature points:
1295 * if (ai != Number() && stage == 0)
1297 * phi.read_dof_values(src);
1299 * for (unsigned int i = 0;
1300 * i < phi.static_dofs_per_component * (dim + 2);
1302 * phi_temp.begin_dof_values()[i] = phi.begin_dof_values()[i];
1304 * phi.evaluate(EvaluationFlags::values);
1308 * phi.gather_evaluate(src, EvaluationFlags::values);
1313 * Buffer the computed values at the quadrature points, since
1315 * step, however, are needed later on
for the face integrals:
1318 * for (
unsigned int i = 0; i < phi.static_n_q_points * (dim + 2); ++i)
1319 * buffer[i] = phi.begin_values()[i];
1323 * Apply the cell integral at the cell quadrature points. See also
1324 * the function <code>EulerOperator::local_apply_cell()</code> from
1325 * @ref step_67
"step-67":
1328 *
for (
const unsigned int q : phi.quadrature_point_indices())
1330 *
const auto w_q = phi.get_value(q);
1331 * phi.submit_gradient(euler_flux<dim>(w_q), q);
1332 *
if (body_force.get() !=
nullptr)
1334 *
const Tensor<1, dim, VectorizedArrayType> force =
1335 * constant_function ?
1336 * constant_body_force :
1337 * evaluate_function<dim, VectorizedArrayType, dim>(
1338 * *body_force, phi.quadrature_point(q));
1340 * Tensor<1, dim + 2, VectorizedArrayType> forcing;
1341 *
for (
unsigned int d = 0;
d < dim; ++
d)
1342 * forcing[d + 1] = w_q[0] * force[d];
1343 *
for (
unsigned int d = 0;
d < dim; ++
d)
1344 * forcing[dim + 1] += force[d] * w_q[d + 1];
1346 * phi.submit_value(forcing, q);
1353 * points. We skip the interpolation back to the support points
1354 * of the element, since we first collect all contributions in the
1355 * cell quadrature points and only perform the interpolation back
1356 * as the
final step.
1360 *
auto *values_ptr = phi.begin_values();
1361 *
auto *gradient_ptr = phi.begin_gradients();
1363 *
for (
unsigned int c = 0; c < dim + 2; ++c)
1365 *
if (dim >= 1 && body_force.get() ==
nullptr)
1366 * eval.template gradients<0, false, false, dim>(gradient_ptr,
1368 *
else if (dim >= 1)
1369 * eval.template gradients<0, false, true, dim>(gradient_ptr,
1372 * eval.template gradients<1, false, true, dim>(gradient_ptr +
1376 * eval.template gradients<2, false, true, dim>(gradient_ptr +
1380 * values_ptr += phi.static_n_q_points;
1381 * gradient_ptr += phi.static_n_q_points * dim;
1387 * Loop over all faces of the current cell:
1390 *
for (
unsigned int face = 0;
1391 * face < GeometryInfo<dim>::faces_per_cell;
1396 * Determine the boundary ID of the current face. Since we have
1397 * set up MatrixFree in a way that all filled lanes have
1398 * guaranteed the same boundary ID, we can select the
1399 * boundary ID of the first lane.
1402 *
const auto boundary_ids =
1403 * data.get_faces_by_cells_boundary_id(cell, face);
1405 *
Assert(std::equal(boundary_ids.begin(),
1406 * boundary_ids.begin() +
1407 * data.n_active_entries_per_cell_batch(cell),
1408 * boundary_ids.begin()),
1409 * ExcMessage(
"Boundary IDs of lanes differ."));
1413 * phi_m.reinit(cell, face);
1417 * Interpolate the
values from the cell quadrature points to the
1418 * quadrature points of the current face via a simple 1
d
1422 * internal::FEFaceNormalEvaluationImpl<dim,
1424 * VectorizedArrayType>::
1425 *
template interpolate_quadrature<true, false>(
1428 * data.get_shape_info(),
1430 * phi_m.begin_values(),
1435 * Check
if the face is an internal or a boundary face and
1436 * select a different code path based on
this information:
1443 * Process and internal face. The following lines of code
1444 * are a
copy of the function
1445 * <code>EulerDG::EulerOperator::local_apply_face</code>
1446 * from @ref step_67
"step-67":
1449 * phi_p.reinit(cell, face);
1452 *
for (
const unsigned int q :
1453 * phi_m.quadrature_point_indices())
1455 *
const auto numerical_flux =
1456 * euler_numerical_flux<dim>(phi_m.get_value(q),
1457 * phi_p.get_value(q),
1458 * phi_m.normal_vector(q));
1459 * phi_m.submit_value(-numerical_flux, q);
1466 * Process a boundary face. These following lines of code
1467 * are a
copy of the function
1468 * <code>EulerDG::EulerOperator::local_apply_boundary_face</code>
1469 * from @ref step_67
"step-67":
1472 *
for (
const unsigned int q :
1473 * phi_m.quadrature_point_indices())
1475 *
const auto w_m = phi_m.get_value(q);
1476 *
const auto normal = phi_m.normal_vector(q);
1478 *
auto rho_u_dot_n = w_m[1] * normal[0];
1479 *
for (
unsigned int d = 1;
d < dim; ++
d)
1480 * rho_u_dot_n += w_m[1 + d] * normal[d];
1482 *
bool at_outflow =
false;
1484 * Tensor<1, dim + 2, VectorizedArrayType> w_p;
1486 *
if (wall_boundaries.find(boundary_id) !=
1487 * wall_boundaries.end())
1490 *
for (
unsigned int d = 0;
d < dim; ++
d)
1492 * w_m[d + 1] - 2. * rho_u_dot_n * normal[d];
1493 * w_p[dim + 1] = w_m[dim + 1];
1495 *
else if (inflow_boundaries.find(boundary_id) !=
1496 * inflow_boundaries.end())
1497 * w_p = evaluate_function(
1498 * *inflow_boundaries.find(boundary_id)->second,
1499 * phi_m.quadrature_point(q));
1500 *
else if (subsonic_outflow_boundaries.find(
1502 * subsonic_outflow_boundaries.end())
1506 * evaluate_function(*subsonic_outflow_boundaries
1507 * .find(boundary_id)
1509 * phi_m.quadrature_point(q),
1511 * at_outflow =
true;
1516 *
"Unknown boundary id, did "
1517 *
"you set a boundary condition for "
1518 *
"this part of the domain boundary?"));
1520 *
auto flux = euler_numerical_flux<dim>(w_m, w_p, normal);
1523 *
for (
unsigned int v = 0;
1524 * v < VectorizedArrayType::size();
1527 *
if (rho_u_dot_n[v] < -1e-12)
1528 *
for (
unsigned int d = 0;
d < dim; ++
d)
1529 * flux[d + 1][v] = 0.;
1532 * phi_m.submit_value(-flux, q);
1538 * Evaluate local integrals related to cell by quadrature and
1539 * add into cell contribution via a simple 1
d interpolation:
1542 * internal::FEFaceNormalEvaluationImpl<dim,
1544 * VectorizedArrayType>::
1545 *
template interpolate_quadrature<false, true>(
1548 * data.get_shape_info(),
1549 * phi_m.begin_values(),
1550 * phi.begin_values(),
1556 * Apply inverse mass
matrix in the cell quadrature points. See
1558 * <code>EulerDG::EulerOperator::local_apply_inverse_mass_matrix()</code>
1559 * from @ref step_67
"step-67":
1562 *
for (
unsigned int q = 0; q < phi.static_n_q_points; ++q)
1564 *
const auto factor = VectorizedArrayType(1.0) / phi.JxW(q);
1565 *
for (
unsigned int c = 0; c < dim + 2; ++c)
1566 * phi.begin_values()[c * phi.static_n_q_points + q] =
1567 * phi.begin_values()[c * phi.static_n_q_points + q] * factor;
1572 * Transform
values from collocation space to the original
1573 * Gauss-Lobatto space:
1576 * internal::FEEvaluationImplBasisChange<
1581 * n_points_1d>::do_backward(dim + 2,
1582 * data.get_shape_info()
1584 * .inverse_shape_values_eo,
1586 * phi.begin_values(),
1587 * phi.begin_dof_values());
1591 * Perform Runge-Kutta update and write results back to global
1595 *
if (ai == Number())
1597 *
for (
unsigned int q = 0; q < phi.static_dofs_per_cell; ++q)
1598 * phi.begin_dof_values()[q] = bi * phi.begin_dof_values()[q];
1599 * phi.distribute_local_to_global(solution);
1604 * phi_temp.read_dof_values(solution);
1606 *
for (
unsigned int q = 0; q < phi.static_dofs_per_cell; ++q)
1608 *
const auto K_i = phi.begin_dof_values()[q];
1610 * phi.begin_dof_values()[q] =
1611 * phi_temp.begin_dof_values()[q] + (ai * K_i);
1613 * phi_temp.begin_dof_values()[q] += bi * K_i;
1615 * phi.set_dof_values(dst);
1616 * phi_temp.set_dof_values(solution);
1623 * MatrixFree<dim, Number, VectorizedArrayType>::DataAccessOnFaces::values);
1630 * From here, the code of @ref step_67
"step-67" has not changed.
1633 *
template <
int dim,
int degree,
int n_po
ints_1d>
1634 *
void EulerOperator<dim, degree, n_points_1d>::initialize_vector(
1635 * LinearAlgebra::distributed::Vector<Number> &vector)
const
1637 * data.initialize_dof_vector(vector);
1642 *
template <
int dim,
int degree,
int n_po
ints_1d>
1643 *
void EulerOperator<dim, degree, n_points_1d>::set_inflow_boundary(
1645 * std::unique_ptr<Function<dim>> inflow_function)
1647 *
AssertThrow(subsonic_outflow_boundaries.find(boundary_id) ==
1648 * subsonic_outflow_boundaries.end() &&
1649 * wall_boundaries.find(boundary_id) == wall_boundaries.end(),
1650 * ExcMessage(
"You already set the boundary with id " +
1651 * std::to_string(
static_cast<int>(boundary_id)) +
1652 *
" to another type of boundary before now setting " +
1654 *
AssertThrow(inflow_function->n_components == dim + 2,
1655 * ExcMessage(
"Expected function with dim+2 components"));
1657 * inflow_boundaries[
boundary_id] = std::move(inflow_function);
1662 *
template <
int dim,
int degree,
int n_po
ints_1d>
1663 *
void EulerOperator<dim, degree, n_points_1d>::set_subsonic_outflow_boundary(
1665 * std::unique_ptr<Function<dim>> outflow_function)
1667 *
AssertThrow(inflow_boundaries.find(boundary_id) ==
1668 * inflow_boundaries.end() &&
1669 * wall_boundaries.find(boundary_id) == wall_boundaries.end(),
1670 * ExcMessage(
"You already set the boundary with id " +
1671 * std::to_string(
static_cast<int>(boundary_id)) +
1672 *
" to another type of boundary before now setting " +
1673 *
"it as subsonic outflow"));
1674 *
AssertThrow(outflow_function->n_components == dim + 2,
1675 * ExcMessage(
"Expected function with dim+2 components"));
1677 * subsonic_outflow_boundaries[
boundary_id] = std::move(outflow_function);
1682 *
template <
int dim,
int degree,
int n_po
ints_1d>
1683 *
void EulerOperator<dim, degree, n_points_1d>::set_wall_boundary(
1686 *
AssertThrow(inflow_boundaries.find(boundary_id) ==
1687 * inflow_boundaries.end() &&
1688 * subsonic_outflow_boundaries.find(boundary_id) ==
1689 * subsonic_outflow_boundaries.end(),
1690 * ExcMessage(
"You already set the boundary with id " +
1691 * std::to_string(
static_cast<int>(boundary_id)) +
1692 *
" to another type of boundary before now setting " +
1693 *
"it as wall boundary"));
1695 * wall_boundaries.insert(boundary_id);
1700 *
template <
int dim,
int degree,
int n_po
ints_1d>
1701 *
void EulerOperator<dim, degree, n_points_1d>::set_body_force(
1702 * std::unique_ptr<Function<dim>> body_force)
1706 * this->body_force = std::move(body_force);
1711 *
template <
int dim,
int degree,
int n_po
ints_1d>
1712 *
void EulerOperator<dim, degree, n_points_1d>::project(
1713 *
const Function<dim> &function,
1714 * LinearAlgebra::distributed::Vector<Number> &solution)
const
1716 * FEEvaluation<dim, degree, degree + 1, dim + 2, Number, VectorizedArrayType>
1718 * MatrixFreeOperators::CellwiseInverseMassMatrix<dim,
1722 * VectorizedArrayType>
1724 * solution.zero_out_ghost_values();
1725 *
for (
unsigned int cell = 0; cell < data.n_cell_batches(); ++cell)
1728 *
for (
const unsigned int q : phi.quadrature_point_indices())
1729 * phi.submit_dof_value(evaluate_function(function,
1730 * phi.quadrature_point(q)),
1732 * inverse.transform_from_q_points_to_basis(dim + 2,
1733 * phi.begin_dof_values(),
1734 * phi.begin_dof_values());
1735 * phi.set_dof_values(solution);
1741 *
template <
int dim,
int degree,
int n_po
ints_1d>
1742 * std::array<double, 3> EulerOperator<dim, degree, n_points_1d>::compute_errors(
1743 *
const Function<dim> &function,
1744 *
const LinearAlgebra::distributed::Vector<Number> &solution)
const
1746 * TimerOutput::Scope t(timer,
"compute errors");
1747 *
double errors_squared[3] = {};
1748 * FEEvaluation<dim, degree, n_points_1d, dim + 2, Number, VectorizedArrayType>
1751 *
for (
unsigned int cell = 0; cell < data.n_cell_batches(); ++cell)
1755 * VectorizedArrayType local_errors_squared[3] = {};
1756 *
for (
const unsigned int q : phi.quadrature_point_indices())
1758 *
const auto error =
1759 * evaluate_function(function, phi.quadrature_point(q)) -
1761 *
const auto JxW = phi.JxW(q);
1763 * local_errors_squared[0] += error[0] * error[0] * JxW;
1764 *
for (
unsigned int d = 0;
d < dim; ++
d)
1765 * local_errors_squared[1] += (error[d + 1] * error[d + 1]) * JxW;
1766 * local_errors_squared[2] += (error[dim + 1] * error[dim + 1]) * JxW;
1768 *
for (
unsigned int v = 0; v < data.n_active_entries_per_cell_batch(cell);
1770 *
for (
unsigned int d = 0;
d < 3; ++
d)
1771 * errors_squared[d] += local_errors_squared[d][v];
1776 * std::array<double, 3> errors;
1777 *
for (
unsigned int d = 0;
d < 3; ++
d)
1778 * errors[d] =
std::sqrt(errors_squared[d]);
1785 *
template <
int dim,
int degree,
int n_po
ints_1d>
1786 *
double EulerOperator<dim, degree, n_points_1d>::compute_cell_transport_speed(
1787 *
const LinearAlgebra::distributed::Vector<Number> &solution)
const
1789 * TimerOutput::Scope t(timer,
"compute transport speed");
1790 * Number max_transport = 0;
1791 * FEEvaluation<dim, degree, degree + 1, dim + 2, Number, VectorizedArrayType>
1794 *
for (
unsigned int cell = 0; cell < data.n_cell_batches(); ++cell)
1798 * VectorizedArrayType local_max = 0.;
1799 *
for (
const unsigned int q : phi.quadrature_point_indices())
1801 *
const auto solution = phi.get_value(q);
1802 *
const auto velocity = euler_velocity<dim>(solution);
1803 *
const auto pressure = euler_pressure<dim>(solution);
1805 *
const auto inverse_jacobian = phi.inverse_jacobian(q);
1806 *
const auto convective_speed = inverse_jacobian * velocity;
1807 * VectorizedArrayType convective_limit = 0.;
1808 *
for (
unsigned int d = 0;
d < dim; ++
d)
1809 * convective_limit =
1812 *
const auto speed_of_sound =
1813 *
std::sqrt(gamma * pressure * (1. / solution[0]));
1815 * Tensor<1, dim, VectorizedArrayType> eigenvector;
1816 *
for (
unsigned int d = 0;
d < dim; ++
d)
1817 * eigenvector[d] = 1.;
1818 *
for (
unsigned int i = 0; i < 5; ++i)
1820 * eigenvector =
transpose(inverse_jacobian) *
1821 * (inverse_jacobian * eigenvector);
1822 * VectorizedArrayType eigenvector_norm = 0.;
1823 *
for (
unsigned int d = 0;
d < dim; ++
d)
1824 * eigenvector_norm =
1826 * eigenvector /= eigenvector_norm;
1828 *
const auto jac_times_ev = inverse_jacobian * eigenvector;
1829 *
const auto max_eigenvalue =
std::sqrt(
1830 * (jac_times_ev * jac_times_ev) / (eigenvector * eigenvector));
1833 * max_eigenvalue * speed_of_sound + convective_limit);
1836 *
for (
unsigned int v = 0; v < data.n_active_entries_per_cell_batch(cell);
1838 * max_transport =
std::max(max_transport, local_max[v]);
1843 *
return max_transport;
1848 *
template <
int dim>
1849 *
class EulerProblem
1857 *
void make_grid_and_dofs();
1859 *
void output_results(
const unsigned int result_number);
1861 * LinearAlgebra::distributed::Vector<Number> solution;
1863 * ConditionalOStream pcout;
1865 * #ifdef DEAL_II_WITH_P4EST
1866 * parallel::distributed::Triangulation<dim> triangulation;
1868 * Triangulation<dim> triangulation;
1871 *
const FESystem<dim> fe;
1872 *
const MappingQ<dim>
mapping;
1873 * DoFHandler<dim> dof_handler;
1875 * TimerOutput timer;
1877 * EulerOperator<dim, fe_degree, n_q_points_1d> euler_operator;
1879 *
double time, time_step;
1881 *
class Postprocessor :
public DataPostprocessor<dim>
1886 *
virtual void evaluate_vector_field(
1887 *
const DataPostprocessorInputs::Vector<dim> &inputs,
1888 * std::vector<Vector<double>> &computed_quantities)
const override;
1890 *
virtual std::vector<std::string> get_names()
const override;
1892 *
virtual std::vector<
1894 * get_data_component_interpretation()
const override;
1896 *
virtual UpdateFlags get_needed_update_flags()
const override;
1899 *
const bool do_schlieren_plot;
1905 *
template <
int dim>
1906 * EulerProblem<dim>::Postprocessor::Postprocessor()
1907 * : do_schlieren_plot(dim == 2)
1912 *
template <
int dim>
1913 *
void EulerProblem<dim>::Postprocessor::evaluate_vector_field(
1914 *
const DataPostprocessorInputs::Vector<dim> &inputs,
1915 * std::vector<Vector<double>> &computed_quantities)
const
1917 *
const unsigned int n_evaluation_points = inputs.solution_values.size();
1919 *
if (do_schlieren_plot ==
true)
1920 *
Assert(inputs.solution_gradients.size() == n_evaluation_points,
1921 * ExcInternalError());
1923 *
Assert(computed_quantities.size() == n_evaluation_points,
1924 * ExcInternalError());
1925 *
Assert(inputs.solution_values[0].size() == dim + 2, ExcInternalError());
1926 *
Assert(computed_quantities[0].size() ==
1927 * dim + 2 + (do_schlieren_plot ==
true ? 1 : 0),
1928 * ExcInternalError());
1930 *
for (
unsigned int p = 0; p < n_evaluation_points; ++p)
1932 * Tensor<1, dim + 2> solution;
1933 *
for (
unsigned int d = 0;
d < dim + 2; ++
d)
1934 * solution[d] = inputs.solution_values[p](d);
1936 *
const double density = solution[0];
1937 *
const Tensor<1, dim> velocity = euler_velocity<dim>(solution);
1938 *
const double pressure = euler_pressure<dim>(solution);
1940 *
for (
unsigned int d = 0;
d < dim; ++
d)
1941 * computed_quantities[p](d) = velocity[
d];
1942 * computed_quantities[p](dim) = pressure;
1943 * computed_quantities[p](dim + 1) =
std::sqrt(gamma * pressure / density);
1945 *
if (do_schlieren_plot ==
true)
1946 * computed_quantities[p](dim + 2) =
1947 * inputs.solution_gradients[p][0] * inputs.solution_gradients[p][0];
1953 *
template <
int dim>
1954 * std::vector<std::string> EulerProblem<dim>::Postprocessor::get_names() const
1956 * std::vector<std::string> names;
1957 *
for (
unsigned int d = 0;
d < dim; ++
d)
1958 * names.emplace_back(
"velocity");
1959 * names.emplace_back(
"pressure");
1960 * names.emplace_back(
"speed_of_sound");
1962 *
if (do_schlieren_plot ==
true)
1963 * names.emplace_back(
"schlieren_plot");
1970 *
template <
int dim>
1971 * std::vector<DataComponentInterpretation::DataComponentInterpretation>
1972 * EulerProblem<dim>::Postprocessor::get_data_component_interpretation() const
1974 * std::vector<DataComponentInterpretation::DataComponentInterpretation>
1976 *
for (
unsigned int d = 0;
d < dim; ++
d)
1977 * interpretation.push_back(
1982 *
if (do_schlieren_plot ==
true)
1983 * interpretation.push_back(
1986 *
return interpretation;
1991 *
template <
int dim>
1992 *
UpdateFlags EulerProblem<dim>::Postprocessor::get_needed_update_flags() const
1994 *
if (do_schlieren_plot ==
true)
2002 *
template <
int dim>
2003 * EulerProblem<dim>::EulerProblem()
2005 * #ifdef DEAL_II_WITH_P4EST
2006 * , triangulation(MPI_COMM_WORLD)
2008 * , fe(FE_DGQ<dim>(fe_degree), dim + 2)
2010 * , dof_handler(triangulation)
2012 * , euler_operator(timer)
2019 *
template <
int dim>
2020 *
void EulerProblem<dim>::make_grid_and_dofs()
2026 * Point<dim> lower_left;
2027 *
for (
unsigned int d = 1;
d < dim; ++
d)
2028 * lower_left[d] = -5;
2030 * Point<dim> upper_right;
2031 * upper_right[0] = 10;
2032 *
for (
unsigned int d = 1;
d < dim; ++
d)
2033 * upper_right[d] = 5;
2038 * triangulation.refine_global(2);
2040 * euler_operator.set_inflow_boundary(
2041 * 0, std::make_unique<ExactSolution<dim>>(0));
2049 * triangulation, 0.03, 1, 0,
true);
2051 * euler_operator.set_inflow_boundary(
2052 * 0, std::make_unique<ExactSolution<dim>>(0));
2053 * euler_operator.set_subsonic_outflow_boundary(
2054 * 1, std::make_unique<ExactSolution<dim>>(0));
2056 * euler_operator.set_wall_boundary(2);
2057 * euler_operator.set_wall_boundary(3);
2060 * euler_operator.set_body_force(
2061 * std::make_unique<Functions::ConstantFunction<dim>>(
2062 * std::vector<double>({0., 0., -0.2})));
2071 * triangulation.refine_global(n_global_refinements);
2073 * dof_handler.distribute_dofs(fe);
2075 * euler_operator.reinit(
mapping, dof_handler);
2076 * euler_operator.initialize_vector(solution);
2078 * std::locale s = pcout.get_stream().getloc();
2079 * pcout.get_stream().imbue(std::locale(
""));
2080 * pcout <<
"Number of degrees of freedom: " << dof_handler.n_dofs()
2081 * <<
" ( = " << (dim + 2) <<
" [vars] x "
2082 * << triangulation.n_global_active_cells() <<
" [cells] x "
2085 * pcout.get_stream().imbue(s);
2090 *
template <
int dim>
2091 *
void EulerProblem<dim>::output_results(
const unsigned int result_number)
2093 *
const std::array<double, 3> errors =
2094 * euler_operator.compute_errors(ExactSolution<dim>(time), solution);
2095 *
const std::string quantity_name = testcase == 0 ?
"error" :
"norm";
2097 * pcout <<
"Time:" << std::setw(8) << std::setprecision(3) << time
2098 * <<
", dt: " << std::setw(8) << std::setprecision(2) << time_step
2099 * <<
", " << quantity_name <<
" rho: " << std::setprecision(4)
2100 * << std::setw(10) << errors[0] <<
", rho * u: " << std::setprecision(4)
2101 * << std::setw(10) << errors[1] <<
", energy:" << std::setprecision(4)
2102 * << std::setw(10) << errors[2] << std::endl;
2105 * TimerOutput::Scope t(timer,
"output");
2107 * Postprocessor postprocessor;
2108 * DataOut<dim> data_out;
2110 * DataOutBase::VtkFlags flags;
2111 * flags.write_higher_order_cells =
true;
2112 * data_out.set_flags(flags);
2114 * data_out.attach_dof_handler(dof_handler);
2116 * std::vector<std::string> names;
2117 * names.emplace_back(
"density");
2118 *
for (
unsigned int d = 0;
d < dim; ++
d)
2119 * names.emplace_back(
"momentum");
2120 * names.emplace_back(
"energy");
2122 * std::vector<DataComponentInterpretation::DataComponentInterpretation>
2124 * interpretation.push_back(
2126 *
for (
unsigned int d = 0;
d < dim; ++
d)
2127 * interpretation.push_back(
2129 * interpretation.push_back(
2132 * data_out.add_data_vector(dof_handler, solution, names, interpretation);
2134 * data_out.add_data_vector(solution, postprocessor);
2136 * LinearAlgebra::distributed::Vector<Number> reference;
2137 *
if (testcase == 0 && dim == 2)
2139 * reference.reinit(solution);
2140 * euler_operator.project(ExactSolution<dim>(time), reference);
2141 * reference.sadd(-1., 1, solution);
2142 * std::vector<std::string> names;
2143 * names.emplace_back(
"error_density");
2144 *
for (
unsigned int d = 0;
d < dim; ++
d)
2145 * names.emplace_back(
"error_momentum");
2146 * names.emplace_back(
"error_energy");
2148 * std::vector<DataComponentInterpretation::DataComponentInterpretation>
2150 * interpretation.push_back(
2152 *
for (
unsigned int d = 0;
d < dim; ++
d)
2153 * interpretation.push_back(
2155 * interpretation.push_back(
2158 * data_out.add_data_vector(dof_handler,
2164 * Vector<double> mpi_owner(triangulation.n_active_cells());
2166 * data_out.add_data_vector(mpi_owner,
"owner");
2168 * data_out.build_patches(
mapping,
2172 *
const std::string filename =
2174 * data_out.write_vtu_in_parallel(filename, MPI_COMM_WORLD);
2180 *
template <
int dim>
2181 *
void EulerProblem<dim>::run()
2184 *
const unsigned int n_vect_number = VectorizedArrayType::size();
2185 *
const unsigned int n_vect_bits = 8 *
sizeof(Number) * n_vect_number;
2187 * pcout <<
"Running with "
2189 * <<
" MPI processes" << std::endl;
2190 * pcout <<
"Vectorization over " << n_vect_number <<
' '
2191 * << (std::is_same_v<Number, double> ?
"doubles" :
"floats") <<
" = "
2192 * << n_vect_bits <<
" bits ("
2197 * make_grid_and_dofs();
2199 *
const LowStorageRungeKuttaIntegrator integrator(lsrk_scheme);
2201 * LinearAlgebra::distributed::Vector<Number> rk_register_1;
2202 * LinearAlgebra::distributed::Vector<Number> rk_register_2;
2203 * rk_register_1.reinit(solution);
2204 * rk_register_2.reinit(solution);
2206 * euler_operator.project(ExactSolution<dim>(time), solution);
2209 *
double min_vertex_distance = std::numeric_limits<double>::max();
2210 *
for (
const auto &cell : triangulation.active_cell_iterators())
2211 *
if (cell->is_locally_owned())
2212 * min_vertex_distance =
2213 *
std::min(min_vertex_distance, cell->minimum_vertex_distance());
2214 * min_vertex_distance =
2217 * time_step = courant_number * integrator.n_stages() /
2218 * euler_operator.compute_cell_transport_speed(solution);
2219 * pcout <<
"Time step size: " << time_step
2220 * <<
", minimal h: " << min_vertex_distance
2221 * <<
", initial transport scaling: "
2222 * << 1. / euler_operator.compute_cell_transport_speed(solution)
2226 * output_results(0);
2228 *
unsigned int timestep_number = 0;
2230 *
while (time < final_time - 1e-12 && timestep_number < max_time_steps)
2232 * ++timestep_number;
2233 *
if (timestep_number % 5 == 0)
2235 * courant_number * integrator.n_stages() /
2237 * euler_operator.compute_cell_transport_speed(solution), 3);
2240 * TimerOutput::Scope t(timer,
"rk time stepping total");
2241 * integrator.perform_time_step(euler_operator,
2249 * time += time_step;
2251 *
if (
static_cast<int>(time / output_tick) !=
2252 *
static_cast<int>((time - time_step) / output_tick) ||
2253 * time >= final_time - 1e-12)
2255 *
static_cast<unsigned int>(std::round(time / output_tick)));
2258 * timer.print_wall_time_statistics(MPI_COMM_WORLD);
2259 * pcout << std::endl;
2265 *
int main(
int argc,
char **argv)
2267 *
using namespace Euler_DG;
2268 *
using namespace dealii;
2270 * Utilities::MPI::MPI_InitFinalize mpi_initialization(argc, argv, 1);
2274 * EulerProblem<dimension> euler_problem;
2275 * euler_problem.run();
2277 *
catch (std::exception &exc)
2279 * std::cerr << std::endl
2281 * <<
"----------------------------------------------------"
2283 * std::cerr <<
"Exception on processing: " << std::endl
2284 * << exc.what() << std::endl
2285 * <<
"Aborting!" << std::endl
2286 * <<
"----------------------------------------------------"
2293 * std::cerr << std::endl
2295 * <<
"----------------------------------------------------"
2297 * std::cerr <<
"Unknown exception!" << std::endl
2298 * <<
"Aborting!" << std::endl
2299 * <<
"----------------------------------------------------"
2307<a name=
"step_76-Results"></a><h1>Results</h1>
2310Running the program with the
default settings on a machine with 24 processes
2311in
release mode and AVX2 vectorization produces the following output:
2314Running with 24
MPI processes
2315Vectorization over 4 doubles = 256 bits (AVX)
2316Number of degrees of freedom: 230,400 ( = 4 [vars] x 1,600 [cells] x 36 [dofs/cell/var] )
2317Time step size: 0.000295952, minimal h: 0.0075,
initial transport scaling: 0.00441179
2319Time: 0, dt: 0.0003,
norm rho: 4.17e-16, rho * u: 1.629e-16, energy: 1.381e-15
2320Time: 0.0501, dt: 0.00025,
norm rho: 0.02076, rho * u: 0.038, energy: 0.08774
2324+-------------------------------------+------------------+------------+------------------+
2325| Total wallclock time elapsed | 8.231s 8 | 8.235s | 8.236s 5 |
2327| Section | no. calls |
min time rank |
avg time |
max time rank |
2328+-------------------------------------+------------------+------------+------------------+
2329| compute errors | 41 | 0.002513s 17 | 0.002564s | 0.002631s 13 |
2330| compute transport speed | 1731 | 0.1581s 17 | 0.1608s | 0.1636s 11 |
2331| output | 41 | 0.7818s 0 | 0.7832s | 0.7847s 17 |
2332| rk time stepping total | 8648 | 7.23s 23 | 7.233s | 7.237s 13 |
2333+-------------------------------------+------------------+------------+------------------+
2336and the following visual output, which was taken from @ref step_67
"step-67" (for a slightly different test case):
2338<table align=
"center" class=
"doxtable" style=
"width:85%">
2341 <img src=
"https://www.dealii.org/images/steps/developer/step-67.pressure_010.png" alt=
"" width=
"100%">
2344 <img src=
"https://www.dealii.org/images/steps/developer/step-67.pressure_025.png" alt=
"" width=
"100%">
2349 <img src=
"https://www.dealii.org/images/steps/developer/step-67.pressure_050.png" alt=
"" width=
"100%">
2352 <img src=
"https://www.dealii.org/images/steps/developer/step-67.pressure_100.png" alt=
"" width=
"100%">
2357As a reference, the results of @ref step_67
"step-67" for test case 1 and the same resolution
2358(note this is not the default test case nor the default resolution in @ref step_67
"step-67")
2362Running with 24
MPI processes
2363Vectorization over 4 doubles = 256 bits (AVX)
2364Number of degrees of freedom: 230,400 ( = 4 [vars] x 1,600 [cells] x 36 [dofs/cell/var] )
2365Time step size: 0.000295952, minimal h: 0.0075,
initial transport scaling: 0.00441179
2367Time: 0, dt: 0.0003,
norm rho: 4.17e-16, rho * u: 1.629e-16, energy: 1.381e-15
2368Time: 0.0501, dt: 0.00025,
norm rho: 0.02076, rho * u: 0.03801, energy: 0.08774
2372+-------------------------------------------+------------------+------------+------------------+
2373| Total wallclock time elapsed | 8.166s 4 | 8.168s | 8.169s 14 |
2375| Section | no. calls |
min time rank |
avg time |
max time rank |
2376+-------------------------------------------+------------------+------------+------------------+
2377| compute errors | 41 | 0.002265s 18 | 0.004323s | 0.006903s 0 |
2378| compute transport speed | 1731 | 0.1471s 18 | 0.2393s | 0.3772s 0 |
2379| output | 41 | 0.791s 0 | 0.7925s | 0.7934s 1 |
2380| rk time stepping total | 8648 | 6.955s 0 | 7.096s | 7.189s 18 |
2381| rk_stage - integrals L_h | 43240 | 5.537s 5 | 5.833s | 6.171s 13 |
2382| rk_stage - inv mass + vec upd | 43240 | 0.7758s 4 | 1.15s | 1.373s 5 |
2383+-------------------------------------------+------------------+------------+------------------+
2386While the performance in this small test case is almost identical, the
2387difference depends on the hardware and the
dimension and size of the setup.
2388In larger computations we have seen that the modifications shown in this
2389tutorial were able to achieve a speedup of 27% for the Runge-Kutta stages.
2391<a name=
"step_76-Possibilitiesforextensions"></a><h3>Possibilities for extensions</h3>
2394The algorithms are easily extendable to higher dimensions: a high-dimensional
2395<a href=
"https://github.com/hyperdeal/hyperdeal/blob/a9e67b4e625ff1dde2fed93ad91cdfacfaa3acdf/include/hyper.deal/operators/advection/advection_operation.h#L219-L569">advection operator based on cell-centric loops</a>
2396is part of the hyper.deal library. An extension of cell-centric loops
2397to locally-refined meshes is more involved.
2399<a name=
"step_76-ExtensiontothecompressibleNavierStokesequations"></a><h4>Extension to the compressible Navier-Stokes equations</h4>
2402The solver presented in this tutorial program can also be extended to the
2403compressible Navier–Stokes equations by adding viscous terms, as also
2404suggested in @ref step_67
"step-67". To keep as much of the performance obtained here despite
2405the additional cost of elliptic terms,
e.g. via an interior penalty method, that
2406tutorial has proposed to switch the basis from FE_DGQ to FE_DGQHermite like in
2407the @ref step_59
"step-59" tutorial program. The reasoning behind this switch is that in the
2408case of FE_DGQ all
values of neighboring cells (i.
e., @f$k+1@f$ layers) are needed,
2409whilst in the case of FE_DGQHermite only 2 layers, making the latter
2410significantly more suitable for higher degrees. The additional layers have to be,
2411on the
one hand, loaded from main memory during flux computation and,
one the
2412other hand, have to be communicated. Using the shared-memory capabilities
2413introduced in this tutorial, the second
point can be eliminated on a single
2414compute node or its influence can be reduced in a
hybrid context.
2416<a name=
"step_76-BlockGaussSeidellikepreconditioners"></a><h4>Block Gauss-Seidel-like preconditioners</h4>
2419Cell-centric loops could be used to create block Gauss-Seidel preconditioners
2420that are multiplicative within
one process and additive over processes. These
2421type of preconditioners use during flux computation, in contrast to Jacobi-type
2422preconditioners, already updated
values from neighboring cells. The following
2423pseudo-code visualizes how this could in principal be achieved:
2427Vector<Number> visit_flags(data.n_cell_batches () + data.n_ghost_cell_batches ());
2430data.template loop_cell_centric<VectorType, VectorType>(
2431 [&](
const auto &data,
auto &dst,
const auto &src,
const auto cell_range) {
2433 for (
unsigned int cell = cell_range.first; cell < cell_range.second; ++cell)
2437 for (
unsigned int face = 0; face < GeometryInfo<dim>::faces_per_cell; ++face)
2439 const auto boundary_id = data.get_faces_by_cells_boundary_id(cell, face)[0];
2443 phi_p.reinit(cell, face);
2445 const auto flags = phi_p.read_cell_data(visit_flags);
2446 const auto all_neighbors_have_been_updated =
2448 flags().
begin() + data.n_active_entries_per_cell_batch(cell) == 1;
2450 if(all_neighbors_have_been_updated)
2467 phi.set_cell_data(visit_flags, VectorizedArrayType(1.0));
2473 MatrixFree<dim, Number, VectorizedArrayType>::DataAccessOnFaces::values);
2476For
this purpose,
one can exploit the cell-data vector capabilities of
2477MatrixFree and the range-based iteration capabilities of VectorizedArray.
2479Please note that in the given example we process <code>VectorizedArrayType::size()</code>
2480number of blocks, since each lane corresponds to
one block. We consider blocks
2481as updated
if all blocks processed by a vector
register have been updated. In
2482the
case of Cartesian meshes
this is a reasonable approach, however,
for
2483general unstructured meshes
this conservative approach might lead to a decrease in the
2484efficiency of the preconditioner.
A reduction of cells processed in parallel
2485by explicitly reducing the number of lanes used by <code>VectorizedArrayType</code>
2486might increase the quality of the preconditioner, but with the cost that each
2487iteration might be more expensive. This dilemma leads us to a further
2488"possibility for extension": vectorization within an element.
2491<a name=
"step_76-PlainProg"></a>
2492<h1> The plain program</h1>
2493@include
"step-76.cc"
void submit_value(const value_type val_in, const unsigned int q_point)
void loop_cell_centric(void(CLASS::*cell_operation)(const MatrixFree &, OutVector &, const InVector &, const std::pair< unsigned int, unsigned int > &) const, const CLASS *owning_class, OutVector &dst, const InVector &src, const bool zero_dst_vector=false, const DataAccessOnFaces src_vector_face_access=DataAccessOnFaces::unspecified) const
void reinit(const MappingType &mapping, const DoFHandler< dim > &dof_handler, const AffineConstraints< number2 > &constraint, const QuadratureType &quad, const AdditionalData &additional_data=AdditionalData())
std::enable_if_t< std::is_floating_point_v< T > &&std::is_floating_point_v< U > &&!std::is_same_v< T, U >, typename ProductType< std::complex< T >, std::complex< U > >::type > operator*(const std::complex< T > &left, const std::complex< U > &right)
#define DEAL_II_ALWAYS_INLINE
const unsigned int DoFAccessor< structdim, dim, spacedim, level_dof_access >::dimension
#define DEAL_II_NOT_IMPLEMENTED()
#define Assert(cond, exc)
#define AssertDimension(dim1, dim2)
#define AssertThrow(cond, exc)
void loop(IteratorType begin, std_cxx20::type_identity_t< IteratorType > end, DOFINFO &dinfo, INFOBOX &info, const std::function< void(std_cxx20::type_identity_t< DOFINFO > &, typename INFOBOX::CellInfo &)> &cell_worker, const std::function< void(std_cxx20::type_identity_t< DOFINFO > &, typename INFOBOX::CellInfo &)> &boundary_worker, const std::function< void(std_cxx20::type_identity_t< DOFINFO > &, std_cxx20::type_identity_t< DOFINFO > &, typename INFOBOX::CellInfo &, typename INFOBOX::CellInfo &)> &face_worker, AssemblerType &assembler, const LoopControl &lctrl=LoopControl())
const bool IsBlockVector< VectorType >::value
@ update_values
Shape function values.
@ update_normal_vectors
Normal vectors.
@ update_JxW_values
Transformed quadrature weights.
@ update_gradients
Shape function gradients.
@ update_quadrature_points
Transformed quadrature points.
MappingQ< dim, spacedim > StaticMappingQ1< dim, spacedim >::mapping
DataComponentInterpretation
@ component_is_part_of_vector
void hyper_rectangle(Triangulation< dim, spacedim > &tria, const Point< dim > &p1, const Point< dim > &p2, const bool colorize=false)
void channel_with_cylinder(Triangulation< dim > &tria, const double shell_region_width=0.03, const unsigned int n_shells=2, const double skewness=2.0, const bool colorize=false)
@ matrix
Contents is actually a matrix.
@ general
No special properties.
constexpr types::blas_int one
double norm(const FEValuesBase< dim > &fe, const ArrayView< const std::vector< Tensor< 1, dim > > > &Du)
Point< spacedim > point(const gp_Pnt &p, const double tolerance=1e-10)
SymmetricTensor< 2, dim, Number > C(const Tensor< 2, dim, Number > &F)
SymmetricTensor< 2, dim, Number > e(const Tensor< 2, dim, Number > &F)
SymmetricTensor< 2, dim, Number > d(const Tensor< 2, dim, Number > &F, const Tensor< 2, dim, Number > &dF_dt)
@ LOW_STORAGE_RK_STAGE9_ORDER5
@ LOW_STORAGE_RK_STAGE3_ORDER3
@ LOW_STORAGE_RK_STAGE7_ORDER4
@ LOW_STORAGE_RK_STAGE5_ORDER4
VectorType::value_type * begin(VectorType &V)
T sum(const T &t, const MPI_Comm mpi_communicator)
unsigned int n_mpi_processes(const MPI_Comm mpi_communicator)
T max(const T &t, const MPI_Comm mpi_communicator)
T min(const T &t, const MPI_Comm mpi_communicator)
unsigned int this_mpi_process(const MPI_Comm mpi_communicator)
T reduce(const T &local_value, const MPI_Comm comm, const std::function< T(const T &, const T &)> &combiner, const unsigned int root_process=0)
std::string get_current_vectorization_level()
Number truncate_to_n_digits(const Number number, const unsigned int n_digits)
std::string int_to_string(const unsigned int value, const unsigned int digits=numbers::invalid_unsigned_int)
constexpr T pow(const T base, const int iexp)
void run(const Iterator &begin, const std_cxx20::type_identity_t< Iterator > &end, Worker worker, Copier copier, const ScratchData &sample_scratch_data, const CopyData &sample_copy_data, const unsigned int queue_length, const unsigned int chunk_size)
long double gamma(const unsigned int n)
void copy(const T *begin, const T *end, U *dest)
int(& functions)(const void *v1, const void *v2)
void reinit(MatrixBlock< MatrixType > &v, const BlockSparsityPattern &p)
constexpr unsigned int invalid_unsigned_int
constexpr types::boundary_id internal_face_boundary_id
void transform(const InputIterator &begin_in, const InputIterator &end_in, OutputIterator out, const Function &function, const unsigned int grainsize)
::VectorizedArray< Number, width > exp(const ::VectorizedArray< Number, width > &)
::VectorizedArray< Number, width > min(const ::VectorizedArray< Number, width > &, const ::VectorizedArray< Number, width > &)
::VectorizedArray< Number, width > max(const ::VectorizedArray< Number, width > &, const ::VectorizedArray< Number, width > &)
::VectorizedArray< Number, width > sqrt(const ::VectorizedArray< Number, width > &)
::VectorizedArray< Number, width > pow(const ::VectorizedArray< Number, width > &, const Number p)
::VectorizedArray< Number, width > abs(const ::VectorizedArray< Number, width > &)
bool hold_all_faces_to_owned_cells