Reference documentation for deal.II version 9.1.0-pre
process_grid.cc
1 // ---------------------------------------------------------------------
2 //
3 // Copyright (C) 2017 - 2018 by the deal.II authors
4 //
5 // This file is part of the deal.II library.
6 //
7 // The deal.II library is free software; you can use it, redistribute
8 // it, and/or modify it under the terms of the GNU Lesser General
9 // Public License as published by the Free Software Foundation; either
10 // version 2.1 of the License, or (at your option) any later version.
11 // The full text of the license can be found in the file LICENSE.md at
12 // the top level directory of deal.II.
13 //
14 // ---------------------------------------------------------------------
15 
16 #include <deal.II/base/process_grid.h>
17 
18 #ifdef DEAL_II_WITH_SCALAPACK
19 
20 # include <deal.II/lac/scalapack.templates.h>
21 
22 DEAL_II_NAMESPACE_OPEN
23 
24 namespace
25 {
36  inline std::pair<int, int>
37  compute_processor_grid_sizes(MPI_Comm mpi_comm,
38  const unsigned int m,
39  const unsigned int n,
40  const unsigned int block_size_m,
41  const unsigned int block_size_n)
42  {
43  // Few notes from the ScaLAPACK user guide:
44  // It is possible to predict the best grid shape given the number of
45  // processes available: Pr x Pc <= P This, however, depends on the task to
46  // be done. LU , QR and QL factorizations perform better for “flat” process
47  // grids (Pr < Pc ) For large N, Pc = 2*Pr is a good choice, whereas for
48  // small N, one should choose small Pr Square or near square grids are more
49  // optimal for Cholesky factorization. LQ and RQ factorizations take
50  // advantage of “tall” grids (Pr > Pc )
51 
52  // Below we always try to create 2D processor grids:
53 
54  const int n_processes = Utilities::MPI::n_mpi_processes(mpi_comm);
55 
56  // Get the total number of cores we can occupy in a rectangular dense matrix
57  // with rectangular blocks when every core owns only a single block:
58  const int n_processes_heuristic = int(std::ceil((1. * m) / block_size_m)) *
59  int(std::ceil((1. * n) / block_size_n));
60  const int Np = std::min(n_processes_heuristic, n_processes);
61 
62  // Now we need to split Np into Pr x Pc. Assume we know the shape/ratio
63  // Pc =: ratio * Pr
64  // therefore
65  // Np = Pc * Pc / ratio
66  // for quadratic matrices the ratio equals 1
67  const double ratio = double(n) / m;
68  int Pc = std::floor(std::sqrt(ratio * Np));
69 
70  // one could rounds up Pc to the number which has zero remainder from the
71  // division of Np while ( Np % Pc != 0 )
72  // ++Pc;
73  // but this affects the grid shape dramatically, i.e. 10 cores 3x3 becomes
74  // 2x5.
75 
76  // limit our estimate to be in [2, Np]
77  int n_process_columns = std::min(Np, std::max(2, Pc));
78  // finally, get the rows:
79  int n_process_rows = Np / n_process_columns;
80 
81  Assert(n_process_columns >= 1 && n_process_rows >= 1 &&
82  n_processes >= n_process_rows * n_process_columns,
83  ExcMessage(
84  "error in process grid: " + std::to_string(n_process_rows) + "x" +
85  std::to_string(n_process_columns) + "=" +
86  std::to_string(n_process_rows * n_process_columns) + " out of " +
87  std::to_string(n_processes)));
88 
89  return std::make_pair(n_process_rows, n_process_columns);
90 
91  // For example,
92  // 320x320 with 32x32 blocks and 16 cores:
93  // Pc = 1.0 * Pr => 4x4 grid
94  // Pc = 0.5 * Pr => 8x2 grid
95  // Pc = 2.0 * Pr => 3x5 grid
96  }
97 } // namespace
98 
99 namespace Utilities
100 {
101  namespace MPI
102  {
104  MPI_Comm mpi_comm,
105  const std::pair<unsigned int, unsigned int> &grid_dimensions)
106  : mpi_communicator(mpi_comm)
107  , this_mpi_process(Utilities::MPI::this_mpi_process(mpi_communicator))
108  , n_mpi_processes(Utilities::MPI::n_mpi_processes(mpi_communicator))
109  , n_process_rows(grid_dimensions.first)
110  , n_process_columns(grid_dimensions.second)
111  {
112  Assert(grid_dimensions.first > 0,
113  ExcMessage("Number of process grid rows has to be positive."));
114  Assert(grid_dimensions.second > 0,
115  ExcMessage("Number of process grid columns has to be positive."));
116 
117  Assert(
118  grid_dimensions.first * grid_dimensions.second <= n_mpi_processes,
119  ExcMessage(
120  "Size of process grid is larger than number of available MPI processes."));
121 
122  // processor grid order.
123  const bool column_major = false;
124 
125  // Initialize Cblas context from the provided communicator
126  blacs_context = Csys2blacs_handle(mpi_communicator);
127  const char *order = (column_major ? "Col" : "Row");
128  // Note that blacs_context can be modified below. Thus Cblacs2sys_handle
129  // may not return the same MPI communicator.
130  Cblacs_gridinit(&blacs_context, order, n_process_rows, n_process_columns);
131 
132  // Blacs may modify the grid size on processes which are not used
133  // in the grid. So provide copies below:
134  int procrows_ = n_process_rows;
135  int proccols_ = n_process_columns;
136  Cblacs_gridinfo(blacs_context,
137  &procrows_,
138  &proccols_,
141 
142  // If this MPI core is not on the grid, flag it as inactive and
143  // skip all jobs
144  // Note that a different condition is used in FORTRAN code here
145  // https://stackoverflow.com/questions/18516915/calling-blacs-with-more-processes-than-used
146  if (this_process_row < 0 || this_process_column < 0)
147  mpi_process_is_active = false;
148  else
149  mpi_process_is_active = true;
150 
151  // Create an auxiliary communicator which has root and all inactive cores.
152  // Assume that inactive cores start with
153  // id=n_process_rows*n_process_columns
154  const unsigned int n_active_mpi_processes =
155  n_process_rows * n_process_columns;
157  this_mpi_process >= n_active_mpi_processes,
158  ExcInternalError());
159 
160  std::vector<int> inactive_with_root_ranks;
161  inactive_with_root_ranks.push_back(0);
162  for (unsigned int i = n_active_mpi_processes; i < n_mpi_processes; ++i)
163  inactive_with_root_ranks.push_back(i);
164 
165  // Get the group of processes in mpi_communicator
166  int ierr = 0;
167  MPI_Group all_group;
168  ierr = MPI_Comm_group(mpi_communicator, &all_group);
169  AssertThrowMPI(ierr);
170 
171  // Construct the group containing all ranks we need:
172  MPI_Group inactive_with_root_group;
173  const int n = inactive_with_root_ranks.size();
174  ierr = MPI_Group_incl(all_group,
175  n,
176  inactive_with_root_ranks.data(),
177  &inactive_with_root_group);
178  AssertThrowMPI(ierr);
179 
180  // Create the communicator based on inactive_with_root_group.
181  // Note that on all the active MPI processes (except for the one with
182  // rank 0) the resulting MPI_Comm mpi_communicator_inactive_with_root
183  // will be MPI_COMM_NULL.
185  inactive_with_root_group,
186  55,
188  AssertThrowMPI(ierr);
189 
190  ierr = MPI_Group_free(&all_group);
191  AssertThrowMPI(ierr);
192  ierr = MPI_Group_free(&inactive_with_root_group);
193  AssertThrowMPI(ierr);
194 
195  // Double check that the process with rank 0 in subgroup is active:
196 # ifdef DEBUG
197  if (mpi_communicator_inactive_with_root != MPI_COMM_NULL &&
201 # endif
202  }
203 
204 
205 
206  ProcessGrid::ProcessGrid(MPI_Comm mpi_comm,
207  const unsigned int n_rows_matrix,
208  const unsigned int n_columns_matrix,
209  const unsigned int row_block_size,
210  const unsigned int column_block_size)
211  : ProcessGrid(mpi_comm,
212  compute_processor_grid_sizes(mpi_comm,
213  n_rows_matrix,
214  n_columns_matrix,
215  row_block_size,
216  column_block_size))
217  {}
218 
219 
220 
221  ProcessGrid::ProcessGrid(MPI_Comm mpi_comm,
222  const unsigned int n_rows,
223  const unsigned int n_columns)
224  : ProcessGrid(mpi_comm, std::make_pair(n_rows, n_columns))
225  {}
226 
227 
228 
230  {
232  Cblacs_gridexit(blacs_context);
233 
234  if (mpi_communicator_inactive_with_root != MPI_COMM_NULL)
235  MPI_Comm_free(&mpi_communicator_inactive_with_root);
236  }
237 
238 
239 
240  template <typename NumberType>
241  void
242  ProcessGrid::send_to_inactive(NumberType *value, const int count) const
243  {
244  Assert(count > 0, ExcInternalError());
245  if (mpi_communicator_inactive_with_root != MPI_COMM_NULL)
246  {
247  const int ierr =
248  MPI_Bcast(value,
249  count,
250  Utilities::MPI::internal::mpi_type_id(value),
251  0 /*from root*/,
253  AssertThrowMPI(ierr);
254  }
255  }
256 
257  } // namespace MPI
258 } // namespace Utilities
259 
260 // instantiations
261 
262 template void
263 Utilities::MPI::ProcessGrid::send_to_inactive<double>(double *,
264  const int) const;
265 template void
266 Utilities::MPI::ProcessGrid::send_to_inactive<float>(float *, const int) const;
267 template void
268 Utilities::MPI::ProcessGrid::send_to_inactive<int>(int *, const int) const;
269 
270 DEAL_II_NAMESPACE_CLOSE
271 
272 #endif // DEAL_II_WITH_SCALAPACK
void send_to_inactive(NumberType *value, const int count=1) const
const unsigned int this_mpi_process
Definition: process_grid.h:164
STL namespace.
MPI_Comm mpi_communicator_inactive_with_root
Definition: process_grid.h:153
static::ExceptionBase & ExcMessage(std::string arg1)
#define Assert(cond, exc)
Definition: exceptions.h:1227
int create_group(const MPI_Comm &comm, const MPI_Group &group, const int tag, MPI_Comm *new_comm)
Definition: mpi.cc:102
unsigned int n_mpi_processes(const MPI_Comm &mpi_communicator)
Definition: mpi.cc:69
Definition: cuda.h:32
#define AssertThrowMPI(error_code)
Definition: exceptions.h:1443
const unsigned int n_mpi_processes
Definition: process_grid.h:169
ProcessGrid(MPI_Comm mpi_communicator, const unsigned int n_rows, const unsigned int n_columns)
unsigned int this_mpi_process(const MPI_Comm &mpi_communicator)
Definition: mpi.cc:80
static::ExceptionBase & ExcInternalError()