MueLu  Version of the Day
MueLu_RepartitionHeuristicFactory_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 // Tobias Wiesner (tawiesn@sandia.gov)
43 //
44 // ***********************************************************************
45 //
46 // @HEADER
47 #ifndef PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_
48 #define PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_
49 
50 #include <algorithm>
51 #include <iostream>
52 #include <sstream>
53 
54 #ifdef HAVE_MPI
55 #include <Teuchos_DefaultMpiComm.hpp>
56 #include <Teuchos_CommHelpers.hpp>
57 
58 //#include <Xpetra_Map.hpp>
59 #include <Xpetra_Matrix.hpp>
60 
61 #include "MueLu_Utilities.hpp"
62 
63 #include "MueLu_RAPFactory.hpp"
64 #include "MueLu_BlockedRAPFactory.hpp"
65 #include "MueLu_SubBlockAFactory.hpp"
66 #include "MueLu_Level.hpp"
67 #include "MueLu_MasterList.hpp"
68 #include "MueLu_Monitor.hpp"
69 
70 #include "MueLu_RepartitionHeuristicFactory.hpp"
71 
72 namespace MueLu {
73 
74  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
76  RCP<ParameterList> validParamList = rcp(new ParameterList());
77 
78 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
79  SET_VALID_ENTRY("repartition: start level");
80  SET_VALID_ENTRY("repartition: use map");
81  SET_VALID_ENTRY("repartition: node repartition level");
82  SET_VALID_ENTRY("repartition: min rows per proc");
83  SET_VALID_ENTRY("repartition: target rows per proc");
84  SET_VALID_ENTRY("repartition: min rows per thread");
85  SET_VALID_ENTRY("repartition: target rows per thread");
86  SET_VALID_ENTRY("repartition: max imbalance");
87 #undef SET_VALID_ENTRY
88 
89  validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Factory of the matrix A");
90  validParamList->set< RCP<const FactoryBase> >("Map", Teuchos::null, "Factory of the map Map");
91  validParamList->set< RCP<const FactoryBase> >("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
92 
93  return validParamList;
94  }
95 
96  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
98  const Teuchos::ParameterList & pL = GetParameterList();
99  if(pL.isParameter("repartition: use map")) {
100  const bool useMap = pL.get<bool>("repartition: use map");
101  if (useMap)
102  Input(currentLevel, "Map");
103  else
104  Input(currentLevel, "A");
105  } else
106  Input(currentLevel, "A");
107  if(pL.isParameter("repartition: node repartition level")) {
108  const int nodeRepartLevel = pL.get<int>("repartition: node repartition level");
109  if(currentLevel.GetLevelID() == nodeRepartLevel) {
110  Input(currentLevel,"Node Comm");
111  }
112  }
113  }
114 
115  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
117  FactoryMonitor m(*this, "Build", currentLevel);
118 
119  const Teuchos::ParameterList & pL = GetParameterList();
120  // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation.
121  // TODO (JG): I don't really know if we want to do this.
122  const int startLevel = pL.get<int> ("repartition: start level");
123  const int nodeRepartLevel = pL.get<int> ("repartition: node repartition level");
124  LO minRowsPerProcess = pL.get<LO> ("repartition: min rows per proc");
125  LO targetRowsPerProcess = pL.get<LO> ("repartition: target rows per proc");
126  LO minRowsPerThread = pL.get<LO> ("repartition: min rows per thread");
127  LO targetRowsPerThread = pL.get<LO> ("repartition: target rows per thread");
128  const double nonzeroImbalance = pL.get<double>("repartition: max imbalance");
129  const bool useMap = pL.get<bool> ("repartition: use map");
130 
131  int thread_per_mpi_rank = 1;
132 #if defined(KOKKOS_ENABLE_OPENMP)
133  using execution_space = typename Node::device_type::execution_space;
134  if (std::is_same<execution_space, Kokkos::OpenMP>::value)
135  thread_per_mpi_rank = execution_space::concurrency();
136 #endif
137 
138  if (minRowsPerThread > 0)
139  // We ignore the value given by minRowsPerProcess and repartition based on threads instead
140  minRowsPerProcess = minRowsPerThread*thread_per_mpi_rank;
141 
142  if (targetRowsPerThread == 0)
143  targetRowsPerThread = minRowsPerThread;
144 
145  if (targetRowsPerThread > 0)
146  // We ignore the value given by targetRowsPerProcess and repartition based on threads instead
147  targetRowsPerProcess = targetRowsPerThread*thread_per_mpi_rank;
148 
149  if (targetRowsPerProcess == 0)
150  targetRowsPerProcess = minRowsPerProcess;
151 
152  // Stick this on the level so Zoltan2Interface can use this later
153  Set<LO>(currentLevel,"repartition: heuristic target rows per process",targetRowsPerProcess);
154 
155  // Check for validity of the node repartition option
156  TEUCHOS_TEST_FOR_EXCEPTION(nodeRepartLevel >= startLevel, Exceptions::RuntimeError, "MueLu::RepartitionHeuristicFactory::Build(): If 'repartition: node repartition level' is set, it must be less than or equal to 'repartition: start level'");
157 
158 
159  RCP<Matrix> A;
160  RCP<const FactoryBase> Afact;
161  RCP<const Map> map;
162  if (!useMap) {
163  Afact = GetFactory("A");
164  if(!Afact.is_null() && Teuchos::rcp_dynamic_cast<const RAPFactory>(Afact) == Teuchos::null &&
165  Teuchos::rcp_dynamic_cast<const BlockedRAPFactory>(Afact) == Teuchos::null &&
166  Teuchos::rcp_dynamic_cast<const SubBlockAFactory>(Afact) == Teuchos::null) {
167  GetOStream(Warnings) <<
168  "MueLu::RepartitionHeuristicFactory::Build: The generation factory for A must " \
169  "be a RAPFactory or a SubBlockAFactory providing the non-rebalanced matrix information! " \
170  "It specifically must not be of type Rebalance(Blocked)AcFactory or similar. " \
171  "Please check the input. Make also sure that \"number of partitions\" is provided to " \
172  "the Interface class and the RepartitionFactory instance. Instead, we have a "<<Afact->description() << std::endl;
173  }
174  // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types.
175  A = Get< RCP<Matrix> >(currentLevel, "A");
176  map = A->getRowMap();
177  } else
178  map = Get< RCP<const Map> >(currentLevel, "Map");
179 
180  // ======================================================================================================
181  // Determine whether partitioning is needed
182  // ======================================================================================================
183  // NOTE: most tests include some global communication, which is why we currently only do tests until we make
184  // a decision on whether to repartition. However, there is value in knowing how "close" we are to having to
185  // rebalance an operator. So, it would probably be beneficial to do and report *all* tests.
186 
187  // Test0: Should we do node repartitioning?
188  if (currentLevel.GetLevelID() == nodeRepartLevel && map->getComm()->getSize() > 1) {
189  RCP<const Teuchos::Comm<int> > NodeComm = Get< RCP<const Teuchos::Comm<int> > >(currentLevel, "Node Comm");
190  TEUCHOS_TEST_FOR_EXCEPTION(NodeComm.is_null(), Exceptions::RuntimeError, "MueLu::RepartitionHeuristicFactory::Build(): NodeComm is null.");
191 
192  // If we only have one node, then we don't want to pop down to one rank
193  if(NodeComm()->getSize() != map->getComm()->getSize()) {
194  GetOStream(Statistics1) << "Repartitioning? YES: \n Within node only"<<std::endl;
195  int nodeRank = NodeComm->getRank();
196 
197  // Do a reduction to get the total number of nodes
198  int isZero = (nodeRank == 0);
199  int numNodes=0;
200  Teuchos::reduceAll(*map->getComm(), Teuchos::REDUCE_SUM, isZero, Teuchos::outArg(numNodes));
201  Set(currentLevel, "number of partitions", numNodes);
202  return;
203  }
204  }
205 
206  // Test1: skip repartitioning if current level is less than the specified minimum level for repartitioning
207  if (currentLevel.GetLevelID() < startLevel) {
208  GetOStream(Statistics1) << "Repartitioning? NO:" <<
209  "\n current level = " << Teuchos::toString(currentLevel.GetLevelID()) <<
210  ", first level where repartitioning can happen is " + Teuchos::toString(startLevel) << std::endl;
211 
212  // a negative number of processors means: no repartitioning
213  Set(currentLevel, "number of partitions", -1);
214 
215  return;
216  }
217 
218  RCP<const Teuchos::Comm<int> > origComm = map->getComm();
219  RCP<const Teuchos::Comm<int> > comm = origComm;
220 
221  // Test 2: check whether A is actually distributed, i.e. more than one processor owns part of A
222  // TODO: this global communication can be avoided if we store the information with the matrix (it is known when matrix is created)
223  // TODO: further improvements could be achieved when we use subcommunicator for the active set. Then we only need to check its size
224 
225  // TODO: The block transfer factories do not check correctly whether or not repartitioning actually took place.
226  {
227  if (comm->getSize() == 1 && Teuchos::rcp_dynamic_cast<const RAPFactory>(Afact) != Teuchos::null) {
228  GetOStream(Statistics1) << "Repartitioning? NO:" <<
229  "\n comm size = 1" << std::endl;
230 
231  Set(currentLevel, "number of partitions", -1);
232  return;
233  }
234 
235  int numActiveProcesses = 0;
236  MueLu_sumAll(comm, Teuchos::as<int>((map->getNodeNumElements() > 0) ? 1 : 0), numActiveProcesses);
237 
238  if (numActiveProcesses == 1) {
239  GetOStream(Statistics1) << "Repartitioning? NO:" <<
240  "\n # processes with rows = " << Teuchos::toString(numActiveProcesses) << std::endl;
241 
242  Set(currentLevel, "number of partitions", 1);
243  return;
244  }
245  }
246 
247  bool test3 = false, test4 = false;
248  std::string msg3, msg4;
249 
250  // Test3: check whether number of rows on any processor satisfies the minimum number of rows requirement
251  // NOTE: Test2 ensures that repartitionning is not done when there is only one processor (it may or may not satisfy Test3)
252  if (minRowsPerProcess > 0) {
253  LO numMyRows = Teuchos::as<LO>(map->getNodeNumElements()), minNumRows, LOMAX = Teuchos::OrdinalTraits<LO>::max();
254  LO haveFewRows = (numMyRows < minRowsPerProcess ? 1 : 0), numWithFewRows = 0;
255  MueLu_sumAll(comm, haveFewRows, numWithFewRows);
256  MueLu_minAll(comm, (numMyRows > 0 ? numMyRows : LOMAX), minNumRows);
257 
258  // TODO: we could change it to repartition only if the number of processors with numRows < minNumRows is larger than some
259  // percentage of the total number. This way, we won't repartition if 2 out of 1000 processors don't have enough elements.
260  // I'm thinking maybe 20% threshold. To implement, simply add " && numWithFewRows < .2*numProcs" to the if statement.
261  if (numWithFewRows > 0)
262  test3 = true;
263 
264  msg3 = "\n min # rows per proc = " + Teuchos::toString(minNumRows) + ", min allowable = " + Teuchos::toString(minRowsPerProcess);
265  }
266 
267  // Test4: check whether the balance in the number of nonzeros per processor is greater than threshold
268  if (!test3) {
269  if (useMap)
270  msg4 = "";
271  else {
272  GO minNnz, maxNnz, numMyNnz = Teuchos::as<GO>(A->getNodeNumEntries());
273  MueLu_maxAll(comm, numMyNnz, maxNnz);
274  MueLu_minAll(comm, (numMyNnz > 0 ? numMyNnz : maxNnz), minNnz); // min nnz over all active processors
275  double imbalance = Teuchos::as<double>(maxNnz)/minNnz;
276 
277  if (imbalance > nonzeroImbalance)
278  test4 = true;
279 
280  msg4 = "\n nonzero imbalance = " + Teuchos::toString(imbalance) + ", max allowable = " + Teuchos::toString(nonzeroImbalance);
281  }
282  }
283 
284  if (!test3 && !test4) {
285  GetOStream(Statistics1) << "Repartitioning? NO:" << msg3 + msg4 << std::endl;
286 
287  // A negative number of partitions means: no repartitioning
288  Set(currentLevel, "number of partitions", -1);
289  return;
290  }
291 
292  GetOStream(Statistics1) << "Repartitioning? YES:" << msg3 + msg4 << std::endl;
293 
294  // ======================================================================================================
295  // Calculate number of partitions
296  // ======================================================================================================
297  // FIXME Quick way to figure out how many partitions there should be (same algorithm as ML)
298  // FIXME Should take into account nnz? Perhaps only when user is using min #nnz per row threshold.
299 
300  // The number of partitions is calculated by the RepartitionFactory and stored in "number of partitions" variable on
301  // the current level. If this variable is already set (e.g., by another instance of RepartitionFactory) then this number
302  // is used. The "number of partitions" variable serves as basic communication between the RepartitionFactory (which
303  // requests a certain number of partitions) and the *Interface classes which call the underlying partitioning algorithms
304  // and produce the "Partition" array with the requested number of partitions.
305  const auto globalNumRows = Teuchos::as<GO>(map->getGlobalNumElements());
306  int numPartitions = 1;
307  if (globalNumRows >= targetRowsPerProcess) {
308  // Make sure that each CPU thread has approximately targetRowsPerProcess
309  numPartitions = std::max(Teuchos::as<int>(globalNumRows / targetRowsPerProcess), 1);
310  }
311  numPartitions = std::min(numPartitions, comm->getSize());
312 
313  Set(currentLevel, "number of partitions", numPartitions);
314 
315  GetOStream(Statistics1) << "Number of partitions to use = " << numPartitions << std::endl;
316  } // Build
317 } // namespace MueLu
318 
319 #endif //ifdef HAVE_MPI
320 #endif /* PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_ */
#define MueLu_sumAll(rcpComm, in, out)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.
#define MueLu_maxAll(rcpComm, in, out)
void DeclareInput(Level &currentLevel) const
Determines the data that RepartitionHeuristicFactory needs, and the factories that generate that data...
Timer to be used in factories. Similar to Monitor but with additional timers.
Print more statistics.
void Build(Level &currentLevel) const
Build an object with this factory.
#define SET_VALID_ENTRY(name)
Namespace for MueLu classes and methods.
#define MueLu_minAll(rcpComm, in, out)
int GetLevelID() const
Return level number.
Definition: MueLu_Level.cpp:76
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:99
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
Factory for building a thresholded operator.
Exception throws to report errors in the internal logical of the program.
Print all warning messages.
Factory for building coarse matrices.