MueLu  Version of the Day
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
47 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
48 
49 #ifdef HAVE_MUELU_KOKKOS_REFACTOR
50 
51 #include <Teuchos_ScalarTraits.hpp>
52 #include <Teuchos_SerialDenseMatrix.hpp>
53 #include <Teuchos_SerialQRDenseSolver.hpp>
54 
56 
57 #include "MueLu_Aggregates_kokkos.hpp"
58 #include "MueLu_AmalgamationFactory.hpp"
59 #include "MueLu_AmalgamationInfo.hpp"
60 #include "MueLu_CoarseMapFactory_kokkos.hpp"
61 #include "MueLu_NullspaceFactory_kokkos.hpp"
62 #include "MueLu_PerfUtils.hpp"
63 #include "MueLu_Monitor.hpp"
64 #include "MueLu_Utilities_kokkos.hpp"
65 
66 namespace MueLu {
67 
68  namespace { // anonymous
69 
70  template<class LocalOrdinal, class RowType>
71  class ScanFunctor {
72  public:
73  ScanFunctor(RowType rows) : rows_(rows) { }
74 
75  KOKKOS_INLINE_FUNCTION
76  void operator()(const LocalOrdinal i, LocalOrdinal& upd, const bool& final) const {
77  upd += rows_(i);
78  if (final)
79  rows_(i) = upd;
80  }
81 
82  private:
83  RowType rows_;
84  };
85 
86  }
87 
88  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
89  RCP<const ParameterList> TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::GetValidParameterList() const {
90  RCP<ParameterList> validParamList = rcp(new ParameterList());
91 
92  validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Generating factory of the matrix A");
93  validParamList->set< RCP<const FactoryBase> >("Aggregates", Teuchos::null, "Generating factory of the aggregates");
94  validParamList->set< RCP<const FactoryBase> >("Nullspace", Teuchos::null, "Generating factory of the nullspace");
95  validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
96  validParamList->set< RCP<const FactoryBase> >("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
97 
98  return validParamList;
99  }
100 
101  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
102  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::DeclareInput(Level& fineLevel, Level& coarseLevel) const {
103  Input(fineLevel, "A");
104  Input(fineLevel, "Aggregates");
105  Input(fineLevel, "Nullspace");
106  Input(fineLevel, "UnAmalgamationInfo");
107  Input(fineLevel, "CoarseMap");
108  }
109 
110  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
111  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::Build(Level& fineLevel, Level& coarseLevel) const {
112  return BuildP(fineLevel, coarseLevel);
113  }
114 
115  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
116  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::BuildP(Level& fineLevel, Level& coarseLevel) const {
117  FactoryMonitor m(*this, "Build", coarseLevel);
118 
119  auto A = Get< RCP<Matrix> > (fineLevel, "A");
120  auto aggregates = Get< RCP<Aggregates_kokkos> >(fineLevel, "Aggregates");
121  auto amalgInfo = Get< RCP<AmalgamationInfo> > (fineLevel, "UnAmalgamationInfo");
122  auto fineNullspace = Get< RCP<MultiVector> > (fineLevel, "Nullspace");
123  auto coarseMap = Get< RCP<const Map> > (fineLevel, "CoarseMap");
124 
125  RCP<Matrix> Ptentative;
126  RCP<MultiVector> coarseNullspace;
127  if (!aggregates->AggregatesCrossProcessors())
128  BuildPuncoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
129  else
130  BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
131 
132  // If available, use striding information of fine level matrix A for range
133  // map and coarseMap as domain map; otherwise use plain range map of
134  // Ptent = plain range map of A for range map and coarseMap as domain map.
135  // NOTE:
136  // The latter is not really safe, since there is no striding information
137  // for the range map. This is not really a problem, since striding
138  // information is always available on the intermedium levels and the
139  // coarsest levels.
140  if (A->IsView("stridedMaps") == true)
141  Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
142  else
143  Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
144 
145  Set(coarseLevel, "Nullspace", coarseNullspace);
146  Set(coarseLevel, "P", Ptentative);
147 
148  if (IsPrint(Statistics1)) {
149  RCP<ParameterList> params = rcp(new ParameterList());
150  params->set("printLoadBalancingInfo", true);
151  GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
152  }
153  }
154 
155  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
156  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
157  BuildPuncoupled(RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
158  RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace) const {
159  RCP<const Map> rowMap = A->getRowMap();
160  RCP<const Map> colMap = A->getColMap();
161 
162  const size_t numRows = rowMap->getNodeNumElements();
163  const size_t NSDim = fineNullspace->getNumVectors();
164 
165  typedef Teuchos::ScalarTraits<SC> STS;
166  const SC zero = STS::zero();
167  const SC one = STS::one();
168  const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
169 
170  auto aggGraph = aggregates->GetGraph();
171  auto aggRows = aggGraph.row_map;
172  auto aggCols = aggGraph.entries;
173  const GO numAggs = aggregates->GetNumAggregates();
174 
175  // Aggregates map is based on the amalgamated column map
176  // We can skip global-to-local conversion if LIDs in row map are
177  // same as LIDs in column map
178  bool goodMap = isGoodMap(*rowMap, *colMap);
179 #if 1
180  TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError, "For now, need matching row and col maps");
181  TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() != 1, Exceptions::RuntimeError, "For now, only block size 1");
182 
183  // For now, do a simple translation
184  // FIXME: only type is correct here, everything else is not
185  Kokkos::View<LO*, DeviceType> agg2RowMapLO("agg2row_map_LO", NSDim*aggCols.size()); // initialized to 0
186  if (NSDim == 1) {
187  for (LO i = 0; i < aggCols.size(); i++)
188  agg2RowMapLO(i) = aggCols(i);
189  } else {
190  for (LO i = 0; i < aggCols.size(); i++)
191  for (LO k = 0; k < NSDim; k++) {
192  // FIXME: this should be the proper transformation with indexBase and offsets
193  agg2RowMapLO(i*NSDim+k) = aggCols(i)*NSDim+k;
194  }
195  }
196 #else
197  ArrayRCP<LO> aggStart;
198  ArrayRCP<LO> aggToRowMapLO;
199  ArrayRCP<GO> aggToRowMapGO;
200  if (goodMap) {
201  amalgInfo->UnamalgamateAggregatesLO(*aggregates, aggStart, aggToRowMapLO);
202  GetOStream(Runtime1) << "Column map is consistent with the row map, good." << std::endl;
203 
204  } else {
205  amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMapGO);
206  GetOStream(Warnings0) << "Column map is not consistent with the row map\n"
207  << "using GO->LO conversion with performance penalty" << std::endl;
208  }
209 #endif
210 
211  // TODO
212  // Use TeamPolicy with scratch_memory_space for local QR
213  // Something along the lines:
214  //
215  // typedef TeamPolicy<ExecutionSpace>::member_type team_t;
216  // struct functor() {
217  // inline unsigned team_shmem_size( int team_size ) const {
218  // return view_type::shmem_size( team_size, 10, 3 );
219  // }
220  // KOKKOS_INLINE_FUNCTION
221  // void operator() (const team_t& team) const {
222  // view_type matrices(team.team_shmem(), team.team_size(), 10, 3);
223  // auto matrix = subview(matrices, team.team_rank(), Kokkos::ALL(), Kokkos::ALL());
224  // }
225  // }
226 
227  coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
228 
229  // Pull out the nullspace vectors so that we can have random access
230  auto fineNS = fineNullspace ->template getLocalView<DeviceType>();
231  auto coarseNS = coarseNullspace->template getLocalView<DeviceType>();
232 
233  size_t nnzEstimate = numRows * NSDim, nnz = 0;
234 
235  typedef typename Matrix::local_matrix_type local_matrix_type;
236  typedef typename local_matrix_type::row_map_type rows_type;
237  typedef typename local_matrix_type::index_type cols_type;
238  typedef typename local_matrix_type::values_type vals_type;
239 
240  // Stage 0: initialize auxilary arrays
241  // The main thing to notice is initialization of vals with INVALID. These
242  // values will later be used to compress the arrays
243  typename rows_type::non_const_type rowsAux("Ptent_aux_rows", numRows+1), rows("Ptent_rows", numRows+1);
244  typename cols_type::non_const_type colsAux("Ptent_aux_cols", nnzEstimate);
245  typename vals_type::non_const_type valsAux("Ptent_aux_vals", nnzEstimate);
246 
247  Kokkos::parallel_for("MueLu:TentativePF:BuildPuncoupled:for1", numRows+1, KOKKOS_LAMBDA(const LO row) {
248  rowsAux(row) = row*NSDim;
249  });
250  Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:for2", nnzEstimate, KOKKOS_LAMBDA(const LO j) {
251  colsAux(j) = INVALID;
252  valsAux(j) = zero;
253  });
254 
255  typedef Kokkos::View<int[10], DeviceType> status_type;
256  status_type status("status");
257 
258  // Stage 1: construct auxilary arrays.
259  // The constructed arrays may have gaps in them (vals(j) == INVALID)
260  // Run one thread per aggregate.
261  typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
262  typename AppendTrait<status_type, Kokkos::Atomic> ::type statusAtomic = status;
263  if (NSDim == 1) {
264  // 1D is special, as it is the easiest. We don't even need to the QR,
265  // just normalize an array. Plus, no worries abot small aggregates.
266  Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_loop", numAggs, KOKKOS_LAMBDA(const GO agg, size_t& rowNnz) {
267  LO aggSize = aggRows(agg+1) - aggRows(agg);
268 
269  // Extract the piece of the nullspace corresponding to the aggregate, and
270  // put it in the flat array, "localQR" (in column major format) for the
271  // QR routine. Trivial in 1D.
272  if (goodMap) {
273  // Calculate QR by hand
274  typedef Kokkos::ArithTraits<SC> ATS;
275  typedef typename ATS::magnitudeType Magnitude;
276 
277  Magnitude norm = ATS::magnitude(zero);
278  for (size_t k = 0; k < aggSize; k++) {
279  Magnitude dnorm = ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
280  norm += dnorm*dnorm;
281  }
282  norm = sqrt(norm);
283 
284  if (norm == zero) {
285  // zero column; terminate the execution
286  statusAtomic(1) = true;
287  return;
288  }
289 
290  // R = norm
291  coarseNS(agg, 0) = norm;
292 
293  // Q = localQR(:,0)/norm
294  for (LO k = 0; k < aggSize; k++) {
295  LO localRow = agg2RowMapLO(aggRows(agg)+k);
296  SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
297 
298  size_t rowStart = rowsAux(localRow);
299  colsAux(rowStart) = agg;
300  valsAux(rowStart) = localVal;
301 
302  // Store true number of nonzeros per row
303  rows(localRow+1) = 1;
304  rowNnz += 1;
305  }
306 
307  } else {
308  // FIXME: implement non-standard map QR
309  // Look at the original TentativeP for how to do that
310  statusAtomic(0) = true;
311  return;
312  }
313  }, nnz);
314 
315  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
316  for (int i = 0; i < statusHost.size(); i++)
317  if (statusHost(i)) {
318  std::ostringstream oss;
319  oss << "MueLu::TentativePFactory::MakeTentative: ";
320  switch(i) {
321  case 0: oss << "!goodMap is not implemented"; break;
322  case 1: oss << "fine level NS part has a zero column"; break;
323  }
324  throw Exceptions::RuntimeError(oss.str());
325  }
326 
327  } else {
328  throw Exceptions::RuntimeError("Ignore NSDim > 1 for now");
329 #if 0
330  Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_loop", numAggs, KOKKOS_LAMBDA(const GO agg, size_t& nnz) {
331  LO aggSize = aggRows(agg+1) - aggRows(agg);
332 
333  Xpetra::global_size_t offset = agg*NSDim;
334 
335  // Extract the piece of the nullspace corresponding to the aggregate, and
336  // put it in the flat array, "localQR" (in column major format) for the
337  // QR routine.
338  // FIXME: can I create views in parallel_regions? If not, I will need to create a view with max aggregate outside?
339  // Can I create local variables? Or do I need View of Views
340  Kokkos::View<SC**, DeviceType> localQR("localQR", aggSize, NSDim);
341  if (goodMap) {
342  for (size_t j = 0; j < NSDim; j++)
343  for (LO k = 0; k < aggSize; k++)
344  localQR(k,j) = fineNSRandom(agg2RowMapLO(aggRows(agg)+k), j);
345  } else {
346  statusAtomic(0) = true;
347  return;
348 #if 0
349  for (size_t j = 0; j < NSDim; j++)
350  for (LO k = 0; k < aggSize; k++)
351  // FIXME
352  localQR(k,j) = fineNS(rowMap->getLocalElement(aggToRowMapGO(aggStart(agg)+k)), j);
353 #endif
354  }
355 
356  // Test for zero columns
357  for (size_t j = 0; j < NSDim; j++) {
358  bool bIsZeroNSColumn = true;
359 
360  for (LO k = 0; k < aggSize; k++)
361  if (localQR(k,j) != zero)
362  bIsZeroNSColumn = false;
363 
364  if (bIsZeroNSColumn) {
365  statusAtomic(1) = true;
366  return;
367  }
368  }
369 
370  // Calculate QR decomposition (standard)
371  // NOTE: Q is stored in localQR and R is stored in coarseNS
372  if (aggSize >= NSDim) {
373 
374  if (NSDim == 1) {
375  // Only one nullspace vector, calculate Q and R by hand
376  typedef Kokkos::ArithTraits<SC> ATS;
377  typedef typename ATS::magnitudeType Magnitude;
378 
379  Magnitude norm = ATS::magnitude(zero);
380  for (size_t k = 0; k < Teuchos::as<size_t>(aggSize); k++)
381  norm += ATS::magnitude(localQR(k,0)*localQR(k,0));
382  norm = Kokkos::ArithTraits<Magnitude>::squareroot(norm);
383 
384  // R = norm
385  coarseNS(offset, 0) = norm;
386 
387  // Q = localQR(:,0)/norm
388  for (LO i = 0; i < aggSize; i++)
389  localQR(i,0) /= norm;
390 
391  } else {
392 #if 1
393  statusAtomic(2) = true;
394  return;
395 #else
396  // FIXME: Need Kokkos QR solver
397  Teuchos::SerialQRDenseSolver<LO,SC> qrSolver;
398  qrSolver.setMatrix(Teuchos::rcp(&localQR, false));
399  qrSolver.factor();
400 
401  // R = upper triangular part of localQR
402  for (size_t j = 0; j < NSDim; j++)
403  for (size_t k = 0; k <= j; k++)
404  coarseNS(offset+k,j) = localQR(k,j); //TODO is offset+k the correct local ID?!
405 
406  // Calculate Q, the tentative prolongator.
407  // The Lapack GEQRF call only works for myAggsize >= NSDim
408  qrSolver.formQ();
409  Teuchos::RCP<Teuchos::SerialDenseMatrix<LO,SC> > qFactor = qrSolver.getQ();
410  for (size_t j = 0; j < NSDim; j++)
411  for (size_t i = 0; i < Teuchos::as<size_t>(aggSize); i++)
412  localQR(i,j) = (*qFactor)(i,j);
413 #endif
414  }
415 
416  } else {
417  statusAtomic(3) = true;
418  return;
419 #if 0
420  // Special handling for aggSize < NSDim (i.e. single node aggregates in structural mechanics)
421 
422  // The local QR decomposition is not possible in the "overconstrained"
423  // case (i.e. number of columns in localQR > number of rowsAux), which
424  // corresponds to #DOFs in Aggregate < NSDim. For usual problems this
425  // is only possible for single node aggregates in structural mechanics.
426  // (Similar problems may arise in discontinuous Galerkin problems...)
427  // We bypass the QR decomposition and use an identity block in the
428  // tentative prolongator for the single node aggregate and transfer the
429  // corresponding fine level null space information 1-to-1 to the coarse
430  // level null space part.
431 
432  // NOTE: The resulting tentative prolongation operator has
433  // (aggSize*DofsPerNode-NSDim) zero columns leading to a singular
434  // coarse level operator A. To deal with that one has the following
435  // options:
436  // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
437  // false) to add some identity block to the diagonal of the zero rowsAux
438  // in the coarse level operator A, such that standard level smoothers
439  // can be used again.
440  // - Use special (projection-based) level smoothers, which can deal
441  // with singular matrices (very application specific)
442  // - Adapt the code below to avoid zero columns. However, we do not
443  // support a variable number of DOFs per node in MueLu/Xpetra which
444  // makes the implementation really hard.
445 
446  // R = extended (by adding identity rowsAux) localQR
447  for (size_t j = 0; j < NSDim; j++)
448  for (size_t k = 0; k < NSDim; k++)
449  if (k < as<size_t>(aggSize))
450  coarseNS[j][offset+k] = localQR(k,j);
451  else
452  coarseNS[j][offset+k] = (k == j ? one : zero);
453 
454  // Q = I (rectangular)
455  for (size_t i = 0; i < as<size_t>(aggSize); i++)
456  for (size_t j = 0; j < NSDim; j++)
457  localQR(i,j) = (j == i ? one : zero);
458 #endif
459  }
460 
461  // Process each row in the local Q factor
462  // FIXME: What happens if maps are block maps?
463  for (LO j = 0; j < aggSize; j++) {
464 #if 1
465  LO localRow = (goodMap ? agg2RowMapLO(aggRows(agg)+j) : -1);
466 #else
467  LO localRow = (goodMap ? agg2RowMapLO[aggRows(agg)+j] : rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+j]));
468 #endif
469 
470  size_t rowStart = rowsAux(localRow), lnnz = 0;
471  for (size_t k = 0; k < NSDim; k++) {
472  // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions)
473  if (localQR(j,k) != zero) {
474  colsAux(rowStart+lnnz) = offset + k;
475  valsAux(rowStart+lnnz) = localQR(j,k);
476  lnnz++;
477  }
478  }
479  // Store true number of nonzeros per row
480  rows(localRow+1) = lnnz;
481  }
482  }, nnz);
483 
484  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
485  for (int i = 0; i < statusHost.size(); i++)
486  if (statusHost(i)) {
487  std::ostringstream oss;
488  oss << "MueLu::TentativePFactory::MakeTentative: ";
489  switch(i) {
490  case 0: oss << "!goodMap is not implemented";
491  case 1: oss << "fine level NS part has a zero column";
492  case 2: oss << "NSDim > 1 is not implemented";
493  case 3: oss << "aggSize < NSDim is not imlemented";
494  }
495  throw Exceptions::RuntimeError(oss.str());
496  }
497 #endif
498  }
499 
500  // Stage 2: compress the arrays
501  ScanFunctor<LO,decltype(rows)> scanFunctor(rows);
502  Kokkos::parallel_scan("MueLu:TentativePF:Build:compress_rows", numRows+1, scanFunctor);
503 
504  // The real cols and vals are constructed using calculated (not estimated) nnz
505  typename cols_type::non_const_type cols("Ptent_cols", nnz);
506  typename vals_type::non_const_type vals("Ptent_vals", nnz);
507  Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols_vals", numRows, KOKKOS_LAMBDA(const LO i) {
508  LO rowStart = rows(i);
509 
510  size_t lnnz = 0;
511  for (LO j = rowsAux(i); j < rowsAux(i+1); j++)
512  if (valsAux(j) != INVALID) {
513  cols(rowStart+lnnz) = colsAux(j);
514  vals(rowStart+lnnz) = valsAux(j);
515  lnnz++;
516  }
517  });
518 
519  GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
520 
521  // Stage 3: construct Xpetra::Matrix
522  // FIXME: For now, we simply copy-paste arrays. The proper way to do that
523  // would be to construct a Kokkos CrsMatrix, and then construct
524  // Xpetra::Matrix out of that.
525  Ptentative = rcp(new CrsMatrixWrap(rowMap, coarseMap, 0, Xpetra::StaticProfile));
526  RCP<CrsMatrix> PtentCrs = rcp_dynamic_cast<CrsMatrixWrap>(Ptentative)->getCrsMatrix();
527 
528  ArrayRCP<size_t> iaPtent;
529  ArrayRCP<LO> jaPtent;
530  ArrayRCP<SC> valPtent;
531 
532  PtentCrs->allocateAllValues(nnz, iaPtent, jaPtent, valPtent);
533 
534  ArrayView<size_t> ia = iaPtent();
535  ArrayView<LO> ja = jaPtent();
536  ArrayView<SC> val = valPtent();
537 
538  // Copy values
539  typename rows_type::HostMirror rowsHost = Kokkos::create_mirror_view(rows);
540  typename cols_type::HostMirror colsHost = Kokkos::create_mirror_view(cols);
541  typename vals_type::HostMirror valsHost = Kokkos::create_mirror_view(vals);
542  for (LO i = 0; i < rowsHost.size(); i++)
543  ia[i] = rowsHost(i);
544  for (LO j = 0; j < colsHost.size(); j++) {
545  ja [j] = colsHost(j);
546  val[j] = valsHost(j);
547  }
548 
549  PtentCrs->setAllValues(iaPtent, jaPtent, valPtent);
550  PtentCrs->expertStaticFillComplete(coarseMap, A->getDomainMap());
551  }
552 
553  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
554  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
555  BuildPcoupled(RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
556  RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace) const {
557  throw Exceptions::RuntimeError("Construction of coupled tentative P is not implemented");
558  }
559 
560  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
561  bool TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::isGoodMap(const Map& rowMap, const Map& colMap) const {
562  ArrayView<const GO> rowElements = rowMap.getNodeElementList();
563  ArrayView<const GO> colElements = colMap.getNodeElementList();
564 
565  const size_t numElements = rowElements.size();
566 
567  bool goodMap = true;
568  for (size_t i = 0; i < numElements; i++)
569  if (rowElements[i] != colElements[i]) {
570  goodMap = false;
571  break;
572  }
573 
574  return goodMap;
575  }
576 
577 } //namespace MueLu
578 
579 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
580 #endif // HAVE_MUELU_KOKKOS_REFACTOR
581 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
Important warning messages (one line)
GlobalOrdinal GO
Print more statistics.
LocalOrdinal LO
Namespace for MueLu classes and methods.
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
size_t global_size_t
Scalar SC
Description of what is happening (more verbose)