MueLu  Version of the Day
MueLu_SemiCoarsenPFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_SEMICOARSENPFACTORY_KOKKOS_DEF_HPP
47 #define MUELU_SEMICOARSENPFACTORY_KOKKOS_DEF_HPP
48 
49 #ifdef HAVE_MUELU_KOKKOS_REFACTOR
50 
51 #include <stdlib.h>
52 
53 #include <Kokkos_Core.hpp>
54 
55 #include <KokkosBatched_LU_Decl.hpp>
56 #include <KokkosBatched_LU_Serial_Impl.hpp>
57 #include <KokkosBatched_Trsm_Decl.hpp>
58 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
59 #include <KokkosBatched_Util.hpp>
60 #include <KokkosSparse_CrsMatrix.hpp>
61 
62 #include <Xpetra_CrsMatrixWrap.hpp>
63 #include <Xpetra_ImportFactory.hpp>
64 #include <Xpetra_Matrix.hpp>
65 #include <Xpetra_MultiVectorFactory.hpp>
66 #include <Xpetra_VectorFactory.hpp>
67 
69 #include "MueLu_MasterList.hpp"
70 #include "MueLu_Monitor.hpp"
72 
73 namespace MueLu {
74 
75 template <class Scalar, class LocalOrdinal, class GlobalOrdinal,
76  class DeviceType>
77 RCP<const ParameterList>
78 SemiCoarsenPFactory_kokkos<Scalar, LocalOrdinal, GlobalOrdinal,
79  Kokkos::Compat::KokkosDeviceWrapperNode<
80  DeviceType>>::GetValidParameterList() const {
81  RCP<ParameterList> validParamList = rcp(new ParameterList());
82 
83  std::string name = "semicoarsen: coarsen rate";
84  validParamList->setEntry(name, MasterList::getEntry(name));
85  validParamList->set<RCP<const FactoryBase>>(
86  "A", Teuchos::null, "Generating factory of the matrix A");
87  validParamList->set<RCP<const FactoryBase>>(
88  "Nullspace", Teuchos::null, "Generating factory of the nullspace");
89  validParamList->set<RCP<const FactoryBase>>(
90  "Coordinates", Teuchos::null, "Generating factory for coordinates");
91 
92  validParamList->set<RCP<const FactoryBase>>(
93  "LineDetection_VertLineIds", Teuchos::null,
94  "Generating factory for LineDetection vertical line ids");
95  validParamList->set<RCP<const FactoryBase>>(
96  "LineDetection_Layers", Teuchos::null,
97  "Generating factory for LineDetection layer ids");
98  validParamList->set<RCP<const FactoryBase>>(
99  "CoarseNumZLayers", Teuchos::null,
100  "Generating factory for number of coarse z-layers");
101 
102  return validParamList;
103 }
104 
105 template <class Scalar, class LocalOrdinal, class GlobalOrdinal,
106  class DeviceType>
107 void SemiCoarsenPFactory_kokkos<
109  Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
110  DeclareInput(Level &fineLevel, Level & /* coarseLevel */) const {
111  Input(fineLevel, "A");
112  Input(fineLevel, "Nullspace");
113 
114  Input(fineLevel, "LineDetection_VertLineIds");
115  Input(fineLevel, "LineDetection_Layers");
116  Input(fineLevel, "CoarseNumZLayers");
117 
118  // check whether fine level coordinate information is available.
119  // If yes, request the fine level coordinates and generate coarse coordinates
120  // during the Build call
121  if (fineLevel.GetLevelID() == 0) {
122  if (fineLevel.IsAvailable("Coordinates", NoFactory::get())) {
123  fineLevel.DeclareInput("Coordinates", NoFactory::get(), this);
124  bTransferCoordinates_ = true;
125  }
126  } else if (bTransferCoordinates_ == true) {
127  // on coarser levels we check the default factory providing "Coordinates"
128  // or the factory declared to provide "Coordinates"
129  // first, check which factory is providing coordinate information
130  RCP<const FactoryBase> myCoordsFact = GetFactory("Coordinates");
131  if (myCoordsFact == Teuchos::null) {
132  myCoordsFact = fineLevel.GetFactoryManager()->GetFactory("Coordinates");
133  }
134  if (fineLevel.IsAvailable("Coordinates", myCoordsFact.get())) {
135  fineLevel.DeclareInput("Coordinates", myCoordsFact.get(), this);
136  }
137  }
138 }
139 
140 template <class Scalar, class LocalOrdinal, class GlobalOrdinal,
141  class DeviceType>
142 void SemiCoarsenPFactory_kokkos<Scalar, LocalOrdinal, GlobalOrdinal,
143  Kokkos::Compat::KokkosDeviceWrapperNode<
144  DeviceType>>::Build(Level &fineLevel,
145  Level &coarseLevel)
146  const {
147  return BuildP(fineLevel, coarseLevel);
148 }
149 
150 template <class Scalar, class LocalOrdinal, class GlobalOrdinal,
151  class DeviceType>
152 void SemiCoarsenPFactory_kokkos<Scalar, LocalOrdinal, GlobalOrdinal,
153  Kokkos::Compat::KokkosDeviceWrapperNode<
154  DeviceType>>::BuildP(Level &fineLevel,
155  Level &coarseLevel)
156  const {
157  FactoryMonitor m(*this, "Build", coarseLevel);
158 
159  // obtain general variables
160  RCP<Matrix> A = Get<RCP<Matrix>>(fineLevel, "A");
161  RCP<MultiVector> fineNullspace =
162  Get<RCP<MultiVector>>(fineLevel, "Nullspace");
163 
164  // get user-provided coarsening rate parameter (constant over all levels)
165  const ParameterList &pL = GetParameterList();
166  LO CoarsenRate = as<LO>(pL.get<int>("semicoarsen: coarsen rate"));
167  TEUCHOS_TEST_FOR_EXCEPTION(
168  CoarsenRate < 2, Exceptions::RuntimeError,
169  "semicoarsen: coarsen rate must be greater than 1");
170 
171  // collect general input data
172  LO BlkSize = A->GetFixedBlockSize();
173  RCP<const Map> rowMap = A->getRowMap();
174  LO Ndofs = rowMap->getNodeNumElements();
175  LO Nnodes = Ndofs / BlkSize;
176 
177  // collect line detection information generated by the LineDetectionFactory
178  // instance
179  LO FineNumZLayers = Get<LO>(fineLevel, "CoarseNumZLayers");
180  Teuchos::ArrayRCP<LO> TVertLineId =
181  Get<Teuchos::ArrayRCP<LO>>(fineLevel, "LineDetection_VertLineIds");
182  Teuchos::ArrayRCP<LO> TLayerId =
183  Get<Teuchos::ArrayRCP<LO>>(fineLevel, "LineDetection_Layers");
184 
185  // compute number of coarse layers
186  TEUCHOS_TEST_FOR_EXCEPTION(FineNumZLayers < 2, Exceptions::RuntimeError,
187  "Cannot coarsen further");
188  using coordT = typename Teuchos::ScalarTraits<Scalar>::coordinateType;
189  LO CoarseNumZLayers =
190  (LO)floor(((coordT)(FineNumZLayers + 1)) / ((coordT)CoarsenRate) - 1.0);
191  if (CoarseNumZLayers < 1)
192  CoarseNumZLayers = 1;
193 
194  // generate transfer operator with semicoarsening
195  RCP<Matrix> P;
196  RCP<MultiVector> coarseNullspace;
197  BuildSemiCoarsenP(coarseLevel, Ndofs, Nnodes, BlkSize, FineNumZLayers,
198  CoarseNumZLayers, TLayerId, TVertLineId, A, fineNullspace, P,
199  coarseNullspace);
200 
201  // Store number of coarse z-layers on the coarse level container
202  // This information is used by the LineDetectionAlgorithm
203  // TODO get rid of the NoFactory
204  coarseLevel.Set("NumZLayers", Teuchos::as<LO>(CoarseNumZLayers),
206 
207  // store semicoarsening transfer on coarse level
208  Set(coarseLevel, "P", P);
209  Set(coarseLevel, "Nullspace", coarseNullspace);
210 
211  // transfer coordinates
212  if (bTransferCoordinates_) {
213  SubFactoryMonitor m2(*this, "TransferCoordinates", coarseLevel);
214  typedef Xpetra::MultiVector<
215  typename Teuchos::ScalarTraits<Scalar>::coordinateType, LO, GO, NO>
216  xdMV;
217  RCP<xdMV> fineCoords = Teuchos::null;
218  if (fineLevel.GetLevelID() == 0 &&
219  fineLevel.IsAvailable("Coordinates", NoFactory::get())) {
220  fineCoords = fineLevel.Get<RCP<xdMV>>("Coordinates", NoFactory::get());
221  } else {
222  RCP<const FactoryBase> myCoordsFact = GetFactory("Coordinates");
223  if (myCoordsFact == Teuchos::null) {
224  myCoordsFact = fineLevel.GetFactoryManager()->GetFactory("Coordinates");
225  }
226  if (fineLevel.IsAvailable("Coordinates", myCoordsFact.get())) {
227  fineCoords =
228  fineLevel.Get<RCP<xdMV>>("Coordinates", myCoordsFact.get());
229  }
230  }
231 
232  TEUCHOS_TEST_FOR_EXCEPTION(fineCoords == Teuchos::null,
233  Exceptions::RuntimeError,
234  "No Coordinates found provided by the user.");
235 
236  TEUCHOS_TEST_FOR_EXCEPTION(fineCoords->getNumVectors() != 3,
237  Exceptions::RuntimeError,
238  "Three coordinates arrays must be supplied if "
239  "line detection orientation not given.");
240  ArrayRCP<typename Teuchos::ScalarTraits<Scalar>::coordinateType> x =
241  fineCoords->getDataNonConst(0);
242  ArrayRCP<typename Teuchos::ScalarTraits<Scalar>::coordinateType> y =
243  fineCoords->getDataNonConst(1);
244  ArrayRCP<typename Teuchos::ScalarTraits<Scalar>::coordinateType> z =
245  fineCoords->getDataNonConst(2);
246 
247  // determine the maximum and minimum z coordinate value on the current
248  // processor.
249  typename Teuchos::ScalarTraits<Scalar>::coordinateType zval_max =
250  -Teuchos::ScalarTraits<
251  typename Teuchos::ScalarTraits<Scalar>::coordinateType>::one() /
252  Teuchos::ScalarTraits<
253  typename Teuchos::ScalarTraits<Scalar>::coordinateType>::sfmin();
254  typename Teuchos::ScalarTraits<Scalar>::coordinateType zval_min =
255  Teuchos::ScalarTraits<
256  typename Teuchos::ScalarTraits<Scalar>::coordinateType>::one() /
257  Teuchos::ScalarTraits<
258  typename Teuchos::ScalarTraits<Scalar>::coordinateType>::sfmin();
259  for (auto it = z.begin(); it != z.end(); ++it) {
260  if (*it > zval_max)
261  zval_max = *it;
262  if (*it < zval_min)
263  zval_min = *it;
264  }
265 
266  LO myCoarseZLayers = Teuchos::as<LO>(CoarseNumZLayers);
267 
268  ArrayRCP<typename Teuchos::ScalarTraits<Scalar>::coordinateType>
269  myZLayerCoords = Teuchos::arcp<
270  typename Teuchos::ScalarTraits<Scalar>::coordinateType>(
271  myCoarseZLayers);
272  if (myCoarseZLayers == 1) {
273  myZLayerCoords[0] = zval_min;
274  } else {
275  typename Teuchos::ScalarTraits<Scalar>::coordinateType dz =
276  (zval_max - zval_min) / (myCoarseZLayers - 1);
277  for (LO k = 0; k < myCoarseZLayers; ++k) {
278  myZLayerCoords[k] = k * dz;
279  }
280  }
281 
282  // Note, that the coarse level node coordinates have to be in vertical
283  // ordering according to the numbering of the vertical lines
284 
285  // number of vertical lines on current node:
286  LO numVertLines = Nnodes / FineNumZLayers;
287  LO numLocalCoarseNodes = numVertLines * myCoarseZLayers;
288 
289  RCP<const Map> coarseCoordMap = MapFactory::Build(
290  fineCoords->getMap()->lib(),
291  Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
292  Teuchos::as<size_t>(numLocalCoarseNodes),
293  fineCoords->getMap()->getIndexBase(), fineCoords->getMap()->getComm());
294  RCP<xdMV> coarseCoords = Xpetra::MultiVectorFactory<
295  typename Teuchos::ScalarTraits<Scalar>::coordinateType, LO, GO,
296  NO>::Build(coarseCoordMap, fineCoords->getNumVectors());
297  coarseCoords->putScalar(-1.0);
298  ArrayRCP<typename Teuchos::ScalarTraits<Scalar>::coordinateType> cx =
299  coarseCoords->getDataNonConst(0);
300  ArrayRCP<typename Teuchos::ScalarTraits<Scalar>::coordinateType> cy =
301  coarseCoords->getDataNonConst(1);
302  ArrayRCP<typename Teuchos::ScalarTraits<Scalar>::coordinateType> cz =
303  coarseCoords->getDataNonConst(2);
304 
305  // loop over all vert line indices (stop as soon as possible)
306  LO cntCoarseNodes = 0;
307  for (LO vt = 0; vt < TVertLineId.size(); ++vt) {
308  // vertical line id in *vt
309  LO curVertLineId = TVertLineId[vt];
310 
311  if (cx[curVertLineId * myCoarseZLayers] == -1.0 &&
312  cy[curVertLineId * myCoarseZLayers] == -1.0) {
313  // loop over all local myCoarseZLayers
314  for (LO n = 0; n < myCoarseZLayers; ++n) {
315  cx[curVertLineId * myCoarseZLayers + n] = x[vt];
316  cy[curVertLineId * myCoarseZLayers + n] = y[vt];
317  cz[curVertLineId * myCoarseZLayers + n] = myZLayerCoords[n];
318  }
319  cntCoarseNodes += myCoarseZLayers;
320  }
321 
322  TEUCHOS_TEST_FOR_EXCEPTION(cntCoarseNodes > numLocalCoarseNodes,
323  Exceptions::RuntimeError,
324  "number of coarse nodes is inconsistent.");
325  if (cntCoarseNodes == numLocalCoarseNodes)
326  break;
327  }
328 
329  // set coarse level coordinates
330  Set(coarseLevel, "Coordinates", coarseCoords);
331  } /* end bool bTransferCoordinates */
332 }
333 
334 template <class Scalar, class LocalOrdinal, class GlobalOrdinal,
335  class DeviceType>
336 void SemiCoarsenPFactory_kokkos<
338  Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
339  BuildSemiCoarsenP(Level &coarseLevel, const LO NFRows, const LO NFNodes,
340  const LO DofsPerNode, const LO NFLayers,
341  const LO NCLayers, const ArrayRCP<LO> LayerId,
342  const ArrayRCP<LO> VertLineId, const RCP<Matrix> &Amat,
343  const RCP<MultiVector> fineNullspace, RCP<Matrix> &P,
344  RCP<MultiVector> &coarseNullspace) const {
345  SubFactoryMonitor m2(*this, "BuildSemiCoarsenP", coarseLevel);
346  using impl_SC = typename Kokkos::ArithTraits<SC>::val_type;
347  using impl_ATS = Kokkos::ArithTraits<impl_SC>;
348  using LOView1D = Kokkos::View<LO *, DeviceType>;
349  using LOView2D = Kokkos::View<LO **, DeviceType>;
350 
351  // Construct a map from fine level column to layer ids (including ghost nodes)
352  // Note: this is needed to sum all couplings within a layer
353  const auto FCol2LayerVector =
354  Xpetra::VectorFactory<LO, LO, GO, NO>::Build(Amat->getColMap());
355  const auto localTemp =
356  Xpetra::VectorFactory<LO, LO, GO, NO>::Build(Amat->getDomainMap());
357  RCP<const Import> importer = Amat->getCrsGraph()->getImporter();
358  if (importer == Teuchos::null)
359  importer = ImportFactory::Build(Amat->getDomainMap(), Amat->getColMap());
360  {
361  // Fill local temp with layer ids and fill ghost nodes
362  const auto localTempHost = localTemp->getHostLocalView(Xpetra::Access::ReadWrite);
363  for (int row = 0; row < NFRows; row++)
364  localTempHost(row, 0) = LayerId[row / DofsPerNode];
365  const auto localTempView = localTemp->getDeviceLocalView(Xpetra::Access::ReadWrite);
366  Kokkos::deep_copy(localTempView, localTempHost);
367  FCol2LayerVector->doImport(*localTemp, *(importer), Xpetra::INSERT);
368  }
369  const auto FCol2LayerView = FCol2LayerVector->getDeviceLocalView(Xpetra::Access::ReadOnly);
370  const auto FCol2Layer = Kokkos::subview(FCol2LayerView, Kokkos::ALL(), 0);
371 
372  // Construct a map from fine level column to local dof per node id (including
373  // ghost nodes) Note: this is needed to sum all couplings within a layer
374  const auto FCol2DofVector =
375  Xpetra::VectorFactory<LO, LO, GO, NO>::Build(Amat->getColMap());
376  {
377  // Fill local temp with local dof per node ids and fill ghost nodes
378  const auto localTempHost = localTemp->getHostLocalView(Xpetra::Access::ReadWrite);
379  for (int row = 0; row < NFRows; row++)
380  localTempHost(row, 0) = row % DofsPerNode;
381  const auto localTempView = localTemp->getDeviceLocalView(Xpetra::Access::ReadWrite);
382  Kokkos::deep_copy(localTempView, localTempHost);
383  FCol2DofVector->doImport(*localTemp, *(importer), Xpetra::INSERT);
384  }
385  const auto FCol2DofView = FCol2DofVector->getDeviceLocalView(Xpetra::Access::ReadOnly);
386  const auto FCol2Dof = Kokkos::subview(FCol2DofView, Kokkos::ALL(), 0);
387 
388  // Compute NVertLines
389  // TODO: Read this from line detection factory
390  int NVertLines = -1;
391  if (NFNodes != 0)
392  NVertLines = VertLineId[0];
393  for (int node = 1; node < NFNodes; ++node)
394  if (VertLineId[node] > NVertLines)
395  NVertLines = VertLineId[node];
396  NVertLines++;
397 
398  // Construct a map from Line, Layer ids to fine level node
399  LOView2D LineLayer2Node("LineLayer2Node", NVertLines, NFLayers);
400  typename LOView2D::HostMirror LineLayer2NodeHost =
401  Kokkos::create_mirror_view(LineLayer2Node);
402  for (int node = 0; node < NFNodes; ++node)
403  LineLayer2NodeHost(VertLineId[node], LayerId[node]) = node;
404  Kokkos::deep_copy(LineLayer2Node, LineLayer2NodeHost);
405 
406  // Construct a map from coarse layer id to fine layer id
407  LOView1D CLayer2FLayer("CLayer2FLayer", NCLayers);
408  typename LOView1D::HostMirror CLayer2FLayerHost =
409  Kokkos::create_mirror_view(CLayer2FLayer);
410  using coordT = typename Teuchos::ScalarTraits<Scalar>::coordinateType;
411  const LO FirstStride =
412  (LO)ceil(((coordT)(NFLayers + 1)) / ((coordT)(NCLayers + 1)));
413  const coordT RestStride =
414  ((coordT)(NFLayers - FirstStride + 1)) / ((coordT)NCLayers);
415  const LO NCpts =
416  (LO)floor((((coordT)(NFLayers - FirstStride + 1)) / RestStride) + .00001);
417  TEUCHOS_TEST_FOR_EXCEPTION(NCLayers != NCpts, Exceptions::RuntimeError,
418  "sizes do not match.");
419  coordT stride = (coordT)FirstStride;
420  for (int clayer = 0; clayer < NCLayers; ++clayer) {
421  CLayer2FLayerHost(clayer) = (LO)floor(stride) - 1;
422  stride += RestStride;
423  }
424  Kokkos::deep_copy(CLayer2FLayer, CLayer2FLayerHost);
425 
426  // Compute start layer and stencil sizes for layer interpolation at each
427  // coarse layer
428  int MaxStencilSize = 1;
429  LOView1D CLayer2StartLayer("CLayer2StartLayer", NCLayers);
430  LOView1D CLayer2StencilSize("CLayer2StencilSize", NCLayers);
431  typename LOView1D::HostMirror CLayer2StartLayerHost =
432  Kokkos::create_mirror_view(CLayer2StartLayer);
433  typename LOView1D::HostMirror CLayer2StencilSizeHost =
434  Kokkos::create_mirror_view(CLayer2StencilSize);
435  for (int clayer = 0; clayer < NCLayers; ++clayer) {
436  const int startLayer = (clayer > 0) ? CLayer2FLayerHost(clayer - 1) + 1 : 0;
437  const int stencilSize = (clayer < NCLayers - 1)
438  ? CLayer2FLayerHost(clayer + 1) - startLayer
439  : NFLayers - startLayer;
440 
441  if (MaxStencilSize < stencilSize)
442  MaxStencilSize = stencilSize;
443  CLayer2StartLayerHost(clayer) = startLayer;
444  CLayer2StencilSizeHost(clayer) = stencilSize;
445  }
446  Kokkos::deep_copy(CLayer2StartLayer, CLayer2StartLayerHost);
447  Kokkos::deep_copy(CLayer2StencilSize, CLayer2StencilSizeHost);
448 
449  // Allocate storage for the coarse layer interpolation matrices on all
450  // vertical lines Note: Contributions to each matrix are collapsed to vertical
451  // lines. Thus, each vertical line gives rise to a block tridiagonal matrix.
452  // Here we store the full matrix to be compatible with kokkos kernels batch LU
453  // and tringular solve.
454  int Nmax = MaxStencilSize * DofsPerNode;
456  "BandMat", NVertLines, Nmax, Nmax);
458  "BandSol", NVertLines, Nmax, DofsPerNode);
459 
460  // Precompute number of nonzeros in prolongation matrix and allocate P views
461  // Note: Each coarse dof (NVertLines*NCLayers*DofsPerNode) contributes an
462  // interpolation stencil (StencilSize*DofsPerNode)
463  int NnzP = 0;
464  for (int clayer = 0; clayer < NCLayers; ++clayer)
465  NnzP += CLayer2StencilSizeHost(clayer);
466  NnzP *= NVertLines * DofsPerNode * DofsPerNode;
467  Kokkos::View<impl_SC *, DeviceType> Pvals("Pvals", NnzP);
468  Kokkos::View<LO *, DeviceType> Pcols("Pcols", NnzP);
469 
470  // Precompute Pptr
471  // Note: Each coarse layer stencil dof contributes DofsPerNode to the
472  // corresponding row in P
473  Kokkos::View<size_t *, DeviceType> Pptr("Pptr", NFRows + 1);
475  Kokkos::create_mirror_view(Pptr);
476  Kokkos::deep_copy(PptrHost, 0);
477  for (int line = 0; line < NVertLines; ++line) {
478  for (int clayer = 0; clayer < NCLayers; ++clayer) {
479  const int stencilSize = CLayer2StencilSizeHost(clayer);
480  const int startLayer = CLayer2StartLayerHost(clayer);
481  for (int snode = 0; snode < stencilSize; ++snode) {
482  for (int dofi = 0; dofi < DofsPerNode; ++dofi) {
483  const int layer = startLayer + snode;
484  const int AmatBlkRow = LineLayer2NodeHost(line, layer);
485  const int AmatRow = AmatBlkRow * DofsPerNode + dofi;
486  PptrHost(AmatRow + 1) += DofsPerNode;
487  }
488  }
489  }
490  }
491  for (int i = 2; i < NFRows + 1; ++i)
492  PptrHost(i) += PptrHost(i - 1);
493  TEUCHOS_TEST_FOR_EXCEPTION(NnzP != (int)PptrHost(NFRows),
494  Exceptions::RuntimeError,
495  "Number of nonzeros in P does not match");
496  Kokkos::deep_copy(Pptr, PptrHost);
497 
498  // Precompute Pptr offsets
499  // Note: These are used to determine the nonzero index in Pvals and Pcols
501  "layerBuckets", NFLayers);
502  Kokkos::deep_copy(layerBuckets, 0);
503  LOView2D CLayerSNode2PptrOffset("CLayerSNode2PptrOffset", NCLayers,
504  MaxStencilSize);
505  typename LOView2D::HostMirror CLayerSNode2PptrOffsetHost =
506  Kokkos::create_mirror_view(CLayerSNode2PptrOffset);
507  for (int clayer = 0; clayer < NCLayers; ++clayer) {
508  const int stencilSize = CLayer2StencilSizeHost(clayer);
509  const int startLayer = CLayer2StartLayerHost(clayer);
510  for (int snode = 0; snode < stencilSize; ++snode) {
511  const int layer = startLayer + snode;
512  CLayerSNode2PptrOffsetHost(clayer, snode) = layerBuckets(layer);
513  layerBuckets(layer)++;
514  }
515  }
516  Kokkos::deep_copy(CLayerSNode2PptrOffset, CLayerSNode2PptrOffsetHost);
517 
518  { // Fill P - fill and solve each block tridiagonal system and fill P views
519  SubFactoryMonitor m3(*this, "Fill P", coarseLevel);
520 
521  const auto localAmat = Amat->getLocalMatrixDevice();
522  const auto zero = impl_ATS::zero();
523  const auto one = impl_ATS::one();
524 
525  using range_policy = Kokkos::RangePolicy<execution_space>;
526  Kokkos::parallel_for(
527  "MueLu::SemiCoarsenPFactory_kokkos::BuildSemiCoarsenP Fill P",
528  range_policy(0, NVertLines), KOKKOS_LAMBDA(const int line) {
529  for (int clayer = 0; clayer < NCLayers; ++clayer) {
530 
531  // Initialize BandSol
532  auto bandSol =
533  Kokkos::subview(BandSol, line, Kokkos::ALL(), Kokkos::ALL());
534  for (int row = 0; row < Nmax; ++row)
535  for (int dof = 0; dof < DofsPerNode; ++dof)
536  bandSol(row, dof) = zero;
537 
538  // Initialize BandMat (set unused row diagonal to 1.0)
539  const int stencilSize = CLayer2StencilSize(clayer);
540  const int N = stencilSize * DofsPerNode;
541  auto bandMat =
542  Kokkos::subview(BandMat, line, Kokkos::ALL(), Kokkos::ALL());
543  for (int row = 0; row < Nmax; ++row)
544  for (int col = 0; col < Nmax; ++col)
545  bandMat(row, col) =
546  (row == col && row >= N) ? one : zero;
547 
548  // Loop over layers in stencil and fill banded matrix and rhs
549  const int flayer = CLayer2FLayer(clayer);
550  const int startLayer = CLayer2StartLayer(clayer);
551  for (int snode = 0; snode < stencilSize; ++snode) {
552 
553  const int layer = startLayer + snode;
554  if (layer == flayer) { // If layer in stencil is a coarse layer
555  for (int dof = 0; dof < DofsPerNode; ++dof) {
556  const int row = snode * DofsPerNode + dof;
557  bandMat(row, row) = one;
558  bandSol(row, dof) = one;
559  }
560  } else { // Not a coarse layer
561  const int AmatBlkRow = LineLayer2Node(line, layer);
562  for (int dofi = 0; dofi < DofsPerNode; ++dofi) {
563 
564  // Get Amat row info
565  const int AmatRow = AmatBlkRow * DofsPerNode + dofi;
566  const auto localAmatRow = localAmat.rowConst(AmatRow);
567  const int AmatRowLeng = localAmatRow.length;
568 
569  const int row = snode * DofsPerNode + dofi;
570  for (int dofj = 0; dofj < DofsPerNode; ++dofj) {
571  const int col = snode * DofsPerNode + dofj;
572 
573  // Sum values along row which correspond to stencil
574  // layer/dof and fill bandMat
575  auto val = zero;
576  for (int i = 0; i < AmatRowLeng; ++i) {
577  const int colidx = localAmatRow.colidx(i);
578  if (FCol2Layer(colidx) == layer &&
579  FCol2Dof(colidx) == dofj)
580  val += localAmatRow.value(i);
581  }
582  bandMat(row, col) = val;
583 
584  if (snode > 0) {
585  // Sum values along row which correspond to stencil
586  // layer/dof below and fill bandMat
587  val = zero;
588  for (int i = 0; i < AmatRowLeng; ++i) {
589  const int colidx = localAmatRow.colidx(i);
590  if (FCol2Layer(colidx) == layer - 1 &&
591  FCol2Dof(colidx) == dofj)
592  val += localAmatRow.value(i);
593  }
594  bandMat(row, col - DofsPerNode) = val;
595  }
596 
597  if (snode < stencilSize - 1) {
598  // Sum values along row which correspond to stencil
599  // layer/dof above and fill bandMat
600  val = zero;
601  for (int i = 0; i < AmatRowLeng; ++i) {
602  const int colidx = localAmatRow.colidx(i);
603  if (FCol2Layer(colidx) == layer + 1 &&
604  FCol2Dof(colidx) == dofj)
605  val += localAmatRow.value(i);
606  }
607  bandMat(row, col + DofsPerNode) = val;
608  }
609  }
610  }
611  }
612  }
613 
614  // Batch LU and triangular solves
615  namespace KB = KokkosBatched;
616  using lu_type = typename KB::SerialLU<KB::Algo::LU::Unblocked>;
617  lu_type::invoke(bandMat);
618  using trsv_l_type =
619  typename KB::SerialTrsm<KB::Side::Left, KB::Uplo::Lower,
620  KB::Trans::NoTranspose, KB::Diag::Unit,
621  KB::Algo::Trsm::Unblocked>;
622  trsv_l_type::invoke(one, bandMat, bandSol);
623  using trsv_u_type = typename KB::SerialTrsm<
624  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose,
625  KB::Diag::NonUnit, KB::Algo::Trsm::Unblocked>;
626  trsv_u_type::invoke(one, bandMat, bandSol);
627 
628  // Fill prolongation views with solution
629  for (int snode = 0; snode < stencilSize; ++snode) {
630  for (int dofj = 0; dofj < DofsPerNode; ++dofj) {
631  for (int dofi = 0; dofi < DofsPerNode; ++dofi) {
632  const int layer = startLayer + snode;
633  const int AmatBlkRow = LineLayer2Node(line, layer);
634  const int AmatRow = AmatBlkRow * DofsPerNode + dofi;
635 
636  const int pptrOffset = CLayerSNode2PptrOffset(clayer, snode);
637  const int pptr =
638  Pptr(AmatRow) + pptrOffset * DofsPerNode + dofj;
639 
640  const int col =
641  line * NCLayers + clayer; // coarse node (block row) index
642  Pcols(pptr) = col * DofsPerNode + dofj;
643  Pvals(pptr) = bandSol(snode * DofsPerNode + dofi, dofj);
644  }
645  }
646  }
647  }
648  });
649  } // Fill P
650 
651  // Build P
652  RCP<const Map> rowMap = Amat->getRowMap();
653  Xpetra::global_size_t GNdofs = rowMap->getGlobalNumElements();
654  Xpetra::global_size_t itemp = GNdofs / NFLayers;
655  std::vector<size_t> stridingInfo_{(size_t)DofsPerNode};
656  RCP<const Map> coarseMap = StridedMapFactory::Build(
657  rowMap->lib(), NCLayers * itemp, NCLayers * NVertLines * DofsPerNode, 0,
658  stridingInfo_, rowMap->getComm(), -1, 0);
659  P = rcp(new CrsMatrixWrap(rowMap, coarseMap, 0));
660  RCP<CrsMatrix> PCrs = rcp_dynamic_cast<CrsMatrixWrap>(P)->getCrsMatrix();
661  PCrs->setAllValues(Pptr, Pcols, Pvals);
662  PCrs->expertStaticFillComplete(coarseMap, Amat->getDomainMap());
663 
664  // set StridingInformation of P
665  if (Amat->IsView("stridedMaps") == true)
666  P->CreateView("stridedMaps", Amat->getRowMap("stridedMaps"), coarseMap);
667  else
668  P->CreateView("stridedMaps", P->getRangeMap(), coarseMap);
669 
670  // Construct coarse nullspace and inject fine nullspace
671  coarseNullspace =
672  MultiVectorFactory::Build(coarseMap, fineNullspace->getNumVectors());
673  const int numVectors = fineNullspace->getNumVectors();
674  const auto fineNullspaceView = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadOnly);
675  const auto coarseNullspaceView = coarseNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
676  using range_policy = Kokkos::RangePolicy<execution_space>;
677  Kokkos::parallel_for(
678  "MueLu::SemiCoarsenPFactory_kokkos::BuildSemiCoarsenP Inject Nullspace",
679  range_policy(0, NVertLines), KOKKOS_LAMBDA(const int line) {
680  for (int clayer = 0; clayer < NCLayers; ++clayer) {
681  const int layer = CLayer2FLayer(clayer);
682  const int AmatBlkRow =
683  LineLayer2Node(line, layer); // fine node (block row) index
684  const int col =
685  line * NCLayers + clayer; // coarse node (block row) index
686  for (int k = 0; k < numVectors; ++k) {
687  for (int dofi = 0; dofi < DofsPerNode; ++dofi) {
688  const int fRow = AmatBlkRow * DofsPerNode + dofi;
689  const int cRow = col * DofsPerNode + dofi;
690  coarseNullspaceView(cRow, k) = fineNullspaceView(fRow, k);
691  }
692  }
693  }
694  });
695 }
696 
697 } // namespace MueLu
698 
699 #define MUELU_SEMICOARSENPFACTORY_KOKKOS_SHORT
700 #endif // HAVE_MUELU_KOKKOS_REFACTOR
701 #endif // MUELU_SEMICOARSENPFACTORY_KOKKOS_DEF_HPP
MueLu::DefaultLocalOrdinal LocalOrdinal
Namespace for MueLu classes and methods.
static const NoFactory * get()
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
static const Teuchos::ParameterEntry & getEntry(const std::string &name)
Returns default entry from the "master" list corresponding to the specified name. ...