Ifpack2 Templated Preconditioning Package  Version 1.0
Ifpack2_BlockTriDiContainer_impl.hpp
1 /*@HEADER
2 // ***********************************************************************
3 //
4 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ***********************************************************************
40 //@HEADER
41 */
42 
43 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
44 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
45 
46 #include <Teuchos_Details_MpiTypeTraits.hpp>
47 
48 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
49 #include <Tpetra_Distributor.hpp>
50 #include <Tpetra_BlockMultiVector.hpp>
51 
52 #include <Kokkos_ArithTraits.hpp>
53 #include <KokkosBatched_Util.hpp>
54 #include <KokkosBatched_Vector.hpp>
55 #include <KokkosBatched_Copy_Decl.hpp>
56 #include <KokkosBatched_Copy_Impl.hpp>
57 #include <KokkosBatched_AddRadial_Decl.hpp>
58 #include <KokkosBatched_AddRadial_Impl.hpp>
59 #include <KokkosBatched_SetIdentity_Decl.hpp>
60 #include <KokkosBatched_SetIdentity_Impl.hpp>
61 #include <KokkosBatched_Gemm_Decl.hpp>
62 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
63 #include <KokkosBatched_Gemm_Team_Impl.hpp>
64 #include <KokkosBatched_Gemv_Decl.hpp>
65 #include <KokkosBatched_Gemv_Serial_Impl.hpp>
66 #include <KokkosBatched_Gemv_Team_Impl.hpp>
67 #include <KokkosBatched_Trsm_Decl.hpp>
68 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
69 #include <KokkosBatched_Trsm_Team_Impl.hpp>
70 #include <KokkosBatched_Trsv_Decl.hpp>
71 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
72 #include <KokkosBatched_Trsv_Team_Impl.hpp>
73 #include <KokkosBatched_LU_Decl.hpp>
74 #include <KokkosBatched_LU_Serial_Impl.hpp>
75 #include <KokkosBatched_LU_Team_Impl.hpp>
76 
77 #include <KokkosBlas1_nrm1.hpp>
78 #include <KokkosBlas1_nrm2.hpp>
79 
80 #include <memory>
81 
82 // need to interface this into cmake variable (or only use this flag when it is necessary)
83 //#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
84 //#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
85 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
86 #include "cuda_profiler_api.h"
87 #endif
88 
89 // I am not 100% sure about the mpi 3 on cuda
90 #if MPI_VERSION >= 3
91 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
92 #endif
93 
94 // ::: Experiments :::
95 // define either pinned memory or cudamemory for mpi
96 // if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
97 // if defined, this use pinned memory instead of device pointer
98 // by default, we enable pinned memory
99 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
100 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
101 
102 // if defined, all views are allocated on cuda space intead of cuda uvm space
103 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
104 
105 // if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
106 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
107 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
108 #endif
109 
110 // if defined, it uses multiple execution spaces
111 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
112 
113 namespace Ifpack2 {
114 
115  namespace BlockTriDiContainerDetails {
116 
117  namespace KB = KokkosBatched;
118 
122  using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
123 
124  template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
125  using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
126  MemoryTraitsType::is_random_access |
127  flag>;
128 
129  template <typename ViewType>
130  using Unmanaged = Kokkos::View<typename ViewType::data_type,
131  typename ViewType::array_layout,
132  typename ViewType::device_type,
133  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
134  template <typename ViewType>
135  using Atomic = Kokkos::View<typename ViewType::data_type,
136  typename ViewType::array_layout,
137  typename ViewType::device_type,
138  MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
139  template <typename ViewType>
140  using Const = Kokkos::View<typename ViewType::const_data_type,
141  typename ViewType::array_layout,
142  typename ViewType::device_type,
143  typename ViewType::memory_traits>;
144  template <typename ViewType>
145  using ConstUnmanaged = Const<Unmanaged<ViewType> >;
146 
147  template <typename ViewType>
148  using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
149 
150  template <typename ViewType>
151  using Unmanaged = Kokkos::View<typename ViewType::data_type,
152  typename ViewType::array_layout,
153  typename ViewType::device_type,
154  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
155 
156 
157  template <typename ViewType>
158  using Scratch = Kokkos::View<typename ViewType::data_type,
159  typename ViewType::array_layout,
160  typename ViewType::execution_space::scratch_memory_space,
161  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
162 
166  template<typename T> struct BlockTridiagScalarType { typedef T type; };
167 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
168  template<> struct BlockTridiagScalarType<double> { typedef float type; };
169  //template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
170 #endif
171 
175  template<typename T> struct is_cuda { enum : bool { value = false }; };
176 #if defined(KOKKOS_ENABLE_CUDA)
177  template<> struct is_cuda<Kokkos::Cuda> { enum : bool { value = true }; };
178 #endif
179 
183  template<typename T> struct is_hip { enum : bool { value = false }; };
184 #if defined(KOKKOS_ENABLE_HIP)
185  template<> struct is_hip<Kokkos::Experimental::HIP> { enum : bool { value = true }; };
186 #endif
187 
191  template<typename T>
193  static void createInstance(T &exec_instance) {
194  exec_instance = T();
195  }
196 #if defined(KOKKOS_ENABLE_CUDA)
197  static void createInstance(const cudaStream_t &s, T &exec_instance) {
198  exec_instance = T();
199  }
200 #endif
201  };
202 
203 #if defined(KOKKOS_ENABLE_CUDA)
204  template<>
205  struct ExecutionSpaceFactory<Kokkos::Cuda> {
206  static void createInstance(Kokkos::Cuda &exec_instance) {
207  exec_instance = Kokkos::Cuda();
208  }
209  static void createInstance(const cudaStream_t &s, Kokkos::Cuda &exec_instance) {
210  exec_instance = Kokkos::Cuda(s);
211  }
212  };
213 #endif
214 
215 #if defined(KOKKOS_ENABLE_HIP)
216  template<>
217  struct ExecutionSpaceFactory<Kokkos::Experimental::HIP> {
218  static void createInstance(Kokkos::Experimental::HIP &exec_instance) {
219  exec_instance = Kokkos::Experimental::HIP();
220  }
221  };
222 #endif
223 
227  template<typename CommPtrType>
228  std::string get_msg_prefix (const CommPtrType &comm) {
229  const auto rank = comm->getRank();
230  const auto nranks = comm->getSize();
231  std::stringstream ss;
232  ss << "Rank " << rank << " of " << nranks << ": ";
233  return ss.str();
234  }
235 
239  template<typename T, int N>
240  struct ArrayValueType {
241  T v[N];
242  KOKKOS_INLINE_FUNCTION
243  ArrayValueType() {
244  for (int i=0;i<N;++i)
245  this->v[i] = 0;
246  }
247  KOKKOS_INLINE_FUNCTION
248  ArrayValueType(const ArrayValueType &b) {
249  for (int i=0;i<N;++i)
250  this->v[i] = b.v[i];
251  }
252  };
253  template<typename T, int N>
254  static
255  KOKKOS_INLINE_FUNCTION
256  void
257  operator+=(volatile ArrayValueType<T,N> &a,
258  volatile const ArrayValueType<T,N> &b) {
259  for (int i=0;i<N;++i)
260  a.v[i] += b.v[i];
261  }
262  template<typename T, int N>
263  static
264  KOKKOS_INLINE_FUNCTION
265  void
266  operator+=(ArrayValueType<T,N> &a,
267  const ArrayValueType<T,N> &b) {
268  for (int i=0;i<N;++i)
269  a.v[i] += b.v[i];
270  }
271 
275  template<typename T, int N, typename ExecSpace>
276  struct SumReducer {
277  typedef SumReducer reducer;
279  typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
280  value_type *value;
281 
282  KOKKOS_INLINE_FUNCTION
283  SumReducer(value_type &val) : value(&val) {}
284 
285  KOKKOS_INLINE_FUNCTION
286  void join(value_type &dst, value_type &src) const {
287  for (int i=0;i<N;++i)
288  dst.v[i] += src.v[i];
289  }
290  KOKKOS_INLINE_FUNCTION
291  void join(volatile value_type &dst, const volatile value_type &src) const {
292  for (int i=0;i<N;++i)
293  dst.v[i] += src.v[i];
294  }
295  KOKKOS_INLINE_FUNCTION
296  void init(value_type &val) const {
297  for (int i=0;i<N;++i)
298  val.v[i] = Kokkos::reduction_identity<T>::sum();
299  }
300  KOKKOS_INLINE_FUNCTION
301  value_type& reference() {
302  return *value;
303  }
304  KOKKOS_INLINE_FUNCTION
305  result_view_type view() const {
306  return result_view_type(value);
307  }
308  };
309 
310 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS)
311 #define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label) TEUCHOS_FUNC_TIME_MONITOR(label);
312 #else
313 #define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label)
314 #endif
315 
316 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
317 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
318  CUDA_SAFE_CALL(cudaProfilerStart());
319 
320 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
321  { CUDA_SAFE_CALL( cudaProfilerStop() ); }
322 #else
323 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
325 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
326 #endif
327 
331  template <typename MatrixType>
332  struct ImplType {
336  typedef size_t size_type;
337  typedef typename MatrixType::scalar_type scalar_type;
338  typedef typename MatrixType::local_ordinal_type local_ordinal_type;
339  typedef typename MatrixType::global_ordinal_type global_ordinal_type;
340  typedef typename MatrixType::node_type node_type;
341 
345  typedef typename Kokkos::Details::ArithTraits<scalar_type>::val_type impl_scalar_type;
346  typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
347 
348  typedef typename BlockTridiagScalarType<impl_scalar_type>::type btdm_scalar_type;
349  typedef typename Kokkos::ArithTraits<btdm_scalar_type>::mag_type btdm_magnitude_type;
350 
354  typedef Kokkos::DefaultHostExecutionSpace host_execution_space;
355 
359  typedef typename node_type::device_type node_device_type;
360  typedef typename node_device_type::execution_space node_execution_space;
361  typedef typename node_device_type::memory_space node_memory_space;
362 
363 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE)
364  typedef node_execution_space execution_space;
366  typedef typename std::conditional<std::is_same<node_memory_space,Kokkos::CudaUVMSpace>::value,
367  Kokkos::CudaSpace,
368  node_memory_space>::type memory_space;
369  typedef Kokkos::Device<execution_space,memory_space> device_type;
370 #else
371  typedef node_device_type device_type;
372  typedef node_execution_space execution_space;
373  typedef node_memory_space memory_space;
374 #endif
375 
376  typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
377  typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
378  typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
379  typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
380  typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
381  typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
382  typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
383  typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_device_type local_crs_graph_type;
384 
388  template<typename T, int l> using Vector = KB::Vector<T,l>;
389  template<typename T> using SIMD = KB::SIMD<T>;
390  template<typename T, typename M> using DefaultVectorLength = KB::DefaultVectorLength<T,M>;
391  template<typename T, typename M> using DefaultInternalVectorLength = KB::DefaultInternalVectorLength<T,M>;
392 
393  static constexpr int vector_length = DefaultVectorLength<btdm_scalar_type,memory_space>::value;
394  static constexpr int internal_vector_length = DefaultInternalVectorLength<btdm_scalar_type,memory_space>::value;
395  typedef Vector<SIMD<btdm_scalar_type>,vector_length> vector_type;
396  typedef Vector<SIMD<btdm_scalar_type>,internal_vector_length> internal_vector_type;
397 
401  typedef Kokkos::View<size_type*,device_type> size_type_1d_view;
402  typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
403  // tpetra block crs values
404  typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
405  typedef Kokkos::View<impl_scalar_type*,node_device_type> impl_scalar_type_1d_view_tpetra;
406 
407  // tpetra multivector values (layout left): may need to change the typename more explicitly
408  typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
409  typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,node_device_type> impl_scalar_type_2d_view_tpetra;
410 
411  // packed data always use layout right
412  typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
413  typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
414  typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
415  typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
416  typedef Kokkos::View<btdm_scalar_type***,Kokkos::LayoutRight,device_type> btdm_scalar_type_3d_view;
417  typedef Kokkos::View<btdm_scalar_type****,Kokkos::LayoutRight,device_type> btdm_scalar_type_4d_view;
418  };
419 
423  template<typename MatrixType>
424  typename Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type>
425  createBlockCrsTpetraImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
426  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter");
427  using impl_type = ImplType<MatrixType>;
428  using tpetra_map_type = typename impl_type::tpetra_map_type;
429  using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
430  using tpetra_import_type = typename impl_type::tpetra_import_type;
431 
432  const auto g = A->getCrsGraph(); // tpetra crs graph object
433  const auto blocksize = A->getBlockSize();
434  const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
435  const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
436 
437  return Teuchos::rcp(new tpetra_import_type(src, tgt));
438  }
439 
440  // Partial replacement for forward-mode MultiVector::doImport.
441  // Permits overlapped communication and computation, but also supports sync'ed.
442  // I'm finding that overlapped comm/comp can give quite poor performance on some
443  // platforms, so we can't just use it straightforwardly always.
444 
445  template<typename MatrixType>
446  struct AsyncableImport {
447  public:
448  using impl_type = ImplType<MatrixType>;
449 
450  private:
454 #if !defined(HAVE_IFPACK2_MPI)
455  typedef int MPI_Request;
456  typedef int MPI_Comm;
457 #endif
458  using scalar_type = typename impl_type::scalar_type;
461 
462  static int isend(const MPI_Comm comm, const char* buf, int count, int dest, int tag, MPI_Request* ireq) {
463 #ifdef HAVE_IFPACK2_MPI
464  MPI_Request ureq;
465  int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
466  if (ireq == NULL) MPI_Request_free(&ureq);
467  return ret;
468 #else
469  return 0;
470 #endif
471  }
472 
473  static int irecv(const MPI_Comm comm, char* buf, int count, int src, int tag, MPI_Request* ireq) {
474 #ifdef HAVE_IFPACK2_MPI
475  MPI_Request ureq;
476  int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
477  if (ireq == NULL) MPI_Request_free(&ureq);
478  return ret;
479 #else
480  return 0;
481 #endif
482  }
483 
484  static int waitany(int count, MPI_Request* reqs, int* index) {
485 #ifdef HAVE_IFPACK2_MPI
486  return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
487 #else
488  return 0;
489 #endif
490  }
491 
492  static int waitall(int count, MPI_Request* reqs) {
493 #ifdef HAVE_IFPACK2_MPI
494  return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
495 #else
496  return 0;
497 #endif
498  }
499 
500  public:
501  using tpetra_map_type = typename impl_type::tpetra_map_type;
502  using tpetra_import_type = typename impl_type::tpetra_import_type;
503 
504  using local_ordinal_type = typename impl_type::local_ordinal_type;
505  using global_ordinal_type = typename impl_type::global_ordinal_type;
506  using size_type = typename impl_type::size_type;
507  using impl_scalar_type = typename impl_type::impl_scalar_type;
508 
509  using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
510  using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
511 
512  using execution_space = typename impl_type::execution_space;
513  using memory_space = typename impl_type::memory_space;
514  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
515  using size_type_1d_view = typename impl_type::size_type_1d_view;
516  using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
517 
518 #if defined(KOKKOS_ENABLE_CUDA)
519  using impl_scalar_type_1d_view =
520  typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
521 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
522  Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
523 # elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
524  Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
525 # else // no experimental macros are defined
526  typename impl_type::impl_scalar_type_1d_view,
527 # endif
528  typename impl_type::impl_scalar_type_1d_view>::type;
529 #else
530  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
531 #endif
532  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
533  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
534 
535 #ifdef HAVE_IFPACK2_MPI
536  MPI_Comm comm;
537 #endif
538 
539  impl_scalar_type_2d_view_tpetra remote_multivector;
540  local_ordinal_type blocksize;
541 
542  template<typename T>
543  struct SendRecvPair {
544  T send, recv;
545  };
546 
547  // (s)end and (r)eceive data:
548  SendRecvPair<int_1d_view_host> pids; // mpi ranks
549  SendRecvPair<std::vector<MPI_Request> > reqs; // MPI_Request is pointer, cannot use kokkos view
550  SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
551  SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
552  SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
553  SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
554 
555  local_ordinal_type_1d_view dm2cm; // permutation
556 
557 #if defined(KOKKOS_ENABLE_CUDA)
558  using cuda_stream_1d_std_vector = std::vector<cudaStream_t>;
559  cuda_stream_1d_std_vector stream;
560 
561  using exec_instance_1d_std_vector = std::vector<execution_space>;
562  exec_instance_1d_std_vector exec_instances;
563 #endif
564 
565  // for cuda
566  public:
567  void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
568  const size_type_1d_view &offs) {
569  // wrap lens to kokkos view and deep copy to device
570  Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
571  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
572 
573  // exclusive scan
574  const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
575  const local_ordinal_type lens_size = lens_device.extent(0);
576  Kokkos::parallel_scan
577  ("AsyncableImport::RangePolicy::setOffsetValues",
578  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
579  if (final)
580  offs(i) = update;
581  update += (i < lens_size ? lens_device[i] : 0);
582  });
583  }
584 
585  void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
586  const size_type_1d_view_host &offs) {
587  // wrap lens to kokkos view and deep copy to device
588  Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
589  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
590 
591  // exclusive scan
592  offs(0) = 0;
593  for (local_ordinal_type i=1,iend=offs.extent(0);i<iend;++i) {
594  offs(i) = offs(i-1) + lens[i-1];
595  }
596  }
597 
598  private:
599  void createMpiRequests(const tpetra_import_type &import) {
600  Tpetra::Distributor &distributor = import.getDistributor();
601 
602  // copy pids from distributor
603  const auto pids_from = distributor.getProcsFrom();
604  pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
605  memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int)*pids.recv.extent(0));
606 
607  const auto pids_to = distributor.getProcsTo();
608  pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
609  memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int)*pids.send.extent(0));
610 
611  // mpi requests
612  reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*sizeof(MPI_Request));
613  reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*sizeof(MPI_Request));
614 
615  // construct offsets
616 #if 0
617  const auto lengths_to = distributor.getLengthsTo();
618  offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
619 
620  const auto lengths_from = distributor.getLengthsFrom();
621  offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
622 
623  setOffsetValues(lengths_to, offset.send);
624  offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
625 
626  setOffsetValues(lengths_from, offset.recv);
627  offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
628 #else
629  const auto lengths_to = distributor.getLengthsTo();
630  offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
631 
632  const auto lengths_from = distributor.getLengthsFrom();
633  offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
634 
635  setOffsetValuesHost(lengths_to, offset_host.send);
636  //offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
637 
638  setOffsetValuesHost(lengths_from, offset_host.recv);
639  //offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
640 #endif
641  }
642 
643  void createSendRecvIDs(const tpetra_import_type &import) {
644  // For each remote PID, the list of LIDs to receive.
645  const auto remote_lids = import.getRemoteLIDs();
646  const local_ordinal_type_1d_view_host
647  remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
648  lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
649  Kokkos::deep_copy(lids.recv, remote_lids_view_host);
650 
651  // For each export PID, the list of LIDs to send.
652  auto epids = import.getExportPIDs();
653  auto elids = import.getExportLIDs();
654  TEUCHOS_ASSERT(epids.size() == elids.size());
655  lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
656  auto lids_send_host = Kokkos::create_mirror_view(lids.send);
657 
658  // naive search (not sure if pids or epids are sorted)
659  for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
660  const auto pid_send_value = pids.send[i];
661  for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
662  if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
663 #if !defined(__CUDA_ARCH__)
664  TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
665 #endif
666  }
667  Kokkos::deep_copy(lids.send, lids_send_host);
668  }
669 
670  void createExecutionSpaceInstances() {
671 #if defined(KOKKOS_ENABLE_CUDA)
672  const local_ordinal_type num_streams = 8;
673  {
674  stream.clear();
675  stream.resize(num_streams);
676  exec_instances.clear();
677  exec_instances.resize(num_streams);
678  for (local_ordinal_type i=0;i<num_streams;++i) {
679  CUDA_SAFE_CALL(cudaStreamCreateWithFlags(&stream[i], cudaStreamNonBlocking));
680  ExecutionSpaceFactory<execution_space>::createInstance(stream[i], exec_instances[i]);
681  }
682  }
683 #endif
684  }
685 
686  void destroyExecutionSpaceInstances() {
687 #if defined(KOKKOS_ENABLE_CUDA)
688  {
689  const local_ordinal_type num_streams = stream.size();
690  for (local_ordinal_type i=0;i<num_streams;++i)
691  CUDA_SAFE_CALL(cudaStreamDestroy(stream[i]));
692  }
693  stream.clear();
694  exec_instances.clear();
695 #endif
696  }
697 
698  public:
699  // for cuda, all tag types are public
700  struct ToBuffer {};
701  struct ToMultiVector {};
702 
703  AsyncableImport (const Teuchos::RCP<const tpetra_map_type>& src_map,
704  const Teuchos::RCP<const tpetra_map_type>& tgt_map,
705  const local_ordinal_type blocksize_,
706  const local_ordinal_type_1d_view dm2cm_) {
707  blocksize = blocksize_;
708  dm2cm = dm2cm_;
709 
710 #ifdef HAVE_IFPACK2_MPI
711  comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
712 #endif
713  const tpetra_import_type import(src_map, tgt_map);
714 
715  createMpiRequests(import);
716  createSendRecvIDs(import);
717  createExecutionSpaceInstances();
718  }
719 
720  ~AsyncableImport() {
721  destroyExecutionSpaceInstances();
722  }
723 
724  void createDataBuffer(const local_ordinal_type &num_vectors) {
725  const size_type extent_0 = lids.recv.extent(0)*blocksize;
726  const size_type extent_1 = num_vectors;
727  if (remote_multivector.extent(0) == extent_0 &&
728  remote_multivector.extent(1) == extent_1) {
729  // skip
730  } else {
731  remote_multivector =
732  impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
733 
734  const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
735  const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
736 
737  buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
738  buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
739  }
740  }
741 
742  void cancel () {
743 #ifdef HAVE_IFPACK2_MPI
744  waitall(reqs.recv.size(), reqs.recv.data());
745  waitall(reqs.send.size(), reqs.send.data());
746 #endif
747  }
748 
749  // ======================================================================
750  // Async version using cuda stream
751  // - cuda only with kokkos develop branch
752  // ======================================================================
753 
754 #if defined(KOKKOS_ENABLE_CUDA)
755  template<typename PackTag>
756  static
757  void copy(const local_ordinal_type_1d_view &lids_,
758  const impl_scalar_type_1d_view &buffer_,
759  const local_ordinal_type ibeg_,
760  const local_ordinal_type iend_,
761  const impl_scalar_type_2d_view_tpetra &multivector_,
762  const local_ordinal_type blocksize_,
763  const execution_space &exec_instance_) {
764  const local_ordinal_type num_vectors = multivector_.extent(1);
765  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
766  const local_ordinal_type idiff = iend_ - ibeg_;
767  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
768 
769  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
770  local_ordinal_type vector_size(0);
771  if (blocksize_ <= 4) vector_size = 4;
772  else if (blocksize_ <= 8) vector_size = 8;
773  else if (blocksize_ <= 16) vector_size = 16;
774  else vector_size = 32;
775 
776  const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
777  const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
778  Kokkos::parallel_for
779  (//"AsyncableImport::TeamPolicy::copyViaCudaStream",
780  Kokkos::Experimental::require(policy, work_item_property),
781  KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
782  const local_ordinal_type i = member.league_rank();
783  Kokkos::parallel_for
784  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
785  auto aptr = abase + blocksize_*(i + idiff*j);
786  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
787  if (std::is_same<PackTag,ToBuffer>::value)
788  Kokkos::parallel_for
789  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
790  aptr[k] = bptr[k];
791  });
792  else
793  Kokkos::parallel_for
794  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
795  bptr[k] = aptr[k];
796  });
797  });
798  });
799  }
800 
801  void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
802  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
803 
804 #ifdef HAVE_IFPACK2_MPI
805  // constants and reallocate data buffers if necessary
806  const local_ordinal_type num_vectors = mv.extent(1);
807  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
808 
809  // 0. post receive async
810  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
811  irecv(comm,
812  reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
813  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
814  pids.recv[i],
815  42,
816  &reqs.recv[i]);
817  }
818 
820  execution_space().fence();
821 
822  // 1. async memcpy
823  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
824  // 1.0. enqueue pack buffer
825  if (i<8) exec_instances[i%8].fence();
826  copy<ToBuffer>(lids.send, buffer.send,
827  offset_host.send(i), offset_host.send(i+1),
828  mv, blocksize,
829  //execution_space());
830  exec_instances[i%8]);
831 
832  }
834  //execution_space().fence();
835  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
836  // 1.1. sync the stream and isend
837  if (i<8) exec_instances[i%8].fence();
838  isend(comm,
839  reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
840  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
841  pids.send[i],
842  42,
843  &reqs.send[i]);
844  }
845 
846  // 2. poke communication
847  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
848  int flag;
849  MPI_Status stat;
850  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
851  }
852 #endif // HAVE_IFPACK2_MPI
853  }
854 
855  void syncRecvVar1() {
856  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
857 #ifdef HAVE_IFPACK2_MPI
858  // 0. wait for receive async.
859  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
860  local_ordinal_type idx = i;
861 
862  // 0.0. wait any
863  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
864 
865  // 0.1. unpack data after data is moved into a device
866  copy<ToMultiVector>(lids.recv, buffer.recv,
867  offset_host.recv(idx), offset_host.recv(idx+1),
868  remote_multivector, blocksize,
869  exec_instances[idx%8]);
870  }
871 
872  // 1. fire up all cuda events
873  Kokkos::fence();
874 
875  // 2. cleanup all open comm
876  waitall(reqs.send.size(), reqs.send.data());
877 #endif // HAVE_IFPACK2_MPI
878  }
879 #endif //defined(KOKKOS_ENABLE_CUDA)
880 
881  // ======================================================================
882  // Generic version without using cuda stream
883  // - only difference between cuda and host architecture is on using team
884  // or range policies.
885  // ======================================================================
886  template<typename PackTag>
887  static
888  void copy(const local_ordinal_type_1d_view &lids_,
889  const impl_scalar_type_1d_view &buffer_,
890  const local_ordinal_type &ibeg_,
891  const local_ordinal_type &iend_,
892  const impl_scalar_type_2d_view_tpetra &multivector_,
893  const local_ordinal_type blocksize_) {
894  const local_ordinal_type num_vectors = multivector_.extent(1);
895  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
896  const local_ordinal_type idiff = iend_ - ibeg_;
897  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
898  if (is_cuda<execution_space>::value || is_hip<execution_space>::value) {
899 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
900  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
901  local_ordinal_type vector_size(0);
902  if (blocksize_ <= 4) vector_size = 4;
903  else if (blocksize_ <= 8) vector_size = 8;
904  else if (blocksize_ <= 16) vector_size = 16;
905  else vector_size = 32;
906  const team_policy_type policy(idiff, 1, vector_size);
907  Kokkos::parallel_for
908  ("AsyncableImport::TeamPolicy::copy",
909  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
910  const local_ordinal_type i = member.league_rank();
911  Kokkos::parallel_for
912  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
913  auto aptr = abase + blocksize_*(i + idiff*j);
914  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
915  if (std::is_same<PackTag,ToBuffer>::value)
916  Kokkos::parallel_for
917  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
918  aptr[k] = bptr[k];
919  });
920  else
921  Kokkos::parallel_for
922  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
923  bptr[k] = aptr[k];
924  });
925  });
926  });
927 #endif
928  } else {
929 #if defined(__CUDA_ARCH__)
930  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
931 #else
932  {
933  const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
934  Kokkos::parallel_for
935  ("AsyncableImport::RangePolicy::copy",
936  policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
937  const local_ordinal_type i = ij%idiff;
938  const local_ordinal_type j = ij/idiff;
939  auto aptr = abase + blocksize_*(i + idiff*j);
940  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
941  auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
942  auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
943  memcpy(to, from, sizeof(impl_scalar_type)*blocksize_);
944  });
945  }
946 #endif
947  }
948  }
949 
950 
954  void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
955  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
956 
957 #ifdef HAVE_IFPACK2_MPI
958  // constants and reallocate data buffers if necessary
959  const local_ordinal_type num_vectors = mv.extent(1);
960  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
961 
962  // receive async
963  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
964  irecv(comm,
965  reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
966  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
967  pids.recv[i],
968  42,
969  &reqs.recv[i]);
970  }
971 
972  // send async
973  for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
974  copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
975  mv, blocksize);
976  Kokkos::fence();
977  isend(comm,
978  reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
979  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
980  pids.send[i],
981  42,
982  &reqs.send[i]);
983  }
984 
985  // I find that issuing an Iprobe seems to nudge some MPIs into action,
986  // which helps with overlapped comm/comp performance.
987  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
988  int flag;
989  MPI_Status stat;
990  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
991  }
992 #endif
993  }
994 
995  void syncRecvVar0() {
996  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
997 #ifdef HAVE_IFPACK2_MPI
998  // receive async.
999  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
1000  local_ordinal_type idx = i;
1001  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
1002  copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
1003  remote_multivector, blocksize);
1004  }
1005  // wait on the sends to match all Isends with a cleanup operation.
1006  waitall(reqs.send.size(), reqs.send.data());
1007 #endif
1008  }
1009 
1013  void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
1014 #if defined(KOKKOS_ENABLE_CUDA)
1015 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
1016  asyncSendRecvVar1(mv);
1017 #else
1018  asyncSendRecvVar0(mv);
1019 #endif
1020 #else
1021  asyncSendRecvVar0(mv);
1022 #endif
1023  }
1024  void syncRecv() {
1025 #if defined(KOKKOS_ENABLE_CUDA)
1026 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
1027  syncRecvVar1();
1028 #else
1029  syncRecvVar0();
1030 #endif
1031 #else
1032  syncRecvVar0();
1033 #endif
1034  }
1035 
1036  void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
1037  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncExchange");
1038  asyncSendRecv(mv);
1039  syncRecv();
1040  }
1041 
1042  impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
1043  };
1044 
1048  template<typename MatrixType>
1049  Teuchos::RCP<AsyncableImport<MatrixType> >
1050  createBlockCrsAsyncImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
1051  using impl_type = ImplType<MatrixType>;
1052  using tpetra_map_type = typename impl_type::tpetra_map_type;
1053  using local_ordinal_type = typename impl_type::local_ordinal_type;
1054  using global_ordinal_type = typename impl_type::global_ordinal_type;
1055  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1056 
1057  const auto g = A->getCrsGraph(); // tpetra crs graph object
1058  const auto blocksize = A->getBlockSize();
1059  const auto domain_map = g.getDomainMap();
1060  const auto column_map = g.getColMap();
1061 
1062  std::vector<global_ordinal_type> gids;
1063  bool separate_remotes = true, found_first = false, need_owned_permutation = false;
1064  for (size_t i=0;i<column_map->getNodeNumElements();++i) {
1065  const global_ordinal_type gid = column_map->getGlobalElement(i);
1066  if (!domain_map->isNodeGlobalElement(gid)) {
1067  found_first = true;
1068  gids.push_back(gid);
1069  } else if (found_first) {
1070  separate_remotes = false;
1071  break;
1072  }
1073  if (!need_owned_permutation &&
1074  domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
1075  // The owned part of the domain and column maps are different
1076  // orderings. We *could* do a super efficient impl of this case in the
1077  // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
1078  // really, if a caller cares about speed, they wouldn't make different
1079  // local permutations like this. So we punt on the best impl and go for
1080  // a pretty good one: the permutation is done in place in
1081  // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
1082  // is the presumably worse memory access pattern of the input vector.
1083  need_owned_permutation = true;
1084  }
1085  }
1086 
1087  if (separate_remotes) {
1088  const auto invalid = Teuchos::OrdinalTraits<global_ordinal_type>::invalid();
1089  const auto parsimonious_col_map
1090  = Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm()));
1091  if (parsimonious_col_map->getGlobalNumElements() > 0) {
1092  // make the importer only if needed.
1093  local_ordinal_type_1d_view dm2cm;
1094  if (need_owned_permutation) {
1095  dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getNodeNumElements());
1096  const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
1097  for (size_t i=0;i<domain_map->getNodeNumElements();++i)
1098  dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
1099  Kokkos::deep_copy(dm2cm, dm2cm_host);
1100  }
1101  return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
1102  }
1103  }
1104  return Teuchos::null;
1105  }
1106 
1107  template<typename MatrixType>
1108  struct PartInterface {
1109  using local_ordinal_type = typename ImplType<MatrixType>::local_ordinal_type;
1110  using local_ordinal_type_1d_view = typename ImplType<MatrixType>::local_ordinal_type_1d_view;
1111 
1112  PartInterface() = default;
1113  PartInterface(const PartInterface &b) = default;
1114 
1115  // Some terms:
1116  // The matrix A is split as A = D + R, where D is the matrix of tridiag
1117  // blocks and R is the remainder.
1118  // A part is roughly a synonym for a tridiag. The distinction is that a part
1119  // is the set of rows belonging to one tridiag and, equivalently, the off-diag
1120  // rows in R associated with that tridiag. In contrast, the term tridiag is
1121  // used to refer specifically to tridiag data, such as the pointer into the
1122  // tridiag data array.
1123  // Local (lcl) row arge the LIDs. lclrow lists the LIDs belonging to each
1124  // tridiag, and partptr points to the beginning of each tridiag. This is the
1125  // LID space.
1126  // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
1127  // by this ordinal. This is the 'index' space.
1128  // A flat index is the mathematical index into an array. A pack index
1129  // accounts for SIMD packing.
1130 
1131  // Local row LIDs. Permutation from caller's index space to tridiag index
1132  // space.
1133  local_ordinal_type_1d_view lclrow;
1134  // partptr_ is the pointer array into lclrow_.
1135  local_ordinal_type_1d_view partptr; // np+1
1136  // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
1137  // is the start of the i'th pack.
1138  local_ordinal_type_1d_view packptr; // npack+1
1139  // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
1140  // an alias of partptr_ in the case of no overlap.
1141  local_ordinal_type_1d_view part2rowidx0; // np+1
1142  // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
1143  // it's the same as part2rowidx0_; if it's > 1, then the value is combined
1144  // with i % vector_length to get the location in the packed data.
1145  local_ordinal_type_1d_view part2packrowidx0; // np+1
1146  local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
1147  // rowidx2part_ maps the row index to the part index.
1148  local_ordinal_type_1d_view rowidx2part; // nr
1149  // True if lcl{row|col} is at most a constant away from row{idx|col}. In
1150  // practice, this knowledge is not particularly useful, as packing for batched
1151  // processing is done at the same time as the permutation from LID to index
1152  // space. But it's easy to detect, so it's recorded in case an optimization
1153  // can be made based on it.
1154  bool row_contiguous;
1155 
1156  local_ordinal_type max_partsz;
1157  };
1158 
1162  template<typename MatrixType>
1163  PartInterface<MatrixType>
1164  createPartInterface(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1165  const Teuchos::Array<Teuchos::Array<typename ImplType<MatrixType>::local_ordinal_type> > &partitions) {
1166  using impl_type = ImplType<MatrixType>;
1167  using local_ordinal_type = typename impl_type::local_ordinal_type;
1168  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1169 
1170  constexpr int vector_length = impl_type::vector_length;
1171 
1172  const auto comm = A->getRowMap()->getComm();
1173 
1174  PartInterface<MatrixType> interf;
1175 
1176  const bool jacobi = partitions.size() == 0;
1177  const local_ordinal_type A_n_lclrows = A->getNodeNumRows();
1178  const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1179 
1180 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1181  local_ordinal_type nrows = 0;
1182  if (jacobi)
1183  nrows = nparts;
1184  else
1185  for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1186 
1187  TEUCHOS_TEST_FOR_EXCEPT_MSG
1188  (nrows != A_n_lclrows, get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1189  << "the same as getNodeNumRows: " << nrows << " vs " << A_n_lclrows);
1190 #endif
1191 
1192  // permutation vector
1193  std::vector<local_ordinal_type> p;
1194  if (jacobi) {
1195  interf.max_partsz = 1;
1196  } else {
1197  // reorder parts to maximize simd packing efficiency
1198  p.resize(nparts);
1199 
1200  typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1201  std::vector<size_idx_pair_type> partsz(nparts);
1202  for (local_ordinal_type i=0;i<nparts;++i)
1203  partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1204  std::sort(partsz.begin(), partsz.end(),
1205  [] (const size_idx_pair_type& x, const size_idx_pair_type& y) {
1206  return x.first > y.first;
1207  });
1208  for (local_ordinal_type i=0;i<nparts;++i)
1209  p[i] = partsz[i].second;
1210 
1211  interf.max_partsz = partsz[0].first;
1212  }
1213 
1214  // allocate parts
1215  interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1216  interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1217  interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1218  interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1219  interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1220 
1221  // mirror to host and compute on host execution space
1222  const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1223  const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1224  const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1225  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1226  const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1227 
1228  // Determine parts.
1229  interf.row_contiguous = true;
1230  partptr(0) = 0;
1231  part2rowidx0(0) = 0;
1232  part2packrowidx0(0) = 0;
1233  local_ordinal_type pack_nrows = 0;
1234  if (jacobi) {
1235  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1236  const local_ordinal_type ipnrows = 1;
1237  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1238  get_msg_prefix(comm)
1239  << "partition " << p[ip]
1240  << " is empty, which is not allowed.");
1241  //assume No overlap.
1242  part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1243  // Since parts are ordered in nonincreasing size, the size of the first
1244  // part in a pack is the size for all parts in the pack.
1245  if (ip % vector_length == 0) pack_nrows = ipnrows;
1246  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1247  const local_ordinal_type os = partptr(ip);
1248  for (local_ordinal_type i=0;i<ipnrows;++i) {
1249  const auto lcl_row = ip;
1250  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1251  get_msg_prefix(comm)
1252  << "partitions[" << p[ip] << "]["
1253  << i << "] = " << lcl_row
1254  << " but input matrix implies limits of [0, " << A_n_lclrows-1
1255  << "].");
1256  lclrow(os+i) = lcl_row;
1257  rowidx2part(os+i) = ip;
1258  if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
1259  interf.row_contiguous = false;
1260  }
1261  partptr(ip+1) = os + ipnrows;
1262  }
1263  } else {
1264  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1265  const auto* part = &partitions[p[ip]];
1266  const local_ordinal_type ipnrows = part->size();
1267  TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1268  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1269  get_msg_prefix(comm)
1270  << "partition " << p[ip]
1271  << " is empty, which is not allowed.");
1272  //assume No overlap.
1273  part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1274  // Since parts are ordered in nonincreasing size, the size of the first
1275  // part in a pack is the size for all parts in the pack.
1276  if (ip % vector_length == 0) pack_nrows = ipnrows;
1277  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1278  const local_ordinal_type os = partptr(ip);
1279  for (local_ordinal_type i=0;i<ipnrows;++i) {
1280  const auto lcl_row = (*part)[i];
1281  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1282  get_msg_prefix(comm)
1283  << "partitions[" << p[ip] << "]["
1284  << i << "] = " << lcl_row
1285  << " but input matrix implies limits of [0, " << A_n_lclrows-1
1286  << "].");
1287  lclrow(os+i) = lcl_row;
1288  rowidx2part(os+i) = ip;
1289  if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
1290  interf.row_contiguous = false;
1291  }
1292  partptr(ip+1) = os + ipnrows;
1293  }
1294  }
1295 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1296  TEUCHOS_ASSERT(partptr(nparts) == nrows);
1297 #endif
1298  if (lclrow(0) != 0) interf.row_contiguous = false;
1299 
1300  Kokkos::deep_copy(interf.partptr, partptr);
1301  Kokkos::deep_copy(interf.lclrow, lclrow);
1302 
1303  //assume No overlap. Thus:
1304  interf.part2rowidx0 = interf.partptr;
1305  Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1306 
1307  interf.part2packrowidx0_back = part2packrowidx0(part2packrowidx0.extent(0) - 1);
1308  Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1309 
1310  { // Fill packptr.
1311  local_ordinal_type npacks = 0;
1312  for (local_ordinal_type ip=1;ip<=nparts;++ip)
1313  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1314  ++npacks;
1315  interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1316  const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1317  packptr(0) = 0;
1318  for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1319  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1320  packptr(k++) = ip;
1321  Kokkos::deep_copy(interf.packptr, packptr);
1322  }
1323 
1324  return interf;
1325  }
1326 
1330  template <typename MatrixType>
1331  struct BlockTridiags {
1333  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1334  using size_type_1d_view = typename impl_type::size_type_1d_view;
1335  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1336 
1337  // flat_td_ptr(i) is the index into flat-array values of the start of the
1338  // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1339  // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1340  // vector_length is the position in the pack.
1341  size_type_1d_view flat_td_ptr, pack_td_ptr;
1342  // List of local column indices into A from which to grab
1343  // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1344  local_ordinal_type_1d_view A_colindsub;
1345  // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1346  // tridiag's pack, and i % vector_length gives the position in the pack.
1347  vector_type_3d_view values;
1348 
1349  bool is_diagonal_only;
1350 
1351  BlockTridiags() = default;
1352  BlockTridiags(const BlockTridiags &b) = default;
1353 
1354  // Index into row-major block of a tridiag.
1355  template <typename idx_type>
1356  static KOKKOS_FORCEINLINE_FUNCTION
1357  idx_type IndexToRow (const idx_type& ind) { return (ind + 1) / 3; }
1358  // Given a row of a row-major tridiag, return the index of the first block
1359  // in that row.
1360  template <typename idx_type>
1361  static KOKKOS_FORCEINLINE_FUNCTION
1362  idx_type RowToIndex (const idx_type& row) { return row > 0 ? 3*row - 1 : 0; }
1363  // Number of blocks in a tridiag having a given number of rows.
1364  template <typename idx_type>
1365  static KOKKOS_FORCEINLINE_FUNCTION
1366  idx_type NumBlocks (const idx_type& nrows) { return nrows > 0 ? 3*nrows - 2 : 0; }
1367  };
1368 
1369 
1373  template<typename MatrixType>
1375  createBlockTridiags(const PartInterface<MatrixType> &interf) {
1376  using impl_type = ImplType<MatrixType>;
1377  using execution_space = typename impl_type::execution_space;
1378  using local_ordinal_type = typename impl_type::local_ordinal_type;
1379  using size_type = typename impl_type::size_type;
1380  using size_type_1d_view = typename impl_type::size_type_1d_view;
1381 
1382  constexpr int vector_length = impl_type::vector_length;
1383 
1385 
1386  const local_ordinal_type ntridiags = interf.partptr.extent(0) - 1;
1387 
1388  { // construct the flat index pointers into the tridiag values array.
1389  btdm.flat_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.flat_td_ptr"), ntridiags + 1);
1390  const Kokkos::RangePolicy<execution_space> policy(0,ntridiags + 1);
1391  Kokkos::parallel_scan
1392  ("createBlockTridiags::RangePolicy::flat_td_ptr",
1393  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1394  if (final)
1395  btdm.flat_td_ptr(i) = update;
1396  if (i < ntridiags) {
1397  const local_ordinal_type nrows = interf.partptr(i+1) - interf.partptr(i);
1398  update += btdm.NumBlocks(nrows);
1399  }
1400  });
1401 
1402  const auto nblocks = Kokkos::create_mirror_view_and_copy
1403  (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, ntridiags));
1404  btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1405  }
1406 
1407  // And the packed index pointers.
1408  if (vector_length == 1) {
1409  btdm.pack_td_ptr = btdm.flat_td_ptr;
1410  } else {
1411  const local_ordinal_type npacks = interf.packptr.extent(0) - 1;
1412  btdm.pack_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.pack_td_ptr"), ntridiags + 1);
1413  const Kokkos::RangePolicy<execution_space> policy(0,npacks);
1414  Kokkos::parallel_scan
1415  ("createBlockTridiags::RangePolicy::pack_td_ptr",
1416  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1417  const local_ordinal_type parti = interf.packptr(i);
1418  const local_ordinal_type parti_next = interf.packptr(i+1);
1419  if (final) {
1420  const size_type nblks = update;
1421  for (local_ordinal_type pti=parti;pti<parti_next;++pti)
1422  btdm.pack_td_ptr(pti) = nblks;
1423  const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1424  // last one
1425  if (i == npacks-1)
1426  btdm.pack_td_ptr(ntridiags) = nblks + btdm.NumBlocks(nrows);
1427  }
1428  {
1429  const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1430  update += btdm.NumBlocks(nrows);
1431  }
1432  });
1433  }
1434 
1435  // values and A_colindsub are created in the symbolic phase
1436 
1437  return btdm;
1438  }
1439 
1440  // Set the tridiags to be I to the full pack block size. That way, if a
1441  // tridiag within a pack is shorter than the longest one, the extra blocks are
1442  // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1443  // in the packed multvector are 0, and the tridiag LU reflects the extra I
1444  // blocks, then the solve proceeds as though the extra blocks aren't
1445  // present. Since this extra work is part of the SIMD calls, it's not actually
1446  // extra work. Instead, it means we don't have to put checks or masks in, or
1447  // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1448  // since the numeric phase fills in only the used entries, leaving these I
1449  // blocks intact.
1450  template<typename MatrixType>
1451  void
1452  setTridiagsToIdentity
1453  (const BlockTridiags<MatrixType>& btdm,
1454  const typename ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1455  {
1456  using impl_type = ImplType<MatrixType>;
1457  using execution_space = typename impl_type::execution_space;
1458  using local_ordinal_type = typename impl_type::local_ordinal_type;
1459  using size_type_1d_view = typename impl_type::size_type_1d_view;
1460 
1461  const ConstUnmanaged<size_type_1d_view> pack_td_ptr(btdm.pack_td_ptr);
1462  const local_ordinal_type blocksize = btdm.values.extent(1);
1463 
1464  {
1465  const int vector_length = impl_type::vector_length;
1466  const int internal_vector_length = impl_type::internal_vector_length;
1467 
1468  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1469  using internal_vector_type = typename impl_type::internal_vector_type;
1470  using internal_vector_type_4d_view =
1471  typename impl_type::internal_vector_type_4d_view;
1472 
1473  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1474  const internal_vector_type_4d_view values
1475  (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1476  btdm.values.extent(0),
1477  btdm.values.extent(1),
1478  btdm.values.extent(2),
1479  vector_length/internal_vector_length);
1480  const local_ordinal_type vector_loop_size = values.extent(3);
1481 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1482  local_ordinal_type total_team_size(0);
1483  if (blocksize <= 5) total_team_size = 32;
1484  else if (blocksize <= 9) total_team_size = 64;
1485  else if (blocksize <= 12) total_team_size = 96;
1486  else if (blocksize <= 16) total_team_size = 128;
1487  else if (blocksize <= 20) total_team_size = 160;
1488  else total_team_size = 160;
1489  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1490  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1491 #elif defined(KOKKOS_ENABLE_HIP)
1492  // FIXME: HIP
1493  // These settings might be completely wrong
1494  // will have to do some experiments to decide
1495  // what makes sense on AMD GPUs
1496  local_ordinal_type total_team_size(0);
1497  if (blocksize <= 5) total_team_size = 32;
1498  else if (blocksize <= 9) total_team_size = 64;
1499  else if (blocksize <= 12) total_team_size = 96;
1500  else if (blocksize <= 16) total_team_size = 128;
1501  else if (blocksize <= 20) total_team_size = 160;
1502  else total_team_size = 160;
1503  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1504  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1505 #else // Host architecture: team size is always one
1506  const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1507 #endif
1508  Kokkos::parallel_for
1509  ("setTridiagsToIdentity::TeamPolicy",
1510  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1511  const local_ordinal_type k = member.league_rank();
1512  const local_ordinal_type ibeg = pack_td_ptr(packptr(k));
1513  const local_ordinal_type iend = pack_td_ptr(packptr(k+1));
1514  const local_ordinal_type diff = iend - ibeg;
1515  const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1516  const btdm_scalar_type one(1);
1517  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
1518  Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](const local_ordinal_type &ii) {
1519  const local_ordinal_type i = ibeg + ii*3;
1520  for (local_ordinal_type j=0;j<blocksize;++j)
1521  values(i,j,j,v) = one;
1522  });
1523  });
1524  });
1525  }
1526  }
1527 
1531  template <typename MatrixType>
1532  struct AmD {
1534  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1535  using size_type_1d_view = typename impl_type::size_type_1d_view;
1536  using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
1537  // rowptr points to the start of each row of A_colindsub.
1538  size_type_1d_view rowptr, rowptr_remote;
1539  // Indices into A's rows giving the blocks to extract. rowptr(i) points to
1540  // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
1541  // where g is A's graph, are the columns AmD uses. If seq_method_, then
1542  // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
1543  // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
1544  // contains the remote ones.
1545  local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
1546 
1547  // Currently always true.
1548  bool is_tpetra_block_crs;
1549 
1550  // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
1551  impl_scalar_type_1d_view_tpetra tpetra_values;
1552 
1553  AmD() = default;
1554  AmD(const AmD &b) = default;
1555  };
1556 
1560  template<typename MatrixType>
1561  void
1562  performSymbolicPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1563  const PartInterface<MatrixType> &interf,
1565  AmD<MatrixType> &amd,
1566  const bool overlap_communication_and_computation) {
1567  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SymbolicPhase");
1568 
1569  using impl_type = ImplType<MatrixType>;
1570  using node_memory_space = typename impl_type::node_memory_space;
1571  using host_execution_space = typename impl_type::host_execution_space;
1572 
1573  using local_ordinal_type = typename impl_type::local_ordinal_type;
1574  using global_ordinal_type = typename impl_type::global_ordinal_type;
1575  using size_type = typename impl_type::size_type;
1576  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1577  using size_type_1d_view = typename impl_type::size_type_1d_view;
1578  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1579  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1580 
1581  constexpr int vector_length = impl_type::vector_length;
1582 
1583  const auto comm = A->getRowMap()->getComm();
1584  const auto& g = A->getCrsGraph();
1585  const auto blocksize = A->getBlockSize();
1586 
1587  // mirroring to host
1588  const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1589  const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1590  const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1591  const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1592  const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1593 
1594  const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1595 
1596  // find column to row map on host
1597  Kokkos::View<local_ordinal_type*,host_execution_space> col2row("col2row", A->getNodeNumCols());
1598  Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1599  {
1600  const auto rowmap = g.getRowMap();
1601  const auto colmap = g.getColMap();
1602  const auto dommap = g.getDomainMap();
1603  TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1604 
1605 #if !defined(__CUDA_ARCH__)
1606  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1607  Kokkos::parallel_for
1608  ("performSymbolicPhase::RangePolicy::col2row",
1609  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1610  const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1611  TEUCHOS_ASSERT(gid != Teuchos::OrdinalTraits<global_ordinal_type>::invalid());
1612  if (dommap->isNodeGlobalElement(gid)) {
1613  const local_ordinal_type lc = colmap->getLocalElement(gid);
1614 # if defined(BLOCKTRIDICONTAINER_DEBUG)
1615  TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits<local_ordinal_type>::invalid(),
1616  get_msg_prefix(comm) << "GID " << gid
1617  << " gives an invalid local column.");
1618 # endif
1619  col2row(lc) = lr;
1620  }
1621  });
1622 #endif
1623  }
1624 
1625  // construct the D and R graphs in A = D + R.
1626  {
1627  const auto local_graph = g.getLocalGraphHost();
1628  const auto local_graph_rowptr = local_graph.row_map;
1629  TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
1630  const auto local_graph_colidx = local_graph.entries;
1631 
1632  //assume no overlap.
1633 
1634  Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx("lclrow2idx", nrows);
1635  {
1636  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1637  Kokkos::parallel_for
1638  ("performSymbolicPhase::RangePolicy::lclrow2idx",
1639  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1640  lclrow2idx[lclrow(i)] = i;
1641  });
1642  }
1643 
1644  // count (block) nnzs in D and R.
1645  typedef SumReducer<size_type,3,host_execution_space> sum_reducer_type;
1646  typename sum_reducer_type::value_type sum_reducer_value;
1647  {
1648  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1649  Kokkos::parallel_reduce
1650  // profiling interface does not work
1651  (//"performSymbolicPhase::RangePolicy::count_nnz",
1652  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
1653  // LID -> index.
1654  const local_ordinal_type ri0 = lclrow2idx[lr];
1655  const local_ordinal_type pi0 = rowidx2part(ri0);
1656  for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1657  const local_ordinal_type lc = local_graph_colidx(j);
1658  const local_ordinal_type lc2r = col2row[lc];
1659  bool incr_R = false;
1660  do { // breakable
1661  if (lc2r == Teuchos::OrdinalTraits<local_ordinal_type>::invalid()) {
1662  incr_R = true;
1663  break;
1664  }
1665  const local_ordinal_type ri = lclrow2idx[lc2r];
1666  const local_ordinal_type pi = rowidx2part(ri);
1667  if (pi != pi0) {
1668  incr_R = true;
1669  break;
1670  }
1671  // Test for being in the tridiag. This is done in index space. In
1672  // LID space, tridiag LIDs in a row are not necessarily related by
1673  // {-1, 0, 1}.
1674  if (ri0 + 1 >= ri && ri0 <= ri + 1)
1675  ++update.v[0]; // D_nnz
1676  else
1677  incr_R = true;
1678  } while (0);
1679  if (incr_R) {
1680  if (lc < nrows) ++update.v[1]; // R_nnz_owned
1681  else ++update.v[2]; // R_nnz_remote
1682  }
1683  }
1684  }, sum_reducer_type(sum_reducer_value));
1685  }
1686  size_type D_nnz = sum_reducer_value.v[0];
1687  size_type R_nnz_owned = sum_reducer_value.v[1];
1688  size_type R_nnz_remote = sum_reducer_value.v[2];
1689 
1690  if (!overlap_communication_and_computation) {
1691  R_nnz_owned += R_nnz_remote;
1692  R_nnz_remote = 0;
1693  }
1694 
1695  // construct the D graph.
1696  {
1697  const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1698 
1699  btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
1700  const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
1701 
1702 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1703  Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1704 #endif
1705 
1706  const local_ordinal_type nparts = partptr.extent(0) - 1;
1707  {
1708  const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
1709  Kokkos::parallel_for
1710  ("performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
1711  policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
1712  const local_ordinal_type part_ri0 = part2rowidx0(pi0);
1713  local_ordinal_type offset = 0;
1714  for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
1715  const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
1716  offset = 1;
1717  const local_ordinal_type lr0 = lclrow(ri0);
1718  const size_type j0 = local_graph_rowptr(lr0);
1719  for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
1720  const local_ordinal_type lc = local_graph_colidx(j);
1721  const local_ordinal_type lc2r = col2row[lc];
1722  if (lc2r == Teuchos::OrdinalTraits<local_ordinal_type>::invalid()) continue;
1723  const local_ordinal_type ri = lclrow2idx[lc2r];
1724  const local_ordinal_type pi = rowidx2part(ri);
1725  if (pi != pi0) continue;
1726  if (ri + 1 < ri0 || ri > ri0 + 1) continue;
1727  const local_ordinal_type row_entry = j - j0;
1728  D_A_colindsub(flat_td_ptr(pi0) + ((td_row_os + ri) - ri0)) = row_entry;
1729  }
1730  }
1731  });
1732  }
1733 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1734  for (size_t i=0;i<D_A_colindsub.extent(0);++i)
1735  TEUCHOS_ASSERT(D_A_colindsub(i) != Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1736 #endif
1737  Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
1738 
1739  // Allocate values.
1740  {
1741  const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, nparts);
1742  const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
1743  btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
1744  if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
1745  }
1746  }
1747 
1748  // Construct the R graph.
1749  {
1750  amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
1751  amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
1752 
1753  const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
1754  const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
1755 
1756  amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
1757  amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
1758 
1759  const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
1760  const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
1761 
1762  {
1763  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1764  Kokkos::parallel_for
1765  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
1766  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1767  const local_ordinal_type ri0 = lclrow2idx[lr];
1768  const local_ordinal_type pi0 = rowidx2part(ri0);
1769  const size_type j0 = local_graph_rowptr(lr);
1770  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1771  const local_ordinal_type lc = local_graph_colidx(j);
1772  const local_ordinal_type lc2r = col2row[lc];
1773  if (lc2r != Teuchos::OrdinalTraits<local_ordinal_type>::invalid()) {
1774  const local_ordinal_type ri = lclrow2idx[lc2r];
1775  const local_ordinal_type pi = rowidx2part(ri);
1776  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
1777  continue;
1778  }
1779  }
1780  // exclusive scan will be performed later
1781  if (!overlap_communication_and_computation || lc < nrows) {
1782  ++R_rowptr(lr);
1783  } else {
1784  ++R_rowptr_remote(lr);
1785  }
1786  }
1787  });
1788  }
1789 
1790  // exclusive scan
1791  typedef ArrayValueType<size_type,2> update_type;
1792  {
1793  Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
1794  Kokkos::parallel_scan
1795  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
1796  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr,
1797  update_type &update,
1798  const bool &final) {
1799  update_type val;
1800  val.v[0] = R_rowptr(lr);
1801  if (overlap_communication_and_computation)
1802  val.v[1] = R_rowptr_remote(lr);
1803 
1804  if (final) {
1805  R_rowptr(lr) = update.v[0];
1806  if (overlap_communication_and_computation)
1807  R_rowptr_remote(lr) = update.v[1];
1808 
1809  if (lr < nrows) {
1810  const local_ordinal_type ri0 = lclrow2idx[lr];
1811  const local_ordinal_type pi0 = rowidx2part(ri0);
1812 
1813  size_type cnt_rowptr = R_rowptr(lr);
1814  size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
1815 
1816  const size_type j0 = local_graph_rowptr(lr);
1817  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1818  const local_ordinal_type lc = local_graph_colidx(j);
1819  const local_ordinal_type lc2r = col2row[lc];
1820  if (lc2r != Teuchos::OrdinalTraits<local_ordinal_type>::invalid()) {
1821  const local_ordinal_type ri = lclrow2idx[lc2r];
1822  const local_ordinal_type pi = rowidx2part(ri);
1823  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
1824  continue;
1825  }
1826  const local_ordinal_type row_entry = j - j0;
1827  if (!overlap_communication_and_computation || lc < nrows)
1828  R_A_colindsub(cnt_rowptr++) = row_entry;
1829  else
1830  R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
1831  }
1832  }
1833  }
1834  update += val;
1835  });
1836  }
1837  TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
1838  Kokkos::deep_copy(amd.rowptr, R_rowptr);
1839  Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
1840  if (overlap_communication_and_computation) {
1841  TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
1842  Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
1843  Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
1844  }
1845 
1846  // Allocate or view values.
1847  amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A.get())->getValuesDeviceNonConst());
1848 
1849  }
1850  }
1851  }
1852 
1853 
1857  template<typename ArgActiveExecutionMemorySpace>
1859 
1860  template<>
1861  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
1862  typedef KB::Mode::Serial mode_type;
1863 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
1864  typedef KB::Algo::Level3::CompactMKL algo_type;
1865 #else
1866  typedef KB::Algo::Level3::Blocked algo_type;
1867 #endif
1868  static int recommended_team_size(const int /* blksize */,
1869  const int /* vector_length */,
1870  const int /* internal_vector_length */) {
1871  return 1;
1872  }
1873 
1874  };
1875 
1876 #if defined(KOKKOS_ENABLE_CUDA)
1877  static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
1878  const int vector_length,
1879  const int internal_vector_length) {
1880  const int vector_size = vector_length/internal_vector_length;
1881  int total_team_size(0);
1882  if (blksize <= 5) total_team_size = 32;
1883  else if (blksize <= 9) total_team_size = 32; // 64
1884  else if (blksize <= 12) total_team_size = 96;
1885  else if (blksize <= 16) total_team_size = 128;
1886  else if (blksize <= 20) total_team_size = 160;
1887  else total_team_size = 160;
1888  return 2*total_team_size/vector_size;
1889  }
1890  template<>
1891  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
1892  typedef KB::Mode::Team mode_type;
1893  typedef KB::Algo::Level3::Unblocked algo_type;
1894  static int recommended_team_size(const int blksize,
1895  const int vector_length,
1896  const int internal_vector_length) {
1897  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1898  }
1899  };
1900  template<>
1901  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
1902  typedef KB::Mode::Team mode_type;
1903  typedef KB::Algo::Level3::Unblocked algo_type;
1904  static int recommended_team_size(const int blksize,
1905  const int vector_length,
1906  const int internal_vector_length) {
1907  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1908  }
1909  };
1910 #endif
1911 
1912 #if defined(KOKKOS_ENABLE_HIP)
1913  static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
1914  const int vector_length,
1915  const int internal_vector_length) {
1916  const int vector_size = vector_length/internal_vector_length;
1917  int total_team_size(0);
1918  if (blksize <= 5) total_team_size = 32;
1919  else if (blksize <= 9) total_team_size = 32; // 64
1920  else if (blksize <= 12) total_team_size = 96;
1921  else if (blksize <= 16) total_team_size = 128;
1922  else if (blksize <= 20) total_team_size = 160;
1923  else total_team_size = 160;
1924  return 2*total_team_size/vector_size;
1925  }
1926  template<>
1927  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPSpace> {
1928  typedef KB::Mode::Team mode_type;
1929  typedef KB::Algo::Level3::Unblocked algo_type;
1930  static int recommended_team_size(const int blksize,
1931  const int vector_length,
1932  const int internal_vector_length) {
1933  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
1934  }
1935  };
1936  template<>
1937  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPHostPinnedSpace> {
1938  typedef KB::Mode::Team mode_type;
1939  typedef KB::Algo::Level3::Unblocked algo_type;
1940  static int recommended_team_size(const int blksize,
1941  const int vector_length,
1942  const int internal_vector_length) {
1943  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
1944  }
1945  };
1946 #endif
1947 
1948  template<typename MatrixType>
1949  struct ExtractAndFactorizeTridiags {
1950  public:
1951  using impl_type = ImplType<MatrixType>;
1952  // a functor cannot have both device_type and execution_space; specialization error in kokkos
1953  using execution_space = typename impl_type::execution_space;
1954  using memory_space = typename impl_type::memory_space;
1956  using local_ordinal_type = typename impl_type::local_ordinal_type;
1957  using size_type = typename impl_type::size_type;
1958  using impl_scalar_type = typename impl_type::impl_scalar_type;
1959  using magnitude_type = typename impl_type::magnitude_type;
1961  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1963  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1964  using size_type_1d_view = typename impl_type::size_type_1d_view;
1965  using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
1967  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1968  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
1969  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1970  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
1971  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
1972  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
1973  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
1974 
1975  using internal_vector_type = typename impl_type::internal_vector_type;
1976  static constexpr int vector_length = impl_type::vector_length;
1977  static constexpr int internal_vector_length = impl_type::internal_vector_length;
1978 
1980  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1981  using member_type = typename team_policy_type::member_type;
1982 
1983  private:
1984  // part interface
1985  const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr;
1986  const local_ordinal_type max_partsz;
1987  // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
1988  using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
1989  const ConstUnmanaged<size_type_1d_view_tpetra> A_rowptr;
1990  const ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
1991  // block tridiags
1992  const ConstUnmanaged<size_type_1d_view> pack_td_ptr, flat_td_ptr;
1993  const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
1994  const Unmanaged<internal_vector_type_4d_view> internal_vector_values;
1995  const Unmanaged<btdm_scalar_type_4d_view> scalar_values;
1996  // shared information
1997  const local_ordinal_type blocksize, blocksize_square;
1998  // diagonal safety
1999  const magnitude_type tiny;
2000  const local_ordinal_type vector_loop_size;
2001  const local_ordinal_type vector_length_value;
2002 
2003  public:
2004  ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
2005  const PartInterface<MatrixType> &interf_,
2006  const Teuchos::RCP<const block_crs_matrix_type> &A_,
2007  const magnitude_type& tiny_) :
2008  // interface
2009  partptr(interf_.partptr),
2010  lclrow(interf_.lclrow),
2011  packptr(interf_.packptr),
2012  max_partsz(interf_.max_partsz),
2013  // block crs matrix
2014  A_rowptr(A_->getCrsGraph().getLocalGraphDevice().row_map),
2015  A_values(const_cast<block_crs_matrix_type*>(A_.get())->getValuesDeviceNonConst()),
2016  // block tridiags
2017  pack_td_ptr(btdm_.pack_td_ptr),
2018  flat_td_ptr(btdm_.flat_td_ptr),
2019  A_colindsub(btdm_.A_colindsub),
2020  internal_vector_values((internal_vector_type*)btdm_.values.data(),
2021  btdm_.values.extent(0),
2022  btdm_.values.extent(1),
2023  btdm_.values.extent(2),
2024  vector_length/internal_vector_length),
2025  scalar_values((btdm_scalar_type*)btdm_.values.data(),
2026  btdm_.values.extent(0),
2027  btdm_.values.extent(1),
2028  btdm_.values.extent(2),
2029  vector_length),
2030  blocksize(btdm_.values.extent(1)),
2031  blocksize_square(blocksize*blocksize),
2032  // diagonal weight to avoid zero pivots
2033  tiny(tiny_),
2034  vector_loop_size(vector_length/internal_vector_length),
2035  vector_length_value(vector_length) {}
2036 
2037  private:
2038 
2039  KOKKOS_INLINE_FUNCTION
2040  void
2041  extract(local_ordinal_type partidx,
2042  local_ordinal_type npacks) const {
2043  const size_type kps = pack_td_ptr(partidx);
2044  local_ordinal_type kfs[vector_length] = {};
2045  local_ordinal_type ri0[vector_length] = {};
2046  local_ordinal_type nrows[vector_length] = {};
2047 
2048  for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
2049  kfs[vi] = flat_td_ptr(partidx);
2050  ri0[vi] = partptr(partidx);
2051  nrows[vi] = partptr(partidx+1) - ri0[vi];
2052  }
2053  for (local_ordinal_type tr=0,j=0;tr<nrows[0];++tr) {
2054  for (local_ordinal_type e=0;e<3;++e) {
2055  const impl_scalar_type* block[vector_length] = {};
2056  for (local_ordinal_type vi=0;vi<npacks;++vi) {
2057  const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2058  block[vi] = &A_values(Aj*blocksize_square);
2059  }
2060  const size_type pi = kps + j;
2061  ++j;
2062  for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2063  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2064  const auto idx = ii*blocksize + jj;
2065  auto& v = internal_vector_values(pi, ii, jj, 0);
2066  for (local_ordinal_type vi=0;vi<npacks;++vi)
2067  v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
2068  }
2069  }
2070 
2071  if (nrows[0] == 1) break;
2072  if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
2073  for (local_ordinal_type vi=1;vi<npacks;++vi) {
2074  if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
2075  npacks = vi;
2076  break;
2077  }
2078  }
2079  }
2080  }
2081  }
2082 
2083  KOKKOS_INLINE_FUNCTION
2084  void
2085  extract(const member_type &member,
2086  const local_ordinal_type &partidxbeg,
2087  const local_ordinal_type &npacks,
2088  const local_ordinal_type &vbeg) const {
2089  local_ordinal_type kfs_vals[internal_vector_length] = {};
2090  local_ordinal_type ri0_vals[internal_vector_length] = {};
2091  local_ordinal_type nrows_vals[internal_vector_length] = {};
2092 
2093  const size_type kps = pack_td_ptr(partidxbeg);
2094  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
2095  kfs_vals[vi] = flat_td_ptr(partidxbeg+vi);
2096  ri0_vals[vi] = partptr(partidxbeg+vi);
2097  nrows_vals[vi] = partptr(partidxbeg+vi+1) - ri0_vals[vi];
2098  }
2099 
2100  local_ordinal_type j_vals[internal_vector_length] = {};
2101  for (local_ordinal_type tr=0;tr<nrows_vals[0];++tr) {
2102  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
2103  const local_ordinal_type nrows = nrows_vals[vi];
2104  if (tr < nrows) {
2105  auto &j = j_vals[vi];
2106  const local_ordinal_type kfs = kfs_vals[vi];
2107  const local_ordinal_type ri0 = ri0_vals[vi];
2108  const local_ordinal_type lbeg = (tr == 0 ? 1 : 0);
2109  const local_ordinal_type lend = (tr == nrows - 1 ? 2 : 3);
2110  for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
2111  const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
2112  const impl_scalar_type* block = &A_values(Aj*blocksize_square);
2113  const size_type pi = kps + j;
2114  Kokkos::parallel_for
2115  (Kokkos::TeamThreadRange(member,blocksize),
2116  [&](const local_ordinal_type &ii) {
2117  for (local_ordinal_type jj=0;jj<blocksize;++jj)
2118  scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[ii*blocksize + jj]);
2119  });
2120  }
2121  }
2122  }
2123  }
2124  }
2125 
2126  template<typename AAViewType,
2127  typename WWViewType>
2128  KOKKOS_INLINE_FUNCTION
2129  void
2130  factorize(const member_type &member,
2131  const local_ordinal_type &i0,
2132  const local_ordinal_type &nrows,
2133  const local_ordinal_type &v,
2134  const AAViewType &AA,
2135  const WWViewType &WW) const {
2136  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
2137  <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2138  typedef default_mode_and_algo_type::mode_type default_mode_type;
2139  typedef default_mode_and_algo_type::algo_type default_algo_type;
2140 
2141  // constant
2142  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2143 
2144  // subview pattern
2145  auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2146  KB::LU<member_type,
2147  default_mode_type,KB::Algo::LU::Unblocked>
2148  ::invoke(member, A , tiny);
2149 
2150  if (nrows > 1) {
2151  auto B = A;
2152  auto C = A;
2153  local_ordinal_type i = i0;
2154  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2155  B.assign_data( &AA(i+1,0,0,v) );
2156  KB::Trsm<member_type,
2157  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2158  default_mode_type,default_algo_type>
2159  ::invoke(member, one, A, B);
2160  C.assign_data( &AA(i+2,0,0,v) );
2161  KB::Trsm<member_type,
2162  KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2163  default_mode_type,default_algo_type>
2164  ::invoke(member, one, A, C);
2165  A.assign_data( &AA(i+3,0,0,v) );
2166 
2167  member.team_barrier();
2168  KB::Gemm<member_type,
2169  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2170  default_mode_type,default_algo_type>
2171  ::invoke(member, -one, C, B, one, A);
2172  KB::LU<member_type,
2173  default_mode_type,KB::Algo::LU::Unblocked>
2174  ::invoke(member, A, tiny);
2175  }
2176  } else {
2177  // for block jacobi invert a matrix here
2178  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2179  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2180  ::invoke(member, A, W);
2181  KB::SetIdentity<member_type,default_mode_type>
2182  ::invoke(member, A);
2183  member.team_barrier();
2184  KB::Trsm<member_type,
2185  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2186  default_mode_type,default_algo_type>
2187  ::invoke(member, one, W, A);
2188  KB::Trsm<member_type,
2189  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2190  default_mode_type,default_algo_type>
2191  ::invoke(member, one, W, A);
2192  }
2193  }
2194 
2195  public:
2196 
2197  struct ExtractAndFactorizeTag {};
2198 
2199  KOKKOS_INLINE_FUNCTION
2200  void
2201  operator() (const ExtractAndFactorizeTag &, const member_type &member) const {
2202  // btdm is packed and sorted from largest one
2203  const local_ordinal_type packidx = member.league_rank();
2204 
2205  const local_ordinal_type partidx = packptr(packidx);
2206  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2207  const local_ordinal_type i0 = pack_td_ptr(partidx);
2208  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2209 
2210  internal_vector_scratch_type_3d_view
2211  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
2212  if (vector_loop_size == 1) {
2213  extract(partidx, npacks);
2214  factorize(member, i0, nrows, 0, internal_vector_values, WW);
2215  } else {
2216  Kokkos::parallel_for
2217  (Kokkos::ThreadVectorRange(member, vector_loop_size),
2218  [&](const local_ordinal_type &v) {
2219  const local_ordinal_type vbeg = v*internal_vector_length;
2220  if (vbeg < npacks)
2221  extract(member, partidx+vbeg, npacks, vbeg);
2222  // this is not safe if vector loop size is different from vector size of
2223  // the team policy. we always make sure this when constructing the team policy
2224  member.team_barrier();
2225  factorize(member, i0, nrows, v, internal_vector_values, WW);
2226  });
2227  }
2228  }
2229 
2230  void run() {
2231  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2232  const local_ordinal_type team_size =
2233  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
2234  recommended_team_size(blocksize, vector_length, internal_vector_length);
2235  const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
2236  shmem_size(blocksize, blocksize, vector_loop_size);
2237 
2238  Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeTag>
2239  policy(packptr.extent(0)-1, team_size, vector_loop_size);
2240 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
2241  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
2242  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this);
2243 #else
2244  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
2245  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
2246  policy, *this);
2247 #endif
2248  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2249  }
2250 
2251  };
2252 
2256  template<typename MatrixType>
2257  void
2258  performNumericPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
2259  const PartInterface<MatrixType> &interf,
2261  const typename ImplType<MatrixType>::magnitude_type tiny) {
2262  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NumericPhase");
2263  ExtractAndFactorizeTridiags<MatrixType> function(btdm, interf, A, tiny);
2264  function.run();
2265  }
2266 
2270  template<typename MatrixType>
2272  public:
2274  using execution_space = typename impl_type::execution_space;
2275  using memory_space = typename impl_type::memory_space;
2276 
2277  using local_ordinal_type = typename impl_type::local_ordinal_type;
2278  using impl_scalar_type = typename impl_type::impl_scalar_type;
2279  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2280  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
2281  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2282  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2283  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
2284  using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
2285  static constexpr int vector_length = impl_type::vector_length;
2286 
2287  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
2288 
2289  private:
2290  // part interface
2291  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2292  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2293  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2294  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
2295  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2296  const local_ordinal_type blocksize;
2297  const local_ordinal_type num_vectors;
2298 
2299  // packed multivector output (or input)
2300  vector_type_3d_view packed_multivector;
2301  const_impl_scalar_type_2d_view_tpetra scalar_multivector;
2302 
2303  template<typename TagType>
2304  KOKKOS_INLINE_FUNCTION
2305  void copy_multivectors(const local_ordinal_type &j,
2306  const local_ordinal_type &vi,
2307  const local_ordinal_type &pri,
2308  const local_ordinal_type &ri0) const {
2309  for (local_ordinal_type col=0;col<num_vectors;++col)
2310  for (local_ordinal_type i=0;i<blocksize;++i)
2311  packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
2312  }
2313 
2314  public:
2315 
2316  MultiVectorConverter(const PartInterface<MatrixType> &interf,
2317  const vector_type_3d_view &pmv)
2318  : partptr(interf.partptr),
2319  packptr(interf.packptr),
2320  part2packrowidx0(interf.part2packrowidx0),
2321  part2rowidx0(interf.part2rowidx0),
2322  lclrow(interf.lclrow),
2323  blocksize(pmv.extent(1)),
2324  num_vectors(pmv.extent(2)),
2325  packed_multivector(pmv) {}
2326 
2327  // TODO:: modify this routine similar to the team level functions
2328  // inline ---> FIXME HIP: should not need the KOKKOS_INLINE_FUNCTION below...
2329  KOKKOS_INLINE_FUNCTION
2330  void
2331  operator() (const local_ordinal_type &packidx) const {
2332  local_ordinal_type partidx = packptr(packidx);
2333  local_ordinal_type npacks = packptr(packidx+1) - partidx;
2334  const local_ordinal_type pri0 = part2packrowidx0(partidx);
2335 
2336  local_ordinal_type ri0[vector_length] = {};
2337  local_ordinal_type nrows[vector_length] = {};
2338  for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
2339  ri0[v] = part2rowidx0(partidx);
2340  nrows[v] = part2rowidx0(partidx+1) - ri0[v];
2341  }
2342  for (local_ordinal_type j=0;j<nrows[0];++j) {
2343  local_ordinal_type cnt = 1;
2344  for (;cnt<npacks && j!= nrows[cnt];++cnt);
2345  npacks = cnt;
2346  const local_ordinal_type pri = pri0 + j;
2347  for (local_ordinal_type col=0;col<num_vectors;++col)
2348  for (local_ordinal_type i=0;i<blocksize;++i)
2349  for (local_ordinal_type v=0;v<npacks;++v)
2350  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
2351  }
2352  }
2353 
2354  KOKKOS_INLINE_FUNCTION
2355  void
2356  operator() (const member_type &member) const {
2357  const local_ordinal_type packidx = member.league_rank();
2358  const local_ordinal_type partidx_begin = packptr(packidx);
2359  const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
2360  const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
2361  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
2362  const local_ordinal_type partidx = partidx_begin + v;
2363  const local_ordinal_type ri0 = part2rowidx0(partidx);
2364  const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
2365 
2366  if (nrows == 1) {
2367  const local_ordinal_type pri = pri0;
2368  for (local_ordinal_type col=0;col<num_vectors;++col) {
2369  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
2370  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
2371  });
2372  }
2373  } else {
2374  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
2375  const local_ordinal_type pri = pri0 + j;
2376  for (local_ordinal_type col=0;col<num_vectors;++col)
2377  for (local_ordinal_type i=0;i<blocksize;++i)
2378  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
2379  });
2380  }
2381  });
2382  }
2383 
2384  void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
2385  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2386  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::MultiVectorConverter");
2387 
2388  scalar_multivector = scalar_multivector_;
2390 #if defined(KOKKOS_ENABLE_CUDA)
2391  const local_ordinal_type vl = vector_length;
2392  const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
2393  Kokkos::parallel_for
2394  ("MultiVectorConverter::TeamPolicy", policy, *this);
2395 #endif
2396  } else if(is_hip<execution_space>::value) {
2397 #if defined(KOKKOS_ENABLE_HIP)
2398  const local_ordinal_type vl = vector_length;
2399  const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
2400  Kokkos::parallel_for
2401  ("MultiVectorConverter::TeamPolicy", policy, *this);
2402 #endif
2403  } else {
2404 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
2405  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
2406 #else
2407  const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
2408  Kokkos::parallel_for
2409  ("MultiVectorConverter::RangePolicy", policy, *this);
2410 #endif
2411  }
2412  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2413  }
2414  };
2415 
2419  template<typename ArgActiveExecutionMemorySpace>
2421 
2422  template<>
2423  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2424  typedef KB::Mode::Serial mode_type;
2425  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2426 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2427  typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
2428 #else
2429  typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
2430 #endif
2431  static int recommended_team_size(const int /* blksize */,
2432  const int /* vector_length */,
2433  const int /* internal_vector_length */) {
2434  return 1;
2435  }
2436  };
2437 
2438 #if defined(KOKKOS_ENABLE_CUDA)
2439  static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
2440  const int vector_length,
2441  const int internal_vector_length) {
2442  const int vector_size = vector_length/internal_vector_length;
2443  int total_team_size(0);
2444  if (blksize <= 5) total_team_size = 32;
2445  else if (blksize <= 9) total_team_size = 32; // 64
2446  else if (blksize <= 12) total_team_size = 96;
2447  else if (blksize <= 16) total_team_size = 128;
2448  else if (blksize <= 20) total_team_size = 160;
2449  else total_team_size = 160;
2450  return total_team_size/vector_size;
2451  }
2452 
2453  template<>
2454  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2455  typedef KB::Mode::Team mode_type;
2456  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2457  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2458  static int recommended_team_size(const int blksize,
2459  const int vector_length,
2460  const int internal_vector_length) {
2461  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2462  }
2463  };
2464  template<>
2465  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2466  typedef KB::Mode::Team mode_type;
2467  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2468  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2469  static int recommended_team_size(const int blksize,
2470  const int vector_length,
2471  const int internal_vector_length) {
2472  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2473  }
2474  };
2475 #endif
2476 
2477 #if defined(KOKKOS_ENABLE_HIP)
2478  static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
2479  const int vector_length,
2480  const int internal_vector_length) {
2481  const int vector_size = vector_length/internal_vector_length;
2482  int total_team_size(0);
2483  if (blksize <= 5) total_team_size = 32;
2484  else if (blksize <= 9) total_team_size = 32; // 64
2485  else if (blksize <= 12) total_team_size = 96;
2486  else if (blksize <= 16) total_team_size = 128;
2487  else if (blksize <= 20) total_team_size = 160;
2488  else total_team_size = 160;
2489  return total_team_size/vector_size;
2490  }
2491 
2492  template<>
2493  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPSpace> {
2494  typedef KB::Mode::Team mode_type;
2495  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2496  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2497  static int recommended_team_size(const int blksize,
2498  const int vector_length,
2499  const int internal_vector_length) {
2500  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2501  }
2502  };
2503  template<>
2504  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPHostPinnedSpace> {
2505  typedef KB::Mode::Team mode_type;
2506  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2507  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2508  static int recommended_team_size(const int blksize,
2509  const int vector_length,
2510  const int internal_vector_length) {
2511  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2512  }
2513  };
2514 #endif
2515 
2516  template<typename MatrixType>
2517  struct SolveTridiags {
2518  public:
2519  using impl_type = ImplType<MatrixType>;
2520  using execution_space = typename impl_type::execution_space;
2521 
2522  using local_ordinal_type = typename impl_type::local_ordinal_type;
2523  using size_type = typename impl_type::size_type;
2524  using impl_scalar_type = typename impl_type::impl_scalar_type;
2525  using magnitude_type = typename impl_type::magnitude_type;
2526  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2527  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2529  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2530  using size_type_1d_view = typename impl_type::size_type_1d_view;
2532  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2533  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2534  //using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2535 
2536  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2537 
2538  using internal_vector_type =typename impl_type::internal_vector_type;
2539  static constexpr int vector_length = impl_type::vector_length;
2540  static constexpr int internal_vector_length = impl_type::internal_vector_length;
2541 
2543  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
2544  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
2545 
2547  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2548  using member_type = typename team_policy_type::member_type;
2549 
2550  private:
2551  // part interface
2552  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2553  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2554  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2555  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2556 
2557  // block tridiags
2558  const ConstUnmanaged<size_type_1d_view> pack_td_ptr;
2559 
2560  // block tridiags values
2561  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
2562  const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
2563 
2564  const local_ordinal_type vector_loop_size;
2565 
2566  // copy to multivectors : damping factor and Y_scalar_multivector
2567  Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
2568 #if defined(__CUDA_ARCH__)
2569  AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2570 #else
2571  /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2572 #endif
2573  const impl_scalar_type df;
2574  const bool compute_diff;
2575 
2576  public:
2577  SolveTridiags(const PartInterface<MatrixType> &interf,
2578  const BlockTridiags<MatrixType> &btdm,
2579  const vector_type_3d_view &pmv,
2580  const impl_scalar_type damping_factor,
2581  const bool is_norm_manager_active)
2582  :
2583  // interface
2584  partptr(interf.partptr),
2585  packptr(interf.packptr),
2586  part2packrowidx0(interf.part2packrowidx0),
2587  lclrow(interf.lclrow),
2588  // block tridiags and multivector
2589  pack_td_ptr(btdm.pack_td_ptr),
2590  D_internal_vector_values((internal_vector_type*)btdm.values.data(),
2591  btdm.values.extent(0),
2592  btdm.values.extent(1),
2593  btdm.values.extent(2),
2594  vector_length/internal_vector_length),
2595  X_internal_vector_values((internal_vector_type*)pmv.data(),
2596  pmv.extent(0),
2597  pmv.extent(1),
2598  pmv.extent(2),
2599  vector_length/internal_vector_length),
2600  vector_loop_size(vector_length/internal_vector_length),
2601  Y_scalar_multivector(),
2602  Z_scalar_vector(),
2603  df(damping_factor),
2604  compute_diff(is_norm_manager_active)
2605  {}
2606 
2607  public:
2608 
2610  KOKKOS_INLINE_FUNCTION
2611  void
2612  copyToFlatMultiVector(const member_type &member,
2613  const local_ordinal_type partidxbeg, // partidx for v = 0
2614  const local_ordinal_type npacks,
2615  const local_ordinal_type pri0,
2616  const local_ordinal_type v, // index with a loop of vector_loop_size
2617  const local_ordinal_type blocksize,
2618  const local_ordinal_type num_vectors) const {
2619  const local_ordinal_type vbeg = v*internal_vector_length;
2620  if (vbeg < npacks) {
2621  local_ordinal_type ri0_vals[internal_vector_length] = {};
2622  local_ordinal_type nrows_vals[internal_vector_length] = {};
2623  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2624  const local_ordinal_type partidx = partidxbeg+vv;
2625  ri0_vals[vi] = partptr(partidx);
2626  nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
2627  }
2628 
2629  impl_scalar_type z_partial_sum(0);
2630  if (nrows_vals[0] == 1) {
2631  const local_ordinal_type j=0, pri=pri0;
2632  {
2633  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2634  const local_ordinal_type ri0 = ri0_vals[vi];
2635  const local_ordinal_type nrows = nrows_vals[vi];
2636  if (j < nrows) {
2637  Kokkos::parallel_for
2638  (Kokkos::TeamThreadRange(member, blocksize),
2639  [&](const local_ordinal_type &i) {
2640  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2641  for (local_ordinal_type col=0;col<num_vectors;++col) {
2642  impl_scalar_type &y = Y_scalar_multivector(row,col);
2643  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2644  y += df*yd;
2645 
2646  {//if (compute_diff) {
2647  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2648  z_partial_sum += yd_abs*yd_abs;
2649  }
2650  }
2651  });
2652  }
2653  }
2654  }
2655  } else {
2656  Kokkos::parallel_for
2657  (Kokkos::TeamThreadRange(member, nrows_vals[0]),
2658  [&](const local_ordinal_type &j) {
2659  const local_ordinal_type pri = pri0 + j;
2660  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2661  const local_ordinal_type ri0 = ri0_vals[vi];
2662  const local_ordinal_type nrows = nrows_vals[vi];
2663  if (j < nrows) {
2664  for (local_ordinal_type col=0;col<num_vectors;++col) {
2665  for (local_ordinal_type i=0;i<blocksize;++i) {
2666  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2667  impl_scalar_type &y = Y_scalar_multivector(row,col);
2668  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2669  y += df*yd;
2670 
2671  {//if (compute_diff) {
2672  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2673  z_partial_sum += yd_abs*yd_abs;
2674  }
2675  }
2676  }
2677  }
2678  }
2679  });
2680  }
2681  //if (compute_diff)
2682  Z_scalar_vector(member.league_rank()) += z_partial_sum;
2683  }
2684  }
2685 
2689  template<typename WWViewType>
2690  KOKKOS_INLINE_FUNCTION
2691  void
2692  solveSingleVector(const member_type &member,
2693  const local_ordinal_type &blocksize,
2694  const local_ordinal_type &i0,
2695  const local_ordinal_type &r0,
2696  const local_ordinal_type &nrows,
2697  const local_ordinal_type &v,
2698  const WWViewType &WW) const {
2699  typedef SolveTridiagsDefaultModeAndAlgo
2700  <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2701  typedef default_mode_and_algo_type::mode_type default_mode_type;
2702  typedef default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2703 
2704  // base pointers
2705  auto A = D_internal_vector_values.data();
2706  auto X = X_internal_vector_values.data();
2707 
2708  // constant
2709  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2710  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2711  //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2712 
2713  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2714  const local_ordinal_type astep = D_internal_vector_values.stride_0();
2715  const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
2716  const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
2717  const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2718  const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
2719 
2720  // for multiple rhs
2721  //const local_ordinal_type xs0 = num_vectors*vector_length; //X_scalar_values.stride_1();
2722  //const local_ordinal_type xs1 = vector_length; //X_scalar_values.stride_2();
2723 
2724  // move to starting point
2725  A += i0*astep + v;
2726  X += r0*xstep + v;
2727 
2728  //for (local_ordinal_type col=0;col<num_vectors;++col)
2729  if (nrows > 1) {
2730  // solve Lx = x
2731  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2732  (default_mode_type,default_algo_type,
2733  member,
2734  KB::Diag::Unit,
2735  blocksize,blocksize,
2736  one,
2737  A, as0, as1,
2738  X, xs0);
2739 
2740  for (local_ordinal_type tr=1;tr<nrows;++tr) {
2741  member.team_barrier();
2742  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2743  (default_mode_type,default_algo_type,
2744  member,
2745  blocksize, blocksize,
2746  -one,
2747  A+2*astep, as0, as1,
2748  X, xs0,
2749  one,
2750  X+1*xstep, xs0);
2751  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2752  (default_mode_type,default_algo_type,
2753  member,
2754  KB::Diag::Unit,
2755  blocksize,blocksize,
2756  one,
2757  A+3*astep, as0, as1,
2758  X+1*xstep, xs0);
2759 
2760  A += 3*astep;
2761  X += 1*xstep;
2762  }
2763 
2764  // solve Ux = x
2765  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2766  (default_mode_type,default_algo_type,
2767  member,
2768  KB::Diag::NonUnit,
2769  blocksize, blocksize,
2770  one,
2771  A, as0, as1,
2772  X, xs0);
2773 
2774  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2775  A -= 3*astep;
2776  member.team_barrier();
2777  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2778  (default_mode_type,default_algo_type,
2779  member,
2780  blocksize, blocksize,
2781  -one,
2782  A+1*astep, as0, as1,
2783  X, xs0,
2784  one,
2785  X-1*xstep, xs0);
2786  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2787  (default_mode_type,default_algo_type,
2788  member,
2789  KB::Diag::NonUnit,
2790  blocksize, blocksize,
2791  one,
2792  A, as0, as1,
2793  X-1*xstep,xs0);
2794  X -= 1*xstep;
2795  }
2796  // for multiple rhs
2797  //X += xs1;
2798  } else {
2799  const local_ordinal_type ws0 = WW.stride_0();
2800  auto W = WW.data() + v;
2801  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2802  (default_mode_type,
2803  member, blocksize, X, xs0, W, ws0);
2804  member.team_barrier();
2805  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2806  (default_mode_type,default_algo_type,
2807  member,
2808  blocksize, blocksize,
2809  one,
2810  A, as0, as1,
2811  W, xs0,
2812  zero,
2813  X, xs0);
2814  }
2815  }
2816 
2817  template<typename WWViewType>
2818  KOKKOS_INLINE_FUNCTION
2819  void
2820  solveMultiVector(const member_type &member,
2821  const local_ordinal_type &/* blocksize */,
2822  const local_ordinal_type &i0,
2823  const local_ordinal_type &r0,
2824  const local_ordinal_type &nrows,
2825  const local_ordinal_type &v,
2826  const WWViewType &WW) const {
2827  typedef SolveTridiagsDefaultModeAndAlgo
2828  <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2829  typedef default_mode_and_algo_type::mode_type default_mode_type;
2830  typedef default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2831 
2832  // constant
2833  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2834  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2835 
2836  // subview pattern
2837  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2838  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2839  auto X2 = X1;
2840 
2841  local_ordinal_type i = i0, r = r0;
2842 
2843 
2844  if (nrows > 1) {
2845  // solve Lx = x
2846  KB::Trsm<member_type,
2847  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2848  default_mode_type,default_algo_type>
2849  ::invoke(member, one, A, X1);
2850  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2851  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2852  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2853  member.team_barrier();
2854  KB::Gemm<member_type,
2855  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2856  default_mode_type,default_algo_type>
2857  ::invoke(member, -one, A, X1, one, X2);
2858  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2859  KB::Trsm<member_type,
2860  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2861  default_mode_type,default_algo_type>
2862  ::invoke(member, one, A, X2);
2863  X1.assign_data( X2.data() );
2864  }
2865 
2866  // solve Ux = x
2867  KB::Trsm<member_type,
2868  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2869  default_mode_type,default_algo_type>
2870  ::invoke(member, one, A, X1);
2871  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2872  i -= 3;
2873  A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2874  X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2875  member.team_barrier();
2876  KB::Gemm<member_type,
2877  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2878  default_mode_type,default_algo_type>
2879  ::invoke(member, -one, A, X1, one, X2);
2880 
2881  A.assign_data( &D_internal_vector_values(i,0,0,v) );
2882  KB::Trsm<member_type,
2883  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2884  default_mode_type,default_algo_type>
2885  ::invoke(member, one, A, X2);
2886  X1.assign_data( X2.data() );
2887  }
2888  } else {
2889  // matrix is already inverted
2890  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2891  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2892  ::invoke(member, X1, W);
2893  member.team_barrier();
2894  KB::Gemm<member_type,
2895  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2896  default_mode_type,default_algo_type>
2897  ::invoke(member, one, A, W, zero, X1);
2898  }
2899  }
2900 
2901  template<int B> struct SingleVectorTag {};
2902  template<int B> struct MultiVectorTag {};
2903 
2904  template<int B>
2905  KOKKOS_INLINE_FUNCTION
2906  void
2907  operator() (const SingleVectorTag<B> &, const member_type &member) const {
2908  const local_ordinal_type packidx = member.league_rank();
2909  const local_ordinal_type partidx = packptr(packidx);
2910  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2911  const local_ordinal_type pri0 = part2packrowidx0(partidx);
2912  const local_ordinal_type i0 = pack_td_ptr(partidx);
2913  const local_ordinal_type r0 = part2packrowidx0(partidx);
2914  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2915  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2916  const local_ordinal_type num_vectors = 1;
2917  internal_vector_scratch_type_3d_view
2918  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
2919  Kokkos::single(Kokkos::PerTeam(member), [&]() {
2920  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2921  });
2922  Kokkos::parallel_for
2923  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
2924  solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
2925  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2926  });
2927  }
2928 
2929  template<int B>
2930  KOKKOS_INLINE_FUNCTION
2931  void
2932  operator() (const MultiVectorTag<B> &, const member_type &member) const {
2933  const local_ordinal_type packidx = member.league_rank();
2934  const local_ordinal_type partidx = packptr(packidx);
2935  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2936  const local_ordinal_type pri0 = part2packrowidx0(partidx);
2937  const local_ordinal_type i0 = pack_td_ptr(partidx);
2938  const local_ordinal_type r0 = part2packrowidx0(partidx);
2939  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2940  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2941  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2942 
2943  internal_vector_scratch_type_3d_view
2944  WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
2945  Kokkos::single(Kokkos::PerTeam(member), [&]() {
2946  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2947  });
2948  Kokkos::parallel_for
2949  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
2950  solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
2951  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2952  });
2953  }
2954 
2955  void run(const impl_scalar_type_2d_view_tpetra &Y,
2956  const impl_scalar_type_1d_view &Z) {
2957  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2958  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SolveTridiags");
2959 
2961  this->Y_scalar_multivector = Y;
2962  this->Z_scalar_vector = Z;
2963 
2964  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2965  const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
2966 
2967  const local_ordinal_type team_size =
2968  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
2969  recommended_team_size(blocksize, vector_length, internal_vector_length);
2970  const int per_team_scratch = internal_vector_scratch_type_3d_view
2971  ::shmem_size(blocksize, num_vectors, vector_loop_size);
2972 
2973 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
2974 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2975  if (num_vectors == 1) { \
2976  const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2977  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2978  Kokkos::parallel_for \
2979  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2980  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2981  } else { \
2982  const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2983  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2984  Kokkos::parallel_for \
2985  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
2986  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2987  } break
2988 #else
2989 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2990  if (num_vectors == 1) { \
2991  Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2992  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2993  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
2994  Kokkos::parallel_for \
2995  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2996  policy, *this); \
2997  } else { \
2998  Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2999  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
3000  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
3001  Kokkos::parallel_for \
3002  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
3003  policy, *this); \
3004  } break
3005 #endif
3006  switch (blocksize) {
3007  case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
3008  case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
3009  case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
3010  case 9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
3011  case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
3012  case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
3013  case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
3014  case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
3015  case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
3016  default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
3017  }
3018 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
3019 
3020  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3021  }
3022  };
3023 
3027  static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
3028  const int team_size) {
3029  int total_team_size(0);
3030  if (blksize <= 5) total_team_size = 32;
3031  else if (blksize <= 9) total_team_size = 32; // 64
3032  else if (blksize <= 12) total_team_size = 96;
3033  else if (blksize <= 16) total_team_size = 128;
3034  else if (blksize <= 20) total_team_size = 160;
3035  else total_team_size = 160;
3036  return total_team_size/team_size;
3037  }
3038 
3039  static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize,
3040  const int team_size) {
3041  int total_team_size(0);
3042  if (blksize <= 5) total_team_size = 32;
3043  else if (blksize <= 9) total_team_size = 32; // 64
3044  else if (blksize <= 12) total_team_size = 96;
3045  else if (blksize <= 16) total_team_size = 128;
3046  else if (blksize <= 20) total_team_size = 160;
3047  else total_team_size = 160;
3048  return total_team_size/team_size;
3049  }
3050 
3051  template<typename MatrixType>
3052  struct ComputeResidualVector {
3053  public:
3054  using impl_type = ImplType<MatrixType>;
3055  using node_device_type = typename impl_type::node_device_type;
3056  using execution_space = typename impl_type::execution_space;
3057  using memory_space = typename impl_type::memory_space;
3058 
3059  using local_ordinal_type = typename impl_type::local_ordinal_type;
3060  using size_type = typename impl_type::size_type;
3061  using impl_scalar_type = typename impl_type::impl_scalar_type;
3062  using magnitude_type = typename impl_type::magnitude_type;
3063  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3064  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
3066  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3067  using size_type_1d_view = typename impl_type::size_type_1d_view;
3068  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
3069  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
3070  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
3071  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3072  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
3073  static constexpr int vector_length = impl_type::vector_length;
3074 
3076  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3077 
3078  // enum for max blocksize and vector length
3079  enum : int { max_blocksize = 32 };
3080 
3081  private:
3082  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
3083  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
3084  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
3085  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
3086  Unmanaged<vector_type_3d_view> y_packed;
3087  Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
3088 
3089  // AmD information
3090  const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
3091  const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
3092  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
3093 
3094  // block crs graph information
3095  // for cuda (kokkos crs graph uses a different size_type from size_t)
3096  const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_rowptr;
3097  const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
3098 
3099  // blocksize
3100  const local_ordinal_type blocksize_requested;
3101 
3102  // part interface
3103  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3104  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3105  const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
3106  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3107  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3108  const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
3109  const bool is_dm2cm_active;
3110 
3111  public:
3112  template<typename LocalCrsGraphType>
3113  ComputeResidualVector(const AmD<MatrixType> &amd,
3114  const LocalCrsGraphType &graph,
3115  const local_ordinal_type &blocksize_requested_,
3116  const PartInterface<MatrixType> &interf,
3117  const local_ordinal_type_1d_view &dm2cm_)
3118  : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
3119  colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
3120  tpetra_values(amd.tpetra_values),
3121  A_rowptr(graph.row_map),
3122  A_colind(graph.entries),
3123  blocksize_requested(blocksize_requested_),
3124  part2packrowidx0(interf.part2packrowidx0),
3125  part2rowidx0(interf.part2rowidx0),
3126  rowidx2part(interf.rowidx2part),
3127  partptr(interf.partptr),
3128  lclrow(interf.lclrow),
3129  dm2cm(dm2cm_),
3130  is_dm2cm_active(dm2cm_.span() > 0)
3131  {}
3132 
3133  inline
3134  void
3135  SerialGemv(const local_ordinal_type &blocksize,
3136  const impl_scalar_type * const KOKKOS_RESTRICT AA,
3137  const impl_scalar_type * const KOKKOS_RESTRICT xx,
3138  /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
3139  for (local_ordinal_type k0=0;k0<blocksize;++k0) {
3140  impl_scalar_type val = 0;
3141  const local_ordinal_type offset = k0*blocksize;
3142 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
3143 # pragma ivdep
3144 #endif
3145 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
3146 # pragma unroll
3147 #endif
3148  for (local_ordinal_type k1=0;k1<blocksize;++k1)
3149  val += AA[offset+k1]*xx[k1];
3150  yy[k0] -= val;
3151  }
3152  }
3153 
3154  template<typename bbViewType, typename yyViewType>
3155  KOKKOS_INLINE_FUNCTION
3156  void
3157  VectorCopy(const member_type &member,
3158  const local_ordinal_type &blocksize,
3159  const bbViewType &bb,
3160  const yyViewType &yy) const {
3161  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
3162  yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
3163  });
3164  }
3165 
3166  template<typename AAViewType, typename xxViewType, typename yyViewType>
3167  KOKKOS_INLINE_FUNCTION
3168  void
3169  TeamVectorGemv(const member_type &member,
3170  const local_ordinal_type &blocksize,
3171  const AAViewType &AA,
3172  const xxViewType &xx,
3173  const yyViewType &yy) const {
3174  Kokkos::parallel_for
3175  (Kokkos::TeamThreadRange(member, blocksize),
3176  [&](const local_ordinal_type &k0) {
3177  impl_scalar_type val = 0;
3178  Kokkos::parallel_for
3179  (Kokkos::ThreadVectorRange(member, blocksize),
3180  [&](const local_ordinal_type &k1) {
3181  val += AA(k0,k1)*xx(k1);
3182  });
3183  Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
3184  });
3185  }
3186 
3187  template<typename AAViewType, typename xxViewType, typename yyViewType>
3188  KOKKOS_INLINE_FUNCTION
3189  void
3190  VectorGemv(const member_type &member,
3191  const local_ordinal_type &blocksize,
3192  const AAViewType &AA,
3193  const xxViewType &xx,
3194  const yyViewType &yy) const {
3195  Kokkos::parallel_for
3196  (Kokkos::ThreadVectorRange(member, blocksize),
3197  [&](const local_ordinal_type &k0) {
3198  impl_scalar_type val(0);
3199  for (local_ordinal_type k1=0;k1<blocksize;++k1) {
3200  val += AA(k0,k1)*xx(k1);
3201  }
3202  Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
3203  });
3204  }
3205 
3206  // template<typename AAViewType, typename xxViewType, typename yyViewType>
3207  // KOKKOS_INLINE_FUNCTION
3208  // void
3209  // VectorGemv(const member_type &member,
3210  // const local_ordinal_type &blocksize,
3211  // const AAViewType &AA,
3212  // const xxViewType &xx,
3213  // const yyViewType &yy) const {
3214  // for (local_ordinal_type k0=0;k0<blocksize;++k0) {
3215  // impl_scalar_type val = 0;
3216  // Kokkos::parallel_for
3217  // (Kokkos::ThreadVectorRange(member, blocksize),
3218  // [&](const local_ordinal_type &k1) {
3219  // val += AA(k0,k1)*xx(k1);
3220  // });
3221  // Kokkos::atomic_fetch_add(&yy(k0), -val);
3222  // }
3223  // }
3224 
3225  struct SeqTag {};
3226 
3227  // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
3228  KOKKOS_INLINE_FUNCTION
3229  void
3230  operator() (const SeqTag &, const local_ordinal_type& i) const {
3231  const local_ordinal_type blocksize = blocksize_requested;
3232  const local_ordinal_type blocksize_square = blocksize*blocksize;
3233 
3234  // constants
3235  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3236  const local_ordinal_type num_vectors = y.extent(1);
3237  const local_ordinal_type row = i*blocksize;
3238  for (local_ordinal_type col=0;col<num_vectors;++col) {
3239  // y := b
3240  impl_scalar_type *yy = &y(row, col);
3241  const impl_scalar_type * const bb = &b(row, col);
3242  memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize);
3243 
3244  // y -= Rx
3245  const size_type A_k0 = A_rowptr[i];
3246  for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
3247  const size_type j = A_k0 + colindsub[k];
3248  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3249  const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
3250  SerialGemv(blocksize,AA,xx,yy);
3251  }
3252  }
3253  }
3254 
3255  KOKKOS_INLINE_FUNCTION
3256  void
3257  operator() (const SeqTag &, const member_type &member) const {
3258 
3259  // constants
3260  const local_ordinal_type blocksize = blocksize_requested;
3261  const local_ordinal_type blocksize_square = blocksize*blocksize;
3262 
3263  const local_ordinal_type lr = member.league_rank();
3264  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3265  const local_ordinal_type num_vectors = y.extent(1);
3266 
3267  // subview pattern
3268  auto bb = Kokkos::subview(b, block_range, 0);
3269  auto xx = bb;
3270  auto yy = Kokkos::subview(y, block_range, 0);
3271  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3272 
3273  const local_ordinal_type row = lr*blocksize;
3274  for (local_ordinal_type col=0;col<num_vectors;++col) {
3275  // y := b
3276  yy.assign_data(&y(row, col));
3277  bb.assign_data(&b(row, col));
3278  if (member.team_rank() == 0)
3279  VectorCopy(member, blocksize, bb, yy);
3280  member.team_barrier();
3281 
3282  // y -= Rx
3283  const size_type A_k0 = A_rowptr[lr];
3284  Kokkos::parallel_for
3285  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
3286  [&](const local_ordinal_type &k) {
3287  const size_type j = A_k0 + colindsub[k];
3288  A_block.assign_data( &tpetra_values(j*blocksize_square) );
3289  xx.assign_data( &x(A_colind[j]*blocksize, col) );
3290  VectorGemv(member, blocksize, A_block, xx, yy);
3291  });
3292  }
3293  }
3294 
3295  template<int B>
3296  struct AsyncTag {};
3297 
3298  template<int B>
3299  // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
3300  KOKKOS_INLINE_FUNCTION
3301  void
3302  operator() (const AsyncTag<B> &, const local_ordinal_type &rowidx) const {
3303  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3304  const local_ordinal_type blocksize_square = blocksize*blocksize;
3305 
3306  // constants
3307  const local_ordinal_type partidx = rowidx2part(rowidx);
3308  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3309  const local_ordinal_type v = partidx % vector_length;
3310 
3311  const local_ordinal_type num_vectors = y_packed.extent(2);
3312  const local_ordinal_type num_local_rows = lclrow.extent(0);
3313 
3314  // temporary buffer for y flat
3315  impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
3316 
3317  const local_ordinal_type lr = lclrow(rowidx);
3318  const local_ordinal_type row = lr*blocksize;
3319  for (local_ordinal_type col=0;col<num_vectors;++col) {
3320  // y := b
3321  memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
3322 
3323  // y -= Rx
3324  const size_type A_k0 = A_rowptr[lr];
3325  for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
3326  const size_type j = A_k0 + colindsub[k];
3327  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3328  const local_ordinal_type A_colind_at_j = A_colind[j];
3329  if (A_colind_at_j < num_local_rows) {
3330  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3331  const impl_scalar_type * const xx = &x(loc*blocksize, col);
3332  SerialGemv(blocksize, AA,xx,yy);
3333  } else {
3334  const auto loc = A_colind_at_j - num_local_rows;
3335  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
3336  SerialGemv(blocksize, AA,xx_remote,yy);
3337  }
3338  }
3339  // move yy to y_packed
3340  for (local_ordinal_type k=0;k<blocksize;++k)
3341  y_packed(pri, k, col)[v] = yy[k];
3342  }
3343  }
3344 
3345  template<int B>
3346  KOKKOS_INLINE_FUNCTION
3347  void
3348  operator() (const AsyncTag<B> &, const member_type &member) const {
3349  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3350  const local_ordinal_type blocksize_square = blocksize*blocksize;
3351 
3352  // constants
3353  const local_ordinal_type rowidx = member.league_rank();
3354  const local_ordinal_type partidx = rowidx2part(rowidx);
3355  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3356  const local_ordinal_type v = partidx % vector_length;
3357 
3358  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3359  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
3360  const local_ordinal_type num_local_rows = lclrow.extent(0);
3361 
3362  // subview pattern
3363  auto bb = Kokkos::subview(b, block_range, 0);
3364  auto xx = Kokkos::subview(x, block_range, 0);
3365  auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
3366  auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3367  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3368 
3369  const local_ordinal_type lr = lclrow(rowidx);
3370  const local_ordinal_type row = lr*blocksize;
3371  for (local_ordinal_type col=0;col<num_vectors;++col) {
3372  // y := b
3373  bb.assign_data(&b(row, col));
3374  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3375  if (member.team_rank() == 0)
3376  VectorCopy(member, blocksize, bb, yy);
3377  member.team_barrier();
3378 
3379  // y -= Rx
3380  const size_type A_k0 = A_rowptr[lr];
3381  Kokkos::parallel_for
3382  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
3383  [&](const local_ordinal_type &k) {
3384  const size_type j = A_k0 + colindsub[k];
3385  A_block.assign_data( &tpetra_values(j*blocksize_square) );
3386 
3387  const local_ordinal_type A_colind_at_j = A_colind[j];
3388  if (A_colind_at_j < num_local_rows) {
3389  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3390  xx.assign_data( &x(loc*blocksize, col) );
3391  VectorGemv(member, blocksize, A_block, xx, yy);
3392  } else {
3393  const auto loc = A_colind_at_j - num_local_rows;
3394  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3395  VectorGemv(member, blocksize, A_block, xx_remote, yy);
3396  }
3397  });
3398  }
3399  }
3400 
3401  template <int P, int B> struct OverlapTag {};
3402 
3403  template<int P, int B>
3404  // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
3405  KOKKOS_INLINE_FUNCTION
3406  void
3407  operator() (const OverlapTag<P,B> &, const local_ordinal_type& rowidx) const {
3408  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3409  const local_ordinal_type blocksize_square = blocksize*blocksize;
3410 
3411  // constants
3412  const local_ordinal_type partidx = rowidx2part(rowidx);
3413  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3414  const local_ordinal_type v = partidx % vector_length;
3415 
3416  const local_ordinal_type num_vectors = y_packed.extent(2);
3417  const local_ordinal_type num_local_rows = lclrow.extent(0);
3418 
3419  // temporary buffer for y flat
3420  impl_scalar_type yy[max_blocksize] = {};
3421 
3422  auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3423  auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3424 
3425  const local_ordinal_type lr = lclrow(rowidx);
3426  const local_ordinal_type row = lr*blocksize;
3427  for (local_ordinal_type col=0;col<num_vectors;++col) {
3428  if (P == 0) {
3429  // y := b
3430  memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
3431  } else {
3432  // y (temporary) := 0
3433  memset(yy, 0, sizeof(impl_scalar_type)*blocksize);
3434  }
3435 
3436  // y -= Rx
3437  const size_type A_k0 = A_rowptr[lr];
3438  for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
3439  const size_type j = A_k0 + colindsub_used[k];
3440  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3441  const local_ordinal_type A_colind_at_j = A_colind[j];
3442  if (P == 0) {
3443  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3444  const impl_scalar_type * const xx = &x(loc*blocksize, col);
3445  SerialGemv(blocksize,AA,xx,yy);
3446  } else {
3447  const auto loc = A_colind_at_j - num_local_rows;
3448  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
3449  SerialGemv(blocksize,AA,xx_remote,yy);
3450  }
3451  }
3452  // move yy to y_packed
3453  if (P == 0) {
3454  for (local_ordinal_type k=0;k<blocksize;++k)
3455  y_packed(pri, k, col)[v] = yy[k];
3456  } else {
3457  for (local_ordinal_type k=0;k<blocksize;++k)
3458  y_packed(pri, k, col)[v] += yy[k];
3459  }
3460  }
3461  }
3462 
3463  template<int P, int B>
3464  KOKKOS_INLINE_FUNCTION
3465  void
3466  operator() (const OverlapTag<P,B> &, const member_type &member) const {
3467  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3468  const local_ordinal_type blocksize_square = blocksize*blocksize;
3469 
3470  // constants
3471  const local_ordinal_type rowidx = member.league_rank();
3472  const local_ordinal_type partidx = rowidx2part(rowidx);
3473  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3474  const local_ordinal_type v = partidx % vector_length;
3475 
3476  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3477  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
3478  const local_ordinal_type num_local_rows = lclrow.extent(0);
3479 
3480  // subview pattern
3481  auto bb = Kokkos::subview(b, block_range, 0);
3482  auto xx = bb; //Kokkos::subview(x, block_range, 0);
3483  auto xx_remote = bb; //Kokkos::subview(x_remote, block_range, 0);
3484  auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3485  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3486  auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3487  auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3488 
3489  const local_ordinal_type lr = lclrow(rowidx);
3490  const local_ordinal_type row = lr*blocksize;
3491  for (local_ordinal_type col=0;col<num_vectors;++col) {
3492  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3493  if (P == 0) {
3494  // y := b
3495  bb.assign_data(&b(row, col));
3496  if (member.team_rank() == 0)
3497  VectorCopy(member, blocksize, bb, yy);
3498  member.team_barrier();
3499  }
3500 
3501  // y -= Rx
3502  const size_type A_k0 = A_rowptr[lr];
3503  Kokkos::parallel_for
3504  (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
3505  [&](const local_ordinal_type &k) {
3506  const size_type j = A_k0 + colindsub_used[k];
3507  A_block.assign_data( &tpetra_values(j*blocksize_square) );
3508 
3509  const local_ordinal_type A_colind_at_j = A_colind[j];
3510  if (P == 0) {
3511  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3512  xx.assign_data( &x(loc*blocksize, col) );
3513  VectorGemv(member, blocksize, A_block, xx, yy);
3514  } else {
3515  const auto loc = A_colind_at_j - num_local_rows;
3516  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3517  VectorGemv(member, blocksize, A_block, xx_remote, yy);
3518  }
3519  });
3520  }
3521  }
3522 
3523  // y = b - Rx; seq method
3524  template<typename MultiVectorLocalViewTypeY,
3525  typename MultiVectorLocalViewTypeB,
3526  typename MultiVectorLocalViewTypeX>
3527  void run(const MultiVectorLocalViewTypeY &y_,
3528  const MultiVectorLocalViewTypeB &b_,
3529  const MultiVectorLocalViewTypeX &x_) {
3530  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3531  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<SeqTag>");
3532 
3533  y = y_; b = b_; x = x_;
3534  if (is_cuda<execution_space>::value) {
3535 #if defined(KOKKOS_ENABLE_CUDA)
3536  const local_ordinal_type blocksize = blocksize_requested;
3537  const local_ordinal_type team_size = 8;
3538  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3539  const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
3540  Kokkos::parallel_for
3541  ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
3542 #endif
3543  } else if(is_hip<execution_space>::value) {
3544 #if defined(KOKKOS_ENABLE_HIP)
3545  const local_ordinal_type blocksize = blocksize_requested;
3546  const local_ordinal_type team_size = 8;
3547  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedHIPVectorSize(blocksize, team_size);
3548  const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
3549  Kokkos::parallel_for
3550  ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
3551 #endif
3552  } else {
3553 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
3554  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
3555 #else
3556  const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
3557  Kokkos::parallel_for
3558  ("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
3559 #endif
3560  }
3561  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3562  }
3563 
3564  // y = b - R (x , x_remote)
3565  template<typename MultiVectorLocalViewTypeB,
3566  typename MultiVectorLocalViewTypeX,
3567  typename MultiVectorLocalViewTypeX_Remote>
3568  void run(const vector_type_3d_view &y_packed_,
3569  const MultiVectorLocalViewTypeB &b_,
3570  const MultiVectorLocalViewTypeX &x_,
3571  const MultiVectorLocalViewTypeX_Remote &x_remote_) {
3572  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3573  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<AsyncTag>");
3574 
3575  b = b_; x = x_; x_remote = x_remote_;
3576  if (is_cuda<execution_space>::value) {
3577 #if defined(KOKKOS_ENABLE_CUDA)
3578  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3579  y_packed_.extent(0),
3580  y_packed_.extent(1),
3581  y_packed_.extent(2),
3582  vector_length);
3583 #endif
3584  } else if (is_hip<execution_space>::value) {
3585 #if defined(KOKKOS_ENABLE_HIP)
3586  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3587  y_packed_.extent(0),
3588  y_packed_.extent(1),
3589  y_packed_.extent(2),
3590  vector_length);
3591 #endif
3592  } else {
3593  y_packed = y_packed_;
3594  }
3595 
3596  if (is_cuda<execution_space>::value) {
3597 #if defined(KOKKOS_ENABLE_CUDA)
3598  const local_ordinal_type blocksize = blocksize_requested;
3599  const local_ordinal_type team_size = 8;
3600  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3601  // local_ordinal_type vl_power_of_two = 1;
3602  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3603  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3604  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3605 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3606  const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
3607  policy(rowidx2part.extent(0), team_size, vector_size); \
3608  Kokkos::parallel_for \
3609  ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
3610  policy, *this); } break
3611  switch (blocksize_requested) {
3612  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3613  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3614  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3615  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3616  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3617  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3618  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3619  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3620  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3621  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3622  }
3623 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3624 #endif
3625  } else if (is_hip<execution_space>::value) {
3626 #if defined(KOKKOS_ENABLE_HIP)
3627  const local_ordinal_type blocksize = blocksize_requested;
3628  const local_ordinal_type team_size = 8;
3629  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedHIPVectorSize(blocksize, team_size);
3630  // local_ordinal_type vl_power_of_two = 1;
3631  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3632  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3633  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3634 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3635  const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
3636  policy(rowidx2part.extent(0), team_size, vector_size); \
3637  Kokkos::parallel_for \
3638  ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
3639  policy, *this); } break
3640  switch (blocksize_requested) {
3641  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3642  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3643  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3644  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3645  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3646  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3647  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3648  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3649  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3650  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3651  }
3652 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3653 #endif
3654  } else {
3655 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
3656  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
3657 #else
3658 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3659  const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
3660  Kokkos::parallel_for \
3661  ("ComputeResidual::RangePolicy::run<AsyncTag>", \
3662  policy, *this); } break
3663  switch (blocksize_requested) {
3664  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3665  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3666  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3667  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3668  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3669  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3670  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3671  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3672  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3673  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3674  }
3675 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3676 #endif
3677  }
3678  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3679  }
3680 
3681  // y = b - R (y , y_remote)
3682  template<typename MultiVectorLocalViewTypeB,
3683  typename MultiVectorLocalViewTypeX,
3684  typename MultiVectorLocalViewTypeX_Remote>
3685  void run(const vector_type_3d_view &y_packed_,
3686  const MultiVectorLocalViewTypeB &b_,
3687  const MultiVectorLocalViewTypeX &x_,
3688  const MultiVectorLocalViewTypeX_Remote &x_remote_,
3689  const bool compute_owned) {
3690  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3691  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<OverlapTag>");
3692 
3693  b = b_; x = x_; x_remote = x_remote_;
3694  if (is_cuda<execution_space>::value) {
3695 #if defined(KOKKOS_ENABLE_CUDA)
3696  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3697  y_packed_.extent(0),
3698  y_packed_.extent(1),
3699  y_packed_.extent(2),
3700  vector_length);
3701 #endif
3702  } else if (is_hip<execution_space>::value) {
3703 #if defined(KOKKOS_ENABLE_HIP)
3704  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3705  y_packed_.extent(0),
3706  y_packed_.extent(1),
3707  y_packed_.extent(2),
3708  vector_length);
3709 #endif
3710  } else {
3711  y_packed = y_packed_;
3712  }
3713 
3714  if (is_cuda<execution_space>::value) {
3715 #if defined(KOKKOS_ENABLE_CUDA)
3716  const local_ordinal_type blocksize = blocksize_requested;
3717  const local_ordinal_type team_size = 8;
3718  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3719  // local_ordinal_type vl_power_of_two = 1;
3720  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3721  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3722  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3723 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3724  if (compute_owned) { \
3725  const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
3726  policy(rowidx2part.extent(0), team_size, vector_size); \
3727  Kokkos::parallel_for \
3728  ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
3729  } else { \
3730  const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
3731  policy(rowidx2part.extent(0), team_size, vector_size); \
3732  Kokkos::parallel_for \
3733  ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
3734  } break
3735  switch (blocksize_requested) {
3736  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3737  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3738  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3739  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3740  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3741  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3742  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3743  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3744  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3745  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3746  }
3747 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3748 #endif
3749  } else if (is_hip<execution_space>::value) {
3750 #if defined(KOKKOS_ENABLE_HIP)
3751  const local_ordinal_type blocksize = blocksize_requested;
3752  const local_ordinal_type team_size = 8;
3753  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedHIPVectorSize(blocksize, team_size);
3754  // local_ordinal_type vl_power_of_two = 1;
3755  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3756  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3757  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3758 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3759  if (compute_owned) { \
3760  const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
3761  policy(rowidx2part.extent(0), team_size, vector_size); \
3762  Kokkos::parallel_for \
3763  ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
3764  } else { \
3765  const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
3766  policy(rowidx2part.extent(0), team_size, vector_size); \
3767  Kokkos::parallel_for \
3768  ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
3769  } break
3770  switch (blocksize_requested) {
3771  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3772  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3773  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3774  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3775  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3776  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3777  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3778  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3779  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3780  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3781  }
3782 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3783 #endif
3784  } else {
3785 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
3786  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
3787 #else
3788 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3789  if (compute_owned) { \
3790  const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
3791  policy(0, rowidx2part.extent(0)); \
3792  Kokkos::parallel_for \
3793  ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
3794  } else { \
3795  const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
3796  policy(0, rowidx2part.extent(0)); \
3797  Kokkos::parallel_for \
3798  ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
3799  } break
3800 
3801  switch (blocksize_requested) {
3802  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3803  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3804  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3805  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3806  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3807  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3808  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3809  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3810  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3811  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3812  }
3813 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3814 #endif
3815  }
3816  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3817  }
3818  };
3819 
3820  template<typename MatrixType>
3821  void reduceVector(const ConstUnmanaged<typename ImplType<MatrixType>::impl_scalar_type_1d_view> zz,
3822  /* */ typename ImplType<MatrixType>::magnitude_type *vals) {
3823  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3824  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ReduceVector");
3825 
3826  using impl_type = ImplType<MatrixType>;
3827  using local_ordinal_type = typename impl_type::local_ordinal_type;
3828  using impl_scalar_type = typename impl_type::impl_scalar_type;
3829 #if 0
3830  const auto norm2 = KokkosBlas::nrm1(zz);
3831 #else
3832  impl_scalar_type norm2(0);
3833  Kokkos::parallel_reduce
3834  ("ReduceMultiVector::Device",
3835  Kokkos::RangePolicy<typename impl_type::execution_space>(0,zz.extent(0)),
3836  KOKKOS_LAMBDA(const local_ordinal_type &i, impl_scalar_type &update) {
3837  update += zz(i);
3838  }, norm2);
3839 #endif
3840  vals[0] = Kokkos::ArithTraits<impl_scalar_type>::abs(norm2);
3841 
3842  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3843  }
3844 
3848  template<typename MatrixType>
3849  struct NormManager {
3850  public:
3852  using host_execution_space = typename impl_type::host_execution_space;
3853  using magnitude_type = typename impl_type::magnitude_type;
3854 
3855  private:
3856  bool collective_;
3857  int sweep_step_, sweep_step_upper_bound_;
3858 #ifdef HAVE_IFPACK2_MPI
3859  MPI_Request mpi_request_;
3860  MPI_Comm comm_;
3861 #endif
3862  magnitude_type work_[3];
3863 
3864  public:
3865  NormManager() = default;
3866  NormManager(const NormManager &b) = default;
3867  NormManager(const Teuchos::RCP<const Teuchos::Comm<int> >& comm) {
3868  sweep_step_ = 1;
3869  sweep_step_upper_bound_ = 1;
3870  collective_ = comm->getSize() > 1;
3871  if (collective_) {
3872 #ifdef HAVE_IFPACK2_MPI
3873  const auto mpi_comm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
3874  TEUCHOS_ASSERT( ! mpi_comm.is_null());
3875  comm_ = *mpi_comm->getRawMpiComm();
3876 #endif
3877  }
3878  const magnitude_type zero(0), minus_one(-1);
3879  work_[0] = zero;
3880  work_[1] = zero;
3881  work_[2] = minus_one;
3882  }
3883 
3884  // Check the norm every sweep_step sweeps.
3885  void setCheckFrequency(const int sweep_step) {
3886  TEUCHOS_TEST_FOR_EXCEPT_MSG(sweep_step < 1, "sweep step must be >= 1");
3887  sweep_step_upper_bound_ = sweep_step;
3888  sweep_step_ = 1;
3889  }
3890 
3891  // Get the buffer into which to store rank-local squared norms.
3892  magnitude_type* getBuffer() { return &work_[0]; }
3893 
3894  // Call MPI_Iallreduce to find the global squared norms.
3895  void ireduce(const int sweep, const bool force = false) {
3896  if ( ! force && sweep % sweep_step_) return;
3897 
3898  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::Ireduce");
3899 
3900  work_[1] = work_[0];
3901 #ifdef HAVE_IFPACK2_MPI
3902  auto send_data = &work_[1];
3903  auto recv_data = &work_[0];
3904  if (collective_) {
3905 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3906  MPI_Iallreduce(send_data, recv_data, 1,
3907  Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3908  MPI_SUM, comm_, &mpi_request_);
3909 # else
3910  MPI_Allreduce (send_data, recv_data, 1,
3911  Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3912  MPI_SUM, comm_);
3913 # endif
3914  }
3915 #endif
3916  }
3917 
3918  // Check if the norm-based termination criterion is met. tol2 is the
3919  // tolerance squared. Sweep is the sweep index. If not every iteration is
3920  // being checked, this function immediately returns false. If a check must
3921  // be done at this iteration, it waits for the reduction triggered by
3922  // ireduce to complete, then checks the global norm against the tolerance.
3923  bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) {
3924  // early return
3925  if (sweep <= 0) return false;
3926 
3927  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::CheckDone");
3928 
3929  TEUCHOS_ASSERT(sweep >= 1);
3930  if ( ! force && (sweep - 1) % sweep_step_) return false;
3931  if (collective_) {
3932 #ifdef HAVE_IFPACK2_MPI
3933 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3934  MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE);
3935 # else
3936  // Do nothing.
3937 # endif
3938 #endif
3939  }
3940  bool r_val = false;
3941  if (sweep == 1) {
3942  work_[2] = work_[0];
3943  } else {
3944  r_val = (work_[0] < tol2*work_[2]);
3945  }
3946 
3947  // adjust sweep step
3948  const auto adjusted_sweep_step = 2*sweep_step_;
3949  if (adjusted_sweep_step < sweep_step_upper_bound_) {
3950  sweep_step_ = adjusted_sweep_step;
3951  } else {
3952  sweep_step_ = sweep_step_upper_bound_;
3953  }
3954  return r_val;
3955  }
3956 
3957  // After termination has occurred, finalize the norms for use in
3958  // get_norms{0,final}.
3959  void finalize () {
3960  work_[0] = std::sqrt(work_[0]); // after converged
3961  if (work_[2] >= 0)
3962  work_[2] = std::sqrt(work_[2]); // first norm
3963  // if work_[2] is minus one, then norm is not requested.
3964  }
3965 
3966  // Report norms to the caller.
3967  const magnitude_type getNorms0 () const { return work_[2]; }
3968  const magnitude_type getNormsFinal () const { return work_[0]; }
3969  };
3970 
3974  template<typename MatrixType>
3975  int
3976  applyInverseJacobi(// importer
3977  const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
3978  const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
3979  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
3980  const bool overlap_communication_and_computation,
3981  // tpetra interface
3982  const typename ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
3983  /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
3984  /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
3985  /* */ typename ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
3986  // local object interface
3987  const PartInterface<MatrixType> &interf, // mesh interface
3988  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
3989  const AmD<MatrixType> &amd, // R = A - D
3990  /* */ typename ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
3991  /* */ NormManager<MatrixType> &norm_manager,
3992  // preconditioner parameters
3993  const typename ImplType<MatrixType>::impl_scalar_type &damping_factor,
3994  /* */ bool is_y_zero,
3995  const int max_num_sweeps,
3996  const typename ImplType<MatrixType>::magnitude_type tol,
3997  const int check_tol_every) {
3998  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ApplyInverseJacobi");
3999 
4000  using impl_type = ImplType<MatrixType>;
4001  using node_memory_space = typename impl_type::node_memory_space;
4002  using local_ordinal_type = typename impl_type::local_ordinal_type;
4003  using size_type = typename impl_type::size_type;
4004  using impl_scalar_type = typename impl_type::impl_scalar_type;
4005  using magnitude_type = typename impl_type::magnitude_type;
4006  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4007  using vector_type_1d_view = typename impl_type::vector_type_1d_view;
4008  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4009  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
4010 
4011  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4012 
4013  // either tpetra importer or async importer must be active
4014  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
4015  "Neither Tpetra importer nor Async importer is null.");
4016  // max number of sweeps should be positive number
4017  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
4018  "Maximum number of sweeps must be >= 1.");
4019 
4020  // const parameters
4021  const bool is_seq_method_requested = !tpetra_importer.is_null();
4022  const bool is_async_importer_active = !async_importer.is_null();
4023  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
4024  const magnitude_type tolerance = tol*tol;
4025  const local_ordinal_type blocksize = btdm.values.extent(1);
4026  const local_ordinal_type num_vectors = Y.getNumVectors();
4027  const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
4028 
4029  const impl_scalar_type zero(0.0);
4030 
4031  TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
4032  "The seq method for applyInverseJacobi, " <<
4033  "which in any case is for developer use only, " <<
4034  "does not support norm-based termination.");
4035  const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
4036  Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
4037  TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
4038  std::invalid_argument,
4039  "The seq method for applyInverseJacobi, " <<
4040  "which in any case is for developer use only, " <<
4041  "only supports memory spaces accessible from host.");
4042 
4043  // if workspace is needed more, resize it
4044  const size_type work_span_required = num_blockrows*num_vectors*blocksize;
4045  if (work.span() < work_span_required)
4046  work = vector_type_1d_view("vector workspace 1d view", work_span_required);
4047 
4048  // construct W
4049  const local_ordinal_type W_size = interf.packptr.extent(0)-1;
4050  if (local_ordinal_type(W.extent(0)) < W_size)
4051  W = impl_scalar_type_1d_view("W", W_size);
4052 
4053  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
4054  {
4055  if (is_seq_method_requested) {
4056  if (Z.getNumVectors() != Y.getNumVectors())
4057  Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
4058  } else {
4059  if (is_async_importer_active) {
4060  // create comm data buffer and keep it here
4061  async_importer->createDataBuffer(num_vectors);
4062  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
4063  }
4064  }
4065  }
4066 
4067  // wrap the workspace with 3d view
4068  vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
4069  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
4070  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
4071  const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
4072  if (is_y_zero) Kokkos::deep_copy(YY, zero);
4073 
4074  MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
4075  SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
4076  damping_factor, is_norm_manager_active);
4077 
4078  const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
4079  ComputeResidualVector<MatrixType>
4080  compute_residual_vector(amd, A->getCrsGraph().getLocalGraphDevice(), blocksize, interf,
4081  is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
4082 
4083  // norm manager workspace resize
4084  if (is_norm_manager_active)
4085  norm_manager.setCheckFrequency(check_tol_every);
4086 
4087  // iterate
4088  int sweep = 0;
4089  for (;sweep<max_num_sweeps;++sweep) {
4090  {
4091  if (is_y_zero) {
4092  // pmv := x(lclrow)
4093  multivector_converter.run(XX);
4094  } else {
4095  if (is_seq_method_requested) {
4096  // SEQ METHOD IS TESTING ONLY
4097 
4098  // y := x - R y
4099  Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
4100  compute_residual_vector.run(YY, XX, ZZ);
4101 
4102  // pmv := y(lclrow).
4103  multivector_converter.run(YY);
4104  } else {
4105  // fused y := x - R y and pmv := y(lclrow);
4106  // real use case does not use overlap comp and comm
4107  if (overlap_communication_and_computation || !is_async_importer_active) {
4108  if (is_async_importer_active) async_importer->asyncSendRecv(YY);
4109  compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
4110  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
4111  if (is_async_importer_active) async_importer->cancel();
4112  break;
4113  }
4114  if (is_async_importer_active) {
4115  async_importer->syncRecv();
4116  compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
4117  }
4118  } else {
4119  if (is_async_importer_active)
4120  async_importer->syncExchange(YY);
4121  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
4122  compute_residual_vector.run(pmv, XX, YY, remote_multivector);
4123  }
4124  }
4125  }
4126  }
4127 
4128  // pmv := inv(D) pmv.
4129  {
4130  solve_tridiags.run(YY, W);
4131  }
4132  {
4133  if (is_norm_manager_active) {
4134  // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
4135  reduceVector<MatrixType>(W, norm_manager.getBuffer());
4136  if (sweep + 1 == max_num_sweeps) {
4137  norm_manager.ireduce(sweep, true);
4138  norm_manager.checkDone(sweep + 1, tolerance, true);
4139  } else {
4140  norm_manager.ireduce(sweep);
4141  }
4142  }
4143  }
4144  is_y_zero = false;
4145  }
4146 
4147  //sqrt the norms for the caller's use.
4148  if (is_norm_manager_active) norm_manager.finalize();
4149 
4150  return sweep;
4151  }
4152 
4153 
4154  template<typename MatrixType>
4155  struct ImplObject {
4156  using impl_type = ImplType<MatrixType>;
4157  using part_interface_type = PartInterface<MatrixType>;
4158  using block_tridiags_type = BlockTridiags<MatrixType>;
4159  using amd_type = AmD<MatrixType>;
4160  using norm_manager_type = NormManager<MatrixType>;
4161  using async_import_type = AsyncableImport<MatrixType>;
4162 
4163  // distructed objects
4164  Teuchos::RCP<const typename impl_type::tpetra_block_crs_matrix_type> A;
4165  Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer;
4166  Teuchos::RCP<async_import_type> async_importer;
4167  bool overlap_communication_and_computation;
4168 
4169  // copy of Y (mutable to penentrate const)
4170  mutable typename impl_type::tpetra_multivector_type Z;
4171  mutable typename impl_type::impl_scalar_type_1d_view W;
4172 
4173  // local objects
4174  part_interface_type part_interface;
4175  block_tridiags_type block_tridiags; // D
4176  amd_type a_minus_d; // R = A - D
4177  mutable typename impl_type::vector_type_1d_view work; // right hand side workspace
4178  mutable norm_manager_type norm_manager;
4179  };
4180 
4181  } // namespace BlockTriDiContainerDetails
4182 
4183 } // namespace Ifpack2
4184 
4185 #endif
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:166
BlockTridiags< MatrixType > createBlockTridiags(const PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1375
int applyInverseJacobi(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename ImplType< MatrixType >::tpetra_multivector_type &X, typename ImplType< MatrixType >::tpetra_multivector_type &Y, typename ImplType< MatrixType >::tpetra_multivector_type &Z, typename ImplType< MatrixType >::impl_scalar_type_1d_view &W, const PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const AmD< MatrixType > &amd, typename ImplType< MatrixType >::vector_type_1d_view &work, NormManager< MatrixType > &norm_manager, const typename ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3976
PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::Array< Teuchos::Array< typename ImplType< MatrixType >::local_ordinal_type > > &partitions)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1164
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:240
void performNumericPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2258
static int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize, const int team_size)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3027
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:401
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:345
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:192
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:175
size_t size_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:336
std::string get_msg_prefix(const CommPtrType &comm)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:228
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:276
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:122
KB::Vector< T, l > Vector
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:388
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1050
node_type::device_type node_device_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:359
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3849
Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:425
Kokkos::DefaultHostExecutionSpace host_execution_space
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:354
void performSymbolicPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, AmD< MatrixType > &amd, const bool overlap_communication_and_computation)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1562
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1532
Preconditioners and smoothers for Tpetra sparse matrices.
Definition: Ifpack2_AdditiveSchwarz_decl.hpp:73
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1331
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2420
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:332
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2271
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:183