Compadre  1.3.3
Compadre_LinearAlgebra.cpp
Go to the documentation of this file.
2 #include "Compadre_Functors.hpp"
3 
4 #include "KokkosBatched_Copy_Decl.hpp"
5 #include "KokkosBatched_ApplyPivot_Decl.hpp"
6 #include "KokkosBatched_Gemv_Decl.hpp"
7 #include "KokkosBatched_Trsv_Decl.hpp"
8 #include "KokkosBatched_UTV_Decl.hpp"
9 #include "KokkosBatched_SolveUTV_Decl_Compadre.hpp"
10 
11 using namespace KokkosBatched;
12 
13 namespace Compadre{
14 namespace GMLS_LinearAlgebra {
15 
16  template<typename DeviceType,
17  typename AlgoTagType,
18  typename MatrixViewType_A,
19  typename MatrixViewType_B,
20  typename MatrixViewType_X>
22  MatrixViewType_A _a;
23  MatrixViewType_B _b;
24 
27  int _M, _N, _NRHS;
28 
29  KOKKOS_INLINE_FUNCTION
31  const int M,
32  const int N,
33  const int NRHS,
34  const MatrixViewType_A &a,
35  const MatrixViewType_B &b)
36  : _a(a), _b(b), _M(M), _N(N), _NRHS(NRHS) { _pm_getTeamScratchLevel_0 = 0; _pm_getTeamScratchLevel_1 = 0; }
37 
38  template<typename MemberType>
39  KOKKOS_INLINE_FUNCTION
40  void operator()(const MemberType &member) const {
41 
42  const int k = member.league_rank();
43 
44  // workspace vectors
45  scratch_vector_type ww_fast(member.team_scratch(_pm_getTeamScratchLevel_0), 3*_M);
46  scratch_vector_type ww_slow(member.team_scratch(_pm_getTeamScratchLevel_1), _N*_NRHS);
47 
48  scratch_matrix_right_type aa(_a.data() + TO_GLOBAL(k)*TO_GLOBAL(_a.extent(1))*TO_GLOBAL(_a.extent(2)),
49  _a.extent(1), _a.extent(2));
50  scratch_matrix_right_type bb(_b.data() + TO_GLOBAL(k)*TO_GLOBAL(_b.extent(1))*TO_GLOBAL(_b.extent(2)),
51  _b.extent(1), _b.extent(2));
52  scratch_matrix_right_type xx(_b.data() + TO_GLOBAL(k)*TO_GLOBAL(_b.extent(1))*TO_GLOBAL(_b.extent(2)),
53  _b.extent(1), _b.extent(2));
54 
55  // if sizes don't match extents, then copy to a view with extents matching sizes
56  if ((size_t)_M!=_a.extent(1) || (size_t)_N!=_a.extent(2)) {
57  scratch_matrix_right_type tmp(ww_slow.data(), _M, _N);
58  auto aaa = scratch_matrix_right_type(_a.data() + TO_GLOBAL(k)*TO_GLOBAL(_a.extent(1))*TO_GLOBAL(_a.extent(2)), _M, _N);
59  // copy A to W, then back to A
60  Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,_M),[&](const int &i) {
61  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member,0,_N),[&](const int &j) {
62  tmp(i,j) = aa(i,j);
63  });
64  });
65  member.team_barrier();
66  Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,_M),[&](const int &i) {
67  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member,0,_N),[&](const int &j) {
68  aaa(i,j) = tmp(i,j);
69  });
70  });
71  member.team_barrier();
72  aa = aaa;
73  }
74 
75  if (std::is_same<typename MatrixViewType_B::array_layout, layout_left>::value) {
76  scratch_matrix_right_type tmp(ww_slow.data(), _N, _NRHS);
77  // coming from LU
78  // then copy B to W, then back to B
79  auto bb_left =
80  scratch_matrix_left_type(_b.data() + TO_GLOBAL(k)*TO_GLOBAL(_b.extent(1))*TO_GLOBAL(_b.extent(2)),
81  _b.extent(1), _b.extent(2));
82  Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,_N),[&](const int &i) {
83  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member,0,_NRHS),[&](const int &j) {
84  tmp(i,j) = bb_left(i,j);
85  });
86  });
87  member.team_barrier();
88  Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,_N),[&](const int &i) {
89  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member,0,_NRHS),[&](const int &j) {
90  bb(i,j) = tmp(i,j);
91  });
92  });
93  }
94 
95  scratch_matrix_right_type uu(member.team_scratch(_pm_getTeamScratchLevel_1), _M, _N /* only N columns of U are filled, maximum */);
96  scratch_matrix_right_type vv(member.team_scratch(_pm_getTeamScratchLevel_1), _N, _N);
97  scratch_local_index_type pp(member.team_scratch(_pm_getTeamScratchLevel_0), _N);
98 
99  bool do_print = false;
100  if (do_print) {
101  Kokkos::single(Kokkos::PerTeam(member), [&] () {
102  //print a
103  printf("a=zeros(%lu,%lu);\n", aa.extent(0), aa.extent(1));
104  for (size_t j=0; j<aa.extent(0); ++j) {
105  for (size_t k=0; k<aa.extent(1); ++k) {
106  printf("a(%lu,%lu)= %f;\n", j+1,k+1, aa(j,k));
107  }
108  }
109  //print b
110  printf("b=zeros(%lu,%lu);\n", bb.extent(0), bb.extent(1));
111  for (size_t j=0; j<bb.extent(0); ++j) {
112  for (size_t k=0; k<bb.extent(1); ++k) {
113  printf("b(%lu,%lu)= %f;\n", j+1,k+1, bb(j,k));
114  }
115  }
116  });
117  }
118  do_print = false;
119 
120  /// Solving Ax = b using UTV transformation
121  /// A P^T P x = b
122  /// UTV P x = b;
123 
124  /// UTV = A P^T
125  int matrix_rank(0);
126  member.team_barrier();
127  TeamVectorUTV<MemberType,AlgoTagType>
128  ::invoke(member, aa, pp, uu, vv, ww_fast, matrix_rank);
129  member.team_barrier();
130 
131  if (do_print) {
132  Kokkos::single(Kokkos::PerTeam(member), [&] () {
133  printf("matrix_rank: %d\n", matrix_rank);
134  //print u
135  printf("u=zeros(%lu,%lu);\n", uu.extent(0), uu.extent(1));
136  for (size_t j=0; j<uu.extent(0); ++j) {
137  for (size_t k=0; k<uu.extent(1); ++k) {
138  printf("u(%lu,%lu)= %f;\n", j+1,k+1, uu(j,k));
139  }
140  }
141  });
142  }
143  TeamVectorSolveUTVCompadre<MemberType,AlgoTagType>
144  ::invoke(member, matrix_rank, _M, _N, _NRHS, uu, aa, vv, pp, bb, xx, ww_slow, ww_fast);
145  member.team_barrier();
146 
147  }
148 
149  inline
150  void run(ParallelManager pm) {
151  typedef typename MatrixViewType_A::non_const_value_type value_type;
152  std::string name_region("KokkosBatched::Test::TeamVectorSolveUTVCompadre");
153  std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" :
154  std::is_same<value_type,double>::value ? "::Double" :
155  std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
156  std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );
157  std::string name = name_region + name_value_type;
158  Kokkos::Profiling::pushRegion( name.c_str() );
159 
160  _pm_getTeamScratchLevel_0 = pm.getTeamScratchLevel(0);
161  _pm_getTeamScratchLevel_1 = pm.getTeamScratchLevel(1);
162 
163  int scratch_size = scratch_matrix_right_type::shmem_size(_N, _N); // V
164  scratch_size += scratch_matrix_right_type::shmem_size(_M, _N /* only N columns of U are filled, maximum */); // U
165  scratch_size += scratch_vector_type::shmem_size(_N*_NRHS); // W (for SolveUTV)
166 
167  int l0_scratch_size = scratch_vector_type::shmem_size(_N); // P (temporary)
168  l0_scratch_size += scratch_vector_type::shmem_size(3*_M); // W (for UTV)
169 
170  pm.clearScratchSizes();
171  pm.setTeamScratchSize(0, l0_scratch_size);
172  pm.setTeamScratchSize(1, scratch_size);
173 
174  pm.CallFunctorWithTeamThreadsAndVectors(*this, _a.extent(0));
175  Kokkos::fence();
176 
177  Kokkos::Profiling::popRegion();
178  }
179  };
180 
181 
182 
183 template <typename A_layout, typename B_layout, typename X_layout>
184 void batchQRPivotingSolve(ParallelManager pm, double *A, int lda, int nda, double *B, int ldb, int ndb, int M, int N, int NRHS, const int num_matrices) {
185 
186  typedef Algo::UTV::Unblocked algo_tag_type;
187  typedef Kokkos::View<double***, A_layout, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
188  MatrixViewType_A;
189  typedef Kokkos::View<double***, B_layout, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
190  MatrixViewType_B;
191  typedef Kokkos::View<double***, X_layout, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
192  MatrixViewType_X;
193 
194  MatrixViewType_A mat_A(A, num_matrices, lda, nda);
195  MatrixViewType_B mat_B(B, num_matrices, ldb, ndb);
196 
198  <device_execution_space, algo_tag_type, MatrixViewType_A, MatrixViewType_B, MatrixViewType_X>(M,N,NRHS,mat_A,mat_B).run(pm);
199 
200 }
201 
202 template void batchQRPivotingSolve<layout_right, layout_right, layout_right>(ParallelManager,double*,int,int,double*,int,int,int,int,int,const int);
203 template void batchQRPivotingSolve<layout_right, layout_right, layout_left >(ParallelManager,double*,int,int,double*,int,int,int,int,int,const int);
204 template void batchQRPivotingSolve<layout_right, layout_left , layout_right>(ParallelManager,double*,int,int,double*,int,int,int,int,int,const int);
205 template void batchQRPivotingSolve<layout_right, layout_left , layout_left >(ParallelManager,double*,int,int,double*,int,int,int,int,int,const int);
206 template void batchQRPivotingSolve<layout_left , layout_right, layout_right>(ParallelManager,double*,int,int,double*,int,int,int,int,int,const int);
207 template void batchQRPivotingSolve<layout_left , layout_right, layout_left >(ParallelManager,double*,int,int,double*,int,int,int,int,int,const int);
208 template void batchQRPivotingSolve<layout_left , layout_left , layout_right>(ParallelManager,double*,int,int,double*,int,int,int,int,int,const int);
209 template void batchQRPivotingSolve<layout_left , layout_left , layout_left >(ParallelManager,double*,int,int,double*,int,int,int,int,int,const int);
210 
211 } // GMLS_LinearAlgebra
212 } // Compadre
template void batchQRPivotingSolve< layout_right, layout_left, layout_right >(ParallelManager, double *, int, int, double *, int, int, int, int, int, const int)
template void batchQRPivotingSolve< layout_left, layout_left, layout_left >(ParallelManager, double *, int, int, double *, int, int, int, int, int, const int)
KOKKOS_INLINE_FUNCTION int getTeamScratchLevel(const int level) const
void CallFunctorWithTeamThreadsAndVectors(C functor, const global_index_type batch_size, const int threads_per_team=-1, const int vector_lanes_per_thread=-1) const
Calls a parallel_for parallel_for will break out over loops over teams with each vector lane executin...
template void batchQRPivotingSolve< layout_right, layout_right, layout_left >(ParallelManager, double *, int, int, double *, int, int, int, int, int, const int)
template void batchQRPivotingSolve< layout_right, layout_right, layout_right >(ParallelManager, double *, int, int, double *, int, int, int, int, int, const int)
template void batchQRPivotingSolve< layout_left, layout_right, layout_right >(ParallelManager, double *, int, int, double *, int, int, int, int, int, const int)
KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const
Kokkos::DefaultExecutionSpace device_execution_space
#define TO_GLOBAL(variable)
template void batchQRPivotingSolve< layout_left, layout_right, layout_left >(ParallelManager, double *, int, int, double *, int, int, int, int, int, const int)
Kokkos::View< double **, layout_right, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_matrix_right_type
Kokkos::View< double *, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_vector_type
KOKKOS_INLINE_FUNCTION Functor_TestBatchedTeamVectorSolveUTV(const int M, const int N, const int NRHS, const MatrixViewType_A &a, const MatrixViewType_B &b)
Kokkos::View< double **, layout_left, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_matrix_left_type
template void batchQRPivotingSolve< layout_left, layout_left, layout_right >(ParallelManager, double *, int, int, double *, int, int, int, int, int, const int)
void batchQRPivotingSolve(ParallelManager pm, double *A, int lda, int nda, double *B, int ldb, int ndb, int M, int N, int NRHS, const int num_matrices)
Solves a batch of problems with QR+Pivoting.
void setTeamScratchSize(const int level, const int value)
template void batchQRPivotingSolve< layout_right, layout_left, layout_left >(ParallelManager, double *, int, int, double *, int, int, int, int, int, const int)
Kokkos::View< int *, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_local_index_type