34 #include "Teuchos_BLAS.hpp" 37 #include "Teuchos_Time.hpp" 38 #include "Teuchos_CommandLineProcessor.hpp" 48 Teuchos::BLAS<int,double> blas;
50 std::vector<double>
A(m*k),
B(k*n),
C(m*n);
51 for (
unsigned int j=0; j<k; j++)
52 for (
unsigned int i=0;
i<m;
i++)
54 for (
unsigned int j=0; j<n; j++)
55 for (
unsigned int i=0;
i<k;
i++)
57 for (
unsigned int j=0; j<n; j++)
58 for (
unsigned int i=0;
i<m;
i++)
60 double alpha = urand.
number();
61 double beta = urand.
number();
63 Teuchos::Time timer(
"Teuchos Double GEMM",
false);
65 for (
unsigned int j=0; j<nloop; j++) {
66 blas.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, m, n, k, alpha, &
A[0], m,
67 &
B[0], k, beta, &
C[0], m);
71 return timer.totalElapsedTime() / nloop;
78 Teuchos::BLAS<int,double> blas;
80 std::vector<double>
A(m*n),
B(n),
C(m);
81 for (
unsigned int j=0; j<n; j++) {
82 for (
unsigned int i=0;
i<m;
i++)
86 for (
unsigned int i=0;
i<m;
i++)
88 double alpha = urand.
number();
89 double beta = urand.
number();
91 Teuchos::Time timer(
"Teuchos Double GEMV",
false);
93 for (
unsigned int j=0; j<nloop; j++) {
94 blas.GEMV(Teuchos::NO_TRANS, m, n, alpha, &
A[0], m, &
B[0], 1, beta, &
C[0], 1);
98 return timer.totalElapsedTime() / nloop;
105 Teuchos::BLAS<int,double> blas;
107 std::vector<double> X(m), Y(m);
108 for (
unsigned int i=0;
i<m;
i++) {
113 Teuchos::Time timer(
"Teuchos Double DOT",
false);
116 for (
unsigned int j=0; j<nloop; j++) {
117 z += blas.DOT(m, &X[0], 1, &Y[0], 1);
121 return timer.totalElapsedTime() / nloop;
124 template <
typename FadType>
127 unsigned int ndot,
unsigned int nloop)
130 Teuchos::BLAS<int,FadType> blas;
132 std::vector<FadType>
A(m*k),
B(k*n),
C(m*n);
133 for (
unsigned int j=0; j<k; j++) {
134 for (
unsigned int i=0;
i<m;
i++) {
136 for (
unsigned int l=0; l<ndot; l++)
140 for (
unsigned int j=0; j<n; j++) {
141 for (
unsigned int i=0;
i<k;
i++) {
143 for (
unsigned int l=0; l<ndot; l++)
147 for (
unsigned int j=0; j<n; j++) {
148 for (
unsigned int i=0;
i<m;
i++) {
150 for (
unsigned int l=0; l<ndot; l++)
156 for (
unsigned int l=0; l<ndot; l++) {
157 alpha.fastAccessDx(l) = urand.
number();
158 beta.fastAccessDx(l) = urand.
number();
161 Teuchos::Time timer(
"Teuchos Fad GEMM",
false);
163 for (
unsigned int j=0; j<nloop; j++) {
164 blas.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, m, n, k, alpha, &
A[0], m,
165 &
B[0], k, beta, &
C[0], m);
169 return timer.totalElapsedTime() / nloop;
172 template <
typename FadType>
178 Teuchos::BLAS<int,FadType> blas;
180 std::vector<FadType>
A(m*n),
B(n),
C(m);
181 for (
unsigned int j=0; j<n; j++) {
182 for (
unsigned int i=0;
i<m;
i++) {
185 for (
unsigned int k=0; k<ndot; k++)
189 for (
unsigned int k=0; k<ndot; k++)
192 for (
unsigned int i=0;
i<m;
i++) {
194 for (
unsigned int k=0; k<ndot; k++)
199 for (
unsigned int k=0; k<ndot; k++) {
200 alpha.fastAccessDx(k) = urand.
number();
201 beta.fastAccessDx(k) = urand.
number();
204 Teuchos::Time timer(
"Teuchos Fad GEMV",
false);
206 for (
unsigned int j=0; j<nloop; j++) {
207 blas.GEMV(Teuchos::NO_TRANS, m, n, alpha, &
A[0], m, &
B[0], 1, beta, &
C[0], 1);
211 return timer.totalElapsedTime() / nloop;
214 template <
typename FadType>
219 Teuchos::BLAS<int,FadType> blas;
221 std::vector<FadType> X(m), Y(m);
222 for (
unsigned int i=0;
i<m;
i++) {
225 for (
unsigned int k=0; k<ndot; k++) {
226 X[
i].fastAccessDx(k) = urand.
number();
227 Y[
i].fastAccessDx(k) = urand.
number();
231 Teuchos::Time timer(
"Teuchos Fad DOT",
false);
233 for (
unsigned int j=0; j<nloop; j++) {
234 FadType z = blas.DOT(m, &X[0], 1, &Y[0], 1);
238 return timer.totalElapsedTime() / nloop;
241 template <
typename FadType>
244 unsigned int ndot,
unsigned int nloop,
bool use_dynamic)
247 unsigned int sz = (m*k+k*n+m*n)*(1+ndot);
248 Teuchos::BLAS<int,FadType> blas(
false,use_dynamic,sz);
252 for (
unsigned int j=0; j<k; j++) {
253 for (
unsigned int i=0;
i<m;
i++) {
255 for (
unsigned int l=0; l<ndot; l++)
259 for (
unsigned int j=0; j<n; j++) {
260 for (
unsigned int i=0;
i<k;
i++) {
262 for (
unsigned int l=0; l<ndot; l++)
266 for (
unsigned int j=0; j<n; j++) {
267 for (
unsigned int i=0;
i<m;
i++) {
269 for (
unsigned int l=0; l<ndot; l++)
275 for (
unsigned int l=0; l<ndot; l++) {
276 alpha.fastAccessDx(l) = urand.
number();
277 beta.fastAccessDx(l) = urand.
number();
280 Teuchos::Time timer(
"Teuchos Fad GEMM",
false);
282 for (
unsigned int j=0; j<nloop; j++) {
283 blas.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, m, n, k, alpha, &
A[0], m,
284 &
B[0], k, beta, &
C[0], m);
288 return timer.totalElapsedTime() / nloop;
291 template <
typename FadType>
294 unsigned int nloop,
bool use_dynamic)
297 unsigned int sz = m*n*(1+ndot) + 2*n*(1+ndot);
298 Teuchos::BLAS<int,FadType> blas(
false,use_dynamic,sz);
301 for (
unsigned int j=0; j<n; j++) {
302 for (
unsigned int i=0;
i<m;
i++) {
305 for (
unsigned int k=0; k<ndot; k++)
309 for (
unsigned int k=0; k<ndot; k++)
312 for (
unsigned int i=0;
i<m;
i++) {
314 for (
unsigned int k=0; k<ndot; k++)
319 for (
unsigned int k=0; k<ndot; k++) {
320 alpha.fastAccessDx(k) = urand.
number();
321 beta.fastAccessDx(k) = urand.
number();
324 Teuchos::Time timer(
"Teuchos Fad GEMV",
false);
326 for (
unsigned int j=0; j<nloop; j++) {
327 blas.GEMV(Teuchos::NO_TRANS, m, n, alpha, &
A[0], m, &
B[0], 1, beta, &
C[0], 1);
331 return timer.totalElapsedTime() / nloop;
334 template <
typename FadType>
337 unsigned int nloop,
bool use_dynamic)
340 unsigned int sz = 2*m*(1+ndot);
341 Teuchos::BLAS<int,FadType> blas(
false,use_dynamic,sz);
344 for (
unsigned int i=0;
i<m;
i++) {
347 for (
unsigned int k=0; k<ndot; k++) {
348 X[
i].fastAccessDx(k) = urand.
number();
349 Y[
i].fastAccessDx(k) = urand.
number();
353 Teuchos::Time timer(
"Teuchos Fad DOT",
false);
355 for (
unsigned int j=0; j<nloop; j++) {
356 FadType z = blas.DOT(m, &X[0], 1, &Y[0], 1);
360 return timer.totalElapsedTime() / nloop;
363 int main(
int argc,
char* argv[]) {
372 Teuchos::CommandLineProcessor clp;
373 clp.setDocString(
"This program tests the speed of differentiating BLAS routines using Fad");
375 clp.setOption(
"m", &m,
"Number of rows");
377 clp.setOption(
"n", &n,
"Number of columns");
379 clp.setOption(
"k", &k,
"Number of columns for GEMM");
381 clp.setOption(
"ndot", &ndot,
"Number of derivative components");
383 clp.setOption(
"nloop", &nloop,
"Number of loops");
385 clp.setOption(
"dynamic", &dynamic,
"Use dynamic allocation");
388 Teuchos::CommandLineProcessor::EParseCommandLineReturn
389 parseReturn= clp.parse(argc, argv);
390 if(parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL)
392 bool use_dynamic = (dynamic != 0);
394 std::cout.setf(std::ios::scientific);
395 std::cout.precision(
p);
396 std::cout <<
"Times (sec) for m = " << m <<
", n = " << n
397 <<
", ndot = " << ndot <<
", nloop = " << nloop
398 <<
", dynamic = " << use_dynamic <<
": " 402 std::cout <<
"GEMM: " << std::setw(w) << tb << std::endl;
404 t = do_time_sacado_fad_gemm< Sacado::Fad::DVFad<double> >(m,n,k,ndot,nloop,use_dynamic);
405 std::cout <<
"Sacado DVFad GEMM: " << std::setw(w) << t <<
"\t" 406 << std::setw(w) << t/tb << std::endl;
408 t = do_time_sacado_fad_gemm< Sacado::Fad::DFad<double> >(m,n,k,ndot,nloop,use_dynamic);
409 std::cout <<
"Sacado DFad GEMM: " << std::setw(w) << t <<
"\t" 410 << std::setw(w) << t/tb << std::endl;
412 t = do_time_teuchos_fad_gemm< Sacado::Fad::DFad<double> >(m,n,k,ndot,nloop);
413 std::cout <<
"Teuchos DFad GEMM: " << std::setw(w) << t <<
"\t" 414 << std::setw(w) << t/tb << std::endl;
420 t = do_time_teuchos_fad_gemm< Sacado::Fad::DVFad<double> >(m,n,k,ndot,nloop);
421 std::cout <<
"Teuchos DVFad GEMM: " << std::setw(w) << t <<
"\t" 422 << std::setw(w) << t/tb << std::endl;
424 std::cout << std::endl;
427 std::cout <<
"GEMV: " << std::setw(w) << tb << std::endl;
429 t = do_time_sacado_fad_gemv< Sacado::Fad::DVFad<double> >(m,n,ndot,nloop*10,use_dynamic);
430 std::cout <<
"Sacado DVFad GEMV: " << std::setw(w) << t <<
"\t" 431 << std::setw(w) << t/tb << std::endl;
433 t = do_time_sacado_fad_gemv< Sacado::Fad::DFad<double> >(m,n,ndot,nloop*10,use_dynamic);
434 std::cout <<
"Sacado DFad GEMV: " << std::setw(w) << t <<
"\t" 435 << std::setw(w) << t/tb << std::endl;
437 t = do_time_teuchos_fad_gemv< Sacado::Fad::DFad<double> >(m,n,ndot,nloop*10);
438 std::cout <<
"Teuchos DFad GEMV: " << std::setw(w) << t <<
"\t" 439 << std::setw(w) << t/tb << std::endl;
445 t = do_time_teuchos_fad_gemv< Sacado::Fad::DVFad<double> >(m,n,ndot,nloop*10);
446 std::cout <<
"Teuchos DVFad GEMV: " << std::setw(w) << t <<
"\t" 447 << std::setw(w) << t/tb << std::endl;
449 std::cout << std::endl;
452 std::cout <<
"DOT: " << std::setw(w) << tb << std::endl;
454 t = do_time_sacado_fad_dot< Sacado::Fad::DVFad<double> >(m,ndot,nloop*100,use_dynamic);
455 std::cout <<
"Sacado DVFad DOT: " << std::setw(w) << t <<
"\t" 456 << std::setw(w) << t/tb << std::endl;
458 t = do_time_sacado_fad_dot< Sacado::Fad::DFad<double> >(m,ndot,nloop*100,use_dynamic);
459 std::cout <<
"Sacado DFad DOT: " << std::setw(w) << t <<
"\t" 460 << std::setw(w) << t/tb << std::endl;
462 t = do_time_teuchos_fad_dot< Sacado::Fad::DFad<double> >(m,ndot,nloop*100);
463 std::cout <<
"Teuchos DFad DOT: " << std::setw(w) << t <<
"\t" 464 << std::setw(w) << t/tb << std::endl;
470 t = do_time_teuchos_fad_dot< Sacado::Fad::DVFad<double> >(m,ndot,nloop*100);
471 std::cout <<
"Teuchos DVFad DOT: " << std::setw(w) << t <<
"\t" 472 << std::setw(w) << t/tb << std::endl;
475 catch (std::exception& e) {
476 std::cout << e.what() << std::endl;
479 catch (
const char *s) {
480 std::cout << s << std::endl;
484 std::cout <<
"Caught unknown exception!" << std::endl;
int main(int argc, char *argv[])
double do_time_teuchos_fad_gemv(unsigned int m, unsigned int n, unsigned int ndot, unsigned int nloop)
Sacado::Fad::DFad< double > FadType
double do_time_teuchos_double_gemv(unsigned int m, unsigned int n, unsigned int nloop)
ScalarT number()
Get random number.
A class for storing a contiguously allocated array of Fad objects. This is a general definition that ...
double do_time_teuchos_double_gemm(unsigned int m, unsigned int n, unsigned int k, unsigned int nloop)
double do_time_sacado_fad_gemv(unsigned int m, unsigned int n, unsigned int ndot, unsigned int nloop, bool use_dynamic)
double do_time_sacado_fad_gemm(unsigned int m, unsigned int n, unsigned int k, unsigned int ndot, unsigned int nloop, bool use_dynamic)
double do_time_teuchos_double_dot(unsigned int m, unsigned int nloop)
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
double do_time_sacado_fad_dot(unsigned int m, unsigned int ndot, unsigned int nloop, bool use_dynamic)
double do_time_teuchos_fad_dot(unsigned int m, unsigned int ndot, unsigned int nloop)
double do_time_teuchos_fad_gemm(unsigned int m, unsigned int n, unsigned int k, unsigned int ndot, unsigned int nloop)