30 #ifndef SACADO_DYNAMICARRAYTRAITS_HPP 31 #define SACADO_DYNAMICARRAYTRAITS_HPP 38 #if defined(HAVE_SACADO_KOKKOSCORE) 39 #include "Kokkos_Core.hpp" 40 #if defined(KOKKOS_ENABLE_CUDA) 41 #include "Cuda/Kokkos_Cuda_Vectorization.hpp" 43 #if !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) 44 #include "Kokkos_MemoryPool.hpp" 50 template <
typename ExecSpace>
52 ,
const size_t min_total_alloc_size
53 ,
const uint32_t min_block_alloc_size
54 ,
const uint32_t max_block_alloc_size
55 ,
const uint32_t min_superblock_size
58 template <
typename ExecSpace>
61 #if 0 && defined(HAVE_SACADO_KOKKOSCORE) && defined(KOKKOS_ENABLE_OPENMP) 63 extern const Kokkos::MemoryPool<Kokkos::OpenMP>* global_sacado_openmp_memory_pool;
68 ,
const size_t min_total_alloc_size
69 ,
const uint32_t min_block_alloc_size
70 ,
const uint32_t max_block_alloc_size
71 ,
const uint32_t min_superblock_size
74 typedef Kokkos::MemoryPool<Kokkos::OpenMP> pool_t;
75 Impl::global_sacado_openmp_memory_pool =
76 new pool_t(
typename Kokkos::OpenMP::memory_space(),
85 delete Impl::global_sacado_openmp_memory_pool;
89 #if defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__) 93 extern const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_host;
94 extern const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_device;
95 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE 96 extern __device__
const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_on_device;
98 __device__
const Kokkos::MemoryPool<Kokkos::Cuda>* global_sacado_cuda_memory_pool_on_device = 0;
101 struct SetMemoryPoolPtr {
102 Kokkos::MemoryPool<Kokkos::Cuda>* pool_device;
103 __device__
inline void operator()(
int)
const {
104 global_sacado_cuda_memory_pool_on_device = pool_device;
114 ,
const size_t min_total_alloc_size
115 ,
const uint32_t min_block_alloc_size
116 ,
const uint32_t max_block_alloc_size
117 ,
const uint32_t min_superblock_size
120 typedef Kokkos::MemoryPool<Kokkos::Cuda> pool_t;
122 new pool_t(
typename Kokkos::Cuda::memory_space(),
123 min_total_alloc_size,
124 min_block_alloc_size,
125 max_block_alloc_size,
126 min_superblock_size);
127 Impl::SetMemoryPoolPtr f;
128 CUDA_SAFE_CALL( cudaMalloc( &f.pool_device,
sizeof(pool_t) ) );
129 CUDA_SAFE_CALL( cudaMemcpy( f.pool_device, pool,
131 cudaMemcpyHostToDevice ) );
132 Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,1),f);
133 Impl::global_sacado_cuda_memory_pool_host = pool;
134 Impl::global_sacado_cuda_memory_pool_device = f.pool_device;
139 CUDA_SAFE_CALL( cudaFree( (
void*) Impl::global_sacado_cuda_memory_pool_device ) );
140 delete Impl::global_sacado_cuda_memory_pool_host;
145 #if !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__) 150 __device__
inline int warpLane(
const int warp_size = 32) {
151 return ( threadIdx.x + (threadIdx.y + threadIdx.z*blockDim.y)*blockDim.x ) % warp_size;
155 template <
typename T>
156 __device__
inline T warpReduce(
T y,
const int warp_size = 32) {
157 for (
int i=1;
i<warp_size;
i*=2) {
158 y += Kokkos::shfl_down(
y,
i, warp_size);
160 y = Kokkos::shfl(
y, 0, warp_size);
165 template <
typename T>
166 __device__
inline int warpScan(
T y,
const int warp_size = 32) {
167 const int lane = warpLane();
168 y = Kokkos::shfl_up(
y, 1, warp_size);
171 for (
int i=1;
i<warp_size;
i*=2) {
172 T t = Kokkos::shfl_up(
y,
i, warp_size);
179 template <
typename T>
180 __device__
inline T warpBcast(
T y,
int id,
const int warp_size = 32) {
181 return Kokkos::shfl(
y,
id, warp_size);
190 template <
typename T>
193 #if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_ENABLE_CUDA_UVM) && !defined( __CUDA_ARCH__ ) 196 CUDA_SAFE_CALL( cudaMallocManaged( (
void**) &m, sz*
sizeof(
T), cudaMemAttachGlobal ) );
197 #elif defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__) 200 const int total_sz = warpReduce(sz);
201 const int lane = warpLane();
202 if (total_sz > 0 && lane == 0) {
203 m =
static_cast<T*
>(global_sacado_cuda_memory_pool_on_device->allocate(total_sz*
sizeof(
T)));
205 Kokkos::abort(
"Allocation failed. Kokkos memory pool is out of memory");
209 #elif 0 && defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && defined(KOKKOS_ENABLE_OPENMP) 212 if (global_sacado_openmp_memory_pool != 0) {
213 m =
static_cast<T*
>(global_sacado_openmp_memory_pool->allocate(sz*
sizeof(
T)));
215 Kokkos::abort(
"Allocation failed. Kokkos memory pool is out of memory");
218 m =
static_cast<T*
>(
operator new(sz*
sizeof(
T)));
223 m =
static_cast<T*
>(
operator new(sz*
sizeof(
T)));
224 #if defined(HAVE_SACADO_KOKKOSCORE) 226 Kokkos::abort(
"Allocation failed.");
233 template <
typename T>
236 #if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_ENABLE_CUDA_UVM) && !defined( __CUDA_ARCH__ ) 238 CUDA_SAFE_CALL( cudaFree(m) );
239 #elif defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__) 240 const int total_sz = warpReduce(sz);
241 const int lane = warpLane();
242 if (total_sz > 0 && lane == 0) {
243 global_sacado_cuda_memory_pool_on_device->deallocate((
void*) m, total_sz*
sizeof(
T));
245 #elif 0 && defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) && defined(KOKKOS_ENABLE_OPENMP) 247 if (global_sacado_openmp_memory_pool != 0)
248 global_sacado_openmp_memory_pool->deallocate((
void*) m, sz*
sizeof(
T));
250 operator delete((
void*) m);
254 operator delete((
void*) m);
268 static T*
get(
int sz) {
269 T* m = Impl::ds_alloc<T>(sz);
271 for (
int i=0;
i<sz; ++
i)
279 T* m = Impl::ds_alloc<T>(sz);
281 for (
int i=0;
i<sz; ++
i)
292 T* m = Impl::ds_alloc<T>(sz);
294 for (
int i=0;
i<sz; ++
i)
295 new (
p++)
T(*(src++));
305 T* m = Impl::ds_alloc<T>(sz);
307 for (
int i=0;
i<sz; ++
i) {
316 static void copy(
const T* src,
T* dest,
int sz) {
317 for (
int i=0;
i<sz; ++
i)
318 *(dest++) = *(src++);
324 T* dest,
int dest_stride,
int sz) {
325 for (
int i=0;
i<sz; ++
i) {
335 for (
int i=0;
i<sz; ++
i)
342 for (
int i=0;
i<sz; ++
i) {
352 for (
T* b = m; b!=e; b++)
358 #if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__) 362 template <
typename T>
364 static T* ds_strided_alloc(
const int sz) {
370 if (blockDim.x == 32) {
372 const int lane = threadIdx.x;
373 if (sz > 0 && lane == 0) {
374 #if defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) 375 m =
static_cast<T*
>(global_sacado_cuda_memory_pool_on_device->allocate(sz*
sizeof(
T)));
377 Kokkos::abort(
"Allocation failed. Kokkos memory pool is out of memory");
379 m =
static_cast<T*
>(
operator new(sz*
sizeof(
T)));
380 #if defined(HAVE_SACADO_KOKKOSCORE) 382 Kokkos::abort(
"Allocation failed.");
386 m = warpBcast(m,0,blockDim.x);
390 m =
static_cast<T*
>(
operator new(sz*
sizeof(
T)));
391 #if defined(HAVE_SACADO_KOKKOSCORE) 393 Kokkos::abort(
"Allocation failed.");
401 template <
typename T>
403 static void ds_strided_free(
T* m,
int sz) {
404 if (blockDim.x == 32) {
406 const int lane = threadIdx.x;
407 if (sz > 0 && lane == 0) {
408 #if defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) 409 global_sacado_cuda_memory_pool_on_device->deallocate((
void*) m, sz*
sizeof(
T));
411 operator delete((
void*) m);
417 operator delete((
void*) m);
428 template <
typename T>
429 struct ds_array<
T,
true> {
433 static T*
get(
int sz) {
434 T* m = Impl::ds_strided_alloc<T>(sz);
441 T* m = Impl::ds_strided_alloc<T>(sz);
442 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x)
453 T* m = Impl::ds_strided_alloc<T>(sz);
454 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x)
465 T* m = Impl::ds_strided_alloc<T>(sz);
466 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x)
467 m[
i] = src[
i*stride];
473 static void copy(
const T* src,
T* dest,
int sz) {
474 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x)
481 T* dest,
int dest_stride,
int sz) {
482 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x) {
483 dest[
i*dest_stride] = src[
i*src_stride];
489 static void zero(
T* dest,
int sz) {
490 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x)
497 for (
int i=threadIdx.x;
i<sz;
i+=blockDim.x) {
498 dest[
i*stride] =
T(0.);
505 Impl::ds_strided_free(m, sz);
509 #elif defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD_STRIDED) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__) 513 template <
typename T>
515 static T* ds_strided_alloc(
const int sz) {
521 if (blockDim.x == 32) {
524 const int total_sz = warpReduce(sz, blockDim.x);
525 const int lane = threadIdx.x;
526 if (total_sz > 0 && lane == 0) {
527 #if defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) 528 m =
static_cast<T*
>(global_sacado_cuda_memory_pool_on_device->allocate(total_sz*
sizeof(
T)));
530 Kokkos::abort(
"Allocation failed. Kokkos memory pool is out of memory");
532 m =
static_cast<T*
>(
operator new(total_sz*
sizeof(
T)));
533 #if defined(HAVE_SACADO_KOKKOSCORE) 535 Kokkos::abort(
"Allocation failed.");
539 m = warpBcast(m,0,blockDim.x);
544 m =
static_cast<T*
>(
operator new(sz*
sizeof(
T)));
545 #if defined(HAVE_SACADO_KOKKOSCORE) 547 Kokkos::abort(
"Allocation failed.");
555 template <
typename T>
557 static void ds_strided_free(
T* m,
int sz) {
558 if (blockDim.x == 32) {
561 const int total_sz = warpReduce(sz, blockDim.x);
562 const int lane = threadIdx.x;
563 if (total_sz > 0 && lane == 0) {
564 #if defined(HAVE_SACADO_KOKKOSCORE) && defined(SACADO_KOKKOS_USE_MEMORY_POOL) 565 global_sacado_cuda_memory_pool_on_device->deallocate((
void*) m, total_sz*
sizeof(
T));
567 operator delete((
void*) m);
573 operator delete((
void*) m);
582 template <
typename T>
583 struct ds_array<
T,
true> {
587 static T*
get(
int sz) {
588 T* m = Impl::ds_strided_alloc<T>(sz);
595 T* m = Impl::ds_strided_alloc<T>(sz);
596 for (
int i=0;
i<sz; ++
i)
597 m[
i*blockDim.x] = 0.0;
607 T* m = Impl::ds_strided_alloc<T>(sz);
608 for (
int i=0;
i<sz; ++
i)
609 m[
i*blockDim.x] = src[
i*blockDim.x];
619 T* m = Impl::ds_strided_alloc<T>(sz);
620 for (
int i=0;
i<sz; ++
i)
621 m[
i*blockDim.x] = src[
i*stride];
627 static void copy(
const T* src,
T* dest,
int sz) {
628 for (
int i=0;
i<sz; ++
i)
629 dest[
i*blockDim.x] = src[
i*blockDim.x];
635 T* dest,
int dest_stride,
int sz) {
636 for (
int i=0;
i<sz; ++
i) {
645 static void zero(
T* dest,
int sz) {
646 for (
int i=0;
i<sz; ++
i)
647 dest[
i*blockDim.x] =
T(0.);
653 for (
int i=0;
i<sz; ++
i) {
662 Impl::ds_strided_free(m, sz);
672 template <
typename T>
677 static T*
get(
int sz) {
678 T* m = Impl::ds_alloc<T>(sz);
685 T* m = Impl::ds_alloc<T>(sz);
687 for (
int i=0;
i<sz; ++
i)
691 std::memset(m,0,sz*
sizeof(
T));
702 T* m = Impl::ds_alloc<T>(sz);
703 for (
int i=0;
i<sz; ++
i)
714 T* m = Impl::ds_alloc<T>(sz);
715 for (
int i=0;
i<sz; ++
i)
716 m[
i] = src[
i*stride];
722 static void copy(
const T* src,
T* dest,
int sz) {
723 if (sz > 0 && dest != NULL && src != NULL)
725 for (
int i=0;
i<sz; ++
i)
728 std::memcpy(dest,src,sz*
sizeof(
T));
735 T* dest,
int dest_stride,
int sz) {
736 for (
int i=0;
i<sz; ++
i) {
746 if (sz > 0 && dest != NULL)
748 for (
int i=0;
i<sz; ++
i)
751 std::memset(dest,0,sz*
sizeof(
T));
758 for (
int i=0;
i<sz; ++
i) {
775 #endif // SACADO_DYNAMICARRAY_HPP
static SACADO_INLINE_FUNCTION void strided_copy(const T *src, int src_stride, T *dest, int dest_stride, int sz)
Copy array from src to dest of length sz.
static SACADO_INLINE_FUNCTION void strided_zero(T *dest, int stride, int sz)
Zero out array dest of length sz.
static SACADO_INLINE_FUNCTION void copy(const T *src, T *dest, int sz)
Copy array from src to dest of length sz.
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
static SACADO_INLINE_FUNCTION void zero(T *dest, int sz)
Zero out array dest of length sz.
static SACADO_INLINE_FUNCTION void destroy_and_release(T *m, int sz)
Destroy array elements and release memory.
static SACADO_INLINE_FUNCTION T * get_and_fill(const T *src, int sz)
Get memory for new array of length sz and fill with entries from src.
static SACADO_INLINE_FUNCTION void zero(T *dest, int sz)
Zero out array dest of length sz.
static SACADO_INLINE_FUNCTION T * strided_get_and_fill(const T *src, int stride, int sz)
Get memory for new array of length sz and fill with entries from src.
static SACADO_INLINE_FUNCTION void strided_copy(const T *src, int src_stride, T *dest, int dest_stride, int sz)
Copy array from src to dest of length sz.
static SACADO_INLINE_FUNCTION void ds_free(T *m, int sz)
static SACADO_INLINE_FUNCTION void destroy_and_release(T *m, int sz)
Destroy array elements and release memory.
static SACADO_INLINE_FUNCTION T * ds_alloc(const int sz)
static SACADO_INLINE_FUNCTION void copy(const T *src, T *dest, int sz)
Copy array from src to dest of length sz.
static SACADO_INLINE_FUNCTION T * strided_get_and_fill(const T *src, int stride, int sz)
Get memory for new array of length sz and fill with entries from src.
void destroyGlobalMemoryPool(const ExecSpace &space)
#define SACADO_INLINE_FUNCTION
static SACADO_INLINE_FUNCTION T * get_and_fill(int sz)
Get memory for new array of length sz and fill with zeros.
static SACADO_INLINE_FUNCTION T * get_and_fill(const T *src, int sz)
Get memory for new array of length sz and fill with entries from src.
Dynamic array allocation class that works for any type.
static SACADO_INLINE_FUNCTION void strided_zero(T *dest, int stride, int sz)
Zero out array dest of length sz.
static SACADO_INLINE_FUNCTION T * get_and_fill(int sz)
Get memory for new array of length sz and fill with zeros.