42 #ifndef KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP 43 #define KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP 45 #if defined( __CUDACC__) 50 #include "Kokkos_CrsMatrix.hpp" 51 #include "Kokkos_MV.hpp" 56 #include "Kokkos_Core.hpp" 61 #include "Teuchos_TestForException.hpp" 66 # if (__CUDA_ARCH__ >= 300) 67 # define HAVE_CUDA_SHUFFLE 1 69 # define HAVE_CUDA_SHUFFLE 0 72 # define HAVE_CUDA_SHUFFLE 0 85 template <
typename MatrixStorage,
86 typename MatrixOrdinal,
87 typename MatrixMemory,
89 typename InputStorage,
91 typename OutputStorage,
93 class Multiply<
Kokkos::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
98 Kokkos::View< Sacado::UQ::PCE<InputStorage>*,
100 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
109 typedef Kokkos::Cuda MatrixDevice;
111 typedef execution_space::size_type size_type;
113 typedef Kokkos::CrsMatrix< MatrixValue,
117 MatrixSize> matrix_type;
118 typedef typename matrix_type::values_type matrix_values_type;
120 typedef Kokkos::View< InputVectorValue*,
121 InputP... > input_vector_type;
122 typedef Kokkos::View< OutputVectorValue*,
123 OutputP... > output_vector_type;
127 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
128 typedef typename matrix_values_type::array_type matrix_array_type;
129 typedef typename input_vector_type::array_type input_array_type;
130 typedef typename output_vector_type::array_type output_array_type;
137 const matrix_array_type m_A_values ;
138 const matrix_graph_type m_A_graph ;
139 const input_array_type m_x ;
140 const output_array_type m_y ;
141 const tensor_type m_tensor ;
142 const input_scalar m_a ;
143 const output_scalar m_b ;
144 const size_type BlockSize;
146 Multiply(
const matrix_type & A ,
147 const input_vector_type &
x ,
148 const output_vector_type &
y ,
149 const input_scalar & a ,
150 const output_scalar & b ,
151 const size_type block_size )
152 : m_A_values( A.values )
153 , m_A_graph( A.graph )
159 , BlockSize(block_size)
164 __device__
void operator()(
void)
const 167 const size_type dim = m_tensor.dimension();
170 volatile input_scalar *
const sh_x =
171 kokkos_impl_cuda_shared_memory<input_scalar>();
172 volatile matrix_scalar *
const sh_A = sh_x + BlockSize*dim;
173 volatile output_scalar *
const sh_y = sh_A + BlockSize*dim;
174 #if !HAVE_CUDA_SHUFFLE 175 volatile output_scalar *
const sh_t = sh_y + dim;
178 const size_type nid = blockDim.x * blockDim.y;
179 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
182 for ( size_type i = tid; i < dim; i += nid ) {
188 const size_type iBlockEntryBeg = m_A_graph.row_map[ blockIdx.x ];
189 const size_type iBlockEntryEnd = m_A_graph.row_map[ blockIdx.x + 1 ];
190 for (size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
191 iBlockEntry += BlockSize) {
192 const size_type block_size =
193 (iBlockEntryEnd-iBlockEntry < BlockSize) ?
194 iBlockEntryEnd-iBlockEntry : BlockSize;
201 for ( size_type col = 0; col < block_size; ++col ) {
203 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
204 const input_scalar *
const x = & m_x( 0, iBlockColumn );
205 const matrix_scalar *
const A = & m_A_values( iBlockEntry + col, 0 );
208 for ( size_type i = tid; i < dim; i += nid ) {
209 sh_x[col + i * BlockSize] =
x[i];
210 sh_A[col + i * BlockSize] = A[i];
218 for ( size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
222 const size_type lBeg = m_tensor.entry_begin( i );
223 const size_type lEnd = m_tensor.entry_end( i );
226 for ( size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
229 const size_type kj = m_tensor.coord( l );
230 const tensor_scalar v = m_tensor.value( l );
231 const size_type
j = ( kj & 0x0ffff ) * BlockSize ;
232 const size_type k = ( kj >> 16 ) * BlockSize ;
234 for ( size_type col = 0; col < block_size; ++col ) {
235 y += v * ( sh_A[col+
j] * sh_x[col+k] +
236 sh_A[col+k] * sh_x[col+
j] );
242 #if HAVE_CUDA_SHUFFLE 248 if ( threadIdx.x == 0 ) sh_y[i] +=
y;
251 if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
252 if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
253 if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
254 if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
255 if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
256 if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
267 if ( m_b == output_scalar(0) )
268 for ( size_type i = tid; i < dim; i += nid )
269 m_y( i, blockIdx.x ) = m_a * sh_y[ i ];
271 for ( size_type i = tid; i < dim; i += nid )
272 m_y( i, blockIdx.x ) = m_a * sh_y[ i ] + m_b * m_y( i, blockIdx.x );
275 struct TensorReadEntry {
276 size_type block_size, shmem, num_blocks, num_warp;
280 static void apply(
const matrix_type & A ,
281 const input_vector_type &
x ,
282 const output_vector_type &
y ,
283 const input_scalar & a = input_scalar(1) ,
284 const output_scalar & b = output_scalar(0) )
287 const size_type row_count = A.graph.row_map.dimension_0() - 1;
288 const size_type tensor_dimension = tensor.dimension();
289 const size_type tensor_align = tensor_dimension;
290 const size_type avg_tensor_entries_per_row = tensor.avg_entries_per_row();
293 const size_type fem_nnz_per_row = 27;
296 DeviceProp device_prop;
297 const size_type shcap = device_prop.shared_memory_capacity;
298 const size_type sh_granularity = device_prop.shared_memory_granularity;
299 const size_type max_shmem_per_block = device_prop.max_shmem_per_block;
300 const size_type max_blocks_per_sm = device_prop.max_blocks_per_sm;
301 const size_type warp_size = device_prop.warp_size;
302 const size_type warp_granularity = device_prop.warp_granularity;
303 const size_type max_warps_per_block =
304 std::min(device_prop.max_threads_per_block / warp_size,
305 device_prop.max_warps_per_sm);
306 const size_type min_warps_per_block = 1;
307 const size_type max_regs_per_sm = device_prop.max_regs_per_sm;
308 const size_type max_regs_per_block = device_prop.max_regs_per_block;
309 const size_type reg_bank_size = device_prop.reg_bank_size;
314 const size_type regs_per_thread =
315 device_prop.get_kernel_registers(
316 Kokkos::Impl::cuda_parallel_launch_local_memory<Multiply>);
317 const size_type regs_per_warp =
318 (warp_size*regs_per_thread + reg_bank_size-1) & ~(reg_bank_size-1);
319 const size_type warps_per_sm =
320 (max_regs_per_sm/regs_per_warp) & ~(warp_granularity-1);
321 const size_type warps_per_block =
322 (max_regs_per_block/regs_per_warp) & ~(warp_granularity-1);
330 const size_type threads_per_row =
331 avg_tensor_entries_per_row >= 88 ? 32 : 16;
332 const size_type rows_per_warp = warp_size / threads_per_row;
334 const size_type in_vec_scalar_size =
sizeof(input_scalar);
335 const size_type out_vec_scalar_size =
sizeof(output_scalar);
336 const size_type mat_scalar_size =
sizeof(matrix_scalar);
338 #define USE_FIXED_BLOCKSIZE 0 340 #if USE_FIXED_BLOCKSIZE 342 const size_type num_blocks = 3;
343 size_type nw = warps_per_sm / num_blocks;
344 while (nw > 1 && num_blocks*nw % warp_granularity) --nw;
345 const size_type num_warp = nw;
346 const size_type sh_per_block = shcap / num_blocks;
348 device_prop.has_shuffle ? 0 : in_vec_scalar_size*warp_size*num_warp;
349 size_type bs = ((sh_per_block - sr) / tensor_align - out_vec_scalar_size) /
350 (in_vec_scalar_size+mat_scalar_size);
351 if (bs % 2 == 0) --bs;
352 const size_type block_size_max = 31;
353 const size_type block_size =
std::min(bs, block_size_max);
355 const size_type shmem =
356 ( ((in_vec_scalar_size+mat_scalar_size)*block_size+out_vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
369 const size_type block_size_min = 3;
370 const size_type half_nnz_per_row = fem_nnz_per_row / 2 + 1;
371 const size_type block_size_max =
372 half_nnz_per_row % 2 ? half_nnz_per_row + 1 : half_nnz_per_row;
373 Teuchos::Array<TensorReadEntry> reads_per_thread;
374 for (size_type bs = block_size_min; bs<=block_size_max; bs+=2) {
378 device_prop.has_shuffle ? 0 : in_vec_scalar_size*warp_size*warps_per_block;
380 ((in_vec_scalar_size+mat_scalar_size)*bs+out_vec_scalar_size)*tensor_align+sr;
381 shmem = (shmem + sh_granularity-1) & ~(sh_granularity-1);
382 if (shmem <= max_shmem_per_block) {
383 size_type num_blocks =
std::min(shcap / shmem, max_blocks_per_sm);
384 size_type tensor_reads = (fem_nnz_per_row+bs-1) / bs;
387 min_warps_per_block),
388 max_warps_per_block);
389 while (num_warp > 1 && num_blocks*num_warp % warp_granularity)
391 TensorReadEntry entry;
392 entry.block_size = bs;
394 entry.num_blocks = num_blocks;
395 entry.num_warp = num_warp;
398 size_type factor =
std::min(num_blocks,3u);
399 entry.reads = (
static_cast<double>(tensor_reads) /
400 static_cast<double>(factor*num_blocks*num_warp));
401 reads_per_thread.push_back(entry);
404 TEUCHOS_TEST_FOR_EXCEPTION(
405 reads_per_thread.size() == 0, std::logic_error,
406 "Stochastic problem dimension is too large to fit in shared memory");
408 double min_reads = reads_per_thread[0].reads;
409 for (
int i=1; i<reads_per_thread.size(); ++i) {
410 if (reads_per_thread[i].reads < min_reads) {
412 min_reads = reads_per_thread[i].reads;
416 const size_type block_size = reads_per_thread[idx].block_size;
417 const size_type shmem = reads_per_thread[idx].shmem;
418 const size_type num_blocks = reads_per_thread[idx].num_blocks;
419 const size_type num_warp = reads_per_thread[idx].num_warp;
424 const dim3 dBlock( threads_per_row , rows_per_warp*num_warp , 1 );
425 const dim3 dGrid( row_count, 1, 1 );
428 std::cout <<
"block_size = " << block_size
429 <<
" tensor reads = " << (fem_nnz_per_row+block_size-1)/block_size
430 <<
" regs_per_thread = " << regs_per_thread
431 <<
" num blocks = " << num_blocks
432 <<
" num warps = " << num_warp
433 <<
" num rows = " << tensor_dimension
434 <<
" rows/warp = " << tensor_dimension / (num_warp*rows_per_warp)
435 <<
" avg entries/row = " << avg_tensor_entries_per_row
441 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
442 ( Multiply( A,
x,
y, a, b, block_size ) );
454 template <
typename MatrixStorage,
455 typename MatrixOrdinal,
456 typename MatrixMemory,
458 typename InputStorage,
460 typename OutputStorage,
461 typename ... OutputP>
462 class Multiply<
Kokkos::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
467 Kokkos::View< Sacado::UQ::PCE<InputStorage>**,
469 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
478 typedef Kokkos::Cuda MatrixDevice;
480 typedef execution_space::size_type size_type;
482 typedef Kokkos::CrsMatrix< MatrixValue,
486 MatrixSize> matrix_type;
487 typedef Kokkos::View< InputVectorValue**,
488 InputP... > input_vector_type;
489 typedef Kokkos::View< OutputVectorValue**,
490 OutputP... > output_vector_type;
496 static void apply(
const matrix_type & A ,
497 const input_vector_type &
x ,
498 const output_vector_type &
y ,
499 const input_scalar & a = input_scalar(1) ,
500 const output_scalar & b = output_scalar(0) )
502 typedef Kokkos::View< InputVectorValue*, InputP... > input_vector_type_1D;
503 typedef Kokkos::View< OutputVectorValue*, OutputP... > output_vector_type_1D;
504 typedef Multiply< matrix_type, input_vector_type_1D,
505 output_vector_type_1D > multiply_type_1D;
507 const size_type num_col =
y.dimension_1();
508 for (size_type col=0; col<num_col; ++col)
510 multiply_type_1D::apply(
512 Kokkos::subview(
x, Kokkos::ALL(), col),
513 Kokkos::subview(
y, Kokkos::ALL(), col),
518 template <
typename Kernel>
520 #if __CUDA_ARCH__ >= 300 523 MeanFullOccupancyKernelLaunch(Kernel kernel) {
531 template <
typename MatrixStorage,
532 typename MatrixOrdinal,
533 typename MatrixMemory,
535 typename InputStorage,
537 typename OutputStorage,
538 typename ... OutputP>
539 class MeanMultiply<
Kokkos::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
544 Kokkos::View< Sacado::UQ::PCE<InputStorage>*,
546 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
555 typedef Kokkos::Cuda MatrixDevice;
557 typedef Kokkos::CrsMatrix< MatrixValue,
561 MatrixSize> matrix_type;
562 typedef typename matrix_type::values_type matrix_values_type;
564 typedef Kokkos::View< InputVectorValue*,
565 InputP... > input_vector_type;
566 typedef Kokkos::View< OutputVectorValue*,
567 OutputP... > output_vector_type;
569 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
574 template <
int BlockSize>
578 typedef typename input_vector_type::array_type input_array_type;
579 typedef typename output_vector_type::array_type output_array_type;
581 const matrix_array_type m_A_values ;
582 const matrix_graph_type m_A_graph ;
583 const output_array_type v_y ;
584 const input_array_type v_x ;
585 const input_scalar m_a ;
586 const output_scalar m_b ;
587 const size_type m_row_count;
588 const size_type dim ;
590 Kernel(
const matrix_type & A ,
591 const input_vector_type &
x ,
592 const output_vector_type &
y ,
593 const input_scalar & a ,
594 const output_scalar & b )
595 : m_A_values( A.values )
596 , m_A_graph( A.graph )
601 , m_row_count( A.graph.row_map.dimension_0()-1 )
605 __device__
void operator()(
void)
const 609 volatile matrix_scalar *
const sh_A =
610 kokkos_impl_cuda_shared_memory<matrix_scalar>();
611 volatile size_type *
const sh_col =
612 reinterpret_cast<volatile size_type*
>(sh_A + BlockSize*blockDim.y);
615 const size_type iBlockRow = blockDim.y*blockIdx.x + threadIdx.y;
616 if (iBlockRow < m_row_count) {
618 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
619 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
622 if (m_b == output_scalar(0))
623 for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
624 v_y(pce, iBlockRow) = 0.0;
626 for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
627 v_y(pce, iBlockRow) = m_b*v_y(pce, iBlockRow);
630 for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
631 col_block+=BlockSize) {
632 const size_type num_col = col_block+BlockSize <= iEntryEnd ?
633 BlockSize : iEntryEnd-col_block;
637 for (size_type col=threadIdx.x; col<num_col; col+=blockDim.x) {
638 sh_col[col*blockDim.y+threadIdx.y] =
639 m_A_graph.entries(col_block+col);
640 sh_A[col*blockDim.y+threadIdx.y] =
641 m_A_values(col_block+col);
643 if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
648 for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x ) {
649 output_scalar s = 0.0;
650 for ( size_type col = 0; col < num_col; ++col ) {
651 const size_type iCol = sh_col[col*blockDim.y+threadIdx.y];
652 const matrix_scalar aA = m_a*sh_A[col*blockDim.y+threadIdx.y];
653 s += aA*v_x(pce, iCol);
655 v_y(pce, iBlockRow) += s;
662 static void apply(
const matrix_type & A ,
663 const input_vector_type &
x ,
664 const output_vector_type &
y ,
665 const input_scalar & a = input_scalar(1) ,
666 const output_scalar & b = output_scalar(0) )
668 const size_t row_count = A.graph.row_map.dimension_0() - 1;
674 size_type threads_per_row;
675 size_type rows_per_block;
677 threads_per_row = 32;
681 threads_per_row = 16;
684 const size_type num_blocks =
685 (row_count + rows_per_block -1 ) / rows_per_block;
688 const dim3 dBlock( threads_per_row , rows_per_block , 1 );
689 const dim3 dGrid( num_blocks, 1, 1 );
694 const int BlockSize = 32;
695 if (
sizeof(matrix_scalar) > 4)
696 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
698 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
699 const size_t shared =
700 (BlockSize*dBlock.y) * (
sizeof(size_type) +
sizeof(matrix_scalar));
703 MeanFullOccupancyKernelLaunch<<<dGrid, dBlock, shared >>>
704 ( Kernel<BlockSize>( A,
x,
y, a, b ) );
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int &delta, const int &width)
Kokkos::DefaultExecutionSpace execution_space
__launch_bounds__(VECTORS_PER_BLOCK *THREADS_PER_VECTOR, 1) __global__ void spmm_csr_vector_kernel_col(const IndexType Anum_rows
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Top-level namespace for Stokhos classes and functions.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P... > >::value, unsigned >::type dimension_scalar(const View< T, P... > &view)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y