Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
Kokkos_CrsMatrix_UQ_PCE_Cuda.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP
43 #define KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP
44 
45 #if defined( __CUDACC__)
46 
47 #include "Sacado_UQ_PCE.hpp"
48 #include "Kokkos_View_UQ_PCE.hpp"
50 #include "Kokkos_CrsMatrix.hpp"
51 #include "Kokkos_MV.hpp" // for some utilities
52 
53 #include "Stokhos_Multiply.hpp"
55 
56 #include "Kokkos_Core.hpp"
57 
59 //#include "Stokhos_Cuda_WarpShuffle.hpp"
60 
61 #include "Teuchos_TestForException.hpp"
62 
63 //#include "cuda_profiler_api.h"
64 
65 #ifdef __CUDA_ARCH__
66 # if (__CUDA_ARCH__ >= 300)
67 # define HAVE_CUDA_SHUFFLE 1
68 # else
69 # define HAVE_CUDA_SHUFFLE 0
70 # endif
71 #else
72 # define HAVE_CUDA_SHUFFLE 0
73 #endif
74 
75 namespace Stokhos {
76 
77 //----------------------------------------------------------------------------
78 // Specialization of Kokkos CrsMatrix math functions
79 //----------------------------------------------------------------------------
80 
81 // Kernel implementing y = A * x where
82 // A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>,
83 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
84 // x and y are rank 1
85 template <typename MatrixStorage,
86  typename MatrixOrdinal,
87  typename MatrixMemory,
88  typename MatrixSize,
89  typename InputStorage,
90  typename ... InputP,
91  typename OutputStorage,
92  typename ... OutputP>
93 class Multiply< Kokkos::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
94  MatrixOrdinal,
95  Kokkos::Cuda,
96  MatrixMemory,
97  MatrixSize>,
98  Kokkos::View< Sacado::UQ::PCE<InputStorage>*,
99  InputP... >,
100  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
101  OutputP... >
102  >
103 {
104 public:
105  typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
106  typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
107  typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
108 
109  typedef Kokkos::Cuda MatrixDevice;
110  typedef MatrixDevice execution_space;
111  typedef execution_space::size_type size_type;
112 
113  typedef Kokkos::CrsMatrix< MatrixValue,
114  MatrixOrdinal,
115  MatrixDevice,
116  MatrixMemory,
117  MatrixSize> matrix_type;
118  typedef typename matrix_type::values_type matrix_values_type;
119  typedef typename Kokkos::CijkType<matrix_values_type>::type tensor_type;
120  typedef Kokkos::View< InputVectorValue*,
121  InputP... > input_vector_type;
122  typedef Kokkos::View< OutputVectorValue*,
123  OutputP... > output_vector_type;
124 
125 private:
126 
127  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
128  typedef typename matrix_values_type::array_type matrix_array_type;
129  typedef typename input_vector_type::array_type input_array_type;
130  typedef typename output_vector_type::array_type output_array_type;
131 
132  typedef typename MatrixValue::value_type matrix_scalar;
133  typedef typename InputVectorValue::value_type input_scalar;
134  typedef typename OutputVectorValue::value_type output_scalar;
135  typedef typename tensor_type::value_type tensor_scalar;
136 
137  const matrix_array_type m_A_values ;
138  const matrix_graph_type m_A_graph ;
139  const input_array_type m_x ;
140  const output_array_type m_y ;
141  const tensor_type m_tensor ;
142  const input_scalar m_a ;
143  const output_scalar m_b ;
144  const size_type BlockSize;
145 
146  Multiply( const matrix_type & A ,
147  const input_vector_type & x ,
148  const output_vector_type & y ,
149  const input_scalar & a ,
150  const output_scalar & b ,
151  const size_type block_size )
152  : m_A_values( A.values )
153  , m_A_graph( A.graph )
154  , m_x( x )
155  , m_y( y )
156  , m_tensor( Kokkos::cijk(A.values) )
157  , m_a( a )
158  , m_b( b )
159  , BlockSize(block_size)
160  {}
161 
162 public:
163 
164  __device__ void operator()(void) const
165  {
166  // Number of bases in the stochastic system:
167  const size_type dim = m_tensor.dimension();
168 
169  // Get shared memory for loading x, A, and y
170  volatile input_scalar * const sh_x =
171  kokkos_impl_cuda_shared_memory<input_scalar>();
172  volatile matrix_scalar * const sh_A = sh_x + BlockSize*dim;
173  volatile output_scalar * const sh_y = sh_A + BlockSize*dim;
174 #if !HAVE_CUDA_SHUFFLE
175  volatile output_scalar * const sh_t = sh_y + dim;
176 #endif
177 
178  const size_type nid = blockDim.x * blockDim.y;
179  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
180 
181  // Zero y
182  for ( size_type i = tid; i < dim; i += nid ) {
183  sh_y[i] = 0.0;
184  }
185 
186  // Loop over columns in the discrete (finite element) system.
187  // blockIdx.x == row in the deterministic (finite element) system
188  const size_type iBlockEntryBeg = m_A_graph.row_map[ blockIdx.x ];
189  const size_type iBlockEntryEnd = m_A_graph.row_map[ blockIdx.x + 1 ];
190  for (size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
191  iBlockEntry += BlockSize) {
192  const size_type block_size =
193  (iBlockEntryEnd-iBlockEntry < BlockSize) ?
194  iBlockEntryEnd-iBlockEntry : BlockSize;
195 
196  // Wait for X and A to be used in the previous iteration
197  // before reading new values.
198  __syncthreads();
199 
200  // Coalesced read blocks of X and A into shared memory
201  for ( size_type col = 0; col < block_size; ++col ) {
202 
203  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
204  const input_scalar * const x = & m_x( 0, iBlockColumn );
205  const matrix_scalar * const A = & m_A_values( iBlockEntry + col, 0 );
206 
207  // Coalesced read by the whole block from global memory:
208  for ( size_type i = tid; i < dim; i += nid ) {
209  sh_x[col + i * BlockSize] = x[i]; // m_x( i, iBlockColumn );
210  sh_A[col + i * BlockSize] = A[i]; // m_A_values( iBlockEntry , i );
211  }
212 
213  }
214 
215  __syncthreads(); // wait for X and A to be read before using them
216 
217  // This cuda block is responsible for computing all values of 'y'
218  for ( size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
219  output_scalar y = 0;
220 
221  // Product tensor entries which this warp will iterate:
222  const size_type lBeg = m_tensor.entry_begin( i );
223  const size_type lEnd = m_tensor.entry_end( i );
224 
225  // Loop through sparse tensor contributions with coalesced reads.
226  for ( size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
227 
228  // Read 'blockDim.x' entries from the tensor (coalesced)
229  const size_type kj = m_tensor.coord( l );
230  const tensor_scalar v = m_tensor.value( l );
231  const size_type j = ( kj & 0x0ffff ) * BlockSize ;
232  const size_type k = ( kj >> 16 ) * BlockSize ;
233 
234  for ( size_type col = 0; col < block_size; ++col ) {
235  y += v * ( sh_A[col+j] * sh_x[col+k] +
236  sh_A[col+k] * sh_x[col+j] );
237  }
238 
239  }
240 
241  // Reduction of 'y' within 'blockDim.x'
242 #if HAVE_CUDA_SHUFFLE
243  if (blockDim.x >= 2) y += Kokkos::shfl_down(y, 1, blockDim.x);
244  if (blockDim.x >= 4) y += Kokkos::shfl_down(y, 2, blockDim.x);
245  if (blockDim.x >= 8) y += Kokkos::shfl_down(y, 4, blockDim.x);
246  if (blockDim.x >= 16) y += Kokkos::shfl_down(y, 8, blockDim.x);
247  if (blockDim.x >= 32) y += Kokkos::shfl_down(y, 16, blockDim.x);
248  if ( threadIdx.x == 0 ) sh_y[i] += y;
249 #else
250  sh_t[ tid ] = y;
251  if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
252  if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
253  if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
254  if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
255  if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
256  if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
257 #endif
258 
259  }
260 
261  }
262 
263  // Wait for all contributions of y to be completed
264  __syncthreads();
265 
266  // Store result back in global memory
267  if ( m_b == output_scalar(0) )
268  for ( size_type i = tid; i < dim; i += nid )
269  m_y( i, blockIdx.x ) = m_a * sh_y[ i ];
270  else
271  for ( size_type i = tid; i < dim; i += nid )
272  m_y( i, blockIdx.x ) = m_a * sh_y[ i ] + m_b * m_y( i, blockIdx.x );
273  }
274 
275  struct TensorReadEntry {
276  size_type block_size, shmem, num_blocks, num_warp;
277  double reads;
278  };
279 
280  static void apply( const matrix_type & A ,
281  const input_vector_type & x ,
282  const output_vector_type & y ,
283  const input_scalar & a = input_scalar(1) ,
284  const output_scalar & b = output_scalar(0) )
285  {
286  const tensor_type tensor = Kokkos::cijk(A.values);
287  const size_type row_count = A.graph.row_map.dimension_0() - 1;
288  const size_type tensor_dimension = tensor.dimension();
289  const size_type tensor_align = tensor_dimension;
290  const size_type avg_tensor_entries_per_row = tensor.avg_entries_per_row();
291 
292  // Should compute this from FEM graph
293  const size_type fem_nnz_per_row = 27;
294 
295  // Get device properties we need for whatever device is currently selected
296  DeviceProp device_prop;
297  const size_type shcap = device_prop.shared_memory_capacity;
298  const size_type sh_granularity = device_prop.shared_memory_granularity;
299  const size_type max_shmem_per_block = device_prop.max_shmem_per_block;
300  const size_type max_blocks_per_sm = device_prop.max_blocks_per_sm;
301  const size_type warp_size = device_prop.warp_size;
302  const size_type warp_granularity = device_prop.warp_granularity;
303  const size_type max_warps_per_block =
304  std::min(device_prop.max_threads_per_block / warp_size,
305  device_prop.max_warps_per_sm);
306  const size_type min_warps_per_block = 1;
307  const size_type max_regs_per_sm = device_prop.max_regs_per_sm;
308  const size_type max_regs_per_block = device_prop.max_regs_per_block;
309  const size_type reg_bank_size = device_prop.reg_bank_size;
310 
311  // Compute number of warps we can fit on each SM based on register limits
312  // Use Cuda introspection to determine number of registers per thread
313  //const size_type regs_per_thread = 46;
314  const size_type regs_per_thread =
315  device_prop.get_kernel_registers(
316  Kokkos::Impl::cuda_parallel_launch_local_memory<Multiply>);
317  const size_type regs_per_warp =
318  (warp_size*regs_per_thread + reg_bank_size-1) & ~(reg_bank_size-1);
319  const size_type warps_per_sm =
320  (max_regs_per_sm/regs_per_warp) & ~(warp_granularity-1);
321  const size_type warps_per_block =
322  (max_regs_per_block/regs_per_warp) & ~(warp_granularity-1);
323 
324  // Compute number of threads per stochastic row based on number of
325  // nonzero entries per row.
326  // For double, 16 threads/row is still coalesced, but not for float.
327  // We should reorder the tensor values for a given vector width to
328  // maintain coalesced reads. This would help smaller problems too by
329  // allowing fewer threads/row.
330  const size_type threads_per_row =
331  avg_tensor_entries_per_row >= 88 ? 32 : 16;
332  const size_type rows_per_warp = warp_size / threads_per_row;
333 
334  const size_type in_vec_scalar_size = sizeof(input_scalar);
335  const size_type out_vec_scalar_size = sizeof(output_scalar);
336  const size_type mat_scalar_size = sizeof(matrix_scalar);
337 
338 #define USE_FIXED_BLOCKSIZE 0
339 
340 #if USE_FIXED_BLOCKSIZE
341 
342  const size_type num_blocks = 3;
343  size_type nw = warps_per_sm / num_blocks;
344  while (nw > 1 && num_blocks*nw % warp_granularity) --nw;
345  const size_type num_warp = nw;
346  const size_type sh_per_block = shcap / num_blocks;
347  const size_type sr =
348  device_prop.has_shuffle ? 0 : in_vec_scalar_size*warp_size*num_warp;
349  size_type bs = ((sh_per_block - sr) / tensor_align - out_vec_scalar_size) /
350  (in_vec_scalar_size+mat_scalar_size);
351  if (bs % 2 == 0) --bs;
352  const size_type block_size_max = 31;
353  const size_type block_size = std::min(bs, block_size_max);
354  //const size_type block_size = 7;
355  const size_type shmem =
356  ( ((in_vec_scalar_size+mat_scalar_size)*block_size+out_vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
357 
358 #else
359 
360  // We want to maximize the number of blocks per SM (to maximize throughput)
361  // as well as the block_size (to minimize tensor reads), subject to
362  // shared memory and register constraints. Here we try to do this computing
363  // the number of tensor reads per block per thread for each choice of
364  // block_size, and then choose the configuration with the smallest value.
365  // This isn't perfect, but seems to generally work OK. It could be
366  // improved with a better model of:
367  // * Number of blocks versus warps per block (to minimize synchronization)
368  // * Thread efficiency for small numbers of rows per thread
369  const size_type block_size_min = 3;
370  const size_type half_nnz_per_row = fem_nnz_per_row / 2 + 1;
371  const size_type block_size_max =
372  half_nnz_per_row % 2 ? half_nnz_per_row + 1 : half_nnz_per_row;
373  Teuchos::Array<TensorReadEntry> reads_per_thread;
374  for (size_type bs = block_size_min; bs<=block_size_max; bs+=2) {
375  // We don't know the number of warps yet, so we just have to bound
376  // sr by the maximum number possible (which is all warps in 1 block)
377  const size_type sr =
378  device_prop.has_shuffle ? 0 : in_vec_scalar_size*warp_size*warps_per_block;
379  size_type shmem =
380  ((in_vec_scalar_size+mat_scalar_size)*bs+out_vec_scalar_size)*tensor_align+sr;
381  shmem = (shmem + sh_granularity-1) & ~(sh_granularity-1);
382  if (shmem <= max_shmem_per_block) {
383  size_type num_blocks = std::min(shcap / shmem, max_blocks_per_sm);
384  size_type tensor_reads = (fem_nnz_per_row+bs-1) / bs;
385  size_type num_warp =
386  std::min(std::max(std::min(warps_per_sm/num_blocks, warps_per_block),
387  min_warps_per_block),
388  max_warps_per_block);
389  while (num_warp > 1 && num_blocks*num_warp % warp_granularity)
390  --num_warp;
391  TensorReadEntry entry;
392  entry.block_size = bs;
393  entry.shmem = shmem;
394  entry.num_blocks = num_blocks;
395  entry.num_warp = num_warp;
396 
397  // Prefer at least 3 blocks
398  size_type factor = std::min(num_blocks,3u);
399  entry.reads = (static_cast<double>(tensor_reads) /
400  static_cast<double>(factor*num_blocks*num_warp));
401  reads_per_thread.push_back(entry);
402  }
403  }
404  TEUCHOS_TEST_FOR_EXCEPTION(
405  reads_per_thread.size() == 0, std::logic_error,
406  "Stochastic problem dimension is too large to fit in shared memory");
407  size_type idx = 0;
408  double min_reads = reads_per_thread[0].reads;
409  for (int i=1; i<reads_per_thread.size(); ++i) {
410  if (reads_per_thread[i].reads < min_reads) {
411  idx = i;
412  min_reads = reads_per_thread[i].reads;
413  }
414  }
415 
416  const size_type block_size = reads_per_thread[idx].block_size;
417  const size_type shmem = reads_per_thread[idx].shmem;
418  const size_type num_blocks = reads_per_thread[idx].num_blocks;
419  const size_type num_warp = reads_per_thread[idx].num_warp;
420 
421 #endif
422 
423  // Setup thread blocks and grid
424  const dim3 dBlock( threads_per_row , rows_per_warp*num_warp , 1 );
425  const dim3 dGrid( row_count, 1, 1 );
426 
427 #if 0
428  std::cout << "block_size = " << block_size
429  << " tensor reads = " << (fem_nnz_per_row+block_size-1)/block_size
430  << " regs_per_thread = " << regs_per_thread
431  << " num blocks = " << num_blocks
432  << " num warps = " << num_warp
433  << " num rows = " << tensor_dimension
434  << " rows/warp = " << tensor_dimension / (num_warp*rows_per_warp)
435  << " avg entries/row = " << avg_tensor_entries_per_row
436  << std::endl;
437 #endif
438 
439  // Finally launch our kernel
440  //cudaProfilerStart();
441  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
442  ( Multiply( A, x, y, a, b, block_size ) );
443  //cudaProfilerStop();
444  }
445 };
446 
447 // Kernel implementing y = A * x where
448 // A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>,
449 // x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
450 // x and y are rank 2
451 //
452 // Note: Unlike the rank-1 version, this version has not been
453 // optimized, and doesn't even include the block-column implementation
454 template <typename MatrixStorage,
455  typename MatrixOrdinal,
456  typename MatrixMemory,
457  typename MatrixSize,
458  typename InputStorage,
459  typename ... InputP,
460  typename OutputStorage,
461  typename ... OutputP>
462 class Multiply< Kokkos::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
463  MatrixOrdinal,
464  Kokkos::Cuda,
465  MatrixMemory,
466  MatrixSize>,
467  Kokkos::View< Sacado::UQ::PCE<InputStorage>**,
468  InputP... >,
469  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
470  OutputP... >
471  >
472 {
473 public:
474  typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
475  typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
476  typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
477 
478  typedef Kokkos::Cuda MatrixDevice;
479  typedef MatrixDevice execution_space;
480  typedef execution_space::size_type size_type;
481 
482  typedef Kokkos::CrsMatrix< MatrixValue,
483  MatrixOrdinal,
484  MatrixDevice,
485  MatrixMemory,
486  MatrixSize> matrix_type;
487  typedef Kokkos::View< InputVectorValue**,
488  InputP... > input_vector_type;
489  typedef Kokkos::View< OutputVectorValue**,
490  OutputP... > output_vector_type;
491  typedef typename InputVectorValue::value_type input_scalar;
492  typedef typename OutputVectorValue::value_type output_scalar;
493 
494 public:
495 
496  static void apply( const matrix_type & A ,
497  const input_vector_type & x ,
498  const output_vector_type & y ,
499  const input_scalar & a = input_scalar(1) ,
500  const output_scalar & b = output_scalar(0) )
501  {
502  typedef Kokkos::View< InputVectorValue*, InputP... > input_vector_type_1D;
503  typedef Kokkos::View< OutputVectorValue*, OutputP... > output_vector_type_1D;
504  typedef Multiply< matrix_type, input_vector_type_1D,
505  output_vector_type_1D > multiply_type_1D;
506 
507  const size_type num_col = y.dimension_1();
508  for (size_type col=0; col<num_col; ++col)
509 
510  multiply_type_1D::apply(
511  A,
512  Kokkos::subview( x, Kokkos::ALL(), col),
513  Kokkos::subview(y, Kokkos::ALL(), col),
514  a, b );
515  }
516 };
517 
518 template <typename Kernel>
519 __global__ void
520 #if __CUDA_ARCH__ >= 300
521 __launch_bounds__(1024,2)
522 #endif
523 MeanFullOccupancyKernelLaunch(Kernel kernel) {
524  kernel();
525 }
526 
527 // Kernel implementing y = A * x where PCE size of A is 1
528 // A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
529 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
530 // x and y are rank 1
531 template <typename MatrixStorage,
532  typename MatrixOrdinal,
533  typename MatrixMemory,
534  typename MatrixSize,
535  typename InputStorage,
536  typename ... InputP,
537  typename OutputStorage,
538  typename ... OutputP>
539 class MeanMultiply< Kokkos::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
540  MatrixOrdinal,
541  Kokkos::Cuda,
542  MatrixMemory,
543  MatrixSize >,
544  Kokkos::View< Sacado::UQ::PCE<InputStorage>*,
545  InputP... >,
546  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
547  OutputP... >,
548  >
549 {
550 public:
551  typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
552  typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
553  typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
554 
555  typedef Kokkos::Cuda MatrixDevice;
556  typedef MatrixDevice execution_space;
557  typedef Kokkos::CrsMatrix< MatrixValue,
558  MatrixOrdinal,
559  MatrixDevice,
560  MatrixMemory,
561  MatrixSize> matrix_type;
562  typedef typename matrix_type::values_type matrix_values_type;
563  typedef typename MatrixValue::ordinal_type size_type;
564  typedef Kokkos::View< InputVectorValue*,
565  InputP... > input_vector_type;
566  typedef Kokkos::View< OutputVectorValue*,
567  OutputP... > output_vector_type;
568 
569  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
570  typedef typename MatrixValue::value_type matrix_scalar;
571  typedef typename InputVectorValue::value_type input_scalar;
572  typedef typename OutputVectorValue::value_type output_scalar;
573 
574  template <int BlockSize>
575  struct Kernel {
576  typedef MatrixDevice execution_space;
577  typedef typename Kokkos::FlatArrayType<matrix_values_type>::type matrix_array_type;
578  typedef typename input_vector_type::array_type input_array_type;
579  typedef typename output_vector_type::array_type output_array_type;
580 
581  const matrix_array_type m_A_values ;
582  const matrix_graph_type m_A_graph ;
583  const output_array_type v_y ;
584  const input_array_type v_x ;
585  const input_scalar m_a ;
586  const output_scalar m_b ;
587  const size_type m_row_count;
588  const size_type dim ;
589 
590  Kernel( const matrix_type & A ,
591  const input_vector_type & x ,
592  const output_vector_type & y ,
593  const input_scalar & a ,
594  const output_scalar & b )
595  : m_A_values( A.values )
596  , m_A_graph( A.graph )
597  , v_y( y )
598  , v_x( x )
599  , m_a( a )
600  , m_b( b )
601  , m_row_count( A.graph.row_map.dimension_0()-1 )
602  , dim( dimension_scalar(x) )
603  {}
604 
605  __device__ void operator()(void) const
606  {
607  // Store matrix values and column indices in shared memory
608  // to reduce global memory reads
609  volatile matrix_scalar * const sh_A =
610  kokkos_impl_cuda_shared_memory<matrix_scalar>();
611  volatile size_type * const sh_col =
612  reinterpret_cast<volatile size_type*>(sh_A + BlockSize*blockDim.y);
613 
614  // Check for valid row
615  const size_type iBlockRow = blockDim.y*blockIdx.x + threadIdx.y;
616  if (iBlockRow < m_row_count) {
617 
618  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
619  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
620 
621  // Initialize result
622  if (m_b == output_scalar(0))
623  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
624  v_y(pce, iBlockRow) = 0.0;
625  else
626  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
627  v_y(pce, iBlockRow) = m_b*v_y(pce, iBlockRow);
628 
629  // Loop over columns in chunks of size BlockSize
630  for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
631  col_block+=BlockSize) {
632  const size_type num_col = col_block+BlockSize <= iEntryEnd ?
633  BlockSize : iEntryEnd-col_block;
634 
635  // Read BlockSize entries column indices at a time to maintain
636  // coalesced accesses
637  for (size_type col=threadIdx.x; col<num_col; col+=blockDim.x) {
638  sh_col[col*blockDim.y+threadIdx.y] =
639  m_A_graph.entries(col_block+col);
640  sh_A[col*blockDim.y+threadIdx.y] =
641  m_A_values(col_block+col);
642  }
643  if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
644  __syncthreads();
645 
646  // Do portion mat-vec for each PCE coefficient and for columns
647  // within this block
648  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x ) {
649  output_scalar s = 0.0;
650  for ( size_type col = 0; col < num_col; ++col ) {
651  const size_type iCol = sh_col[col*blockDim.y+threadIdx.y];
652  const matrix_scalar aA = m_a*sh_A[col*blockDim.y+threadIdx.y];
653  s += aA*v_x(pce, iCol);
654  }
655  v_y(pce, iBlockRow) += s;
656  }
657  }
658  }
659  }
660  };
661 
662  static void apply( const matrix_type & A ,
663  const input_vector_type & x ,
664  const output_vector_type & y ,
665  const input_scalar & a = input_scalar(1) ,
666  const output_scalar & b = output_scalar(0) )
667  {
668  const size_t row_count = A.graph.row_map.dimension_0() - 1;
669  const size_type dim = dimension_scalar(x);
670 
671  // Compute number of threads for PCE coefficients and number of
672  // matrix rows processed by each CUDA block. A total of 256 threads
673  // gives best occupancy
674  size_type threads_per_row;
675  size_type rows_per_block;
676  if (dim >= 32) {
677  threads_per_row = 32;
678  rows_per_block = 8;
679  }
680  else {
681  threads_per_row = 16;
682  rows_per_block = 16;
683  }
684  const size_type num_blocks =
685  (row_count + rows_per_block -1 ) / rows_per_block;
686 
687  // Setup thread blocks and grid
688  const dim3 dBlock( threads_per_row , rows_per_block , 1 );
689  const dim3 dGrid( num_blocks, 1, 1 );
690 
691  // Setup shared memory for storing matrix values and column indices
692  // Number of columns we process at a time -- making this bigger
693  // requires more shared memory and reduces occupancy
694  const int BlockSize = 32;
695  if (sizeof(matrix_scalar) > 4)
696  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
697  else
698  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
699  const size_t shared =
700  (BlockSize*dBlock.y) * (sizeof(size_type) + sizeof(matrix_scalar));
701 
702  // Launch kernel
703  MeanFullOccupancyKernelLaunch<<<dGrid, dBlock, shared >>>
704  ( Kernel<BlockSize>( A, x, y, a, b ) );
705  }
706 };
707 
708 /*
709  * Disable this specialization as it is actually slower than the default
710  * implementation of launching the single column kernel, one column at at time.
711  * It looks like it uses a few more registers which is reducing occupancy.
712 
713 // Kernel implementing y = A * x where PCE size of A is 1
714 // A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
715 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
716 // x and y are rank 2
717 template <typename MatrixStorage,
718  typename MatrixOrdinal,
719  typename MatrixMemory,
720  typename MatrixSize,
721  typename InputStorage,
722  typename ... InputP,
723  typename OutputStorage,
724  typename ... OutputP>
725 class MeanMultiply< Kokkos::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
726  MatrixOrdinal,
727  Kokkos::Cuda,
728  MatrixMemory,
729  MatrixSize >,
730  Kokkos::View< Sacado::UQ::PCE<InputStorage>**,
731  InputP... >,
732  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
733  OutputP... >,
734  >
735 {
736 public:
737  typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
738  typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
739  typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
740 
741  typedef Kokkos::Cuda MatrixDevice;
742  typedef MatrixDevice execution_space;
743  typedef Kokkos::CrsMatrix< MatrixValue,
744  MatrixOrdinal,
745  MatrixDevice,
746  MatrixMemory,
747  MatrixSize> matrix_type;
748  typedef typename matrix_type::values_type matrix_values_type;
749  typedef typename MatrixValue::ordinal_type size_type;
750  typedef Kokkos::View< InputVectorValue**,
751  InputP... > input_vector_type;
752  typedef Kokkos::View< OutputVectorValue**,
753  OutputP... > output_vector_type;
754 
755  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
756  typedef typename MatrixValue::value_type matrix_scalar;
757  typedef typename InputVectorValue::value_type input_scalar;
758  typedef typename OutputVectorValue::value_type output_scalar;
759 
760  template <int BlockSize>
761  struct Kernel {
762  typedef Device execution_space;
763  typedef typename Kokkos::FlatArrayType<matrix_values_type>::type matrix_array_type;
764  typedef typename input_vector_type::array_type input_array_type;
765  typedef typename output_vector_type::array_type output_array_type;
766 
767  const matrix_array_type m_A_values ;
768  const matrix_graph_type m_A_graph ;
769  const output_array_type v_y ;
770  const input_array_type v_x ;
771  const input_scalar m_a ;
772  const output_scalar m_b ;
773  const size_type m_row_count;
774  const size_type m_num_col;
775  const size_type dim ;
776 
777  Kernel( const matrix_type & A ,
778  const input_vector_type & x ,
779  const output_vector_type & y ,
780  const input_scalar & a ,
781  const output_scalar & b )
782  : m_A_values( A.values )
783  , m_A_graph( A.graph )
784  , v_y( y )
785  , v_x( x )
786  , m_a( a )
787  , m_b( b )
788  , m_row_count( A.graph.row_map.dimension_0()-1 )
789  , m_num_col( x.dimension_1() )
790  , dim( dimension_scalar(x) )
791  {}
792 
793  __device__ void operator()(void) const
794  {
795  // Store matrix values and column indices in shared memory
796  // to reduce global memory reads
797  volatile matrix_scalar * const sh_A =
798  kokkos_impl_cuda_shared_memory<matrix_scalar>();
799  volatile size_type * const sh_col =
800  reinterpret_cast<volatile size_type*>(sh_A + BlockSize*blockDim.y);
801 
802  // Check for valid row
803  const size_type iBlockRow = blockDim.y*blockIdx.x + threadIdx.y;
804  if (iBlockRow < m_row_count) {
805 
806  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
807  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
808 
809  // Initialize result
810  if (m_b == output_scalar(0))
811  for ( size_type xcol = 0; xcol < m_num_col; xcol++)
812  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
813  v_y(pce, iBlockRow, xcol) = 0.0;
814  else
815  for ( size_type xcol = 0; xcol < m_num_col; xcol++)
816  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
817  v_y(pce, iBlockRow, xcol) = m_b*v_y(pce, iBlockRow, xcol);
818 
819  // Loop over columns in chunks of size BlockSize
820  for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
821  col_block+=BlockSize) {
822  const size_type num_col = col_block+BlockSize <= iEntryEnd ?
823  BlockSize : iEntryEnd-col_block;
824 
825  // Read BlockSize entries column indices at a time to maintain
826  // coalesced accesses
827  for (size_type col=threadIdx.x; col<num_col; col+=blockDim.x) {
828  sh_col[col*blockDim.y+threadIdx.y] =
829  m_A_graph.entries(col_block+col);
830  sh_A[col*blockDim.y+threadIdx.y] =
831  m_A_values(col_block+col);
832  }
833  if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
834  __syncthreads();
835 
836  // Do portion mat-vec for each PCE coefficient and for columns
837  // within this block
838  for ( size_type xcol = 0; xcol < m_num_col; xcol++) {
839  for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x ) {
840  output_scalar s = 0.0;
841  for ( size_type col = 0; col < num_col; ++col ) {
842  const size_type iCol = sh_col[col*blockDim.y+threadIdx.y];
843  const matrix_scalar aA = m_a*sh_A[col*blockDim.y+threadIdx.y];
844  s += aA*v_x(pce, iCol, xcol);
845  }
846  v_y(pce, iBlockRow, xcol) += s;
847  }
848  }
849  }
850  }
851  }
852  };
853 
854  static void apply( const matrix_type & A ,
855  const input_vector_type & x ,
856  const output_vector_type & y ,
857  const input_scalar & a = input_scalar(1) ,
858  const output_scalar & b = output_scalar(0) )
859  {
860  const size_t row_count = A.graph.row_map.dimension_0() - 1;
861  const size_type dim = dimension_scalar(x);
862 
863  // Compute number of threads for PCE coefficients and number of
864  // matrix rows processed by each CUDA block. A total of 256 threads
865  // gives best occupancy
866  size_type threads_per_row;
867  size_type rows_per_block;
868  if (dim >= 32) {
869  threads_per_row = 32;
870  rows_per_block = 6;
871  }
872  else {
873  threads_per_row = 16;
874  rows_per_block = 12;
875  }
876  const size_type num_blocks =
877  (row_count + rows_per_block -1 ) / rows_per_block;
878 
879  // Setup thread blocks and grid
880  const dim3 dBlock( threads_per_row , rows_per_block , 1 );
881  const dim3 dGrid( num_blocks, 1, 1 );
882 
883  // Setup shared memory for storing matrix values and column indices
884  // Number of columns we process at a time -- making this bigger
885  // requires more shared memory and reduces occupancy
886  const int BlockSize = 32;
887  if (sizeof(matrix_scalar) > 4)
888  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
889  else
890  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
891  const size_t shared =
892  (BlockSize*dBlock.y) * (sizeof(size_type) + sizeof(matrix_scalar));
893 
894  // Launch kernel
895  // MeanFullOccupancyKernelLaunch<<<dGrid, dBlock, shared >>>
896  // ( Kernel<BlockSize>( A, x, y, a, b ) );
897 
898  Kokkos::Impl::cuda_parallel_launch_local_memory<<<dGrid, dBlock, shared >>>
899  ( Kernel<BlockSize>( A, x, y, a, b ) );
900  }
901 };
902 */
903 
904 } // namespace Stokhos
905 
906 #endif /* #if defined( __CUDACC__) */
907 
908 #endif /* #ifndef KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP */
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int &delta, const int &width)
Kokkos::DefaultExecutionSpace execution_space
__launch_bounds__(VECTORS_PER_BLOCK *THREADS_PER_VECTOR, 1) __global__ void spmm_csr_vector_kernel_col(const IndexType Anum_rows
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
Definition: csr_vector.h:260
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Top-level namespace for Stokhos classes and functions.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P... > >::value, unsigned >::type dimension_scalar(const View< T, P... > &view)
expr expr expr expr j
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
Definition: csr_vector.h:267