Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
Stokhos_Cuda_CrsProductTensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef STOKHOS_CUDA_CRS_PRODUCT_TENSOR_HPP
43 #define STOKHOS_CUDA_CRS_PRODUCT_TENSOR_HPP
44 
45 #include <iostream>
46 
47 #include "Kokkos_Core.hpp"
48 
49 #include "Stokhos_Multiply.hpp"
52 
55 
56 #include "Teuchos_TestForException.hpp"
57 
58 #include "cuda_profiler_api.h"
59 
60 namespace Stokhos {
61 
62 //----------------------------------------------------------------------------
63 
64 // Matrix-vector product specialization for CrsProductTensor layout
65 // To do:
66 // * Incorporate texture/read-only cache
67 // * Change tensor layout to allow coalesced reads with smaller
68 // threads/row (will only probably help smaller problem sizes)
69 // * Get average FEM entries/row from FEM graph (hardcoded to 27)
70 template< typename TensorScalar,
71  typename MatrixScalar,
72  typename VectorScalar >
73 class Multiply<
74  BlockCrsMatrix< CrsProductTensor< TensorScalar, Kokkos::Cuda >,
75  MatrixScalar, Kokkos::Cuda >,
76  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
77  Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
78 {
79 public:
80 
81  typedef Kokkos::Cuda execution_space;
82  typedef execution_space::size_type size_type;
83 
86  typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type;
87 
88 #define USE_LDG 0
89 
90 #if USE_LDG == 0
91 
92  // The multiply kernel
93  class MultiplyKernel {
94  public:
95 
100 
102  const vector_type & x,
103  const vector_type & y,
104  const size_type block_size )
105  : m_A( A ), m_x( x ), m_y( y ), BlockSize(block_size) {}
106 
107  __device__
108  void operator()(void) const
109  {
110  // Number of bases in the stochastic system:
111  const size_type dim = m_A.block.dimension();
112 
113  // Get shared memory for loading x, A, and y
114  volatile VectorScalar * const sh_x =
115  kokkos_impl_cuda_shared_memory<VectorScalar>();
116  volatile MatrixScalar * const sh_A = sh_x + BlockSize*dim;
117  volatile VectorScalar * const sh_y = sh_A + BlockSize*dim;
118 #if !HAVE_CUDA_SHUFFLE
119  volatile VectorScalar * const sh_t = sh_y + dim;
120 #endif
121 
122  const size_type nid = blockDim.x * blockDim.y;
123  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
124 
125  // Zero y
126  for ( size_type i = tid; i < dim; i += nid ) {
127  sh_y[i] = 0.0;
128  }
129 
130  // Loop over columns in the discrete (finite element) system.
131  // blockIdx.x == row in the deterministic (finite element) system
132  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
133  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
134  for (size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
135  iBlockEntry += BlockSize) {
136  const size_type block_size =
137  (iBlockEntryEnd-iBlockEntry < BlockSize) ?
138  iBlockEntryEnd-iBlockEntry : BlockSize;
139 
140  // Wait for X and A to be used in the previous iteration
141  // before reading new values.
142  __syncthreads();
143 
144  // Coalesced read blocks of X and A into shared memory
145  for ( size_type col = 0; col < block_size; ++col ) {
146 
147  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
148  const VectorScalar * const x = & m_x( 0, iBlockColumn );
149  const MatrixScalar * const A = & m_A.values( 0, iBlockEntry + col );
150 
151  // Coalesced read by the whole block from global memory:
152  for ( size_type i = tid; i < dim; i += nid ) {
153  sh_x[col + i * BlockSize] = x[i]; // m_x( i, iBlockColumn );
154  sh_A[col + i * BlockSize] = A[i]; // m_A.values( i, iBlockEntry );
155  }
156 
157  }
158 
159  __syncthreads(); // wait for X and A to be read before using them
160 
161  // This cuda block is responsible for computing all values of 'y'
162  for ( size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
163  VectorScalar y = 0;
164 
165  // Product tensor entries which this warp will iterate:
166  const size_type lBeg = m_A.block.entry_begin( i );
167  const size_type lEnd = m_A.block.entry_end( i );
168 
169  // Loop through sparse tensor contributions with coalesced reads.
170  for ( size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
171 
172  // Read 'blockDim.x' entries from the tensor (coalesced)
173  const size_type kj = m_A.block.coord( l );
174  const TensorScalar v = m_A.block.value( l );
175  const size_type j = ( kj & 0x0ffff ) * BlockSize ;
176  const size_type k = ( kj >> 16 ) * BlockSize ;
177 
178  for ( size_type col = 0; col < block_size; ++col ) {
179  y += v * ( sh_A[col+j] * sh_x[col+k] +
180  sh_A[col+k] * sh_x[col+j] );
181  }
182 
183  }
184 
185  // Reduction of 'y' within 'blockDim.x'
186 #if HAVE_CUDA_SHUFFLE
187  if (blockDim.x >= 2) y += shfl_down(y, 1, blockDim.x);
188  if (blockDim.x >= 4) y += shfl_down(y, 2, blockDim.x);
189  if (blockDim.x >= 8) y += shfl_down(y, 4, blockDim.x);
190  if (blockDim.x >= 16) y += shfl_down(y, 8, blockDim.x);
191  if (blockDim.x >= 32) y += shfl_down(y, 16, blockDim.x);
192  if ( threadIdx.x == 0 ) sh_y[i] += y;
193 #else
194  sh_t[ tid ] = y;
195  if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
196  if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
197  if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
198  if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
199  if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
200  if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
201 #endif
202 
203  }
204 
205  }
206 
207  // Wait for all contributions of y to be completed
208  __syncthreads();
209 
210  // Store result back in global memory
211  for ( size_type i = tid; i < dim; i += nid ) {
212  m_y( i, blockIdx.x ) = sh_y[ i ];
213  }
214  }
215  };
216 
217 #elif USE_LDG == 1
218 
219  // The multiply kernel -- using read-only data cache for A
220  // Currently is slower than one above
221  class MultiplyKernel {
222  public:
223 
224  const matrix_type m_A;
225  const vector_type m_x;
226  const vector_type m_y;
227  const size_type BlockSize;
228 
229  MultiplyKernel( const matrix_type & A,
230  const vector_type & x,
231  const vector_type & y,
232  const size_type block_size )
233  : m_A( A ), m_x( x ), m_y( y ), BlockSize(block_size) {}
234 
235  __device__
236  void operator()(void) const
237  {
238  // Number of bases in the stochastic system:
239  const size_type dim = m_A.block.dimension();
240 
241  volatile VectorScalar * const sh_x =
242  kokkos_impl_cuda_shared_memory<VectorScalar>();
243  volatile VectorScalar * const sh_y = sh_x + BlockSize*dim;
244 #if !HAVE_CUDA_SHUFFLE
245  volatile VectorScalar * const sh_t = sh_y + dim;
246 #endif
247 
248  const size_type nid = blockDim.x * blockDim.y;
249  const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
250 
251  // Zero y
252  for ( size_type i = tid; i < dim; i += nid ) {
253  sh_y[i] = 0.0;
254  }
255 
256  // Loop over columns in the discrete (finite element) system.
257  // blockIdx.x == row in the deterministic (finite element) system
258  const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
259  const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
260  for (size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
261  iBlockEntry += BlockSize) {
262  const size_type block_size =
263  (iBlockEntryEnd-iBlockEntry < BlockSize) ?
264  iBlockEntryEnd-iBlockEntry : BlockSize;
265 
266  // Wait for X and A to be used in the previous iteration
267  // before reading new values.
268  __syncthreads();
269 
270  // Coalesced read blocks of X into shared memory
271  for ( size_type col = 0; col < block_size; ++col ) {
272 
273  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
274  const VectorScalar * const x = & m_x( 0, iBlockColumn );
275 
276  // Coalesced read by the whole block from global memory:
277  for ( size_type i = tid; i < dim; i += nid ) {
278  sh_x[col + i * BlockSize] = x[i]; // m_x( i, iBlockColumn );
279  }
280 
281  }
282 
283  __syncthreads(); // wait for X to be read before using them
284 
285  // This cuda block is responsible for computing all values of 'y'
286  for ( size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
287  VectorScalar y = 0;
288 
289  // Product tensor entries which this warp will iterate:
290  const size_type lBeg = m_A.block.entry_begin( i );
291  const size_type lEnd = m_A.block.entry_end( i );
292 
293  // Loop through sparse tensor contributions with coalesced reads.
294  for ( size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
295 
296  // Read 'blockDim.x' entries from the tensor (coalesced)
297  const size_type kj = m_A.block.coord( l );
298  const TensorScalar v = m_A.block.value( l );
299  const size_type j = ( kj & 0x0ffff ) ;
300  const size_type k = ( kj >> 16 ) ;
301 
302  for ( size_type col = 0; col < block_size; ++col ) {
303  const size_type bCol = iBlockEntry + col;
304 #if (__CUDA_ARCH__ >= 350)
305  y += v * ( __ldg(&m_A.values(j,bCol)) * sh_x[col+k*BlockSize] +
306  __ldg(&m_A.values(k,bCol)) * sh_x[col+j*BlockSize] );
307 #else
308  y += v * ( m_A.values(j,bCol) * sh_x[col+k*BlockSize] +
309  m_A.values(k,bCol) * sh_x[col+j*BlockSize] );
310 #endif
311  }
312 
313  }
314 
315  // Reduction of 'y' within 'blockDim.x'
316 #if HAVE_CUDA_SHUFFLE
317  if (blockDim.x >= 2) y += shfl_down(y, 1, blockDim.x);
318  if (blockDim.x >= 4) y += shfl_down(y, 2, blockDim.x);
319  if (blockDim.x >= 8) y += shfl_down(y, 4, blockDim.x);
320  if (blockDim.x >= 16) y += shfl_down(y, 8, blockDim.x);
321  if (blockDim.x >= 32) y += shfl_down(y, 16, blockDim.x);
322  if ( threadIdx.x == 0 ) sh_y[i] += y;
323 #else
324  sh_t[ tid ] = y;
325  if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
326  if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
327  if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
328  if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
329  if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
330  if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
331 #endif
332 
333  }
334 
335  }
336 
337  // Wait for all contributions of y to be completed
338  __syncthreads();
339 
340  // Store result back in global memory
341  for ( size_type i = tid; i < dim; i += nid ) {
342  m_y( i, blockIdx.x ) = sh_y[ i ];
343  }
344  }
345  };
346 
347 #endif
348 
349  //------------------------------------
350 
351  struct TensorReadEntry {
352  size_type block_size, shmem, num_blocks, num_warp;
353  double reads;
354  };
355 
356  static void apply( const matrix_type & A,
357  const vector_type & x,
358  const vector_type & y )
359  {
360  const size_type row_count = A.graph.row_map.dimension_0() - 1;
361  const size_type tensor_dimension = A.block.dimension();
362  const size_type tensor_align = tensor_dimension;
363  const size_type avg_tensor_entries_per_row = A.block.avg_entries_per_row();
364 
365  // Should compute this from FEM graph
366  const size_type fem_nnz_per_row = 27;
367 
368  // Get device properties we need for whatever device is currently selected
369  DeviceProp device_prop;
370  const size_type shcap = device_prop.shared_memory_capacity;
371  const size_type sh_granularity = device_prop.shared_memory_granularity;
372  const size_type max_shmem_per_block = device_prop.max_shmem_per_block;
373  const size_type max_blocks_per_sm = device_prop.max_blocks_per_sm;
374  const size_type warp_size = device_prop.warp_size;
375  const size_type warp_granularity = device_prop.warp_granularity;
376  const size_type max_warps_per_block =
377  std::min(device_prop.max_threads_per_block / warp_size,
378  device_prop.max_warps_per_sm);
379  const size_type min_warps_per_block = 1;
380  const size_type max_regs_per_sm = device_prop.max_regs_per_sm;
381  const size_type max_regs_per_block = device_prop.max_regs_per_block;
382  const size_type reg_bank_size = device_prop.reg_bank_size;
383 
384  // Compute number of warps we can fit on each SM based on register limits
385  // Use Cuda introspection to determine number of registers per thread
386  //const size_type regs_per_thread = 46;
387  const size_type regs_per_thread =
388  device_prop.get_kernel_registers(
389  Kokkos::Impl::cuda_parallel_launch_local_memory<MultiplyKernel>);
390  const size_type regs_per_warp =
391  (warp_size*regs_per_thread + reg_bank_size-1) & ~(reg_bank_size-1);
392  const size_type warps_per_sm =
393  (max_regs_per_sm/regs_per_warp) & ~(warp_granularity-1);
394  const size_type warps_per_block =
395  (max_regs_per_block/regs_per_warp) & ~(warp_granularity-1);
396 
397  // Compute number of threads per stochastic row based on number of
398  // nonzero entries per row.
399  // For double, 16 threads/row is still coalesced, but not for float.
400  // We should reorder the tensor values for a given vector width to
401  // maintain coalesced reads. This would help smaller problems too by
402  // allowing fewer threads/row.
403  const size_type threads_per_row =
404  avg_tensor_entries_per_row >= 88 ? 32 : 16;
405  const size_type rows_per_warp = warp_size / threads_per_row;
406 
407  const size_type vec_scalar_size = sizeof(VectorScalar);
408 #if USE_LDG == 0
409  const size_type mat_scalar_size = sizeof(MatrixScalar);
410 #endif
411 
412 #define USE_FIXED_BLOCKSIZE 0
413 
414 #if USE_FIXED_BLOCKSIZE
415 
416  const size_type num_blocks = 3;
417  size_type nw = warps_per_sm / num_blocks;
418  while (nw > 1 && num_blocks*nw % warp_granularity) --nw;
419  const size_type num_warp = nw;
420  const size_type sh_per_block = shcap / num_blocks;
421  const size_type sr =
422  device_prop.has_shuffle ? 0 : vec_scalar_size*warp_size*num_warp;
423 #if USE_LDG == 1
424  size_type bs = ((sh_per_block - sr) / tensor_align - vec_scalar_size) /
425  vec_scalar_size;
426 #else
427  size_type bs = ((sh_per_block - sr) / tensor_align - vec_scalar_size) /
428  (vec_scalar_size+mat_scalar_size);
429 #endif
430  if (bs % 2 == 0) --bs;
431  const size_type block_size_max = 31;
432  const size_type block_size = std::min(bs, block_size_max);
433  //const size_type block_size = 7;
434 #if USE_LDG == 1
435  const size_type shmem =
436  ( (vec_scalar_size*block_size+vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
437 #else
438  const size_type shmem =
439  ( ((vec_scalar_size+mat_scalar_size)*block_size+vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
440 #endif
441 
442 #else
443 
444  // We want to maximize the number of blocks per SM (to maximize throughput)
445  // as well as the block_size (to minimize tensor reads), subject to
446  // shared memory and register constraints. Here we try to do this computing
447  // the number of tensor reads per block per thread for each choice of
448  // block_size, and then choose the configuration with the smallest value.
449  // This isn't perfect, but seems to generally work OK. It could be
450  // improved with a better model of:
451  // * Number of blocks versus warps per block (to minimize synchronization)
452  // * Thread efficiency for small numbers of rows per thread
453  const size_type block_size_min = 3;
454  const size_type half_nnz_per_row = fem_nnz_per_row / 2 + 1;
455  const size_type block_size_max =
456  half_nnz_per_row % 2 ? half_nnz_per_row + 1 : half_nnz_per_row;
457  Teuchos::Array<TensorReadEntry> reads_per_thread;
458  for (size_type bs = block_size_min; bs<=block_size_max; bs+=2) {
459  // We don't know the number of warps yet, so we just have to bound
460  // sr by the maximum number possible (which is all warps in 1 block)
461  const size_type sr =
462  device_prop.has_shuffle ? 0 : vec_scalar_size*warp_size*warps_per_block;
463 #if USE_LDG == 1
464  size_type shmem =
465  (vec_scalar_size*bs+vec_scalar_size)*tensor_align+sr;
466 #else
467  size_type shmem =
468  ((vec_scalar_size+mat_scalar_size)*bs+vec_scalar_size)*tensor_align+sr;
469 #endif
470  shmem = (shmem + sh_granularity-1) & ~(sh_granularity-1);
471  if (shmem <= max_shmem_per_block) {
472  size_type num_blocks = std::min(shcap / shmem, max_blocks_per_sm);
473  size_type tensor_reads = (fem_nnz_per_row+bs-1) / bs;
474  size_type num_warp =
475  std::min(std::max(std::min(warps_per_sm/num_blocks, warps_per_block),
476  min_warps_per_block),
477  max_warps_per_block);
478  while (num_warp > 1 && num_blocks*num_warp % warp_granularity)
479  --num_warp;
480  TensorReadEntry entry;
481  entry.block_size = bs;
482  entry.shmem = shmem;
483  entry.num_blocks = num_blocks;
484  entry.num_warp = num_warp;
485 
486  // Prefer at least 3 blocks
487  size_type factor = std::min(num_blocks,3u);
488  entry.reads = (static_cast<double>(tensor_reads) /
489  static_cast<double>(factor*num_blocks*num_warp));
490  reads_per_thread.push_back(entry);
491  }
492  }
493  TEUCHOS_TEST_FOR_EXCEPTION(
494  reads_per_thread.size() == 0, std::logic_error,
495  "Stochastic problem dimension is too large to fit in shared memory");
496  size_type idx = 0;
497  double min_reads = reads_per_thread[0].reads;
498  for (int i=1; i<reads_per_thread.size(); ++i) {
499  if (reads_per_thread[i].reads < min_reads) {
500  idx = i;
501  min_reads = reads_per_thread[i].reads;
502  }
503  }
504 
505  const size_type block_size = reads_per_thread[idx].block_size;
506  const size_type shmem = reads_per_thread[idx].shmem;
507  const size_type num_blocks = reads_per_thread[idx].num_blocks;
508  const size_type num_warp = reads_per_thread[idx].num_warp;
509 
510 #endif
511 
512  // Setup thread blocks and grid
513  const dim3 dBlock( threads_per_row , rows_per_warp*num_warp , 1 );
514  const dim3 dGrid( row_count, 1, 1 );
515 
516 #if 0
517  std::cout << "block_size = " << block_size
518  << " tensor reads = " << (fem_nnz_per_row+block_size-1)/block_size
519  << " regs_per_thread = " << regs_per_thread
520  << " num blocks = " << num_blocks
521  << " num warps = " << num_warp
522  << " num rows = " << tensor_dimension
523  << " rows/warp = " << tensor_dimension / (num_warp*rows_per_warp)
524  << " avg entries/row = " << avg_tensor_entries_per_row
525  << std::endl;
526 #endif
527 
528  // Finally launch our kernel
529  //cudaProfilerStart();
530  Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
531  ( MultiplyKernel( A, x, y, block_size ) );
532  //cudaProfilerStop();
533  }
534 
535 };
536 
537 //----------------------------------------------------------------------------
538 //----------------------------------------------------------------------------
539 
540 } // namespace Stokhos
541 
542 #endif /* #ifndef STOKHOS_CUDA_CRS_PRODUCT_TENSOR_HPP */
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int &delta, const int &width)
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
Definition: csr_vector.h:260
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Top-level namespace for Stokhos classes and functions.
expr expr expr expr j
CRS matrix of dense blocks.
size_type get_kernel_registers(Kernel kernel)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
Definition: csr_vector.h:267