Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
Kokkos_Parallel_MP_Vector.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef KOKKOS_PARALLEL_MP_VECTOR_HPP
43 #define KOKKOS_PARALLEL_MP_VECTOR_HPP
44 
45 #include "Sacado_MP_Vector.hpp"
46 #include "Kokkos_Core.hpp"
47 
48 //----------------------------------------------------------------------------
49 // Kokkos execution policies useful for Sacado::MP::Vector scalar type
50 //----------------------------------------------------------------------------
51 
52 namespace Kokkos {
53 
57 template< class ExecSpace >
59 
61  typedef ExecSpace execution_space ;
62 
63  size_t range;
64  size_t team;
65  size_t shared;
66 
67  MPVectorWorkConfig( const size_t range_,
68  const size_t team_,
69  const size_t shared_ = 0 ) :
70  range(range_), team(team_), shared(shared_) {}
71 };
72 
73 namespace Impl {
74 
75 #if defined( KOKKOS_HAVE_PTHREAD )
76 // Specialization of ParallelFor<> for MPVectorWorkConfig and Threads
77 // The default implementation ignores the team size and uses the standard
78 // work-range implementation. In the future maybe we should try and use
79 // hyperthreads in a useful way. That would require:
80 // -- interpreting the team-size differently, rather as the sacado size
81 // -- determining the vector size of the architecture
82 // -- laying out the threads differently to use hyperthreads across the
83 // the sacado dimension
84 template< class FunctorType >
85 class ParallelFor< FunctorType , MPVectorWorkConfig< Threads > > :
86  public ParallelFor< FunctorType , Kokkos::RangePolicy< Threads > > {
87  typedef Kokkos::RangePolicy< Threads > Policy ;
88 public:
89  ParallelFor( const FunctorType & functor ,
90  const MPVectorWorkConfig< Threads > & work_config ) :
91  ParallelFor< FunctorType , Policy >( functor ,
92  Policy( 0, work_config.range ) ) {}
93 };
94 #endif
95 
96 #if defined( KOKKOS_HAVE_OPENMP )
97 // Specialization of ParallelFor<> for MPVectorWorkConfig and OpenMP
98 // The default implementation ignores the team size and uses the standard
99 // work-range implementation. In the future maybe we should try and use
100 // hyperthreads in a useful way. That would require:
101 // -- interpreting the team-size differently, rather as the sacado size
102 // -- determining the vector size of the architecture
103 // -- laying out the threads differently to use hyperthreads across the
104 // the sacado dimension
105 template< class FunctorType >
106 class ParallelFor< FunctorType , MPVectorWorkConfig< OpenMP > > :
107  public ParallelFor< FunctorType , Kokkos::RangePolicy< OpenMP > > {
108  typedef Kokkos::RangePolicy< OpenMP > Policy ;
109 public:
110  ParallelFor( const FunctorType & functor ,
111  const MPVectorWorkConfig< OpenMP > & work_config ) :
112  ParallelFor< FunctorType , Policy >( functor ,
113  Policy( 0, work_config.range ) ) {}
114 };
115 #endif
116 
117 #if defined(KOKKOS_HAVE_SERIAL)
118 // Specialization of ParallelFor<> for MPVectorWorkConfig and Serial
119 // The default implementation ignores the team size and uses the standard
120 // work-range implementation. In the future maybe we should try and use
121 // hyperthreads in a useful way. That would require:
122 // -- interpreting the team-size differently, rather as the sacado size
123 // -- determining the vector size of the architecture
124 // -- laying out the threads differently to use hyperthreads across the
125 // the sacado dimension
126 template< class FunctorType >
127 class ParallelFor< FunctorType , MPVectorWorkConfig< Serial > > :
128  public ParallelFor< FunctorType , Kokkos::RangePolicy< Serial > > {
129  typedef Kokkos::RangePolicy< Serial > Policy ;
130 public:
131  ParallelFor( const FunctorType & functor ,
132  const MPVectorWorkConfig< Serial > & work_config ) :
133  ParallelFor< FunctorType , Policy >( functor ,
134  Policy( 0, work_config.range ) ) {}
135 };
136 #endif // defined(KOKKOS_HAVE_SERIAL)
137 
138 #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ )
139 
140 // Specialization of ParallelFor<> for MPVectorWorkConfig on Cuda
141 // Here we use threadIdx.x for each entry in the specified team-size
142 template< class FunctorType >
143 class ParallelFor< FunctorType , MPVectorWorkConfig< Cuda > > {
144 public:
145 
146  const FunctorType m_functor ;
147  const MPVectorWorkConfig< Cuda > m_config;
148  const Cuda::size_type m_work ;
149 
150  inline
151  __device__
152  void operator()(void) const
153  {
154  const Cuda::size_type work_stride = blockDim.y * gridDim.x ;
155 
156  for ( Cuda::size_type iwork = threadIdx.y + blockDim.y * blockIdx.x ;
157  iwork < m_work ;
158  iwork += work_stride ) {
159  m_functor( iwork , threadIdx.x );
160  }
161  }
162 
163  ParallelFor( const FunctorType & functor ,
164  const MPVectorWorkConfig< Cuda > & work_config )
165  : m_functor( functor ) ,
166  m_config( work_config ) ,
167  m_work( work_config.range )
168  {
169  }
170 
171  inline
172  void execute() const
173  {
174  // To do: query number of registers used by functor and adjust
175  // nwarp accordingly to get maximum occupancy
176 
177  Cuda::size_type nwarp = 0;
178  if (m_config.team > CudaTraits::WarpSize) {
179  const Cuda::size_type warps_per_team =
180  ( m_config.team + CudaTraits::WarpSize-1 ) / CudaTraits::WarpSize;
181  nwarp = cuda_internal_maximum_warp_count() / warps_per_team;
182  }
183  else {
184  const Cuda::size_type teams_per_warp =
185  CudaTraits::WarpSize / m_config.team ;
186  nwarp = cuda_internal_maximum_warp_count() * teams_per_warp;
187  }
188  const dim3 block( m_config.team , nwarp , 1 );
189 
190  Cuda::size_type nblock =
191  std::min( (m_work + block.y - 1 ) / block.y ,
192  cuda_internal_maximum_grid_count() );
193  const dim3 grid( nblock , 1 , 1 );
194 
195  const Cuda::size_type shared = m_config.shared;
196  CudaParallelLaunch< ParallelFor >( *this , grid , block , shared );
197  }
198 };
199 
200 #endif
201 
202 } // namespace Impl
203 
204 } // namespace Kokkos
205 
206 //----------------------------------------------------------------------------
207 //----------------------------------------------------------------------------
208 
209 #endif /* #ifndef KOKKOS_ATOMIC_MP_VECTOR_HPP */
MPVectorWorkConfig(const size_t range_, const size_t team_, const size_t shared_=0)
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Team-based parallel work configuration for Sacado::MP::Vector.