Kokkos Core Kernels Package  Version of the Day
Kokkos_MemoryPool.hpp
1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 2.0
6 // Copyright (2014) Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
39 //
40 // ************************************************************************
41 //@HEADER
42 */
43 
44 #ifndef KOKKOS_MEMORYPOOL_HPP
45 #define KOKKOS_MEMORYPOOL_HPP
46 
47 #include <Kokkos_Core_fwd.hpp>
48 #include <Kokkos_Parallel.hpp>
49 #include <Kokkos_Atomic.hpp>
50 #include <impl/Kokkos_BitOps.hpp>
51 #include <impl/Kokkos_Error.hpp>
52 #include <impl/Kokkos_SharedAlloc.hpp>
53 
54 #include <limits>
55 #include <algorithm>
56 #include <chrono>
57 
58 // How should errors be handled? In general, production code should return a
59 // value indicating failure so the user can decide how the error is handled.
60 // While experimental, code can abort instead. If KOKKOS_MEMPOOL_PRINTERR is
61 // defined, the code will abort with an error message. Otherwise, the code will
62 // return with a value indicating failure when possible, or do nothing instead.
63 //#define KOKKOS_MEMPOOL_PRINTERR
64 
65 //#define KOKKOS_MEMPOOL_PRINT_INFO
66 //#define KOKKOS_MEMPOOL_PRINT_CONSTRUCTOR_INFO
67 //#define KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO
68 //#define KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
69 //#define KOKKOS_MEMPOOL_PRINT_ACTIVE_SUPERBLOCKS
70 //#define KOKKOS_MEMPOOL_PRINT_PAGE_INFO
71 //#define KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO
72 
73 //----------------------------------------------------------------------------
74 
75 namespace Kokkos {
76 namespace Experimental {
77 
78 namespace MempoolImpl {
79 
80 template < typename T, typename ExecutionSpace >
81 struct initialize_array {
82  typedef ExecutionSpace execution_space;
83  typedef typename ExecutionSpace::size_type size_type;
84 
85  T * m_data;
86  T m_value;
87 
88  initialize_array( T * d, size_t size, T v ) : m_data( d ), m_value( v )
89  {
90  Kokkos::parallel_for( size, *this );
91 
92  execution_space::fence();
93  }
94 
95  KOKKOS_INLINE_FUNCTION
96  void operator()( size_type i ) const { m_data[i] = m_value; }
97 };
98 
99 template <typename Bitset>
100 struct bitset_count
101 {
102  typedef typename Bitset::execution_space execution_space;
103  typedef typename execution_space::size_type size_type;
104  typedef typename Bitset::size_type value_type;
105  typedef typename Bitset::word_type word_type;
106 
107  word_type * m_words;
108  value_type & m_result;
109 
110  bitset_count( word_type * w, value_type num_words, value_type & r )
111  : m_words( w ), m_result( r )
112  {
113  parallel_reduce( num_words, *this, m_result );
114  }
115 
116  KOKKOS_INLINE_FUNCTION
117  void init( value_type & v ) const
118  { v = 0; }
119 
120  KOKKOS_INLINE_FUNCTION
121  void join( volatile value_type & dst, volatile value_type const & src ) const
122  { dst += src; }
123 
124  KOKKOS_INLINE_FUNCTION
125  void operator()( size_type i, value_type & count ) const
126  {
127  count += Kokkos::Impl::bit_count( m_words[i] );
128  }
129 };
130 
131 template < typename Device >
132 class Bitset {
133 public:
134  typedef typename Device::execution_space execution_space;
135  typedef typename Device::memory_space memory_space;
136  typedef unsigned word_type;
137  typedef unsigned size_type;
138 
139  typedef Kokkos::Impl::DeepCopy< memory_space, Kokkos::HostSpace > raw_deep_copy;
140 
141  // Define some constants.
142  enum {
143  // Size of bitset word. Should be 32.
144  WORD_SIZE = sizeof(word_type) * CHAR_BIT,
145  LG_WORD_SIZE = Kokkos::Impl::integral_power_of_two( WORD_SIZE ),
146  WORD_MASK = WORD_SIZE - 1
147  };
148 
149 private:
150  word_type * m_words;
151  size_type m_size;
152  size_type m_num_words;
153  word_type m_last_word_mask;
154 
155 public:
156  ~Bitset() = default;
157  Bitset() = default;
158  Bitset( Bitset && ) = default;
159  Bitset( const Bitset & ) = default;
160  Bitset & operator = ( Bitset && ) = default;
161  Bitset & operator = ( const Bitset & ) = default;
162 
163  void init( void * w, size_type s )
164  {
165  // Assumption: The size of the memory pointed to by w is a multiple of
166  // sizeof(word_type).
167 
168  m_words = reinterpret_cast<word_type*>( w );
169  m_size = s;
170  m_num_words = ( s + WORD_SIZE - 1 ) >> LG_WORD_SIZE;
171  m_last_word_mask = m_size & WORD_MASK ? ( word_type(1) << ( m_size & WORD_MASK ) ) - 1 : 0;
172 
173  reset();
174  }
175 
176  size_type size() const { return m_size; }
177 
178  size_type count() const
179  {
180  size_type val = 0;
181  bitset_count< Bitset > bc( m_words, m_num_words, val );
182  return val;
183  }
184 
185  void set()
186  {
187  // Set all the bits.
188  initialize_array< word_type, execution_space > ia( m_words, m_num_words, ~word_type(0) );
189 
190  if ( m_last_word_mask ) {
191  // Clear the unused bits in the last block.
192  raw_deep_copy( m_words + ( m_num_words - 1 ), &m_last_word_mask, sizeof(word_type) );
193  }
194  }
195 
196  void reset()
197  {
198  initialize_array< word_type, execution_space > ia( m_words, m_num_words, word_type(0) );
199  }
200 
201  KOKKOS_FORCEINLINE_FUNCTION
202  bool test( size_type i ) const
203  {
204  size_type word_pos = i >> LG_WORD_SIZE;
205  word_type word = volatile_load( &m_words[ word_pos ] );
206  word_type mask = word_type(1) << ( i & WORD_MASK );
207 
208  return word & mask;
209  }
210 
211  KOKKOS_FORCEINLINE_FUNCTION
212  bool set( size_type i ) const
213  {
214  size_type word_pos = i >> LG_WORD_SIZE;
215  word_type mask = word_type(1) << ( i & WORD_MASK );
216 
217  return !( atomic_fetch_or( &m_words[ word_pos ], mask ) & mask );
218  }
219 
220  KOKKOS_FORCEINLINE_FUNCTION
221  bool reset( size_type i ) const
222  {
223  size_type word_pos = i >> LG_WORD_SIZE;
224  word_type mask = word_type(1) << ( i & WORD_MASK );
225 
226  return atomic_fetch_and( &m_words[ word_pos ], ~mask ) & mask;
227  }
228 
229  KOKKOS_FORCEINLINE_FUNCTION
231  fetch_word_set( size_type i ) const
232  {
233  size_type word_pos = i >> LG_WORD_SIZE;
234  word_type mask = word_type(1) << ( i & WORD_MASK );
235 
237  result.second = atomic_fetch_or( &m_words[ word_pos ], mask );
238  result.first = !( result.second & mask );
239 
240  return result;
241  }
242 
243  KOKKOS_FORCEINLINE_FUNCTION
245  fetch_word_reset( size_type i ) const
246  {
247  size_type word_pos = i >> LG_WORD_SIZE;
248  word_type mask = word_type(1) << ( i & WORD_MASK );
249 
251  result.second = atomic_fetch_and( &m_words[ word_pos ], ~mask );
252  result.first = result.second & mask;
253 
254  return result;
255  }
256 
257  KOKKOS_FORCEINLINE_FUNCTION
259  set_any_in_word( size_type & pos ) const
260  {
261  size_type word_pos = pos >> LG_WORD_SIZE;
262  word_type word = volatile_load( &m_words[ word_pos ] );
263 
264  // Loop until there are no more unset bits in the word.
265  while ( ~word ) {
266  // Find the first unset bit in the word.
267  size_type bit = Kokkos::Impl::bit_scan_forward( ~word );
268 
269  // Try to set the bit.
270  word_type mask = word_type(1) << bit;
271  word = atomic_fetch_or( &m_words[ word_pos ], mask );
272 
273  if ( !( word & mask ) ) {
274  // Successfully set the bit.
275  pos = ( word_pos << LG_WORD_SIZE ) + bit;
276 
277  return Kokkos::pair<bool, word_type>( true, word );
278  }
279  }
280 
281  // Didn't find a free bit in this word.
282  return Kokkos::pair<bool, word_type>( false, word_type(0) );
283  }
284 
285  KOKKOS_FORCEINLINE_FUNCTION
287  set_any_in_word( size_type & pos, word_type word_mask ) const
288  {
289  size_type word_pos = pos >> LG_WORD_SIZE;
290  word_type word = volatile_load( &m_words[ word_pos ] );
291  word = ( ~word ) & word_mask;
292 
293  // Loop until there are no more unset bits in the word.
294  while ( word ) {
295  // Find the first unset bit in the word.
296  size_type bit = Kokkos::Impl::bit_scan_forward( word );
297 
298  // Try to set the bit.
299  word_type mask = word_type(1) << bit;
300  word = atomic_fetch_or( &m_words[ word_pos ], mask );
301 
302  if ( !( word & mask ) ) {
303  // Successfully set the bit.
304  pos = ( word_pos << LG_WORD_SIZE ) + bit;
305 
306  return Kokkos::pair<bool, word_type>( true, word );
307  }
308 
309  word = ( ~word ) & word_mask;
310  }
311 
312  // Didn't find a free bit in this word.
313  return Kokkos::pair<bool, word_type>( false, word_type(0) );
314  }
315 
316  KOKKOS_FORCEINLINE_FUNCTION
318  reset_any_in_word( size_type & pos ) const
319  {
320  size_type word_pos = pos >> LG_WORD_SIZE;
321  word_type word = volatile_load( &m_words[ word_pos ] );
322 
323  // Loop until there are no more set bits in the word.
324  while ( word ) {
325  // Find the first unset bit in the word.
326  size_type bit = Kokkos::Impl::bit_scan_forward( word );
327 
328  // Try to reset the bit.
329  word_type mask = word_type(1) << bit;
330  word = atomic_fetch_and( &m_words[ word_pos ], ~mask );
331 
332  if ( word & mask ) {
333  // Successfully reset the bit.
334  pos = ( word_pos << LG_WORD_SIZE ) + bit;
335 
336  return Kokkos::pair<bool, word_type>( true, word );
337  }
338  }
339 
340  // Didn't find a free bit in this word.
341  return Kokkos::pair<bool, word_type>( false, word_type(0) );
342  }
343 
344  KOKKOS_FORCEINLINE_FUNCTION
346  reset_any_in_word( size_type & pos, word_type word_mask ) const
347  {
348  size_type word_pos = pos >> LG_WORD_SIZE;
349  word_type word = volatile_load( &m_words[ word_pos ] );
350  word = word & word_mask;
351 
352  // Loop until there are no more set bits in the word.
353  while ( word ) {
354  // Find the first unset bit in the word.
355  size_type bit = Kokkos::Impl::bit_scan_forward( word );
356 
357  // Try to reset the bit.
358  word_type mask = word_type(1) << bit;
359  word = atomic_fetch_and( &m_words[ word_pos ], ~mask );
360 
361  if ( word & mask ) {
362  // Successfully reset the bit.
363  pos = ( word_pos << LG_WORD_SIZE ) + bit;
364 
365  return Kokkos::pair<bool, word_type>( true, word );
366  }
367 
368  word = word & word_mask;
369  }
370 
371  // Didn't find a free bit in this word.
372  return Kokkos::pair<bool, word_type>( false, word_type(0) );
373  }
374 };
375 
376 template < typename UInt32View, typename BSHeaderView, typename SBHeaderView,
377  typename MempoolBitset >
378 struct create_histogram {
379  typedef typename UInt32View::execution_space execution_space;
380  typedef typename execution_space::size_type size_type;
381  typedef Kokkos::pair< double, uint32_t > value_type;
382 
383  size_t m_start;
384  UInt32View m_page_histogram;
385  BSHeaderView m_blocksize_info;
386  SBHeaderView m_sb_header;
387  MempoolBitset m_sb_blocks;
388  size_t m_lg_max_sb_blocks;
389  uint32_t m_lg_min_block_size;
390  uint32_t m_blocks_per_page;
391  value_type & m_result;
392 
393  create_histogram( size_t start, size_t end, UInt32View ph, BSHeaderView bsi,
394  SBHeaderView sbh, MempoolBitset sbb, size_t lmsb,
395  uint32_t lmbs, uint32_t bpp, value_type & r )
396  : m_start( start ), m_page_histogram( ph ), m_blocksize_info( bsi ),
397  m_sb_header( sbh ), m_sb_blocks( sbb ), m_lg_max_sb_blocks( lmsb ),
398  m_lg_min_block_size( lmbs ), m_blocks_per_page( bpp ), m_result( r )
399  {
400  Kokkos::parallel_reduce( end - start, *this, m_result );
401 
402  execution_space::fence();
403  }
404 
405  KOKKOS_INLINE_FUNCTION
406  void init( value_type & v ) const
407  {
408  v.first = 0.0;
409  v.second = 0;
410  }
411 
412  KOKKOS_INLINE_FUNCTION
413  void join( volatile value_type & dst, volatile value_type const & src ) const
414  {
415  dst.first += src.first;
416  dst.second += src.second;
417  }
418 
419  KOKKOS_INLINE_FUNCTION
420  void operator()( size_type i, value_type & r ) const
421  {
422  size_type i2 = i + m_start;
423 
424  uint32_t lg_block_size = m_sb_header(i2).m_lg_block_size;
425 
426  // A superblock only has a block size of 0 when it is empty.
427  if ( lg_block_size != 0 ) {
428  uint32_t block_size_id = lg_block_size - m_lg_min_block_size;
429  uint32_t blocks_per_sb = m_blocksize_info[block_size_id].m_blocks_per_sb;
430  uint32_t pages_per_sb = m_blocksize_info[block_size_id].m_pages_per_sb;
431 
432  uint32_t total_allocated_blocks = 0;
433 
434  for ( uint32_t j = 0; j < pages_per_sb; ++j ) {
435  unsigned start_pos = ( i2 << m_lg_max_sb_blocks ) + j * m_blocks_per_page;
436  unsigned end_pos = start_pos + m_blocks_per_page;
437  uint32_t page_allocated_blocks = 0;
438 
439  for ( unsigned k = start_pos; k < end_pos; ++k ) {
440  page_allocated_blocks += m_sb_blocks.test( k );
441  }
442 
443  total_allocated_blocks += page_allocated_blocks;
444 
445  atomic_increment( &m_page_histogram(page_allocated_blocks) );
446  }
447 
448  r.first += double(total_allocated_blocks) / blocks_per_sb;
449  r.second += blocks_per_sb;
450  }
451  }
452 };
453 
454 #ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
455 template < typename UInt32View, typename SBHeaderView, typename MempoolBitset >
456 struct count_allocated_blocks {
457  typedef typename UInt32View::execution_space execution_space;
458  typedef typename execution_space::size_type size_type;
459 
460  UInt32View m_num_allocated_blocks;
461  SBHeaderView m_sb_header;
462  MempoolBitset m_sb_blocks;
463  size_t m_sb_size;
464  size_t m_lg_max_sb_blocks;
465 
466  count_allocated_blocks( size_t num_sb, UInt32View nab, SBHeaderView sbh,
467  MempoolBitset sbb, size_t sbs, size_t lmsb )
468  : m_num_allocated_blocks( nab ), m_sb_header( sbh ),
469  m_sb_blocks( sbb ), m_sb_size( sbs ), m_lg_max_sb_blocks( lmsb )
470  {
471  Kokkos::parallel_for( num_sb, *this );
472 
473  execution_space::fence();
474  }
475 
476  KOKKOS_INLINE_FUNCTION
477  void operator()( size_type i ) const
478  {
479  uint32_t lg_block_size = m_sb_header(i).m_lg_block_size;
480 
481  // A superblock only has a block size of 0 when it is empty.
482  if ( lg_block_size != 0 ) {
483  // Count the allocated blocks in the superblock.
484  uint32_t blocks_per_sb = lg_block_size > 0 ? m_sb_size >> lg_block_size : 0;
485  unsigned start_pos = i << m_lg_max_sb_blocks;
486  unsigned end_pos = start_pos + blocks_per_sb;
487  uint32_t count = 0;
488 
489  for ( unsigned j = start_pos; j < end_pos; ++j ) {
490  count += m_sb_blocks.test( j );
491  }
492 
493  m_num_allocated_blocks(i) = count;
494  }
495  }
496 };
497 #endif
498 
499 }
500 
514 template < typename Device >
515 class MemoryPool {
516 private:
517  // The allocator uses superblocks. A superblock is divided into pages, and a
518  // page is divided into blocks. A block is the chunk of memory that is given
519  // out by the allocator. A page always has a number of blocks equal to the
520  // size of the word used by the bitset. Thus, the pagesize can vary between
521  // superblocks as it is based on the block size of the superblock. The
522  // allocator supports all powers of 2 from MIN_BLOCK_SIZE to the size of a
523  // superblock as block sizes.
524 
525  // Superblocks are divided into 4 categories:
526  // 1. empty - is completely empty; there are no active allocations
527  // 2. partfull - partially full; there are some active allocations
528  // 3. full - full enough with active allocations that new allocations
529  // will likely fail
530  // 4. active - is currently the active superblock for a block size
531  //
532  // An inactive superblock is one that is empty, partfull, or full.
533  //
534  // New allocations occur only from an active superblock. If a superblock is
535  // made inactive after an allocation request is made to it but before the
536  // allocation request is fulfilled, the allocation will still be attempted
537  // from that superblock. Deallocations can occur to partfull, full, or
538  // active superblocks. Superblocks move between categories as allocations
539  // and deallocations happen. Superblocks all start empty.
540  //
541  // Here are the possible moves between categories:
542  // empty -> active During allocation, there is no active superblock
543  // or the active superblock is full.
544  // active -> full During allocation, the full threshold of the
545  // superblock is reached when increasing the fill
546  // level.
547  // full -> partfull During deallocation, the full threshold of the
548  // superblock is crossed when decreasing the fill
549  // level.
550  // partfull -> empty Deallocation of the last allocated block of an
551  // inactive superblock.
552  // partfull -> active During allocation, the active superblock is full.
553  //
554  // When a new active superblock is needed, partfull superblocks of the same
555  // block size are chosen over empty superblocks.
556  //
557  // The empty and partfull superblocks are tracked using bitsets that represent
558  // the superblocks in those repsective categories. Empty superblocks use a
559  // single bitset, while partfull superblocks use a bitset per block size
560  // (contained sequentially in a single bitset). Active superblocks are
561  // tracked by the active superblocks array. Full superblocks aren't tracked
562  // at all.
563 
564  typedef typename Device::execution_space execution_space;
565  typedef typename Device::memory_space backend_memory_space;
566  typedef Device device_type;
567  typedef MempoolImpl::Bitset< device_type > MempoolBitset;
568 
569  // Define some constants.
570  enum {
571  MIN_BLOCK_SIZE = 64,
572  LG_MIN_BLOCK_SIZE = Kokkos::Impl::integral_power_of_two( MIN_BLOCK_SIZE ),
573  MAX_BLOCK_SIZES = 31 - LG_MIN_BLOCK_SIZE + 1,
574 
575  // Size of bitset word.
576  BLOCKS_PER_PAGE = MempoolBitset::WORD_SIZE,
577  LG_BLOCKS_PER_PAGE = MempoolBitset::LG_WORD_SIZE,
578 
579  INVALID_SUPERBLOCK = ~uint32_t(0),
580  SUPERBLOCK_LOCK = ~uint32_t(0) - 1,
581 
582  MAX_TRIES = 32 // Cap on the number of pages searched
583  // before an allocation returns empty.
584  };
585 
586 public:
587  // Stores information about each superblock.
588  struct SuperblockHeader {
589  uint32_t m_full_pages;
590  uint32_t m_empty_pages;
591  uint32_t m_lg_block_size;
592  uint32_t m_is_active;
593 
594  KOKKOS_FUNCTION
595  SuperblockHeader() :
596  m_full_pages(0), m_empty_pages(0), m_lg_block_size(0), m_is_active(false) {}
597  };
598 
599  // Stores information about each block size.
600  struct BlockSizeHeader {
601  uint32_t m_blocks_per_sb;
602  uint32_t m_pages_per_sb;
603  uint32_t m_sb_full_level;
604  uint32_t m_page_full_level;
605 
606  KOKKOS_FUNCTION
607  BlockSizeHeader() :
608  m_blocks_per_sb(0), m_pages_per_sb(0), m_sb_full_level(0), m_page_full_level(0) {}
609  };
610 
611 private:
612  typedef Kokkos::Impl::SharedAllocationTracker Tracker;
615 
616  // The letters 'sb' used in any variable name mean superblock.
617 
618  size_t m_lg_sb_size; // Log2 of superblock size.
619  size_t m_sb_size; // Superblock size.
620  size_t m_lg_max_sb_blocks; // Log2 of the number of blocks of the
621  // minimum block size in a superblock.
622  size_t m_num_sb; // Number of superblocks.
623  size_t m_ceil_num_sb; // Number of superblocks rounded up to the smallest
624  // multiple of the bitset word size. Used by
625  // bitsets representing superblock categories to
626  // ensure different block sizes never share a word
627  // in the bitset.
628  size_t m_num_block_size; // Number of block sizes supported.
629  size_t m_data_size; // Amount of memory available to the allocator.
630  size_t m_sb_blocks_size; // Amount of memory for free / empty blocks bitset.
631  size_t m_empty_sb_size; // Amount of memory for empty superblocks bitset.
632  size_t m_partfull_sb_size; // Amount of memory for partfull superblocks bitset.
633  size_t m_total_size; // Total amount of memory allocated.
634  char * m_data; // Beginning device memory location used for
635  // superblocks.
636  UInt32View m_active; // Active superblocks IDs.
637  SBHeaderView m_sb_header; // Header info for superblocks.
638  MempoolBitset m_sb_blocks; // Bitsets representing free / allocated status
639  // of blocks in superblocks.
640  MempoolBitset m_empty_sb; // Bitset representing empty superblocks.
641  MempoolBitset m_partfull_sb; // Bitsets representing partially full superblocks.
642  Tracker m_track; // Tracker for superblock memory.
643  BlockSizeHeader m_blocksize_info[MAX_BLOCK_SIZES]; // Header info for block sizes.
644 
645  // There were several methods tried for storing the block size header info: in a View,
646  // in a View of const data, and in a RandomAccess View. All of these were slower than
647  // storing it in a static array that is a member variable to the class. In the latter
648  // case, the block size info gets copied into the constant memory on the GPU along with
649  // the class when it is copied there for exeucting a parallel loop. Instead of storing
650  // the values, computing the values every time they were needed was also tried. This
651  // method was slightly slower than storing them in the static array.
652 
653 public:
656 
657  ~MemoryPool() = default;
658  MemoryPool() = default;
659  MemoryPool( MemoryPool && ) = default;
660  MemoryPool( const MemoryPool & ) = default;
661  MemoryPool & operator = ( MemoryPool && ) = default;
662  MemoryPool & operator = ( const MemoryPool & ) = default;
663 
671  inline
672  MemoryPool( const backend_memory_space & memspace,
673  size_t total_size, size_t log2_superblock_size = 20 )
674  : m_lg_sb_size( log2_superblock_size ),
675  m_sb_size( size_t(1) << m_lg_sb_size ),
676  m_lg_max_sb_blocks( m_lg_sb_size - LG_MIN_BLOCK_SIZE ),
677  m_num_sb( ( total_size + m_sb_size - 1 ) >> m_lg_sb_size ),
678  m_ceil_num_sb( ( ( m_num_sb + BLOCKS_PER_PAGE - 1 ) >> LG_BLOCKS_PER_PAGE ) <<
679  LG_BLOCKS_PER_PAGE ),
680  m_num_block_size( m_lg_sb_size - LG_MIN_BLOCK_SIZE + 1 ),
681  m_data_size( m_num_sb * m_sb_size ),
682  m_sb_blocks_size( ( m_num_sb << m_lg_max_sb_blocks ) / CHAR_BIT ),
683  m_empty_sb_size( m_ceil_num_sb / CHAR_BIT ),
684  m_partfull_sb_size( m_ceil_num_sb * m_num_block_size / CHAR_BIT ),
685  m_total_size( m_data_size + m_sb_blocks_size + m_empty_sb_size + m_partfull_sb_size ),
686  m_data(0),
687  m_active( "Active superblocks" ),
688  m_sb_header( "Superblock headers" ),
689  m_track()
690  {
691  // Assumption. The minimum block size must be a power of 2.
692  static_assert( Kokkos::Impl::is_integral_power_of_two( MIN_BLOCK_SIZE ), "" );
693 
694  // Assumption. Require a superblock be large enough so it takes at least 1
695  // whole bitset word to represent it using the minimum blocksize.
696  if ( m_sb_size < MIN_BLOCK_SIZE * BLOCKS_PER_PAGE ) {
697  printf( "\n** MemoryPool::MemoryPool() Superblock size must be >= %u **\n",
698  MIN_BLOCK_SIZE * BLOCKS_PER_PAGE );
699 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
700  fflush( stdout );
701 #endif
702  Kokkos::abort( "" );
703  }
704 
705  // Assumption. A superblock's size can be at most 2^31. Verify this.
706  if ( m_lg_sb_size > 31 ) {
707  printf( "\n** MemoryPool::MemoryPool() Superblock size must be < %u **\n",
708  ( uint32_t(1) << 31 ) );
709 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
710  fflush( stdout );
711 #endif
712  Kokkos::abort( "" );
713  }
714 
715  // Assumption. The Bitset only uses unsigned for size types which limits
716  // the amount of memory the allocator can manage. Verify the memory size
717  // is below this limit.
718  if ( m_data_size > size_t(MIN_BLOCK_SIZE) * std::numeric_limits<unsigned>::max() ) {
719  printf( "\n** MemoryPool::MemoryPool() Allocator can only manage %lu bytes of memory; requested %lu **\n",
720  size_t(MIN_BLOCK_SIZE) * std::numeric_limits<unsigned>::max(), total_size );
721 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
722  fflush( stdout );
723 #endif
724  Kokkos::abort( "" );
725  }
726 
727  // Allocate memory for Views. This is done here instead of at construction
728  // so that the runtime checks can be performed before allocating memory.
729  resize( m_active, m_num_block_size );
730  resize( m_sb_header, m_num_sb );
731 
732  // Allocate superblock memory.
733  typedef Kokkos::Impl::SharedAllocationRecord< backend_memory_space, void > SharedRecord;
734  SharedRecord * rec =
735  SharedRecord::allocate( memspace, "mempool", m_total_size );
736 
737  m_track.assign_allocated_record_to_uninitialized( rec );
738  m_data = reinterpret_cast<char *>( rec->data() );
739 
740  // Set and initialize the free / empty block bitset memory.
741  m_sb_blocks.init( m_data + m_data_size, m_num_sb << m_lg_max_sb_blocks );
742 
743  // Set and initialize the empty superblock block bitset memory.
744  m_empty_sb.init( m_data + m_data_size + m_sb_blocks_size, m_num_sb );
745 
746  // Start with all superblocks in the empty category.
747  m_empty_sb.set();
748 
749  // Set and initialize the partfull superblock block bitset memory.
750  m_partfull_sb.init( m_data + m_data_size + m_sb_blocks_size + m_empty_sb_size,
751  m_ceil_num_sb * m_num_block_size );
752 
753  // Initialize all active superblocks to be invalid.
754  typename UInt32View::HostMirror host_active = create_mirror_view( m_active );
755  for ( size_t i = 0; i < m_num_block_size; ++i ) host_active(i) = INVALID_SUPERBLOCK;
756  deep_copy( m_active, host_active );
757 
758  // A superblock is considered full when this percentage of its pages are full.
759  const double superblock_full_fraction = .8;
760 
761  // A page is considered full when this percentage of its blocks are full.
762  const double page_full_fraction = .875;
763 
764  // Initialize the blocksize info.
765  for ( size_t i = 0; i < m_num_block_size; ++i ) {
766  uint32_t lg_block_size = i + LG_MIN_BLOCK_SIZE;
767  uint32_t blocks_per_sb = m_sb_size >> lg_block_size;
768  uint32_t pages_per_sb = ( blocks_per_sb + BLOCKS_PER_PAGE - 1 ) >> LG_BLOCKS_PER_PAGE;
769 
770  m_blocksize_info[i].m_blocks_per_sb = blocks_per_sb;
771  m_blocksize_info[i].m_pages_per_sb = pages_per_sb;
772 
773  // Set the full level for the superblock.
774  m_blocksize_info[i].m_sb_full_level =
775  static_cast<uint32_t>( pages_per_sb * superblock_full_fraction );
776 
777  if ( m_blocksize_info[i].m_sb_full_level == 0 ) {
778  m_blocksize_info[i].m_sb_full_level = 1;
779  }
780 
781  // Set the full level for the page.
782  uint32_t blocks_per_page =
783  blocks_per_sb < BLOCKS_PER_PAGE ? blocks_per_sb : BLOCKS_PER_PAGE;
784 
785  m_blocksize_info[i].m_page_full_level =
786  static_cast<uint32_t>( blocks_per_page * page_full_fraction );
787 
788  if ( m_blocksize_info[i].m_page_full_level == 0 ) {
789  m_blocksize_info[i].m_page_full_level = 1;
790  }
791  }
792 
793 #ifdef KOKKOS_MEMPOOL_PRINT_CONSTRUCTOR_INFO
794  printf( "\n" );
795  printf( " m_lg_sb_size: %12lu\n", m_lg_sb_size );
796  printf( " m_sb_size: %12lu\n", m_sb_size );
797  printf( " m_max_sb_blocks: %12lu\n", size_t(1) << m_lg_max_sb_blocks );
798  printf( "m_lg_max_sb_blocks: %12lu\n", m_lg_max_sb_blocks );
799  printf( " m_num_sb: %12lu\n", m_num_sb );
800  printf( " m_ceil_num_sb: %12lu\n", m_ceil_num_sb );
801  printf( " m_num_block_size: %12lu\n", m_num_block_size );
802  printf( " data bytes: %12lu\n", m_data_size );
803  printf( " sb_blocks bytes: %12lu\n", m_sb_blocks_size );
804  printf( " empty_sb bytes: %12lu\n", m_empty_sb_size );
805  printf( " partfull_sb bytes: %12lu\n", m_partfull_sb_size );
806  printf( " total bytes: %12lu\n", m_total_size );
807  printf( " m_empty_sb size: %12u\n", m_empty_sb.size() );
808  printf( "m_partfull_sb size: %12u\n", m_partfull_sb.size() );
809  printf( "\n" );
810  fflush( stdout );
811 #endif
812 
813 #ifdef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO
814  // Print the blocksize info for all the block sizes.
815  printf( "SIZE BLOCKS_PER_SB PAGES_PER_SB SB_FULL_LEVEL PAGE_FULL_LEVEL\n" );
816  for ( size_t i = 0; i < m_num_block_size; ++i ) {
817  printf( "%4zu %13u %12u %13u %15u\n", i + LG_MIN_BLOCK_SIZE,
818  m_blocksize_info[i].m_blocks_per_sb, m_blocksize_info[i].m_pages_per_sb,
819  m_blocksize_info[i].m_sb_full_level, m_blocksize_info[i].m_page_full_level );
820  }
821  printf( "\n" );
822 #endif
823  }
824 
826  KOKKOS_INLINE_FUNCTION
827  size_t allocate_block_size( const size_t alloc_size ) const
828  { return size_t(1) << ( get_block_size_index( alloc_size ) + LG_MIN_BLOCK_SIZE ); }
829 
835  KOKKOS_FUNCTION
836  void * allocate( size_t alloc_size ) const
837  {
838  void * p = 0;
839 
840  // Only support allocations up to the superblock size. Just return 0
841  // (failed allocation) for any size above this.
842  if ( alloc_size <= m_sb_size )
843  {
844  int block_size_id = get_block_size_index( alloc_size );
845  uint32_t blocks_per_sb = m_blocksize_info[block_size_id].m_blocks_per_sb;
846  uint32_t pages_per_sb = m_blocksize_info[block_size_id].m_pages_per_sb;
847 
848 #ifdef KOKKOS_CUDA_CLANG_WORKAROUND
849  // Without this test it looks like pages_per_sb might come back wrong.
850  if ( pages_per_sb == 0 ) return NULL;
851 #endif
852 
853  unsigned word_size = blocks_per_sb > 32 ? 32 : blocks_per_sb;
854  unsigned word_mask = ( uint64_t(1) << word_size ) - 1;
855 
856  // Instead of forcing an atomic read to guarantee the updated value,
857  // reading the old value is actually beneficial because more threads will
858  // attempt allocations on the old active superblock instead of waiting on
859  // the new active superblock. This will help hide the latency of
860  // switching the active superblock.
861  uint32_t sb_id = volatile_load( &m_active(block_size_id) );
862 
863  // If the active is locked, keep reading it atomically until the lock is
864  // released.
865  while ( sb_id == SUPERBLOCK_LOCK ) {
866  sb_id = atomic_fetch_or( &m_active(block_size_id), uint32_t(0) );
867  }
868 
869  load_fence();
870 
871  bool allocation_done = false;
872 
873  while ( !allocation_done ) {
874  bool need_new_sb = false;
875 
876  if ( sb_id != INVALID_SUPERBLOCK ) {
877  // Use the value from the clock register as the hash value.
878  uint64_t hash_val = get_clock_register();
879 
880  // Get the starting position for this superblock's bits in the bitset.
881  uint32_t pos_base = sb_id << m_lg_max_sb_blocks;
882 
883  // Mod the hash value to choose a page in the superblock. The
884  // initial block searched is the first block of that page.
885  uint32_t pos_rel = uint32_t( hash_val & ( pages_per_sb - 1 ) ) << LG_BLOCKS_PER_PAGE;
886 
887  // Get the absolute starting position for this superblock's bits in the bitset.
888  uint32_t pos = pos_base + pos_rel;
889 
890  // Keep track of the number of pages searched. Pages in the superblock are
891  // searched linearly from the starting page. All pages in the superblock are
892  // searched until either a location is found, or it is proven empty.
893  uint32_t pages_searched = 0;
894 
895  bool search_done = false;
896 
897  while ( !search_done ) {
898  bool success = false;
899  unsigned prev_val = 0;
900 
901  Kokkos::tie( success, prev_val ) = m_sb_blocks.set_any_in_word( pos, word_mask );
902 
903  if ( !success ) {
904  if ( ++pages_searched >= pages_per_sb ) {
905  // Searched all the pages in this superblock. Look for a new superblock.
906  //
907  // The previous method tried limiting the number of pages searched, but
908  // that caused a huge performance issue in CUDA where the outer loop
909  // executed massive numbers of times. Threads weren't able to find a
910  // free location when the superblock wasn't full and were able to execute
911  // the outer loop many times before the superblock was switched for a new
912  // one. Switching to an exhaustive search eliminated this possiblity and
913  // didn't slow anything down for the tests.
914  need_new_sb = true;
915  search_done = true;
916  }
917  else {
918  // Move to the next page making sure the new search position
919  // doesn't go past this superblock's bits.
920  pos += BLOCKS_PER_PAGE;
921  pos = ( pos < pos_base + blocks_per_sb ) ? pos : pos_base;
922  }
923  }
924  else {
925  // Reserved a memory location to allocate.
926  memory_fence();
927 
928  search_done = true;
929  allocation_done = true;
930 
931  uint32_t lg_block_size = block_size_id + LG_MIN_BLOCK_SIZE;
932 
933  p = m_data + ( size_t(sb_id) << m_lg_sb_size ) +
934  ( ( pos - pos_base ) << lg_block_size );
935 
936  uint32_t used_bits = Kokkos::Impl::bit_count( prev_val );
937 
938  if ( used_bits == 0 ) {
939  // This page was empty. Decrement the number of empty pages for
940  // the superblock.
941  atomic_decrement( &m_sb_header(sb_id).m_empty_pages );
942  }
943  else if ( used_bits == m_blocksize_info[block_size_id].m_page_full_level - 1 )
944  {
945  // This page is full. Increment the number of full pages for
946  // the superblock.
947  uint32_t full_pages = atomic_fetch_add( &m_sb_header(sb_id).m_full_pages, 1 );
948 
949  // This allocation made the superblock full, so a new one needs to be found.
950  if ( full_pages == m_blocksize_info[block_size_id].m_sb_full_level - 1 ) {
951  need_new_sb = true;
952  }
953  }
954  }
955  }
956  }
957  else {
958  // This is the first allocation for this block size. A superblock needs
959  // to be set as the active one. If this point is reached any other time,
960  // it is an error.
961  need_new_sb = true;
962  }
963 
964  if ( need_new_sb ) {
965  uint32_t new_sb_id = find_superblock( block_size_id, sb_id );
966 
967  if ( new_sb_id == sb_id ) {
968  allocation_done = true;
969 #ifdef KOKKOS_MEMPOOL_PRINT_INFO
970  printf( "** No superblocks available. **\n" );
971 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
972  fflush( stdout );
973 #endif
974 #endif
975  }
976  else {
977  sb_id = new_sb_id;
978  }
979  }
980  }
981  }
982 #ifdef KOKKOS_MEMPOOL_PRINT_INFO
983  else {
984  printf( "** Requested allocation size (%zu) larger than superblock size (%lu). **\n",
985  alloc_size, m_sb_size );
986 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
987  fflush( stdout );
988 #endif
989  }
990 #endif
991 
992  return p;
993  }
994 
999  KOKKOS_FUNCTION
1000  void deallocate( void * alloc_ptr, size_t alloc_size ) const
1001  {
1002  char * ap = static_cast<char *>( alloc_ptr );
1003 
1004  // Only deallocate memory controlled by this pool.
1005  if ( ap >= m_data && ap + alloc_size <= m_data + m_data_size ) {
1006  // Get the superblock for the address. This can be calculated by math on
1007  // the address since the superblocks are stored contiguously in one memory
1008  // chunk.
1009  uint32_t sb_id = ( ap - m_data ) >> m_lg_sb_size;
1010 
1011  // Get the starting position for this superblock's bits in the bitset.
1012  uint32_t pos_base = sb_id << m_lg_max_sb_blocks;
1013 
1014  // Get the relative position for this memory location's bit in the bitset.
1015  uint32_t offset = ( ap - m_data ) - ( size_t(sb_id) << m_lg_sb_size );
1016  uint32_t lg_block_size = m_sb_header(sb_id).m_lg_block_size;
1017  uint32_t block_size_id = lg_block_size - LG_MIN_BLOCK_SIZE;
1018  uint32_t pos_rel = offset >> lg_block_size;
1019 
1020  bool success = false;
1021  unsigned prev_val = 0;
1022 
1023  memory_fence();
1024 
1025  Kokkos::tie( success, prev_val ) = m_sb_blocks.fetch_word_reset( pos_base + pos_rel );
1026 
1027  // If the memory location was previously deallocated, do nothing.
1028  if ( success ) {
1029  uint32_t page_fill_level = Kokkos::Impl::bit_count( prev_val );
1030 
1031  if ( page_fill_level == 1 ) {
1032  // This page is now empty. Increment the number of empty pages for the
1033  // superblock.
1034  uint32_t empty_pages = atomic_fetch_add( &m_sb_header(sb_id).m_empty_pages, 1 );
1035 
1036  if ( !volatile_load( &m_sb_header(sb_id).m_is_active ) &&
1037  empty_pages == m_blocksize_info[block_size_id].m_pages_per_sb - 1 )
1038  {
1039  // This deallocation caused the superblock to be empty. Change the
1040  // superblock category from partially full to empty.
1041  unsigned pos = block_size_id * m_ceil_num_sb + sb_id;
1042 
1043  if ( m_partfull_sb.reset( pos ) ) {
1044  // Reset the empty pages and block size for the superblock.
1045  volatile_store( &m_sb_header(sb_id).m_empty_pages, uint32_t(0) );
1046  volatile_store( &m_sb_header(sb_id).m_lg_block_size, uint32_t(0) );
1047 
1048  store_fence();
1049 
1050  m_empty_sb.set( sb_id );
1051  }
1052  }
1053  }
1054  else if ( page_fill_level == m_blocksize_info[block_size_id].m_page_full_level ) {
1055  // This page is no longer full. Decrement the number of full pages for
1056  // the superblock.
1057  uint32_t full_pages = atomic_fetch_sub( &m_sb_header(sb_id).m_full_pages, 1 );
1058 
1059  if ( !volatile_load( &m_sb_header(sb_id).m_is_active ) &&
1060  full_pages == m_blocksize_info[block_size_id].m_sb_full_level )
1061  {
1062  // This deallocation caused the number of full pages to decrease below
1063  // the full threshold. Change the superblock category from full to
1064  // partially full.
1065  unsigned pos = block_size_id * m_ceil_num_sb + sb_id;
1066  m_partfull_sb.set( pos );
1067  }
1068  }
1069  }
1070  }
1071 #ifdef KOKKOS_MEMPOOL_PRINTERR
1072  else {
1073  printf( "\n** MemoryPool::deallocate() ADDRESS_OUT_OF_RANGE(0x%llx) **\n",
1074  reinterpret_cast<uint64_t>( alloc_ptr ) );
1075 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
1076  fflush( stdout );
1077 #endif
1078  }
1079 #endif
1080  }
1081 
1083  KOKKOS_INLINE_FUNCTION
1084  bool is_empty() const
1085  {
1086  // The allocator is empty if all superblocks are full. A superblock is
1087  // full if it has >= 80% of its pages allocated.
1088 
1089  // Look at all the superblocks. If one is not full, then the allocator
1090  // isn't empty.
1091  for ( size_t i = 0; i < m_num_sb; ++i ) {
1092  uint32_t lg_block_size = m_sb_header(i).m_lg_block_size;
1093 
1094  // A superblock only has a block size of 0 when it is empty.
1095  if ( lg_block_size == 0 ) return false;
1096 
1097  uint32_t block_size_id = lg_block_size - LG_MIN_BLOCK_SIZE;
1098  uint32_t full_pages = volatile_load( &m_sb_header(i).m_full_pages );
1099 
1100  if ( full_pages < m_blocksize_info[block_size_id].m_sb_full_level ) return false;
1101  }
1102 
1103  // All the superblocks were full. The allocator is empty.
1104  return true;
1105  }
1106 
1107  // The following functions are used for debugging.
1108  void print_status() const
1109  {
1110  printf( "\n" );
1111 
1112 #ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
1113  typename SBHeaderView::HostMirror host_sb_header = create_mirror_view( m_sb_header );
1114  deep_copy( host_sb_header, m_sb_header );
1115 
1116  UInt32View num_allocated_blocks( "Allocated Blocks", m_num_sb );
1117 
1118  // Count the number of allocated blocks per superblock.
1119  {
1120  MempoolImpl::count_allocated_blocks< UInt32View, SBHeaderView, MempoolBitset >
1121  mch( m_num_sb, num_allocated_blocks, m_sb_header,
1122  m_sb_blocks, m_sb_size, m_lg_max_sb_blocks );
1123  }
1124 
1125  typename UInt32View::HostMirror host_num_allocated_blocks =
1126  create_mirror_view( num_allocated_blocks );
1127  deep_copy( host_num_allocated_blocks, num_allocated_blocks );
1128 
1129  // Print header info of all superblocks.
1130  printf( "SB_ID SIZE ACTIVE EMPTY_PAGES FULL_PAGES USED_BLOCKS\n" );
1131  for ( size_t i = 0; i < m_num_sb; ++i ) {
1132  printf( "%5zu %4u %6d %11u %10u %10u\n", i,
1133  host_sb_header(i).m_lg_block_size, host_sb_header(i).m_is_active,
1134  host_sb_header(i).m_empty_pages, host_sb_header(i).m_full_pages,
1135  host_num_allocated_blocks(i) );
1136  }
1137 
1138  printf( "\n" );
1139 #endif
1140 
1141  UInt32View page_histogram( "Page Histogram", 33 );
1142 
1143  // Get a View version of the blocksize info.
1144  typedef View< BlockSizeHeader *, device_type > BSHeaderView;
1145  BSHeaderView blocksize_info( "BlockSize Headers", MAX_BLOCK_SIZES );
1146 
1147  Kokkos::Impl::DeepCopy< backend_memory_space, Kokkos::HostSpace >
1148  dc( blocksize_info.ptr_on_device(), m_blocksize_info,
1149  sizeof(BlockSizeHeader) * m_num_block_size );
1150 
1152 
1153  // Create the page histogram.
1154  {
1155  MempoolImpl::create_histogram< UInt32View, BSHeaderView, SBHeaderView, MempoolBitset >
1156  mch( 0, m_num_sb, page_histogram, blocksize_info, m_sb_header, m_sb_blocks,
1157  m_lg_max_sb_blocks, LG_MIN_BLOCK_SIZE, BLOCKS_PER_PAGE, result );
1158  }
1159 
1160  typename UInt32View::HostMirror host_page_histogram = create_mirror_view( page_histogram );
1161  deep_copy( host_page_histogram, page_histogram );
1162 
1163  // Find the used and total pages and blocks.
1164  uint32_t used_pages = 0;
1165  uint32_t used_blocks = 0;
1166  for ( uint32_t i = 1; i < 33; ++i ) {
1167  used_pages += host_page_histogram(i);
1168  used_blocks += i * host_page_histogram(i);
1169  }
1170  uint32_t total_pages = used_pages + host_page_histogram(0);
1171 
1172  unsigned num_empty_sb = m_empty_sb.count();
1173  unsigned num_non_empty_sb = m_num_sb - num_empty_sb;
1174  unsigned num_partfull_sb = m_partfull_sb.count();
1175 
1176  uint32_t total_blocks = result.second;
1177  double ave_sb_full = num_non_empty_sb == 0 ? 0.0 : result.first / num_non_empty_sb;
1178  double percent_used_sb = double( m_num_sb - num_empty_sb ) / m_num_sb;
1179  double percent_used_pages = total_pages == 0 ? 0.0 : double(used_pages) / total_pages;
1180  double percent_used_blocks = total_blocks == 0 ? 0.0 : double(used_blocks) / total_blocks;
1181 
1182  // Count active superblocks.
1183  typename UInt32View::HostMirror host_active = create_mirror_view( m_active );
1184  deep_copy( host_active, m_active );
1185 
1186  unsigned num_active_sb = 0;
1187  for ( size_t i = 0; i < m_num_block_size; ++i ) {
1188  num_active_sb += host_active(i) != INVALID_SUPERBLOCK;
1189  }
1190 
1191 #ifdef KOKKOS_MEMPOOL_PRINT_ACTIVE_SUPERBLOCKS
1192  // Print active superblocks.
1193  printf( "BS_ID SB_ID\n" );
1194  for ( size_t i = 0; i < m_num_block_size; ++i ) {
1195  uint32_t sb_id = host_active(i);
1196 
1197  if ( sb_id == INVALID_SUPERBLOCK ) {
1198  printf( "%5zu I\n", i );
1199  }
1200  else if ( sb_id == SUPERBLOCK_LOCK ) {
1201  printf( "%5zu L\n", i );
1202  }
1203  else {
1204  printf( "%5zu %7u\n", i, sb_id );
1205  }
1206  }
1207  printf( "\n" );
1208  fflush( stdout );
1209 #endif
1210 
1211 #ifdef KOKKOS_MEMPOOL_PRINT_PAGE_INFO
1212  // Print the summary page histogram.
1213  printf( "USED_BLOCKS PAGE_COUNT\n" );
1214  for ( uint32_t i = 0; i < 33; ++i ) {
1215  printf( "%10u %10u\n", i, host_page_histogram[i] );
1216  }
1217  printf( "\n" );
1218 #endif
1219 
1220 #ifdef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO
1221  // Print the page histogram for a few individual superblocks.
1222 // const uint32_t num_sb_id = 2;
1223 // uint32_t sb_id[num_sb_id] = { 0, 10 };
1224  const uint32_t num_sb_id = 1;
1225  uint32_t sb_id[num_sb_id] = { 0 };
1226 
1227  for ( uint32_t i = 0; i < num_sb_id; ++i ) {
1228  deep_copy( page_histogram, 0 );
1229 
1230  {
1231  MempoolImpl::create_histogram< UInt32View, BSHeaderView, SBHeaderView, MempoolBitset >
1232  mch( sb_id[i], sb_id[i] + 1, page_histogram, blocksize_info, m_sb_header,
1233  m_sb_blocks, m_lg_max_sb_blocks, LG_MIN_BLOCK_SIZE, BLOCKS_PER_PAGE, result );
1234  }
1235 
1236  deep_copy( host_page_histogram, page_histogram );
1237 
1238  printf( "SB_ID USED_BLOCKS PAGE_COUNT\n" );
1239  for ( uint32_t j = 0; j < 33; ++j ) {
1240  printf( "%5u %10u %10u\n", sb_id[i], j, host_page_histogram[j] );
1241  }
1242  printf( "\n" );
1243  }
1244 
1245 /*
1246  // Print the blocks used for each page of a few individual superblocks.
1247  for ( uint32_t i = 0; i < num_sb_id; ++i ) {
1248  uint32_t lg_block_size = host_sb_header(sb_id[i]).m_lg_block_size;
1249 
1250  if ( lg_block_size != 0 ) {
1251  printf( "SB_ID BLOCK ID USED_BLOCKS\n" );
1252 
1253  uint32_t block_size_id = lg_block_size - LG_MIN_BLOCK_SIZE;
1254  uint32_t pages_per_sb = m_blocksize_info[block_size_id].m_pages_per_sb;
1255 
1256  for ( uint32_t j = 0; j < pages_per_sb; ++j ) {
1257  unsigned start_pos = ( sb_id[i] << m_lg_max_sb_blocks ) + j * BLOCKS_PER_PAGE;
1258  unsigned end_pos = start_pos + BLOCKS_PER_PAGE;
1259  uint32_t num_allocated_blocks = 0;
1260 
1261  for ( unsigned k = start_pos; k < end_pos; ++k ) {
1262  num_allocated_blocks += m_sb_blocks.test( k );
1263  }
1264 
1265  printf( "%5u %8u %11u\n", sb_id[i], j, num_allocated_blocks );
1266  }
1267 
1268  printf( "\n" );
1269  }
1270  }
1271 */
1272 #endif
1273 
1274  printf( " Used blocks: %10u / %10u = %10.6lf\n", used_blocks, total_blocks,
1275  percent_used_blocks );
1276  printf( " Used pages: %10u / %10u = %10.6lf\n", used_pages, total_pages,
1277  percent_used_pages );
1278  printf( " Used SB: %10zu / %10zu = %10.6lf\n", m_num_sb - num_empty_sb, m_num_sb,
1279  percent_used_sb );
1280  printf( " Active SB: %10u\n", num_active_sb );
1281  printf( " Empty SB: %10u\n", num_empty_sb );
1282  printf( " Partfull SB: %10u\n", num_partfull_sb );
1283  printf( " Full SB: %10lu\n",
1284  m_num_sb - num_active_sb - num_empty_sb - num_partfull_sb );
1285  printf( "Ave. SB Full %%: %10.6lf\n", ave_sb_full );
1286  printf( "\n" );
1287  fflush( stdout );
1288 
1289 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
1290  fflush( stdout );
1291 #endif
1292  }
1293 
1294  KOKKOS_INLINE_FUNCTION
1295  size_t get_min_block_size() const { return MIN_BLOCK_SIZE; }
1296 
1297  size_t get_mem_size() const { return m_data_size; }
1298 
1299 private:
1304  KOKKOS_FORCEINLINE_FUNCTION
1305  int get_block_size_index( const size_t size ) const
1306  {
1307  // We know the size fits in a 32 bit unsigned because the size of a
1308  // superblock is limited to 2^31, so casting to an unsigned is safe.
1309 
1310  // Find the most significant nonzero bit.
1311  uint32_t first_nonzero_bit =
1312  Kokkos::Impl::bit_scan_reverse( static_cast<unsigned>( size ) );
1313 
1314  // If size is an integral power of 2, ceil( log2(size) ) is equal to the
1315  // most significant nonzero bit. Otherwise, you need to add 1. Since the
1316  // minimum block size is MIN_BLOCK_SIZE, make sure ceil( log2(size) ) is at
1317  // least LG_MIN_BLOCK_SIZE.
1318  uint32_t lg2_size = first_nonzero_bit + !Kokkos::Impl::is_integral_power_of_two( size );
1319  lg2_size = lg2_size > LG_MIN_BLOCK_SIZE ? lg2_size : LG_MIN_BLOCK_SIZE;
1320 
1321  // Return ceil( log2(size) ) shifted so that the value for MIN_BLOCK_SIZE
1322  // is 0.
1323  return lg2_size - LG_MIN_BLOCK_SIZE;
1324  }
1325 
1335  KOKKOS_FUNCTION
1336  uint32_t find_superblock( int block_size_id, uint32_t old_sb ) const
1337  {
1338  // Try to grab the lock on the head.
1339  uint32_t lock_sb =
1340  Kokkos::atomic_compare_exchange( &m_active(block_size_id), old_sb, SUPERBLOCK_LOCK );
1341 
1342  load_fence();
1343 
1344  // Initialize the new superblock to be the previous one so the previous
1345  // superblock is returned if a new superblock can't be found.
1346  uint32_t new_sb = lock_sb;
1347 
1348  if ( lock_sb == old_sb ) {
1349  // This thread has the lock.
1350 
1351  // 1. Look for a partially filled superblock that is of the right block
1352  // size.
1353 
1354  size_t max_tries = m_ceil_num_sb >> LG_BLOCKS_PER_PAGE;
1355  size_t tries = 0;
1356  bool search_done = false;
1357 
1358  // Set the starting search position to the beginning of this block
1359  // size's bitset.
1360  unsigned pos = block_size_id * m_ceil_num_sb;
1361 
1362  while ( !search_done ) {
1363  bool success = false;
1364  unsigned prev_val = 0;
1365 
1366  Kokkos::tie( success, prev_val ) = m_partfull_sb.reset_any_in_word( pos );
1367 
1368  if ( !success ) {
1369  if ( ++tries >= max_tries ) {
1370  // Exceeded number of words for this block size's bitset.
1371  search_done = true;
1372  }
1373  else {
1374  pos += BLOCKS_PER_PAGE;
1375  }
1376  }
1377  else {
1378  // Found a superblock.
1379 
1380  // It is possible that the newly found superblock is the same as the
1381  // old superblock. In this case putting the old value back in yields
1382  // correct behavior. This could happen as follows. This thread
1383  // grabs the lock and transitions the superblock to the full state.
1384  // Before it searches for a new superblock, other threads perform
1385  // enough deallocations to transition the superblock to the partially
1386  // full state. This thread then searches for a partially full
1387  // superblock and finds the one it removed. There's potential for
1388  // this to cause a performance issue if the same superblock keeps
1389  // being removed and added due to the right mix and ordering of
1390  // allocations and deallocations.
1391  search_done = true;
1392  new_sb = pos - block_size_id * m_ceil_num_sb;
1393 
1394  // Set the head status for the superblock.
1395  volatile_store( &m_sb_header(new_sb).m_is_active, uint32_t(true) );
1396 
1397  // If there was a previous active superblock, mark it as not active.
1398  // It is now in the full category and as such isn't tracked.
1399  if ( lock_sb != INVALID_SUPERBLOCK ) {
1400  volatile_store( &m_sb_header(lock_sb).m_is_active, uint32_t(false) );
1401  }
1402 
1403  store_fence();
1404  }
1405  }
1406 
1407  // 2. Look for an empty superblock.
1408  if ( new_sb == lock_sb ) {
1409  tries = 0;
1410  search_done = false;
1411 
1412  // Set the starting search position to the beginning of this block
1413  // size's bitset.
1414  pos = 0;
1415 
1416  while ( !search_done ) {
1417  bool success = false;
1418  unsigned prev_val = 0;
1419 
1420  Kokkos::tie( success, prev_val ) = m_empty_sb.reset_any_in_word( pos );
1421 
1422  if ( !success ) {
1423  if ( ++tries >= max_tries ) {
1424  // Exceeded number of words for this block size's bitset.
1425  search_done = true;
1426  }
1427  else {
1428  pos += BLOCKS_PER_PAGE;
1429  }
1430  }
1431  else {
1432  // Found a superblock.
1433 
1434  // It is possible that the newly found superblock is the same as
1435  // the old superblock. In this case putting the old value back in
1436  // yields correct behavior. This could happen as follows. This
1437  // thread grabs the lock and transitions the superblock to the full
1438  // state. Before it searches for a new superblock, other threads
1439  // perform enough deallocations to transition the superblock to the
1440  // partially full state and then the empty state. This thread then
1441  // searches for a partially full superblock and none exist. This
1442  // thread then searches for an empty superblock and finds the one
1443  // it removed. The likelihood of this happening is so remote that
1444  // the potential for this to cause a performance issue is
1445  // infinitesimal.
1446  search_done = true;
1447  new_sb = pos;
1448 
1449  // Set the empty pages, block size, and head status for the
1450  // superblock.
1451  volatile_store( &m_sb_header(new_sb).m_empty_pages,
1452  m_blocksize_info[block_size_id].m_pages_per_sb );
1453  volatile_store( &m_sb_header(new_sb).m_lg_block_size,
1454  block_size_id + LG_MIN_BLOCK_SIZE );
1455  volatile_store( &m_sb_header(new_sb).m_is_active, uint32_t(true) );
1456 
1457  // If there was a previous active superblock, mark it as not active.
1458  // It is now in the full category and as such isn't tracked.
1459  if ( lock_sb != INVALID_SUPERBLOCK ) {
1460  volatile_store( &m_sb_header(lock_sb).m_is_active, uint32_t(false) );
1461  }
1462 
1463  store_fence();
1464  }
1465  }
1466  }
1467 
1468  // Write the new active superblock to release the lock.
1469  atomic_exchange( &m_active(block_size_id), new_sb );
1470  }
1471  else {
1472  // Either another thread has the lock and is switching the active
1473  // superblock for this block size or another thread has already changed
1474  // the active superblock since this thread read its value. Keep
1475  // atomically reading the active superblock until it isn't locked to get
1476  // the new active superblock.
1477  do {
1478  new_sb = atomic_fetch_or( &m_active(block_size_id), uint32_t(0) );
1479  } while ( new_sb == SUPERBLOCK_LOCK );
1480 
1481  load_fence();
1482 
1483  // Assertions:
1484  // 1. An invalid superblock should never be found here.
1485  // 2. If the new superblock is the same as the previous superblock, the
1486  // allocator is empty.
1487 #ifdef KOKKOS_MEMPOOL_PRINTERR
1488  if ( new_sb == INVALID_SUPERBLOCK ) {
1489  printf( "\n** MemoryPool::find_superblock() FOUND_INACTIVE_SUPERBLOCK **\n" );
1490 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
1491  fflush( stdout );
1492 #endif
1493  Kokkos::abort( "" );
1494  }
1495 #endif
1496  }
1497 
1498  return new_sb;
1499  }
1500 
1502  KOKKOS_FORCEINLINE_FUNCTION
1503  uint64_t get_clock_register(void) const
1504  {
1505 #if defined( __CUDA_ARCH__ )
1506  // Return value of 64-bit hi-res clock register.
1507  return clock64();
1508 #elif defined( __i386__ ) || defined( __x86_64 )
1509  // Return value of 64-bit hi-res clock register.
1510  unsigned a = 0, d = 0;
1511 
1512  __asm__ volatile( "rdtsc" : "=a" (a), "=d" (d) );
1513 
1514  return ( (uint64_t) a ) | ( ( (uint64_t) d ) << 32 );
1515 #elif defined( __powerpc ) || defined( __powerpc__ ) || defined( __powerpc64__ ) || \
1516  defined( __POWERPC__ ) || defined( __ppc__ ) || defined( __ppc64__ )
1517  unsigned int cycles = 0;
1518 
1519  asm volatile( "mftb %0" : "=r" (cycles) );
1520 
1521  return (uint64_t) cycles;
1522 #else
1523  const uint64_t ticks =
1524  std::chrono::high_resolution_clock::now().time_since_epoch().count();
1525 
1526  return ticks;
1527 #endif
1528  }
1529 };
1530 
1531 } // namespace Experimental
1532 } // namespace Kokkos
1533 
1534 #ifdef KOKKOS_MEMPOOL_PRINTERR
1535 #undef KOKKOS_MEMPOOL_PRINTERR
1536 #endif
1537 
1538 #ifdef KOKKOS_MEMPOOL_PRINT_INFO
1539 #undef KOKKOS_MEMPOOL_PRINT_INFO
1540 #endif
1541 
1542 #ifdef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO
1543 #undef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO
1544 #endif
1545 
1546 #ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
1547 #undef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
1548 #endif
1549 
1550 #ifdef KOKKOS_MEMPOOL_PRINT_PAGE_INFO
1551 #undef KOKKOS_MEMPOOL_PRINT_PAGE_INFO
1552 #endif
1553 
1554 #ifdef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO
1555 #undef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO
1556 #endif
1557 
1558 #endif // KOKKOS_MEMORYPOOL_HPP
std::enable_if< std::is_same< typename Kokkos::View< T, P... >::array_layout, Kokkos::LayoutLeft >::value||std::is_same< typename Kokkos::View< T, P... >::array_layout, Kokkos::LayoutRight >::value >::type resize(Kokkos::View< T, P... > &v, const size_t n0=0, const size_t n1=0, const size_t n2=0, const size_t n3=0, const size_t n4=0, const size_t n5=0, const size_t n6=0, const size_t n7=0)
Resize a view with copying old data to new data at the corresponding indices.
Bitset based memory manager for pools of same-sized chunks of memory.
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename Impl::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=0)
Parallel reduction.
Replacement for std::pair that works on CUDA devices.
Definition: Kokkos_Pair.hpp:64
KOKKOS_FUNCTION void deallocate(void *alloc_ptr, size_t alloc_size) const
Release allocated memory back to the pool.
first_type first
The first element of the pair.
Definition: Kokkos_Pair.hpp:72
Memory space for main process and CPU execution spaces.
MemoryPool memory_space
Tag this class as a kokkos memory space.
Declaration of parallel operators.
KOKKOS_INLINE_FUNCTION size_t allocate_block_size(const size_t alloc_size) const
The actual block size allocated given alloc_size.
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type *=0)
Execute functor in parallel according to the execution policy.
KOKKOS_FUNCTION void * allocate(size_t alloc_size) const
Allocate a chunk of memory.
Atomic functions.
void deep_copy(const View< DT, DP... > &dst, typename ViewTraits< DT, DP... >::const_value_type &value, typename std::enable_if< std::is_same< typename ViewTraits< DT, DP... >::specialize, void >::value >::type *=0)
Deep copy a value from Host memory into a view.
KOKKOS_INLINE_FUNCTION bool is_empty() const
Tests if the memory pool has no more memory available to allocate.
KOKKOS_FORCEINLINE_FUNCTION pair< T1 &, T2 & > tie(T1 &x, T2 &y)
Return a pair of references to the input arguments.
View< typename traits::non_const_data_type, typename traits::array_layout, typename traits::host_mirror_space > HostMirror
Compatible HostMirror view.
second_type second
The second element of the pair.
Definition: Kokkos_Pair.hpp:74
MemoryPool(const backend_memory_space &memspace, size_t total_size, size_t log2_superblock_size=20)
Initializes the memory pool.