Sierra Toolkit  Version of the Day
RadixSort.hpp
1 /*------------------------------------------------------------------------*/
2 /* Copyright 2010 Sandia Corporation. */
3 /* Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive */
4 /* license for use of this work by or on behalf of the U.S. Government. */
5 /* Export of this program may require a license from the */
6 /* United States Government. */
7 /*------------------------------------------------------------------------*/
8 
9 // Portions of this source code file are:
10 // Copyright (c) 2010, Victor J. Duvanenko.
11 // All rights reserved.
12 // Used with permission.
13 //
14 // Redistribution and use in source and binary forms, with or without
15 // modification, are permitted provided that the following conditions
16 // are met:
17 // * Redistributions of source code must retain the above copyright
18 // notice, this list of conditions and the following disclaimer.
19 // * Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 // * The name of Victor J. Duvanenko may not be used to endorse or
23 // promote products derived from this software without specific prior
24 // written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE
30 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
33 // BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
34 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
35 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
36 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 
38 #ifndef STK_UTIL_UTIL_RADIXSORT_HPP
39 #define STK_UTIL_UTIL_RADIXSORT_HPP
40 
41 //
42 // Internal implementation
43 //
44 namespace { // anonymous namespace
45 
46 // Swap that does not check for self-assignment.
47 template< class T >
48 inline void swap_impl( T& a, T& b )
49 {
50  T tmp = a;
51  a = b;
52  b = tmp;
53 }
54 
55 // insertion sort similar to STL with no self-assignment
56 template <class T>
57 inline void insertion_sort_impl( T* a, size_t a_size )
58 {
59  for ( size_t i = 1; i < a_size; ++i )
60  {
61  if ( a[ i ] < a[ i - 1 ] ) // no need to do (j > 0) compare for the first iteration
62  {
63  T currentElement = a[ i ];
64  a[ i ] = a[ i - 1 ];
65  size_t j;
66  for ( j = i - 1; j > 0 && currentElement < a[ j - 1 ]; --j )
67  {
68  a[ j ] = a[ j - 1 ];
69  }
70  a[ j ] = currentElement; // always necessary work/write
71  }
72  // Perform no work at all if the first comparison fails - i.e. never assign an element to itself!
73  }
74 }
75 
76 // Recursive implementation of radix sort for unsigned integer types only.
77 //
78 // PowerOfTwoRadix - must be a power of 2, 256 is suggested for current hardware as of 03/20/2011.
79 // Log2ofPowerOfTwoRadix - for example log( 256 ) = 8
80 // Threshold - length below which array sections are sorted with an insertion sort
81 // a - pointer to start of an array of integer type
82 // last - one less than the length of the a array
83 // bitMask - controls how many and which bits we process at a time
84 // shiftRightAmount - sizeof(T) * 8 - Log2ofPowerOfTwoRadix
85 template< class T, unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold >
86 inline void radix_sort_unsigned_impl( T* a, long last, T bitMask, unsigned long shiftRightAmount )
87 {
88  unsigned long count[ PowerOfTwoRadix ];
89  for( unsigned long i = 0; i < PowerOfTwoRadix; ++i ) count[ i ] = 0;
90  for ( long _current = 0; _current <= last; ++_current ) count[ (unsigned long)(( a[ _current ] & bitMask ) >> shiftRightAmount ) ]++; // Scan the array and count the number of times each value appears
91 
92  long startOfBin[ PowerOfTwoRadix + 1 ], endOfBin[ PowerOfTwoRadix ], nextBin = 1;
93  startOfBin[ 0 ] = endOfBin[ 0 ] = 0; startOfBin[ PowerOfTwoRadix ] = -1; // sentinel
94  for( unsigned long i = 1; i < PowerOfTwoRadix; ++i )
95  startOfBin[ i ] = endOfBin[ i ] = startOfBin[ i - 1 ] + count[ i - 1 ];
96 
97  for ( long _current = 0; _current <= last; )
98  {
99  unsigned long digit;
100  T _current_element = a[ _current ]; // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location
101  while( endOfBin[ digit = (unsigned long)(( _current_element & bitMask ) >> shiftRightAmount )] != _current ) swap_impl( _current_element, a[ endOfBin[ digit ]++ ] );
102  a[ _current ] = _current_element;
103 
104  endOfBin[ digit ]++;
105  while( endOfBin[ nextBin - 1 ] == startOfBin[ nextBin ] ) ++nextBin; // skip over empty and full bins, when the end of the current bin reaches the start of the next bin
106  _current = endOfBin[ nextBin - 1 ];
107  }
108  bitMask >>= Log2ofPowerOfTwoRadix;
109  if ( bitMask != 0 ) // end recursion when all the bits have been processes
110  {
111  if ( shiftRightAmount >= Log2ofPowerOfTwoRadix ) shiftRightAmount -= Log2ofPowerOfTwoRadix;
112  else shiftRightAmount = 0;
113 
114  for( unsigned long i = 0; i < PowerOfTwoRadix; ++i )
115  {
116  long numberOfElements = endOfBin[ i ] - startOfBin[ i ];
117  if ( numberOfElements >= Threshold ) // endOfBin actually points to one beyond the bin
118  radix_sort_unsigned_impl< T, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &a[ startOfBin[ i ]], numberOfElements - 1, bitMask, shiftRightAmount );
119  else if ( numberOfElements >= 2 )
120  insertion_sort_impl( &a[ startOfBin[ i ]], numberOfElements );
121  }
122  }
123 }
124 
125 } // end anonymous namespace
126 
127 //
128 // Public API
129 //
130 namespace stk_classic {
131 namespace util {
132 
153 template< class T >
154 void radix_sort_unsigned( T* a, size_t a_size )
155 {
156  const long Threshold = 25; // smaller array sections sorted by insertion sort
157  const unsigned long PowerOfTwoRadix = 256; // Radix - must be a power of 2
158  const unsigned long Log2ofPowerOfTwoRadix = 8; // log( 256 ) = 8
159 
160  if ( a_size >= (size_t)Threshold ) {
161  if (a_size > (size_t)std::numeric_limits<long>::max()) {
162  std::ostringstream msg ;
163  msg << "stk_classic::utility::radix_sort() exceeded allowable array size (";
164  msg << a_size << " < " << std::numeric_limits<long>::max() << ")";
165  throw std::runtime_error( msg.str() );
166  }
167  unsigned long shiftRightAmount = sizeof(T) * 8 - Log2ofPowerOfTwoRadix;
168  T bitMask = (T)( ((T)( PowerOfTwoRadix - 1 )) << shiftRightAmount ); // bitMask controls/selects how many and which bits we process at a time
169  radix_sort_unsigned_impl< T, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( a, a_size - 1, bitMask, shiftRightAmount );
170  }
171  else
172  insertion_sort_impl( a, a_size );
173 }
174 
175 } // namespace util
176 } // namespace stk_classic
177 
178 
179 //--------------------------------------
180 // PARALLEL RADIX SORT using TBB library
181 //--------------------------------------
182 
183 /* TODO: this code works with TBB, it just needs minor modifications and some renaming
184 template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, class T >
185 class RadixInPlaceOperation_3
186 {
187  T* my_a; // a local copy to the input array to provide a pointer to each parallel task
188  long* my_startOfBin;
189  long* my_endOfBin;
190  T my_bitMask; // a local copy of the bitMask
191  unsigned long my_shiftRightAmount;
192  static const unsigned long Threshold_P = 10000; // threshold when to switch between parallel and non-parallel implementations
193 public:
194  static const unsigned long numberOfBins = PowerOfTwoRadix;
195  unsigned long my_count[ numberOfBins ]; // the count for this task
196 
197  RadixInPlaceOperation_3( T a[], long* startOfBin, long* endOfBin, T bitMask, unsigned long shiftRightAmount ) :
198  my_a( a ), my_startOfBin( startOfBin ), my_endOfBin( endOfBin ),
199  my_bitMask( bitMask ), my_shiftRightAmount( shiftRightAmount )
200  {}
201  void operator()( const blocked_range< long >& r ) const
202  {
203  for( long i = r.begin(); i != r.end(); ++i )
204  {
205  long numOfElements = my_endOfBin[ i ] - my_startOfBin[ i ];
206  if ( numOfElements >= Threshold_P )
207  _RadixSort_Unsigned_PowerOf2Radix_3< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &my_a[ my_startOfBin[ i ]], numOfElements - 1, my_bitMask, my_shiftRightAmount );
208  else if ( numOfElements >= Threshold )
209  _RadixSort_Unsigned_PowerOf2Radix_1< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &my_a[ my_startOfBin[ i ]], numOfElements - 1, my_bitMask, my_shiftRightAmount );
210  else if ( numOfElements >= 2 )
211  insertion_sort_impl( &my_a[ my_startOfBin[ i ]], numOfElements );
212  }
213  }
214 };
215 
216 template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, class T >
217 inline void _RadixSort_Unsigned_PowerOf2Radix_3( T* a, long last, T bitMask, unsigned long shiftRightAmount )
218 {
219  const unsigned long numberOfBins = PowerOfTwoRadix;
220 
221  CountingType_1< PowerOfTwoRadix, T > count( a, bitMask, shiftRightAmount ); // contains the count array, which is initialized to all zeros
222  // Scan the array and count the number of times each value appears
223  parallel_reduce( blocked_range< unsigned long >( 0, last + 1, 1000 ), count );
224 
225  long startOfBin[ numberOfBins ], endOfBin[ numberOfBins ], nextBin;
226  startOfBin[ 0 ] = endOfBin[ 0 ] = nextBin = 0;
227  for( unsigned long i = 1; i < numberOfBins; i++ )
228  startOfBin[ i ] = endOfBin[ i ] = startOfBin[ i - 1 ] + count.my_count[ i - 1 ];
229 
230  for ( long _current = 0; _current <= last; )
231  {
232  unsigned long digit;
233  T tmp = a[ _current ]; // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location
234  while ( true ) {
235  digit = (unsigned long)(( tmp & bitMask ) >> shiftRightAmount ); // extract the digit we are sorting based on
236  if ( endOfBin[ digit ] == _current )
237  break;
238  swap_impl( tmp, a[ endOfBin[ digit ] ] );
239  endOfBin[ digit ]++;
240  }
241  a[ _current ] = tmp;
242 
243  endOfBin[ digit ]++; // leave the element at its location and grow the bin
244  _current++; // advance the current pointer to the next element
245  while( _current >= startOfBin[ nextBin ] && nextBin < numberOfBins )
246  nextBin++;
247  while( endOfBin[ nextBin - 1 ] == startOfBin[ nextBin ] && nextBin < numberOfBins )
248  nextBin++;
249  if ( _current < endOfBin[ nextBin - 1 ] )
250  _current = endOfBin[ nextBin - 1 ];
251  }
252  bitMask >>= Log2ofPowerOfTwoRadix;
253  if ( bitMask != 0 ) // end recursion when all the bits have been processes
254  {
255  if ( shiftRightAmount >= Log2ofPowerOfTwoRadix ) shiftRightAmount -= Log2ofPowerOfTwoRadix;
256  else shiftRightAmount = 0;
257 
258  RadixInPlaceOperation_3< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold, T > radixOp( a, startOfBin, endOfBin, bitMask, shiftRightAmount );
259  parallel_for( blocked_range< long >( 0, numberOfBins ), radixOp );
260  }
261 }
262 
263 template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, class T >
264 inline void _RadixSort_Unsigned_PowerOf2Radix_1( T* a, long last, T bitMask, unsigned long shiftRightAmount )
265 {
266  const unsigned long numberOfBins = PowerOfTwoRadix;
267  unsigned long count[ numberOfBins ];
268 // Counting occurrence of digits within the array (related to Counting Sort)
269  for( unsigned long i = 0; i < numberOfBins; i++ )
270  count[ i ] = 0;
271 
272  for ( long _current = 0; _current <= last; _current++ ) // Scan the array and count the number of times each value appears
273  {
274  unsigned long digit = (unsigned long)(( a[ _current ] & bitMask ) >> shiftRightAmount ); // extract the digit we are sorting based on
275  count[ digit ]++;
276  }
277 // Moving array elements into their bins
278  long startOfBin[ numberOfBins ], endOfBin[ numberOfBins ], nextBin;
279  startOfBin[ 0 ] = endOfBin[ 0 ] = nextBin = 0;
280  for( unsigned long i = 1; i < numberOfBins; i++ )
281  startOfBin[ i ] = endOfBin[ i ] = startOfBin[ i - 1 ] + count[ i - 1 ];
282 
283  for ( long _current = 0; _current <= last; )
284  {
285  unsigned long digit;
286  T tmp = a[ _current ]; // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location
287  while ( true ) {
288  digit = (unsigned long)(( tmp & bitMask ) >> shiftRightAmount ); // extract the digit we are sorting based on
289  if ( endOfBin[ digit ] == _current )
290  break;
291  swap_impl( tmp, a[ endOfBin[ digit ] ] );
292  endOfBin[ digit ]++;
293  }
294  a[ _current ] = tmp;
295 
296  endOfBin[ digit ]++; // leave the element at its location and grow the bin
297  _current++; // advance the current pointer to the next element
298  while( _current >= startOfBin[ nextBin ] && nextBin < numberOfBins )
299  nextBin++;
300  while( endOfBin[ nextBin - 1 ] == startOfBin[ nextBin ] && nextBin < numberOfBins )
301  nextBin++;
302  if ( _current < endOfBin[ nextBin - 1 ] )
303  _current = endOfBin[ nextBin - 1 ];
304  }
305 // Recursion for each bin
306  bitMask >>= Log2ofPowerOfTwoRadix;
307  if ( bitMask != 0 ) // end recursion when all the bits have been processes
308  {
309  if ( shiftRightAmount >= Log2ofPowerOfTwoRadix ) shiftRightAmount -= Log2ofPowerOfTwoRadix;
310  else shiftRightAmount = 0;
311 
312  for( unsigned long i = 0; i < numberOfBins; i++ )
313  {
314  long numberOfElements = endOfBin[ i ] - startOfBin[ i ];
315  if ( numberOfElements >= Threshold ) { // endOfBin actually points to one beyond the bin
316  _RadixSort_Unsigned_PowerOf2Radix_1< T, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &a[ startOfBin[ i ]], numberOfElements - 1, bitMask, shiftRightAmount );
317  }
318  else if ( numberOfElements >= 2 ) {
319  insertion_sort_impl( &a[ startOfBin[ i ]], numberOfElements );
320  }
321  }
322  }
323 }
324 
325 // Top-level template function
326 template< class T >
327 inline void RadixSortParallel( T* a, unsigned long a_size )
328 {
329  if ( a_size < 2 ) return;
330 
331  const unsigned long Threshold = 32; // Threshold of when to switch to using Insertion Sort
332  const unsigned long PowerOfTwoRadix = 256; // Radix - must be a power of 2
333  const unsigned long Log2ofPowerOfTwoRadix = 8;
334  unsigned long shiftRightAmount = sizeof( T ) * 8 - Log2ofPowerOfTwoRadix; // Create bit-mask and shift right amount
335  T bitMask = (T)( ((T)( PowerOfTwoRadix - 1 )) << shiftRightAmount ); // bitMask controls/selects how many and which bits we process at a time
336 
337  if ( a_size >= Threshold )
338  _RadixSort_Unsigned_PowerOf2Radix_3< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( a, a_size - 1, bitMask, shiftRightAmount );
339  else
340  insertion_sort_impl( a, a_size );
341 }
342 
343 */
344 
345 
346 
347 #endif /* STK_UTIL_UTIL_RADIXSORT_HPP */
Sierra Toolkit.
eastl::iterator_traits< InputIterator >::difference_type count(InputIterator first, InputIterator last, const T &value)