Performance guidelines: 7/7: Apendix B – CUDAfy source for Bitonic sort

This article is part of a series on performance guidelines.

/* 
 * This software is based upon the book CUDA By Example by Sanders and Kandrot.
 * This software contains source code provided by NVIDIA Corporation.
*/

/* Original Copyright notice:
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

public static class bitonic
{
  private static bool IsPow2(uint i)
  {
    return i > 0 && i == (i & ~(i - 1));
  }

  public static void DoBitonicSort(GPGPU gpu, KVP[] d_kvp, bool SortAsc)
  {
    uint dir = (SortAsc ? 1U : 0);
    int arrayLength = gpu.GetDeviceMemory(d_kvp).XSize;

    if (!IsPow2((uint)arrayLength))
      throw new ApplicationException("Invalid array length. Must be a power of 2.");
    if (!IsPow2(SHARED_SIZE_LIMIT))
      throw new ApplicationException("Invalid SHARED_SIZE_LIMIT length. Must be a power of 2.");
    if (arrayLength <= SHARED_SIZE_LIMIT)
      throw new ApplicationException("Invalid array length. Must greater than SHARED_SIZE_LIMIT");
    int CorrectKVPLengthUint = Marshal.SizeOf(typeof(KVP)) / Marshal.SizeOf(typeof(uint));
    if (CorrectKVPLengthUint != KVPLengthUint)
      throw new ApplicationException(string.Format(
        "KVPLengthUint ({0}) must equal the size of KVP in uint's ({1})", KVPLengthUint, CorrectKVPLengthUint));

    int blockCount = arrayLength / (int)SHARED_SIZE_LIMIT;
    int mergeGlobalBlockCount = 2 * blockCount;
    int mergeGlobalThreadCount = threadCount / 2;

    gpu.Launch(blockCount, threadCount, bitonicSortShared1, d_kvp);

    for (uint size = 2 * SHARED_SIZE_LIMIT; size <= arrayLength; size <<= 1)
      for (uint stride = size / 2; stride > 0; stride >>= 1)
        if (stride >= SHARED_SIZE_LIMIT)
        {
          gpu.Launch(mergeGlobalBlockCount, mergeGlobalThreadCount, bitonicMergeGlobal, d_kvp, size, stride, dir);
        }
        else
        {
          gpu.Launch(blockCount, threadCount, bitonicMergeShared, d_kvp, size, dir);
          break;
        }
  }

  public const uint SHARED_SIZE_LIMIT = 2U * threadCount;
  public const int threadCount = 512;
  public const uint KVPLengthUint = 2U;

  [Cudafy]
  public struct KVP
  {
    //public uint key;
    public uint key, value;

    [CudafyIgnore]
    public override string ToString()
    {
      //return String.Format("{0}", key);
      return String.Format("{0},{1}", key, value);
    }

    [CudafyIgnore]
    public override bool Equals(object obj)
    {
      return ((KVP)obj).key == key;
    }

    [CudafyIgnore]
    public override int GetHashCode()
    {
      return (int)key;
    }
  }

  [Cudafy(eCudafyType.Device)]
  public static void Comparator(
      ref KVP A,
      ref KVP B,
      uint dir)
  {
    KVP tmp;
    if (dir == 0 && (A.key < B.key) || dir != 0 && (A.key > B.key))
    {
      tmp = A;
      A = B;
      B = tmp;
    }
  }

  [Cudafy]
  private unsafe static void bitonicSortShared1(GThread thread, KVP[] d_kvp)
  {
    //Shared memory storage for current subarray
    KVP[] s_kvp = thread.AllocateShared<KVP>("s_kvp", (int)SHARED_SIZE_LIMIT);
    fixed (KVP* s_Ptr_kvp = s_kvp)
    fixed (KVP* d_Ptr_kvp = d_kvp)
    {
      int tid = thread.threadIdx.x;

      //Offset to the beginning of subarray and load data, in coalesced batches
      uint* d_Ptr_kvp_offset = (uint*)(d_Ptr_kvp) + ((uint)thread.blockIdx.x * SHARED_SIZE_LIMIT) * KVPLengthUint;
      for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount)
        ((uint*)(s_Ptr_kvp))[tid + offset] = d_Ptr_kvp_offset[tid + offset];

      for (uint size = 2; size < SHARED_SIZE_LIMIT; size <<= 1)
      {
        uint ddd = ((tid & (size / 2))) != 0 ? 1U : 0;

        for (uint stride = size / 2; stride > 0; stride >>= 1)
        {
          thread.SyncThreads();
          uint pos = 2 * (uint)tid - ((uint)tid & (stride - 1));
          Comparator(ref (s_kvp[pos + 0]), ref (s_kvp[pos + stride]), ddd);
        }
      }

      {
        uint ddd = (uint)thread.blockIdx.x & 1;
        for (uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1)
        {
          thread.SyncThreads();
          uint pos = 2 * (uint)tid - ((uint)tid & (stride - 1));
          Comparator(ref (s_kvp[pos + 0]), ref (s_kvp[pos + stride]), ddd);
        }
      }

      thread.SyncThreads();

      //store data back, in coalesced batches
      for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount)
        d_Ptr_kvp_offset[tid + offset] = ((uint*)(s_Ptr_kvp))[tid + offset];
    }
  }

  [Cudafy]
  private unsafe static void bitonicMergeGlobal(GThread thread, KVP[] d_kvp, uint size, uint stride, uint dir)
  {
    uint global_comparatorI = (uint)thread.blockIdx.x * (uint)thread.blockDim.x + (uint)thread.threadIdx.x;
    uint comparatorI = global_comparatorI & ((uint)d_kvp.Length / 2 - 1);

    //Bitonic merge
    uint ddd = dir ^ ((comparatorI & (size / 2)) != 0 ? 1U : 0);
    uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));

    KVP A = d_kvp[pos + 0];
    KVP B = d_kvp[pos + stride];

    Comparator(ref A, ref B, ddd);

    d_kvp[pos + 0] = A;
    d_kvp[pos + stride] = B;
  }

  [Cudafy]
  private unsafe static void bitonicMergeShared(GThread thread, KVP[] d_kvp, uint size, uint dir)
  {
    //Shared memory storage for current subarray
    KVP[] s_kvp = thread.AllocateShared<KVP>("s_kvp", (int)SHARED_SIZE_LIMIT);

    fixed (KVP* s_Ptr_kvp = s_kvp)
    fixed (KVP* d_Ptr_kvp = d_kvp)
    {
      int tid = thread.threadIdx.x;

      //Offset to the beginning of subarray and load data, in coalesced batches
      uint* d_Ptr_kvp_offset = (uint*)(d_Ptr_kvp) + ((uint)thread.blockIdx.x * SHARED_SIZE_LIMIT) * KVPLengthUint;
      for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount)
        ((uint*)(s_Ptr_kvp))[tid + offset] = d_Ptr_kvp_offset[tid + offset];

      //Bitonic merge
      uint comparatorI = 
         ((uint)thread.blockIdx.x * (uint)thread.blockDim.x + (uint)tid) & (((uint)d_kvp.Length / 2U) - 1U);
      uint ddd = dir ^ ((comparatorI & (size / 2)) != 0 ? 1U : 0);

      for (uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1)
      {
        thread.SyncThreads();
        uint pos = 2 * (uint)tid - ((uint)tid & (stride - 1));
        Comparator(ref (s_kvp[pos + 0]), ref (s_kvp[pos + stride]), ddd);
      }

      thread.SyncThreads();

      //store data back, in coalesced batches
      for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount)
        d_Ptr_kvp_offset[tid + offset] = ((uint*)(s_Ptr_kvp))[tid + offset];

    }
  }
}
Performance guidelines

About Me

Thursday, August 8, 2013

7/7: Apendix B – CUDAfy source for Bitonic sort

This article is part of a series on performance guidelines.

2 comments: