Performance guidelines: 6/7: Apendix A – CUDAfy source for Odd-Even sort

This article is part of a series on performance guidelines.

/* 
 * This software is based upon the book CUDA By Example by Sanders and Kandrot.
 * This software contains source code provided by NVIDIA Corporation.
*/

/* Original Copyright notice:
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

public static class oddEven
{
  private static bool IsPow2(uint i)
  {
    return i > 0 && i == (i & ~(i - 1));
  }

  public static void DoOddEvenSort(GPGPU gpu, KVP[] d_kvp, bool SortAsc)
  {
    uint dir = (SortAsc ? 1U : 0);
    int arrayLength = gpu.GetDeviceMemory(d_kvp).XSize;

    if (!IsPow2((uint)arrayLength))
      throw new ApplicationException("Invalid array length. Must be a power of 2.");
    if (!IsPow2(SHARED_SIZE_LIMIT))
      throw new ApplicationException("Invalid SHARED_SIZE_LIMIT length. Must be a power of 2.");
    if (arrayLength <= SHARED_SIZE_LIMIT)
      throw new ApplicationException("Invalid array length. Must greater than SHARED_SIZE_LIMIT");
    int CorrectKVPLengthUint = Marshal.SizeOf(typeof(KVP)) / Marshal.SizeOf(typeof(uint));
    if (CorrectKVPLengthUint != KVPLengthUint)
      throw new ApplicationException(
          string.Format("KVPLengthUint ({0}) must equal the size of KVP in uint's ({1})",
          KVPLengthUint, CorrectKVPLengthUint));

    int blockCount = arrayLength / (int)SHARED_SIZE_LIMIT;
    int mergeGlobalBlockCount = arrayLength / threadCount;
    int mergeGlobalThreadCount = threadCount / 2;

    gpu.Launch(blockCount, threadCount, oddEvenMergeSortShared, d_kvp, dir);

    for (uint size = 2 * SHARED_SIZE_LIMIT; size <= arrayLength; size <<= 1)
      for (uint stride = size / 2; stride > 0; stride >>= 1)
        gpu.Launch(mergeGlobalBlockCount, mergeGlobalThreadCount, oddEvenMergeGlobal, d_kvp, size, stride, dir);
  }

  public const uint SHARED_SIZE_LIMIT = 2U * threadCount;
  public const int threadCount = 512;
  public const uint KVPLengthUint = 2U;

  [Cudafy]
  public struct KVP
  {
    //public uint key;
    public uint key, value;

    [CudafyIgnore]
    public override string ToString()
    {
      //return String.Format("{0}", key);
      return String.Format("{0},{1}", key, value);
    }

    [CudafyIgnore]
    public override bool Equals(object obj)
    {
      return ((KVP)obj).key == key;
    }

    [CudafyIgnore]
    public override int GetHashCode()
    {
      return (int)key;
    }
  }

  [Cudafy(eCudafyType.Device)]
  public static void Comparator(
      ref KVP A,
      ref KVP B,
      uint dir)
  {
    KVP tmp;
    if (dir == 0 && (A.key < B.key) || dir != 0 && (A.key > B.key))
    {
      tmp = A;
      A = B;
      B = tmp;
    }
  }

  [Cudafy]
  private unsafe static void oddEvenMergeSortShared(GThread thread, KVP[] d_kvp, uint dir)
  {
    //Shared memory storage for current subarray
    KVP[] s_kvp = thread.AllocateShared<KVP>("s_kvp", (int)SHARED_SIZE_LIMIT);
    fixed (KVP* s_Ptr_kvp = s_kvp)
    fixed (KVP* d_Ptr_kvp = d_kvp)
    {
      int tid = thread.threadIdx.x;

      //Offset to the beginning of subarray and load data, in coalesced batches
      uint* d_Ptr_kvp_offset = (uint*)(d_Ptr_kvp) + ((uint)thread.blockIdx.x * SHARED_SIZE_LIMIT) * KVPLengthUint;
      for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount)
        ((uint*)(s_Ptr_kvp))[tid + offset] = d_Ptr_kvp_offset[tid + offset];

      for (uint size = 2; size <= SHARED_SIZE_LIMIT; size <<= 1)
      {
        uint stride = size / 2;
        uint offset = (uint)tid & (stride - 1);

        {
          thread.SyncThreads();
          uint pos = 2 * (uint)tid - ((uint)tid & (stride - 1));
          Comparator(ref (s_kvp[pos + 0]), ref (s_kvp[pos + stride]), dir);
          stride >>= 1;
        }

        for (; stride > 0; stride >>= 1)
        {
          thread.SyncThreads();
          uint pos = 2 * (uint)tid - ((uint)tid & (stride - 1));
          if (offset >= stride)
            Comparator(ref (s_kvp[pos - stride]), ref (s_kvp[pos + 0]), dir);
        }
      }

      thread.SyncThreads();

      //store data back, in coalesced batches
      for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount)
        d_Ptr_kvp_offset[tid + offset] = ((uint*)(s_Ptr_kvp))[tid + offset];
    }
  }

  [Cudafy]
  private unsafe static void oddEvenMergeGlobal(GThread thread, KVP[] d_kvp, uint size, uint stride, uint dir)
  {
    uint global_comparatorI = (uint)thread.blockIdx.x * (uint)thread.blockDim.x + (uint)thread.threadIdx.x;
    uint comparatorI = global_comparatorI & ((uint)d_kvp.Length / 2 - 1);

    //OddEven merge
    uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));

    if (stride < size / 2)
    {
      uint offset = global_comparatorI & ((size / 2) - 1);
      if (offset >= stride)
      {
        KVP A = d_kvp[pos - stride];
        KVP B = d_kvp[pos + 0];

        Comparator(ref A, ref B, dir);

        d_kvp[pos - stride] = A;
        d_kvp[pos + 0] = B;
      }
    }
    else
    {
      KVP A = d_kvp[pos + 0];
      KVP B = d_kvp[pos + stride];

      Comparator(ref A, ref B, dir);

      d_kvp[pos + 0] = A;
      d_kvp[pos + stride] = B;
    }
  }
}
Performance guidelines

About Me

Thursday, August 8, 2013

6/7: Apendix A – CUDAfy source for Odd-Even sort

This article is part of a series on performance guidelines.