This article is part of a series on performance guidelines.
/*
* This software is based upon the book CUDA By Example by Sanders and Kandrot.
* This software contains source code provided by NVIDIA Corporation.
*/
/* Original Copyright notice:
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
public static class oddEven
{
private static bool IsPow2(uint i)
{
return i > 0 && i == (i & ~(i - 1));
}
public static void DoOddEvenSort(GPGPU gpu, KVP[] d_kvp, bool SortAsc)
{
uint dir = (SortAsc ? 1U : 0);
int arrayLength = gpu.GetDeviceMemory(d_kvp).XSize;
if (!IsPow2((uint)arrayLength))
throw new ApplicationException("Invalid array length. Must be a power of 2.");
if (!IsPow2(SHARED_SIZE_LIMIT))
throw new ApplicationException("Invalid SHARED_SIZE_LIMIT length. Must be a power of 2.");
if (arrayLength <= SHARED_SIZE_LIMIT)
throw new ApplicationException("Invalid array length. Must greater than SHARED_SIZE_LIMIT");
int CorrectKVPLengthUint = Marshal.SizeOf(typeof(KVP)) / Marshal.SizeOf(typeof(uint));
if (CorrectKVPLengthUint != KVPLengthUint)
throw new ApplicationException(
string.Format("KVPLengthUint ({0}) must equal the size of KVP in uint's ({1})",
KVPLengthUint, CorrectKVPLengthUint));
int blockCount = arrayLength / (int)SHARED_SIZE_LIMIT;
int mergeGlobalBlockCount = arrayLength / threadCount;
int mergeGlobalThreadCount = threadCount / 2;
gpu.Launch(blockCount, threadCount, oddEvenMergeSortShared, d_kvp, dir);
for (uint size = 2 * SHARED_SIZE_LIMIT; size <= arrayLength; size <<= 1)
for (uint stride = size / 2; stride > 0; stride >>= 1)
gpu.Launch(mergeGlobalBlockCount, mergeGlobalThreadCount, oddEvenMergeGlobal, d_kvp, size, stride, dir);
}
public const uint SHARED_SIZE_LIMIT = 2U * threadCount;
public const int threadCount = 512;
public const uint KVPLengthUint = 2U;
[Cudafy]
public struct KVP
{
//public uint key;
public uint key, value;
[CudafyIgnore]
public override string ToString()
{
//return String.Format("{0}", key);
return String.Format("{0},{1}", key, value);
}
[CudafyIgnore]
public override bool Equals(object obj)
{
return ((KVP)obj).key == key;
}
[CudafyIgnore]
public override int GetHashCode()
{
return (int)key;
}
}
[Cudafy(eCudafyType.Device)]
public static void Comparator(
ref KVP A,
ref KVP B,
uint dir)
{
KVP tmp;
if (dir == 0 && (A.key < B.key) || dir != 0 && (A.key > B.key))
{
tmp = A;
A = B;
B = tmp;
}
}
[Cudafy]
private unsafe static void oddEvenMergeSortShared(GThread thread, KVP[] d_kvp, uint dir)
{
//Shared memory storage for current subarray
KVP[] s_kvp = thread.AllocateShared<KVP>("s_kvp", (int)SHARED_SIZE_LIMIT);
fixed (KVP* s_Ptr_kvp = s_kvp)
fixed (KVP* d_Ptr_kvp = d_kvp)
{
int tid = thread.threadIdx.x;
//Offset to the beginning of subarray and load data, in coalesced batches
uint* d_Ptr_kvp_offset = (uint*)(d_Ptr_kvp) + ((uint)thread.blockIdx.x * SHARED_SIZE_LIMIT) * KVPLengthUint;
for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount)
((uint*)(s_Ptr_kvp))[tid + offset] = d_Ptr_kvp_offset[tid + offset];
for (uint size = 2; size <= SHARED_SIZE_LIMIT; size <<= 1)
{
uint stride = size / 2;
uint offset = (uint)tid & (stride - 1);
{
thread.SyncThreads();
uint pos = 2 * (uint)tid - ((uint)tid & (stride - 1));
Comparator(ref (s_kvp[pos + 0]), ref (s_kvp[pos + stride]), dir);
stride >>= 1;
}
for (; stride > 0; stride >>= 1)
{
thread.SyncThreads();
uint pos = 2 * (uint)tid - ((uint)tid & (stride - 1));
if (offset >= stride)
Comparator(ref (s_kvp[pos - stride]), ref (s_kvp[pos + 0]), dir);
}
}
thread.SyncThreads();
//store data back, in coalesced batches
for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount)
d_Ptr_kvp_offset[tid + offset] = ((uint*)(s_Ptr_kvp))[tid + offset];
}
}
[Cudafy]
private unsafe static void oddEvenMergeGlobal(GThread thread, KVP[] d_kvp, uint size, uint stride, uint dir)
{
uint global_comparatorI = (uint)thread.blockIdx.x * (uint)thread.blockDim.x + (uint)thread.threadIdx.x;
uint comparatorI = global_comparatorI & ((uint)d_kvp.Length / 2 - 1);
//OddEven merge
uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
if (stride < size / 2)
{
uint offset = global_comparatorI & ((size / 2) - 1);
if (offset >= stride)
{
KVP A = d_kvp[pos - stride];
KVP B = d_kvp[pos + 0];
Comparator(ref A, ref B, dir);
d_kvp[pos - stride] = A;
d_kvp[pos + 0] = B;
}
}
else
{
KVP A = d_kvp[pos + 0];
KVP B = d_kvp[pos + stride];
Comparator(ref A, ref B, dir);
d_kvp[pos + 0] = A;
d_kvp[pos + stride] = B;
}
}
}