This article is part of a series on performance guidelines.
/*
* This software is based upon the book CUDA By Example by Sanders and Kandrot.
* This software contains source code provided by NVIDIA Corporation.
*/
/* Original Copyright notice:
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
public static class bitonic
{
private static bool IsPow2(uint i)
{
return i > 0 && i == (i & ~(i - 1));
}
public static void DoBitonicSort(GPGPU gpu, KVP[] d_kvp, bool SortAsc)
{
uint dir = (SortAsc ? 1U : 0);
int arrayLength = gpu.GetDeviceMemory(d_kvp).XSize;
if (!IsPow2((uint)arrayLength))
throw new ApplicationException("Invalid array length. Must be a power of 2.");
if (!IsPow2(SHARED_SIZE_LIMIT))
throw new ApplicationException("Invalid SHARED_SIZE_LIMIT length. Must be a power of 2.");
if (arrayLength <= SHARED_SIZE_LIMIT)
throw new ApplicationException("Invalid array length. Must greater than SHARED_SIZE_LIMIT");
int CorrectKVPLengthUint = Marshal.SizeOf(typeof(KVP)) / Marshal.SizeOf(typeof(uint));
if (CorrectKVPLengthUint != KVPLengthUint)
throw new ApplicationException(string.Format(
"KVPLengthUint ({0}) must equal the size of KVP in uint's ({1})", KVPLengthUint, CorrectKVPLengthUint));
int blockCount = arrayLength / (int)SHARED_SIZE_LIMIT;
int mergeGlobalBlockCount = 2 * blockCount;
int mergeGlobalThreadCount = threadCount / 2;
gpu.Launch(blockCount, threadCount, bitonicSortShared1, d_kvp);
for (uint size = 2 * SHARED_SIZE_LIMIT; size <= arrayLength; size <<= 1)
for (uint stride = size / 2; stride > 0; stride >>= 1)
if (stride >= SHARED_SIZE_LIMIT)
{
gpu.Launch(mergeGlobalBlockCount, mergeGlobalThreadCount, bitonicMergeGlobal, d_kvp, size, stride, dir);
}
else
{
gpu.Launch(blockCount, threadCount, bitonicMergeShared, d_kvp, size, dir);
break;
}
}
public const uint SHARED_SIZE_LIMIT = 2U * threadCount;
public const int threadCount = 512;
public const uint KVPLengthUint = 2U;
[Cudafy]
public struct KVP
{
//public uint key;
public uint key, value;
[CudafyIgnore]
public override string ToString()
{
//return String.Format("{0}", key);
return String.Format("{0},{1}", key, value);
}
[CudafyIgnore]
public override bool Equals(object obj)
{
return ((KVP)obj).key == key;
}
[CudafyIgnore]
public override int GetHashCode()
{
return (int)key;
}
}
[Cudafy(eCudafyType.Device)]
public static void Comparator(
ref KVP A,
ref KVP B,
uint dir)
{
KVP tmp;
if (dir == 0 && (A.key < B.key) || dir != 0 && (A.key > B.key))
{
tmp = A;
A = B;
B = tmp;
}
}
[Cudafy]
private unsafe static void bitonicSortShared1(GThread thread, KVP[] d_kvp)
{
//Shared memory storage for current subarray
KVP[] s_kvp = thread.AllocateShared<KVP>("s_kvp", (int)SHARED_SIZE_LIMIT);
fixed (KVP* s_Ptr_kvp = s_kvp)
fixed (KVP* d_Ptr_kvp = d_kvp)
{
int tid = thread.threadIdx.x;
//Offset to the beginning of subarray and load data, in coalesced batches
uint* d_Ptr_kvp_offset = (uint*)(d_Ptr_kvp) + ((uint)thread.blockIdx.x * SHARED_SIZE_LIMIT) * KVPLengthUint;
for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount)
((uint*)(s_Ptr_kvp))[tid + offset] = d_Ptr_kvp_offset[tid + offset];
for (uint size = 2; size < SHARED_SIZE_LIMIT; size <<= 1)
{
uint ddd = ((tid & (size / 2))) != 0 ? 1U : 0;
for (uint stride = size / 2; stride > 0; stride >>= 1)
{
thread.SyncThreads();
uint pos = 2 * (uint)tid - ((uint)tid & (stride - 1));
Comparator(ref (s_kvp[pos + 0]), ref (s_kvp[pos + stride]), ddd);
}
}
{
uint ddd = (uint)thread.blockIdx.x & 1;
for (uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1)
{
thread.SyncThreads();
uint pos = 2 * (uint)tid - ((uint)tid & (stride - 1));
Comparator(ref (s_kvp[pos + 0]), ref (s_kvp[pos + stride]), ddd);
}
}
thread.SyncThreads();
//store data back, in coalesced batches
for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount)
d_Ptr_kvp_offset[tid + offset] = ((uint*)(s_Ptr_kvp))[tid + offset];
}
}
[Cudafy]
private unsafe static void bitonicMergeGlobal(GThread thread, KVP[] d_kvp, uint size, uint stride, uint dir)
{
uint global_comparatorI = (uint)thread.blockIdx.x * (uint)thread.blockDim.x + (uint)thread.threadIdx.x;
uint comparatorI = global_comparatorI & ((uint)d_kvp.Length / 2 - 1);
//Bitonic merge
uint ddd = dir ^ ((comparatorI & (size / 2)) != 0 ? 1U : 0);
uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
KVP A = d_kvp[pos + 0];
KVP B = d_kvp[pos + stride];
Comparator(ref A, ref B, ddd);
d_kvp[pos + 0] = A;
d_kvp[pos + stride] = B;
}
[Cudafy]
private unsafe static void bitonicMergeShared(GThread thread, KVP[] d_kvp, uint size, uint dir)
{
//Shared memory storage for current subarray
KVP[] s_kvp = thread.AllocateShared<KVP>("s_kvp", (int)SHARED_SIZE_LIMIT);
fixed (KVP* s_Ptr_kvp = s_kvp)
fixed (KVP* d_Ptr_kvp = d_kvp)
{
int tid = thread.threadIdx.x;
//Offset to the beginning of subarray and load data, in coalesced batches
uint* d_Ptr_kvp_offset = (uint*)(d_Ptr_kvp) + ((uint)thread.blockIdx.x * SHARED_SIZE_LIMIT) * KVPLengthUint;
for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount)
((uint*)(s_Ptr_kvp))[tid + offset] = d_Ptr_kvp_offset[tid + offset];
//Bitonic merge
uint comparatorI =
((uint)thread.blockIdx.x * (uint)thread.blockDim.x + (uint)tid) & (((uint)d_kvp.Length / 2U) - 1U);
uint ddd = dir ^ ((comparatorI & (size / 2)) != 0 ? 1U : 0);
for (uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1)
{
thread.SyncThreads();
uint pos = 2 * (uint)tid - ((uint)tid & (stride - 1));
Comparator(ref (s_kvp[pos + 0]), ref (s_kvp[pos + stride]), ddd);
}
thread.SyncThreads();
//store data back, in coalesced batches
for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount)
d_Ptr_kvp_offset[tid + offset] = ((uint*)(s_Ptr_kvp))[tid + offset];
}
}
}
Nice article and nice licence note. :)
ReplyDeleteThis comment has been removed by a blog administrator.
ReplyDelete