This article is part of a series on performance guidelines.
/* * This software is based upon the book CUDA By Example by Sanders and Kandrot. * This software contains source code provided by NVIDIA Corporation. */ /* Original Copyright notice: * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ public static class bitonic { private static bool IsPow2(uint i) { return i > 0 && i == (i & ~(i - 1)); } public static void DoBitonicSort(GPGPU gpu, KVP[] d_kvp, bool SortAsc) { uint dir = (SortAsc ? 1U : 0); int arrayLength = gpu.GetDeviceMemory(d_kvp).XSize; if (!IsPow2((uint)arrayLength)) throw new ApplicationException("Invalid array length. Must be a power of 2."); if (!IsPow2(SHARED_SIZE_LIMIT)) throw new ApplicationException("Invalid SHARED_SIZE_LIMIT length. Must be a power of 2."); if (arrayLength <= SHARED_SIZE_LIMIT) throw new ApplicationException("Invalid array length. Must greater than SHARED_SIZE_LIMIT"); int CorrectKVPLengthUint = Marshal.SizeOf(typeof(KVP)) / Marshal.SizeOf(typeof(uint)); if (CorrectKVPLengthUint != KVPLengthUint) throw new ApplicationException(string.Format( "KVPLengthUint ({0}) must equal the size of KVP in uint's ({1})", KVPLengthUint, CorrectKVPLengthUint)); int blockCount = arrayLength / (int)SHARED_SIZE_LIMIT; int mergeGlobalBlockCount = 2 * blockCount; int mergeGlobalThreadCount = threadCount / 2; gpu.Launch(blockCount, threadCount, bitonicSortShared1, d_kvp); for (uint size = 2 * SHARED_SIZE_LIMIT; size <= arrayLength; size <<= 1) for (uint stride = size / 2; stride > 0; stride >>= 1) if (stride >= SHARED_SIZE_LIMIT) { gpu.Launch(mergeGlobalBlockCount, mergeGlobalThreadCount, bitonicMergeGlobal, d_kvp, size, stride, dir); } else { gpu.Launch(blockCount, threadCount, bitonicMergeShared, d_kvp, size, dir); break; } } public const uint SHARED_SIZE_LIMIT = 2U * threadCount; public const int threadCount = 512; public const uint KVPLengthUint = 2U; [Cudafy] public struct KVP { //public uint key; public uint key, value; [CudafyIgnore] public override string ToString() { //return String.Format("{0}", key); return String.Format("{0},{1}", key, value); } [CudafyIgnore] public override bool Equals(object obj) { return ((KVP)obj).key == key; } [CudafyIgnore] public override int GetHashCode() { return (int)key; } } [Cudafy(eCudafyType.Device)] public static void Comparator( ref KVP A, ref KVP B, uint dir) { KVP tmp; if (dir == 0 && (A.key < B.key) || dir != 0 && (A.key > B.key)) { tmp = A; A = B; B = tmp; } } [Cudafy] private unsafe static void bitonicSortShared1(GThread thread, KVP[] d_kvp) { //Shared memory storage for current subarray KVP[] s_kvp = thread.AllocateShared<KVP>("s_kvp", (int)SHARED_SIZE_LIMIT); fixed (KVP* s_Ptr_kvp = s_kvp) fixed (KVP* d_Ptr_kvp = d_kvp) { int tid = thread.threadIdx.x; //Offset to the beginning of subarray and load data, in coalesced batches uint* d_Ptr_kvp_offset = (uint*)(d_Ptr_kvp) + ((uint)thread.blockIdx.x * SHARED_SIZE_LIMIT) * KVPLengthUint; for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount) ((uint*)(s_Ptr_kvp))[tid + offset] = d_Ptr_kvp_offset[tid + offset]; for (uint size = 2; size < SHARED_SIZE_LIMIT; size <<= 1) { uint ddd = ((tid & (size / 2))) != 0 ? 1U : 0; for (uint stride = size / 2; stride > 0; stride >>= 1) { thread.SyncThreads(); uint pos = 2 * (uint)tid - ((uint)tid & (stride - 1)); Comparator(ref (s_kvp[pos + 0]), ref (s_kvp[pos + stride]), ddd); } } { uint ddd = (uint)thread.blockIdx.x & 1; for (uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) { thread.SyncThreads(); uint pos = 2 * (uint)tid - ((uint)tid & (stride - 1)); Comparator(ref (s_kvp[pos + 0]), ref (s_kvp[pos + stride]), ddd); } } thread.SyncThreads(); //store data back, in coalesced batches for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount) d_Ptr_kvp_offset[tid + offset] = ((uint*)(s_Ptr_kvp))[tid + offset]; } } [Cudafy] private unsafe static void bitonicMergeGlobal(GThread thread, KVP[] d_kvp, uint size, uint stride, uint dir) { uint global_comparatorI = (uint)thread.blockIdx.x * (uint)thread.blockDim.x + (uint)thread.threadIdx.x; uint comparatorI = global_comparatorI & ((uint)d_kvp.Length / 2 - 1); //Bitonic merge uint ddd = dir ^ ((comparatorI & (size / 2)) != 0 ? 1U : 0); uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); KVP A = d_kvp[pos + 0]; KVP B = d_kvp[pos + stride]; Comparator(ref A, ref B, ddd); d_kvp[pos + 0] = A; d_kvp[pos + stride] = B; } [Cudafy] private unsafe static void bitonicMergeShared(GThread thread, KVP[] d_kvp, uint size, uint dir) { //Shared memory storage for current subarray KVP[] s_kvp = thread.AllocateShared<KVP>("s_kvp", (int)SHARED_SIZE_LIMIT); fixed (KVP* s_Ptr_kvp = s_kvp) fixed (KVP* d_Ptr_kvp = d_kvp) { int tid = thread.threadIdx.x; //Offset to the beginning of subarray and load data, in coalesced batches uint* d_Ptr_kvp_offset = (uint*)(d_Ptr_kvp) + ((uint)thread.blockIdx.x * SHARED_SIZE_LIMIT) * KVPLengthUint; for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount) ((uint*)(s_Ptr_kvp))[tid + offset] = d_Ptr_kvp_offset[tid + offset]; //Bitonic merge uint comparatorI = ((uint)thread.blockIdx.x * (uint)thread.blockDim.x + (uint)tid) & (((uint)d_kvp.Length / 2U) - 1U); uint ddd = dir ^ ((comparatorI & (size / 2)) != 0 ? 1U : 0); for (uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) { thread.SyncThreads(); uint pos = 2 * (uint)tid - ((uint)tid & (stride - 1)); Comparator(ref (s_kvp[pos + 0]), ref (s_kvp[pos + stride]), ddd); } thread.SyncThreads(); //store data back, in coalesced batches for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount) d_Ptr_kvp_offset[tid + offset] = ((uint*)(s_Ptr_kvp))[tid + offset]; } } }
Nice article and nice licence note. :)
ReplyDeleteThis comment has been removed by a blog administrator.
ReplyDelete