This article is part of a series on performance guidelines.
/* * This software is based upon the book CUDA By Example by Sanders and Kandrot. * This software contains source code provided by NVIDIA Corporation. */ /* Original Copyright notice: * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ public static class oddEven { private static bool IsPow2(uint i) { return i > 0 && i == (i & ~(i - 1)); } public static void DoOddEvenSort(GPGPU gpu, KVP[] d_kvp, bool SortAsc) { uint dir = (SortAsc ? 1U : 0); int arrayLength = gpu.GetDeviceMemory(d_kvp).XSize; if (!IsPow2((uint)arrayLength)) throw new ApplicationException("Invalid array length. Must be a power of 2."); if (!IsPow2(SHARED_SIZE_LIMIT)) throw new ApplicationException("Invalid SHARED_SIZE_LIMIT length. Must be a power of 2."); if (arrayLength <= SHARED_SIZE_LIMIT) throw new ApplicationException("Invalid array length. Must greater than SHARED_SIZE_LIMIT"); int CorrectKVPLengthUint = Marshal.SizeOf(typeof(KVP)) / Marshal.SizeOf(typeof(uint)); if (CorrectKVPLengthUint != KVPLengthUint) throw new ApplicationException( string.Format("KVPLengthUint ({0}) must equal the size of KVP in uint's ({1})", KVPLengthUint, CorrectKVPLengthUint)); int blockCount = arrayLength / (int)SHARED_SIZE_LIMIT; int mergeGlobalBlockCount = arrayLength / threadCount; int mergeGlobalThreadCount = threadCount / 2; gpu.Launch(blockCount, threadCount, oddEvenMergeSortShared, d_kvp, dir); for (uint size = 2 * SHARED_SIZE_LIMIT; size <= arrayLength; size <<= 1) for (uint stride = size / 2; stride > 0; stride >>= 1) gpu.Launch(mergeGlobalBlockCount, mergeGlobalThreadCount, oddEvenMergeGlobal, d_kvp, size, stride, dir); } public const uint SHARED_SIZE_LIMIT = 2U * threadCount; public const int threadCount = 512; public const uint KVPLengthUint = 2U; [Cudafy] public struct KVP { //public uint key; public uint key, value; [CudafyIgnore] public override string ToString() { //return String.Format("{0}", key); return String.Format("{0},{1}", key, value); } [CudafyIgnore] public override bool Equals(object obj) { return ((KVP)obj).key == key; } [CudafyIgnore] public override int GetHashCode() { return (int)key; } } [Cudafy(eCudafyType.Device)] public static void Comparator( ref KVP A, ref KVP B, uint dir) { KVP tmp; if (dir == 0 && (A.key < B.key) || dir != 0 && (A.key > B.key)) { tmp = A; A = B; B = tmp; } } [Cudafy] private unsafe static void oddEvenMergeSortShared(GThread thread, KVP[] d_kvp, uint dir) { //Shared memory storage for current subarray KVP[] s_kvp = thread.AllocateShared<KVP>("s_kvp", (int)SHARED_SIZE_LIMIT); fixed (KVP* s_Ptr_kvp = s_kvp) fixed (KVP* d_Ptr_kvp = d_kvp) { int tid = thread.threadIdx.x; //Offset to the beginning of subarray and load data, in coalesced batches uint* d_Ptr_kvp_offset = (uint*)(d_Ptr_kvp) + ((uint)thread.blockIdx.x * SHARED_SIZE_LIMIT) * KVPLengthUint; for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount) ((uint*)(s_Ptr_kvp))[tid + offset] = d_Ptr_kvp_offset[tid + offset]; for (uint size = 2; size <= SHARED_SIZE_LIMIT; size <<= 1) { uint stride = size / 2; uint offset = (uint)tid & (stride - 1); { thread.SyncThreads(); uint pos = 2 * (uint)tid - ((uint)tid & (stride - 1)); Comparator(ref (s_kvp[pos + 0]), ref (s_kvp[pos + stride]), dir); stride >>= 1; } for (; stride > 0; stride >>= 1) { thread.SyncThreads(); uint pos = 2 * (uint)tid - ((uint)tid & (stride - 1)); if (offset >= stride) Comparator(ref (s_kvp[pos - stride]), ref (s_kvp[pos + 0]), dir); } } thread.SyncThreads(); //store data back, in coalesced batches for (int bbIx = 0, offset = 0; bbIx < KVPLengthUint * 2; bbIx++, offset += threadCount) d_Ptr_kvp_offset[tid + offset] = ((uint*)(s_Ptr_kvp))[tid + offset]; } } [Cudafy] private unsafe static void oddEvenMergeGlobal(GThread thread, KVP[] d_kvp, uint size, uint stride, uint dir) { uint global_comparatorI = (uint)thread.blockIdx.x * (uint)thread.blockDim.x + (uint)thread.threadIdx.x; uint comparatorI = global_comparatorI & ((uint)d_kvp.Length / 2 - 1); //OddEven merge uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); if (stride < size / 2) { uint offset = global_comparatorI & ((size / 2) - 1); if (offset >= stride) { KVP A = d_kvp[pos - stride]; KVP B = d_kvp[pos + 0]; Comparator(ref A, ref B, dir); d_kvp[pos - stride] = A; d_kvp[pos + 0] = B; } } else { KVP A = d_kvp[pos + 0]; KVP B = d_kvp[pos + stride]; Comparator(ref A, ref B, dir); d_kvp[pos + 0] = A; d_kvp[pos + stride] = B; } } }