Hermes
Loading...
Searching...
No Matches
cuda_utils.h
Go to the documentation of this file.
1
31
32#ifndef HERMES_COMMON_CUDA_UTILS_H
33#define HERMES_COMMON_CUDA_UTILS_H
34
35#include <hermes/common/size.h>
36#include <hermes/common/index.h>
37#include <hermes/common/debug.h>
38#include <iostream>
39
40#ifdef HERMES_DEVICE_ENABLED
41
42namespace hermes::cuda_utils {
43
45#define GPU_BLOCK_SIZE 1024
47#define GPU_BLOCK_SIZE_X 1024
49#define GPU_BLOCK_SIZE_Y 1024
51#define GPU_BLOCK_SIZE_Z 64
53#define GPU_WARP_SIZE 32
54
66struct LaunchInfo {
67 // *******************************************************************************************************************
68 // STATIC METHODS
69 // *******************************************************************************************************************
75 static void distribute(u32 max_b, u32 n, u32 &b, u32 &g) {
76 if (n <= max_b) {
77 b = n;
78 g = 1;
79 } else {
80 // round n to a multiple of warp size
81 auto m = (n % GPU_WARP_SIZE) ? ((n + GPU_WARP_SIZE) / GPU_WARP_SIZE) * GPU_WARP_SIZE : n;
82 auto b_candidate = max_b;
83 auto min_candidate = b_candidate;
84 auto min_r = m % b_candidate;
85 while (b_candidate > 128) {
86 auto r = m % b_candidate;
87 if (r < min_r) {
88 min_candidate = b_candidate;
89 min_r = r;
90 }
91 b_candidate >>= 1;
92 }
93 b = min_candidate;
94 g = (m % b) ? (m + b) / b : m / b;
95 }
96 }
102 static void redistribute(dim3 b, dim3 g, dim3 &new_b, dim3 &new_g) {
103 dim3 m(b.x * g.x, b.y * g.y, b.z * g.z);
104 dim3 b_candidate = b;
105 while (b_candidate.x * b_candidate.y * b_candidate.z > GPU_BLOCK_SIZE) {
106 // split max dimension
107 if (b_candidate.x > b_candidate.y && b_candidate.x > b_candidate.z)
108 b_candidate.x >>= 1;
109 else if (b_candidate.y >= b_candidate.x && b_candidate.y >= b_candidate.z)
110 b_candidate.y >>= 1;
111 else
112 b_candidate.z >>= 1;
113 }
114 new_b = b_candidate;
115 new_g = dim3((m.x % new_b.x) ? (m.x + new_b.x) / new_b.x : m.x / new_b.x,
116 (m.y % new_b.y) ? (m.y + new_b.y) / new_b.y : m.y / new_b.y,
117 (m.z % new_b.z) ? (m.z + new_b.z) / new_b.z : m.z / new_b.z);
118 }
119 // *******************************************************************************************************************
120 // CONSTRUCTORS
121 // *******************************************************************************************************************
126 LaunchInfo(u32 n, size_t shared_memory_size_in_bytes = 0, cudaStream_t stream = {}) :
127 shared_memory_size{shared_memory_size_in_bytes},
128 stream_id{stream} {
131 }
137 LaunchInfo(size2 b, size2 s = {0, 0},
138 size_t shared_memory_size_in_bytes = 0,
139 cudaStream_t stream = {}) :
140 shared_memory_size{shared_memory_size_in_bytes},
141 stream_id{stream} {
142 block_size = dim3(b.width, b.height, 1);
143 grid_size = dim3(s.width, s.height, 1);
144 if (s.total() == 0) {
148 }
150 }
156 LaunchInfo(size3 b, size3 s = {0, 0, 0},
157 size_t shared_memory_size_in_bytes = 0,
158 cudaStream_t stream = {}) :
159 shared_memory_size{shared_memory_size_in_bytes},
160 stream_id{stream} {
161 block_size = dim3(b.width, b.height, 1);
162 grid_size = dim3(s.width, s.height, 1);
163 if (s.total() == 0) {
168 }
170 }
171 // *******************************************************************************************************************
172 // METHODS
173 // *******************************************************************************************************************
176 [[nodiscard]] u32 threadCount() const {
177 return grid_size.x * grid_size.y * grid_size.z * block_size.x * block_size.y * block_size.z;
178 }
181 [[nodiscard]] u32 blockThreadCount() const {
182 return block_size.x * block_size.y * block_size.z;
183 }
184 // *******************************************************************************************************************
185 // PUBLIC FIELDS
186 // *******************************************************************************************************************
190 cudaStream_t stream_id{};
191};
192
193// *********************************************************************************************************************
194// SYNCHRONIZATION
195// *********************************************************************************************************************
197class Lock {
198public:
200 Lock();
201 ~Lock();
206private:
207 int *mutex{nullptr};
208};
209
210// *********************************************************************************************************************
211// MEMORY
212// *********************************************************************************************************************
217inline cudaMemcpyKind copyDirection(MemoryLocation src, MemoryLocation dst) {
218 if (src == MemoryLocation::DEVICE && dst == MemoryLocation::DEVICE)
219 return cudaMemcpyDeviceToDevice;
220 if (src == MemoryLocation::DEVICE && dst == MemoryLocation::HOST)
221 return cudaMemcpyDeviceToHost;
222 if (src == MemoryLocation::HOST && dst == MemoryLocation::HOST)
223 return cudaMemcpyHostToHost;
224 return cudaMemcpyHostToDevice;
225}
226
227// *********************************************************************************************************************
228// IO
229// *********************************************************************************************************************
234inline std::ostream &operator<<(std::ostream &o, const LaunchInfo &info) {
235 o << "[block size (" << info.block_size.x << " " << info.block_size.y << " " << info.block_size.z << ") ";
236 o << "grid size (" << info.grid_size.x << " " << info.grid_size.y << " " << info.grid_size.z << ")]";
237 return o;
238}
239
240} // namespace hermes::cuda_utils
241
245#define HERMES_CUDA_TIME(LAUNCH, ELAPSED_TIME_IN_MS) \
246{ cudaEvent_t cuda_event_start_t, cuda_event_stop_t; \
247 cudaEventCreate(&cuda_event_start_t); \
248 cudaEventCreate(&cuda_event_stop_t); \
249 cudaEventRecord(cuda_event_start_t, 0); \
250 LAUNCH \
251 cudaEventRecord(cuda_event_stop_t, 0); \
252 cudaEventSynchronize(cuda_event_stop_t); \
253 cudaEventElapsedTime(&ELAPSED_TIME_IN_MS, cuda_event_start_t, cuda_event_stop_t); }
254
256#define HERMES_CUDA_DEVICE_SYNCHRONIZE HERMES_CHECK_CUDA_CALL(cudaDeviceSynchronize());
257
262#define HERMES_CUDA_LAUNCH(LAUNCH_INFO, NAME, ...) \
263{ \
264 auto _hli_ = hermes::cuda_utils::LaunchInfo LAUNCH_INFO; \
265 NAME<<< _hli_.grid_size, _hli_.block_size, _hli_.shared_memory_size, _hli_.stream_id >>> (__VA_ARGS__); \
266 HERMES_CHECK_LAST_CUDA \
267}
268
273#define HERMES_CUDA_LAUNCH_AND_SYNC(LAUNCH_INFO, NAME, ...) \
274{ \
275 auto _hli_ = hermes::cuda_utils::LaunchInfo LAUNCH_INFO; \
276 NAME<<< _hli_.grid_size, _hli_.block_size, _hli_.shared_memory_size, _hli_.stream_id >>> (__VA_ARGS__); \
277 HERMES_CHECK_LAST_CUDA_CALL \
278 HERMES_CUDA_DEVICE_SYNCHRONIZE \
279}
280
282#define HERMES_CUDA_THREAD_INDEX_I \
283 u32 i = threadIdx.x + blockIdx.x * blockDim.x
284
286#define HERMES_CUDA_THREAD_INDEX_IJ \
287 hermes::index2 ij(threadIdx.x + blockIdx.x * blockDim.x, \
288 threadIdx.y + blockIdx.y * blockDim.y)
289
291#define HERMES_CUDA_THREAD_INDEX_IJK \
292 hermes::index3 ijk(threadIdx.x + blockIdx.x * blockDim.x, \
293 threadIdx.y + blockIdx.y * blockDim.y, \
294 threadIdx.z + blockIdx.z * blockDim.z);
295
297#define HERMES_CUDA_RETURN_IF_NOT_THREAD_0 \
298{ HERMES_CUDA_THREAD_INDEX_IJK \
299 if(ijk != hermes::index3(0,0,0)) \
300 return; \
301}
302
306#define HERMES_CUDA_THREAD_INDEX_LT(I, BOUNDS) \
307 u32 I = threadIdx.x + blockIdx.x * blockDim.x; \
308 if(I >= (BOUNDS)) return
309
313#define HERMES_CUDA_THREAD_INDEX2_LT(IJ, BOUNDS) \
314 hermes::index2 IJ(threadIdx.x + blockIdx.x * blockDim.x, \
315 threadIdx.y + blockIdx.y * blockDim.y); \
316 if(IJ >= (BOUNDS)) return
317
321#define HERMES_CUDA_THREAD_INDEX3_LT(IJK, BOUNDS) \
322 hermes::index3 IJK(threadIdx.x + blockIdx.x * blockDim.x, \
323 threadIdx.y + blockIdx.y * blockDim.y, \
324 threadIdx.z + blockIdx.z * blockDim.z); \
325 if(IJK >= (BOUNDS)) return
326
329#define HERMES_CUDA_THREAD_INDEX_I_LT(BOUNDS) HERMES_CUDA_THREAD_INDEX_LT(i, BOUNDS)
332#define HERMES_CUDA_THREAD_INDEX_IJ_LT(BOUNDS) HERMES_CUDA_THREAD_INDEX2_LT(ij, BOUNDS)
335#define HERMES_CUDA_THREAD_INDEX_IJK_LT(BOUNDS) HERMES_CUDA_THREAD_INDEX3_LT(ijk, BOUNDS)
336
337// *********************************************************************************************************************
338// ERROR
339// *********************************************************************************************************************
341#define HERMES_CHECK_CUDA_CALL(err) \
342 { \
343 auto hermes_cuda_result = (err); \
344 if(hermes_cuda_result != cudaSuccess) { \
345 HERMES_LOG_CRITICAL(cudaGetErrorString(hermes_cuda_result)); \
346 cudaDeviceReset(); \
347 exit(99); \
348 } \
349 }
351#define HERMES_CHECK_LAST_CUDA_CALL HERMES_CHECK_CUDA_CALL(cudaGetLastError());
352
355 int nDevices;
356
357 cudaGetDeviceCount(&nDevices);
358 for (int i = 0; i < nDevices; i++) {
359 cudaDeviceProp prop{};
360 cudaGetDeviceProperties(&prop, i);
361 printf("Device Number: %d\n", i);
362 printf(" Device name: %s\n", prop.name);
363 printf(" Major compute capability: %d\n", prop.major);
364 printf(" Minor compute capability: %d\n", prop.minor);
365 printf(" Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
366 printf(" Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
367 printf(" Peak Memory Bandwidth (GB/s): %f\n",
368 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6);
369 printf(" Device can map host memory with "
370 "cudaHostAlloc/cudaHostGetDevicePointer: %d\n",
371 prop.canMapHostMemory);
372 printf(" Clock frequency in kilohertz: %d\n", prop.clockRate);
373 printf(" Compute mode (See cudaComputeMode): %d\n", prop.computeMode);
374 printf(" Device can concurrently copy memory and execute a kernel: %d\n",
375 prop.deviceOverlap);
376 printf(" Device is integrated as opposed to discrete: %d\n",
377 prop.integrated);
378 printf(" Specified whether there is a run time limit on kernels: %d\n",
379 prop.kernelExecTimeoutEnabled);
380 printf(" Maximum size of each dimension of a grid: %d %d %d\n",
381 prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
382 printf(" Maximum size of each dimension of a block: %d %d %d\n",
383 prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
384 printf(" Maximum number of threads per block: %d\n",
385 prop.maxThreadsPerBlock);
386 printf(" Maximum pitch in bytes allowed by memory copies: %zu\n",
387 prop.memPitch);
388 printf(" Number of multiprocessors on device: %d\n",
389 prop.multiProcessorCount);
390 printf(" 32-bit registers available per block: %d\n", prop.regsPerBlock);
391 printf(" Shared memory available per block in bytes: %zu\n",
392 prop.sharedMemPerBlock);
393 printf(" Alignment requirement for textures: %zu\n", prop.textureAlignment);
394 printf(" Constant memory available on device in bytes: %zu\n",
395 prop.totalConstMem);
396 printf(" Global memory available on device in bytes: %zu\n",
397 prop.totalGlobalMem);
398 printf(" Warp size in threads: %d\n", prop.warpSize);
399 }
400}
401
404 size_t free_byte;
405 size_t total_byte;
406 HERMES_CHECK_CUDA_CALL(cudaMemGetInfo(&free_byte, &total_byte));
407 auto free_db = (double) free_byte;
408 auto total_db = (double) total_byte;
409 double used_db = total_db - free_db;
410 printf("GPU memory usage: used = %f, free = %f MB, total = %f MB\n",
411 used_db / 1024.0 / 1024.0, free_db / 1024.0 / 1024.0,
412 total_db / 1024.0 / 1024.0);
413}
414
416#define CUDA_MEMORY_USAGE \
417 { \
418 std::cerr << "[INFO][" << __FILE__ << "][" << __LINE__ << "]"; \
419 hermes_print_cuda_memory_usage(); \
420 }
421#else
422
423#define HERMES_CHECK_CUDA_CALL(err)
424#define CUDA_MEMORY_USAGE
425
426#endif
427
428#endif // HERMES_COMMON_CUDA_UTILS_H
429
Holds 2-dimensional size.
Definition size.h:47
T height
1-th dimension size
Definition size.h:133
T width
0-th dimension size
Definition size.h:132
Holds 2-dimensional size.
Definition size.h:142
T width
0-th dimension size
Definition size.h:224
T height
1-th dimension size
Definition size.h:225
T depth
2-th dimension size
Definition size.h:226
Synchronization lock.
Definition cuda_utils.h:197
HERMES_DEVICE_FUNCTION void unlock()
unlocks mutex
Definition cuda_utils.cpp:49
Lock()
Default constructor.
Definition cuda_utils.cpp:35
HERMES_DEVICE_FUNCTION void lock()
locks mutex
Definition cuda_utils.cpp:45
#define GPU_BLOCK_SIZE_X
Maximum thread block size in 1st dimension.
Definition cuda_utils.h:47
#define GPU_WARP_SIZE
Warp size.
Definition cuda_utils.h:53
#define GPU_BLOCK_SIZE_Y
Maximum thread block size in 2nd dimension.
Definition cuda_utils.h:49
std::ostream & operator<<(std::ostream &o, const LaunchInfo &info)
LaunchInfo support for std::ostream << operator.
Definition cuda_utils.h:234
#define GPU_BLOCK_SIZE
Maximum thread block size.
Definition cuda_utils.h:45
#define GPU_BLOCK_SIZE_Z
Maximum thread block size in 3rd dimension.
Definition cuda_utils.h:51
cudaMemcpyKind copyDirection(MemoryLocation src, MemoryLocation dst)
Computes cuda flag of memory block copy direction.
Definition cuda_utils.h:217
Debug, logging and assertion macros.
MemoryLocation
Specifies where memory is stored.
Definition defs.h:204
void hermes_print_cuda_memory_usage()
Outputs in stdout current GPU memory usage.
Definition cuda_utils.h:403
#define HERMES_DEVICE_FUNCTION
Specifies that the function can only be called from device side.
Definition defs.h:47
uint32_t u32
32 bit size unsigned integer type
Definition defs.h:88
#define HERMES_CHECK_CUDA_CALL(err)
Checks (and logs) a CUDA method return code for errors.
Definition cuda_utils.h:341
void hermes_print_cuda_devices()
Outputs in stdout information about all devices in the current machine.
Definition cuda_utils.h:354
#define HERMES_CHECK_EXP(expr)
Warns if expression is false.
Definition debug.h:95
Set of multi-dimensional integer iterators.
@ info
logs into info stream
Set of multi-dimensional size representations.
Holds CUDA launch parameters.
Definition cuda_utils.h:66
u32 blockThreadCount() const
Computes the total number of threads per block.
Definition cuda_utils.h:181
static void redistribute(dim3 b, dim3 g, dim3 &new_b, dim3 &new_g)
Redistributes threads to fit the gpu block size limits.
Definition cuda_utils.h:102
LaunchInfo(size3 b, size3 s={0, 0, 0}, size_t shared_memory_size_in_bytes=0, cudaStream_t stream={})
3-dimensional launch constructor
Definition cuda_utils.h:156
dim3 block_size
cuda block size (in number of threads)
Definition cuda_utils.h:188
LaunchInfo(size2 b, size2 s={0, 0}, size_t shared_memory_size_in_bytes=0, cudaStream_t stream={})
2-dimensional launch constructor
Definition cuda_utils.h:137
static void distribute(u32 max_b, u32 n, u32 &b, u32 &g)
Recomputes block and grid sizes to achieve good occupancy.
Definition cuda_utils.h:75
size_t shared_memory_size
size of shared memory in bytes
Definition cuda_utils.h:189
u32 threadCount() const
Computes the total number of threads.
Definition cuda_utils.h:176
dim3 grid_size
cuda grid size (in number of blocks)
Definition cuda_utils.h:187
cudaStream_t stream_id
launch stream identifier
Definition cuda_utils.h:190
LaunchInfo(u32 n, size_t shared_memory_size_in_bytes=0, cudaStream_t stream={})
1-dimensional launch constructor
Definition cuda_utils.h:126