32 #ifndef HERMES_COMMON_CUDA_UTILS_H
33 #define HERMES_COMMON_CUDA_UTILS_H
40 #ifdef HERMES_DEVICE_ENABLED
42 namespace hermes::cuda_utils {
45 #define GPU_BLOCK_SIZE 1024
47 #define GPU_BLOCK_SIZE_X 1024
49 #define GPU_BLOCK_SIZE_Y 1024
51 #define GPU_BLOCK_SIZE_Z 64
53 #define GPU_WARP_SIZE 32
82 auto b_candidate = max_b;
83 auto min_candidate = b_candidate;
84 auto min_r = m % b_candidate;
85 while (b_candidate > 128) {
86 auto r = m % b_candidate;
88 min_candidate = b_candidate;
94 g = (m % b) ? (m + b) / b : m / b;
103 dim3 m(b.x * g.x, b.y * g.y, b.z * g.z);
104 dim3 b_candidate = b;
105 while (b_candidate.x * b_candidate.y * b_candidate.z >
GPU_BLOCK_SIZE) {
107 if (b_candidate.x > b_candidate.y && b_candidate.x > b_candidate.z)
109 else if (b_candidate.y >= b_candidate.x && b_candidate.y >= b_candidate.z)
115 new_g = dim3((m.x % new_b.x) ? (m.x + new_b.x) / new_b.x : m.x / new_b.x,
116 (m.y % new_b.y) ? (m.y + new_b.y) / new_b.y : m.y / new_b.y,
117 (m.z % new_b.z) ? (m.z + new_b.z) / new_b.z : m.z / new_b.z);
126 LaunchInfo(
u32 n,
size_t shared_memory_size_in_bytes = 0, cudaStream_t stream = {}) :
138 size_t shared_memory_size_in_bytes = 0,
139 cudaStream_t stream = {}) :
144 if (s.total() == 0) {
157 size_t shared_memory_size_in_bytes = 0,
158 cudaStream_t stream = {}) :
163 if (s.total() == 0) {
218 if (src == MemoryLocation::DEVICE && dst == MemoryLocation::DEVICE)
219 return cudaMemcpyDeviceToDevice;
220 if (src == MemoryLocation::DEVICE && dst == MemoryLocation::HOST)
221 return cudaMemcpyDeviceToHost;
222 if (src == MemoryLocation::HOST && dst == MemoryLocation::HOST)
223 return cudaMemcpyHostToHost;
224 return cudaMemcpyHostToDevice;
235 o <<
"[block size (" <<
info.block_size.x <<
" " <<
info.block_size.y <<
" " <<
info.block_size.z <<
") ";
236 o <<
"grid size (" <<
info.grid_size.x <<
" " <<
info.grid_size.y <<
" " <<
info.grid_size.z <<
")]";
245 #define HERMES_CUDA_TIME(LAUNCH, ELAPSED_TIME_IN_MS) \
246 { cudaEvent_t cuda_event_start_t, cuda_event_stop_t; \
247 cudaEventCreate(&cuda_event_start_t); \
248 cudaEventCreate(&cuda_event_stop_t); \
249 cudaEventRecord(cuda_event_start_t, 0); \
251 cudaEventRecord(cuda_event_stop_t, 0); \
252 cudaEventSynchronize(cuda_event_stop_t); \
253 cudaEventElapsedTime(&ELAPSED_TIME_IN_MS, cuda_event_start_t, cuda_event_stop_t); }
256 #define HERMES_CUDA_DEVICE_SYNCHRONIZE HERMES_CHECK_CUDA_CALL(cudaDeviceSynchronize());
262 #define HERMES_CUDA_LAUNCH(LAUNCH_INFO, NAME, ...) \
264 auto _hli_ = hermes::cuda_utils::LaunchInfo LAUNCH_INFO; \
265 NAME<<< _hli_.grid_size, _hli_.block_size, _hli_.shared_memory_size, _hli_.stream_id >>> (__VA_ARGS__); \
266 HERMES_CHECK_LAST_CUDA \
273 #define HERMES_CUDA_LAUNCH_AND_SYNC(LAUNCH_INFO, NAME, ...) \
275 auto _hli_ = hermes::cuda_utils::LaunchInfo LAUNCH_INFO; \
276 NAME<<< _hli_.grid_size, _hli_.block_size, _hli_.shared_memory_size, _hli_.stream_id >>> (__VA_ARGS__); \
277 HERMES_CHECK_LAST_CUDA_CALL \
278 HERMES_CUDA_DEVICE_SYNCHRONIZE \
282 #define HERMES_CUDA_THREAD_INDEX_I \
283 u32 i = threadIdx.x + blockIdx.x * blockDim.x
286 #define HERMES_CUDA_THREAD_INDEX_IJ \
287 hermes::index2 ij(threadIdx.x + blockIdx.x * blockDim.x, \
288 threadIdx.y + blockIdx.y * blockDim.y)
291 #define HERMES_CUDA_THREAD_INDEX_IJK \
292 hermes::index3 ijk(threadIdx.x + blockIdx.x * blockDim.x, \
293 threadIdx.y + blockIdx.y * blockDim.y, \
294 threadIdx.z + blockIdx.z * blockDim.z);
297 #define HERMES_CUDA_RETURN_IF_NOT_THREAD_0 \
298 { HERMES_CUDA_THREAD_INDEX_IJK \
299 if(ijk != hermes::index3(0,0,0)) \
306 #define HERMES_CUDA_THREAD_INDEX_LT(I, BOUNDS) \
307 u32 I = threadIdx.x + blockIdx.x * blockDim.x; \
308 if(I >= (BOUNDS)) return
313 #define HERMES_CUDA_THREAD_INDEX2_LT(IJ, BOUNDS) \
314 hermes::index2 IJ(threadIdx.x + blockIdx.x * blockDim.x, \
315 threadIdx.y + blockIdx.y * blockDim.y); \
316 if(IJ >= (BOUNDS)) return
321 #define HERMES_CUDA_THREAD_INDEX3_LT(IJK, BOUNDS) \
322 hermes::index3 IJK(threadIdx.x + blockIdx.x * blockDim.x, \
323 threadIdx.y + blockIdx.y * blockDim.y, \
324 threadIdx.z + blockIdx.z * blockDim.z); \
325 if(IJK >= (BOUNDS)) return
329 #define HERMES_CUDA_THREAD_INDEX_I_LT(BOUNDS) HERMES_CUDA_THREAD_INDEX_LT(i, BOUNDS)
332 #define HERMES_CUDA_THREAD_INDEX_IJ_LT(BOUNDS) HERMES_CUDA_THREAD_INDEX2_LT(ij, BOUNDS)
335 #define HERMES_CUDA_THREAD_INDEX_IJK_LT(BOUNDS) HERMES_CUDA_THREAD_INDEX3_LT(ijk, BOUNDS)
341 #define HERMES_CHECK_CUDA_CALL(err) \
343 auto hermes_cuda_result = (err); \
344 if(hermes_cuda_result != cudaSuccess) { \
345 HERMES_LOG_CRITICAL(cudaGetErrorString(hermes_cuda_result)); \
351 #define HERMES_CHECK_LAST_CUDA_CALL HERMES_CHECK_CUDA_CALL(cudaGetLastError());
357 cudaGetDeviceCount(&nDevices);
358 for (
int i = 0; i < nDevices; i++) {
359 cudaDeviceProp prop{};
360 cudaGetDeviceProperties(&prop, i);
361 printf(
"Device Number: %d\n", i);
362 printf(
" Device name: %s\n", prop.name);
363 printf(
" Major compute capability: %d\n", prop.major);
364 printf(
" Minor compute capability: %d\n", prop.minor);
365 printf(
" Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
366 printf(
" Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
367 printf(
" Peak Memory Bandwidth (GB/s): %f\n",
368 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6);
369 printf(
" Device can map host memory with "
370 "cudaHostAlloc/cudaHostGetDevicePointer: %d\n",
371 prop.canMapHostMemory);
372 printf(
" Clock frequency in kilohertz: %d\n", prop.clockRate);
373 printf(
" Compute mode (See cudaComputeMode): %d\n", prop.computeMode);
374 printf(
" Device can concurrently copy memory and execute a kernel: %d\n",
376 printf(
" Device is integrated as opposed to discrete: %d\n",
378 printf(
" Specified whether there is a run time limit on kernels: %d\n",
379 prop.kernelExecTimeoutEnabled);
380 printf(
" Maximum size of each dimension of a grid: %d %d %d\n",
381 prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
382 printf(
" Maximum size of each dimension of a block: %d %d %d\n",
383 prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
384 printf(
" Maximum number of threads per block: %d\n",
385 prop.maxThreadsPerBlock);
386 printf(
" Maximum pitch in bytes allowed by memory copies: %zu\n",
388 printf(
" Number of multiprocessors on device: %d\n",
389 prop.multiProcessorCount);
390 printf(
" 32-bit registers available per block: %d\n", prop.regsPerBlock);
391 printf(
" Shared memory available per block in bytes: %zu\n",
392 prop.sharedMemPerBlock);
393 printf(
" Alignment requirement for textures: %zu\n", prop.textureAlignment);
394 printf(
" Constant memory available on device in bytes: %zu\n",
396 printf(
" Global memory available on device in bytes: %zu\n",
397 prop.totalGlobalMem);
398 printf(
" Warp size in threads: %d\n", prop.warpSize);
407 auto free_db = (double) free_byte;
408 auto total_db = (double) total_byte;
409 double used_db = total_db - free_db;
410 printf(
"GPU memory usage: used = %f, free = %f MB, total = %f MB\n",
411 used_db / 1024.0 / 1024.0, free_db / 1024.0 / 1024.0,
412 total_db / 1024.0 / 1024.0);
416 #define CUDA_MEMORY_USAGE \
418 std::cerr << "[INFO][" << __FILE__ << "][" << __LINE__ << "]"; \
419 hermes_print_cuda_memory_usage(); \
423 #define HERMES_CHECK_CUDA_CALL(err)
424 #define CUDA_MEMORY_USAGE
Holds 2-dimensional size.
Definition: size.h:47
T height
1-th dimension size
Definition: size.h:133
T width
0-th dimension size
Definition: size.h:132
Holds 2-dimensional size.
Definition: size.h:142
T width
0-th dimension size
Definition: size.h:224
T height
1-th dimension size
Definition: size.h:225
T depth
2-th dimension size
Definition: size.h:226
Synchronization lock.
Definition: cuda_utils.h:197
HERMES_DEVICE_FUNCTION void unlock()
unlocks mutex
Definition: cuda_utils.cpp:49
Lock()
Default constructor.
Definition: cuda_utils.cpp:35
HERMES_DEVICE_FUNCTION void lock()
locks mutex
Definition: cuda_utils.cpp:45
std::ostream & operator<<(std::ostream &o, const LaunchInfo &info)
LaunchInfo support for std::ostream << operator.
Definition: cuda_utils.h:234
#define GPU_BLOCK_SIZE_X
Maximum thread block size in 1st dimension.
Definition: cuda_utils.h:47
#define GPU_WARP_SIZE
Warp size.
Definition: cuda_utils.h:53
#define GPU_BLOCK_SIZE_Y
Maximum thread block size in 2nd dimension.
Definition: cuda_utils.h:49
#define GPU_BLOCK_SIZE
Maximum thread block size.
Definition: cuda_utils.h:45
#define GPU_BLOCK_SIZE_Z
Maximum thread block size in 3rd dimension.
Definition: cuda_utils.h:51
cudaMemcpyKind copyDirection(MemoryLocation src, MemoryLocation dst)
Computes cuda flag of memory block copy direction.
Definition: cuda_utils.h:217
Debug, logging and assertion macros.
MemoryLocation
Specifies where memory is stored.
Definition: defs.h:204
void hermes_print_cuda_memory_usage()
Outputs in stdout current GPU memory usage.
Definition: cuda_utils.h:403
#define HERMES_DEVICE_FUNCTION
Specifies that the function can only be called from device side.
Definition: defs.h:47
uint32_t u32
32 bit size unsigned integer type
Definition: defs.h:88
#define HERMES_CHECK_CUDA_CALL(err)
Checks (and logs) a CUDA method return code for errors.
Definition: cuda_utils.h:341
void hermes_print_cuda_devices()
Outputs in stdout information about all devices in the current machine.
Definition: cuda_utils.h:354
#define HERMES_CHECK_EXP(expr)
Warns if expression is false.
Definition: debug.h:95
Set of multi-dimensional integer iterators.
@ info
logs into info stream
Set of multi-dimensional size representations.
Holds CUDA launch parameters.
Definition: cuda_utils.h:66
u32 blockThreadCount() const
Computes the total number of threads per block.
Definition: cuda_utils.h:181
static void redistribute(dim3 b, dim3 g, dim3 &new_b, dim3 &new_g)
Redistributes threads to fit the gpu block size limits.
Definition: cuda_utils.h:102
LaunchInfo(size3 b, size3 s={0, 0, 0}, size_t shared_memory_size_in_bytes=0, cudaStream_t stream={})
3-dimensional launch constructor
Definition: cuda_utils.h:156
dim3 block_size
cuda block size (in number of threads)
Definition: cuda_utils.h:188
LaunchInfo(size2 b, size2 s={0, 0}, size_t shared_memory_size_in_bytes=0, cudaStream_t stream={})
2-dimensional launch constructor
Definition: cuda_utils.h:137
static void distribute(u32 max_b, u32 n, u32 &b, u32 &g)
Recomputes block and grid sizes to achieve good occupancy.
Definition: cuda_utils.h:75
size_t shared_memory_size
size of shared memory in bytes
Definition: cuda_utils.h:189
u32 threadCount() const
Computes the total number of threads.
Definition: cuda_utils.h:176
dim3 grid_size
cuda grid size (in number of blocks)
Definition: cuda_utils.h:187
cudaStream_t stream_id
launch stream identifier
Definition: cuda_utils.h:190
LaunchInfo(u32 n, size_t shared_memory_size_in_bytes=0, cudaStream_t stream={})
1-dimensional launch constructor
Definition: cuda_utils.h:126