Hermes
cuda_utils.h
Go to the documentation of this file.
1 
32 #ifndef HERMES_COMMON_CUDA_UTILS_H
33 #define HERMES_COMMON_CUDA_UTILS_H
34 
35 #include <hermes/common/size.h>
36 #include <hermes/common/index.h>
37 #include <hermes/common/debug.h>
38 #include <iostream>
39 
40 #ifdef HERMES_DEVICE_ENABLED
41 
42 namespace hermes::cuda_utils {
43 
45 #define GPU_BLOCK_SIZE 1024
47 #define GPU_BLOCK_SIZE_X 1024
49 #define GPU_BLOCK_SIZE_Y 1024
51 #define GPU_BLOCK_SIZE_Z 64
53 #define GPU_WARP_SIZE 32
54 
66 struct LaunchInfo {
67  // *******************************************************************************************************************
68  // STATIC METHODS
69  // *******************************************************************************************************************
75  static void distribute(u32 max_b, u32 n, u32 &b, u32 &g) {
76  if (n <= max_b) {
77  b = n;
78  g = 1;
79  } else {
80  // round n to a multiple of warp size
81  auto m = (n % GPU_WARP_SIZE) ? ((n + GPU_WARP_SIZE) / GPU_WARP_SIZE) * GPU_WARP_SIZE : n;
82  auto b_candidate = max_b;
83  auto min_candidate = b_candidate;
84  auto min_r = m % b_candidate;
85  while (b_candidate > 128) {
86  auto r = m % b_candidate;
87  if (r < min_r) {
88  min_candidate = b_candidate;
89  min_r = r;
90  }
91  b_candidate >>= 1;
92  }
93  b = min_candidate;
94  g = (m % b) ? (m + b) / b : m / b;
95  }
96  }
102  static void redistribute(dim3 b, dim3 g, dim3 &new_b, dim3 &new_g) {
103  dim3 m(b.x * g.x, b.y * g.y, b.z * g.z);
104  dim3 b_candidate = b;
105  while (b_candidate.x * b_candidate.y * b_candidate.z > GPU_BLOCK_SIZE) {
106  // split max dimension
107  if (b_candidate.x > b_candidate.y && b_candidate.x > b_candidate.z)
108  b_candidate.x >>= 1;
109  else if (b_candidate.y >= b_candidate.x && b_candidate.y >= b_candidate.z)
110  b_candidate.y >>= 1;
111  else
112  b_candidate.z >>= 1;
113  }
114  new_b = b_candidate;
115  new_g = dim3((m.x % new_b.x) ? (m.x + new_b.x) / new_b.x : m.x / new_b.x,
116  (m.y % new_b.y) ? (m.y + new_b.y) / new_b.y : m.y / new_b.y,
117  (m.z % new_b.z) ? (m.z + new_b.z) / new_b.z : m.z / new_b.z);
118  }
119  // *******************************************************************************************************************
120  // CONSTRUCTORS
121  // *******************************************************************************************************************
126  LaunchInfo(u32 n, size_t shared_memory_size_in_bytes = 0, cudaStream_t stream = {}) :
127  shared_memory_size{shared_memory_size_in_bytes},
128  stream_id{stream} {
131  }
137  LaunchInfo(size2 b, size2 s = {0, 0},
138  size_t shared_memory_size_in_bytes = 0,
139  cudaStream_t stream = {}) :
140  shared_memory_size{shared_memory_size_in_bytes},
141  stream_id{stream} {
142  block_size = dim3(b.width, b.height, 1);
143  grid_size = dim3(s.width, s.height, 1);
144  if (s.total() == 0) {
148  }
150  }
156  LaunchInfo(size3 b, size3 s = {0, 0, 0},
157  size_t shared_memory_size_in_bytes = 0,
158  cudaStream_t stream = {}) :
159  shared_memory_size{shared_memory_size_in_bytes},
160  stream_id{stream} {
161  block_size = dim3(b.width, b.height, 1);
162  grid_size = dim3(s.width, s.height, 1);
163  if (s.total() == 0) {
168  }
170  }
171  // *******************************************************************************************************************
172  // METHODS
173  // *******************************************************************************************************************
176  [[nodiscard]] u32 threadCount() const {
177  return grid_size.x * grid_size.y * grid_size.z * block_size.x * block_size.y * block_size.z;
178  }
181  [[nodiscard]] u32 blockThreadCount() const {
182  return block_size.x * block_size.y * block_size.z;
183  }
184  // *******************************************************************************************************************
185  // PUBLIC FIELDS
186  // *******************************************************************************************************************
187  dim3 grid_size;
188  dim3 block_size;
189  size_t shared_memory_size{0};
190  cudaStream_t stream_id{};
191 };
192 
193 // *********************************************************************************************************************
194 // SYNCHRONIZATION
195 // *********************************************************************************************************************
197 class Lock {
198 public:
200  Lock();
201  ~Lock();
206 private:
207  int *mutex{nullptr};
208 };
209 
210 // *********************************************************************************************************************
211 // MEMORY
212 // *********************************************************************************************************************
217 inline cudaMemcpyKind copyDirection(MemoryLocation src, MemoryLocation dst) {
218  if (src == MemoryLocation::DEVICE && dst == MemoryLocation::DEVICE)
219  return cudaMemcpyDeviceToDevice;
220  if (src == MemoryLocation::DEVICE && dst == MemoryLocation::HOST)
221  return cudaMemcpyDeviceToHost;
222  if (src == MemoryLocation::HOST && dst == MemoryLocation::HOST)
223  return cudaMemcpyHostToHost;
224  return cudaMemcpyHostToDevice;
225 }
226 
227 // *********************************************************************************************************************
228 // IO
229 // *********************************************************************************************************************
234 inline std::ostream &operator<<(std::ostream &o, const LaunchInfo &info) {
235  o << "[block size (" << info.block_size.x << " " << info.block_size.y << " " << info.block_size.z << ") ";
236  o << "grid size (" << info.grid_size.x << " " << info.grid_size.y << " " << info.grid_size.z << ")]";
237  return o;
238 }
239 
240 } // namespace hermes::cuda_utils
241 
245 #define HERMES_CUDA_TIME(LAUNCH, ELAPSED_TIME_IN_MS) \
246 { cudaEvent_t cuda_event_start_t, cuda_event_stop_t; \
247  cudaEventCreate(&cuda_event_start_t); \
248  cudaEventCreate(&cuda_event_stop_t); \
249  cudaEventRecord(cuda_event_start_t, 0); \
250  LAUNCH \
251  cudaEventRecord(cuda_event_stop_t, 0); \
252  cudaEventSynchronize(cuda_event_stop_t); \
253  cudaEventElapsedTime(&ELAPSED_TIME_IN_MS, cuda_event_start_t, cuda_event_stop_t); }
254 
256 #define HERMES_CUDA_DEVICE_SYNCHRONIZE HERMES_CHECK_CUDA_CALL(cudaDeviceSynchronize());
257 
262 #define HERMES_CUDA_LAUNCH(LAUNCH_INFO, NAME, ...) \
263 { \
264  auto _hli_ = hermes::cuda_utils::LaunchInfo LAUNCH_INFO; \
265  NAME<<< _hli_.grid_size, _hli_.block_size, _hli_.shared_memory_size, _hli_.stream_id >>> (__VA_ARGS__); \
266  HERMES_CHECK_LAST_CUDA \
267 }
268 
273 #define HERMES_CUDA_LAUNCH_AND_SYNC(LAUNCH_INFO, NAME, ...) \
274 { \
275  auto _hli_ = hermes::cuda_utils::LaunchInfo LAUNCH_INFO; \
276  NAME<<< _hli_.grid_size, _hli_.block_size, _hli_.shared_memory_size, _hli_.stream_id >>> (__VA_ARGS__); \
277  HERMES_CHECK_LAST_CUDA_CALL \
278  HERMES_CUDA_DEVICE_SYNCHRONIZE \
279 }
280 
282 #define HERMES_CUDA_THREAD_INDEX_I \
283  u32 i = threadIdx.x + blockIdx.x * blockDim.x
284 
286 #define HERMES_CUDA_THREAD_INDEX_IJ \
287  hermes::index2 ij(threadIdx.x + blockIdx.x * blockDim.x, \
288  threadIdx.y + blockIdx.y * blockDim.y)
289 
291 #define HERMES_CUDA_THREAD_INDEX_IJK \
292  hermes::index3 ijk(threadIdx.x + blockIdx.x * blockDim.x, \
293  threadIdx.y + blockIdx.y * blockDim.y, \
294  threadIdx.z + blockIdx.z * blockDim.z);
295 
297 #define HERMES_CUDA_RETURN_IF_NOT_THREAD_0 \
298 { HERMES_CUDA_THREAD_INDEX_IJK \
299  if(ijk != hermes::index3(0,0,0)) \
300  return; \
301 }
302 
306 #define HERMES_CUDA_THREAD_INDEX_LT(I, BOUNDS) \
307  u32 I = threadIdx.x + blockIdx.x * blockDim.x; \
308  if(I >= (BOUNDS)) return
309 
313 #define HERMES_CUDA_THREAD_INDEX2_LT(IJ, BOUNDS) \
314  hermes::index2 IJ(threadIdx.x + blockIdx.x * blockDim.x, \
315  threadIdx.y + blockIdx.y * blockDim.y); \
316  if(IJ >= (BOUNDS)) return
317 
321 #define HERMES_CUDA_THREAD_INDEX3_LT(IJK, BOUNDS) \
322  hermes::index3 IJK(threadIdx.x + blockIdx.x * blockDim.x, \
323  threadIdx.y + blockIdx.y * blockDim.y, \
324  threadIdx.z + blockIdx.z * blockDim.z); \
325  if(IJK >= (BOUNDS)) return
326 
329 #define HERMES_CUDA_THREAD_INDEX_I_LT(BOUNDS) HERMES_CUDA_THREAD_INDEX_LT(i, BOUNDS)
332 #define HERMES_CUDA_THREAD_INDEX_IJ_LT(BOUNDS) HERMES_CUDA_THREAD_INDEX2_LT(ij, BOUNDS)
335 #define HERMES_CUDA_THREAD_INDEX_IJK_LT(BOUNDS) HERMES_CUDA_THREAD_INDEX3_LT(ijk, BOUNDS)
336 
337 // *********************************************************************************************************************
338 // ERROR
339 // *********************************************************************************************************************
341 #define HERMES_CHECK_CUDA_CALL(err) \
342  { \
343  auto hermes_cuda_result = (err); \
344  if(hermes_cuda_result != cudaSuccess) { \
345  HERMES_LOG_CRITICAL(cudaGetErrorString(hermes_cuda_result)); \
346  cudaDeviceReset(); \
347  exit(99); \
348  } \
349  }
351 #define HERMES_CHECK_LAST_CUDA_CALL HERMES_CHECK_CUDA_CALL(cudaGetLastError());
352 
355  int nDevices;
356 
357  cudaGetDeviceCount(&nDevices);
358  for (int i = 0; i < nDevices; i++) {
359  cudaDeviceProp prop{};
360  cudaGetDeviceProperties(&prop, i);
361  printf("Device Number: %d\n", i);
362  printf(" Device name: %s\n", prop.name);
363  printf(" Major compute capability: %d\n", prop.major);
364  printf(" Minor compute capability: %d\n", prop.minor);
365  printf(" Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
366  printf(" Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
367  printf(" Peak Memory Bandwidth (GB/s): %f\n",
368  2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6);
369  printf(" Device can map host memory with "
370  "cudaHostAlloc/cudaHostGetDevicePointer: %d\n",
371  prop.canMapHostMemory);
372  printf(" Clock frequency in kilohertz: %d\n", prop.clockRate);
373  printf(" Compute mode (See cudaComputeMode): %d\n", prop.computeMode);
374  printf(" Device can concurrently copy memory and execute a kernel: %d\n",
375  prop.deviceOverlap);
376  printf(" Device is integrated as opposed to discrete: %d\n",
377  prop.integrated);
378  printf(" Specified whether there is a run time limit on kernels: %d\n",
379  prop.kernelExecTimeoutEnabled);
380  printf(" Maximum size of each dimension of a grid: %d %d %d\n",
381  prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
382  printf(" Maximum size of each dimension of a block: %d %d %d\n",
383  prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
384  printf(" Maximum number of threads per block: %d\n",
385  prop.maxThreadsPerBlock);
386  printf(" Maximum pitch in bytes allowed by memory copies: %zu\n",
387  prop.memPitch);
388  printf(" Number of multiprocessors on device: %d\n",
389  prop.multiProcessorCount);
390  printf(" 32-bit registers available per block: %d\n", prop.regsPerBlock);
391  printf(" Shared memory available per block in bytes: %zu\n",
392  prop.sharedMemPerBlock);
393  printf(" Alignment requirement for textures: %zu\n", prop.textureAlignment);
394  printf(" Constant memory available on device in bytes: %zu\n",
395  prop.totalConstMem);
396  printf(" Global memory available on device in bytes: %zu\n",
397  prop.totalGlobalMem);
398  printf(" Warp size in threads: %d\n", prop.warpSize);
399  }
400 }
401 
404  size_t free_byte;
405  size_t total_byte;
406  HERMES_CHECK_CUDA_CALL(cudaMemGetInfo(&free_byte, &total_byte));
407  auto free_db = (double) free_byte;
408  auto total_db = (double) total_byte;
409  double used_db = total_db - free_db;
410  printf("GPU memory usage: used = %f, free = %f MB, total = %f MB\n",
411  used_db / 1024.0 / 1024.0, free_db / 1024.0 / 1024.0,
412  total_db / 1024.0 / 1024.0);
413 }
414 
416 #define CUDA_MEMORY_USAGE \
417  { \
418  std::cerr << "[INFO][" << __FILE__ << "][" << __LINE__ << "]"; \
419  hermes_print_cuda_memory_usage(); \
420  }
421 #else
422 
423 #define HERMES_CHECK_CUDA_CALL(err)
424 #define CUDA_MEMORY_USAGE
425 
426 #endif
427 
428 #endif // HERMES_COMMON_CUDA_UTILS_H
429 
Holds 2-dimensional size.
Definition: size.h:47
T height
1-th dimension size
Definition: size.h:133
T width
0-th dimension size
Definition: size.h:132
Holds 2-dimensional size.
Definition: size.h:142
T width
0-th dimension size
Definition: size.h:224
T height
1-th dimension size
Definition: size.h:225
T depth
2-th dimension size
Definition: size.h:226
Synchronization lock.
Definition: cuda_utils.h:197
HERMES_DEVICE_FUNCTION void unlock()
unlocks mutex
Definition: cuda_utils.cpp:49
Lock()
Default constructor.
Definition: cuda_utils.cpp:35
HERMES_DEVICE_FUNCTION void lock()
locks mutex
Definition: cuda_utils.cpp:45
std::ostream & operator<<(std::ostream &o, const LaunchInfo &info)
LaunchInfo support for std::ostream << operator.
Definition: cuda_utils.h:234
#define GPU_BLOCK_SIZE_X
Maximum thread block size in 1st dimension.
Definition: cuda_utils.h:47
#define GPU_WARP_SIZE
Warp size.
Definition: cuda_utils.h:53
#define GPU_BLOCK_SIZE_Y
Maximum thread block size in 2nd dimension.
Definition: cuda_utils.h:49
#define GPU_BLOCK_SIZE
Maximum thread block size.
Definition: cuda_utils.h:45
#define GPU_BLOCK_SIZE_Z
Maximum thread block size in 3rd dimension.
Definition: cuda_utils.h:51
cudaMemcpyKind copyDirection(MemoryLocation src, MemoryLocation dst)
Computes cuda flag of memory block copy direction.
Definition: cuda_utils.h:217
Debug, logging and assertion macros.
MemoryLocation
Specifies where memory is stored.
Definition: defs.h:204
void hermes_print_cuda_memory_usage()
Outputs in stdout current GPU memory usage.
Definition: cuda_utils.h:403
#define HERMES_DEVICE_FUNCTION
Specifies that the function can only be called from device side.
Definition: defs.h:47
uint32_t u32
32 bit size unsigned integer type
Definition: defs.h:88
#define HERMES_CHECK_CUDA_CALL(err)
Checks (and logs) a CUDA method return code for errors.
Definition: cuda_utils.h:341
void hermes_print_cuda_devices()
Outputs in stdout information about all devices in the current machine.
Definition: cuda_utils.h:354
#define HERMES_CHECK_EXP(expr)
Warns if expression is false.
Definition: debug.h:95
Set of multi-dimensional integer iterators.
@ info
logs into info stream
Set of multi-dimensional size representations.
Holds CUDA launch parameters.
Definition: cuda_utils.h:66
u32 blockThreadCount() const
Computes the total number of threads per block.
Definition: cuda_utils.h:181
static void redistribute(dim3 b, dim3 g, dim3 &new_b, dim3 &new_g)
Redistributes threads to fit the gpu block size limits.
Definition: cuda_utils.h:102
LaunchInfo(size3 b, size3 s={0, 0, 0}, size_t shared_memory_size_in_bytes=0, cudaStream_t stream={})
3-dimensional launch constructor
Definition: cuda_utils.h:156
dim3 block_size
cuda block size (in number of threads)
Definition: cuda_utils.h:188
LaunchInfo(size2 b, size2 s={0, 0}, size_t shared_memory_size_in_bytes=0, cudaStream_t stream={})
2-dimensional launch constructor
Definition: cuda_utils.h:137
static void distribute(u32 max_b, u32 n, u32 &b, u32 &g)
Recomputes block and grid sizes to achieve good occupancy.
Definition: cuda_utils.h:75
size_t shared_memory_size
size of shared memory in bytes
Definition: cuda_utils.h:189
u32 threadCount() const
Computes the total number of threads.
Definition: cuda_utils.h:176
dim3 grid_size
cuda grid size (in number of blocks)
Definition: cuda_utils.h:187
cudaStream_t stream_id
launch stream identifier
Definition: cuda_utils.h:190
LaunchInfo(u32 n, size_t shared_memory_size_in_bytes=0, cudaStream_t stream={})
1-dimensional launch constructor
Definition: cuda_utils.h:126