Holds CUDA launch parameters.
More...
|
| LaunchInfo (u32 n, size_t shared_memory_size_in_bytes=0, cudaStream_t stream={}) |
| 1-dimensional launch constructor More...
|
|
| LaunchInfo (size2 b, size2 s={0, 0}, size_t shared_memory_size_in_bytes=0, cudaStream_t stream={}) |
| 2-dimensional launch constructor More...
|
|
| LaunchInfo (size3 b, size3 s={0, 0, 0}, size_t shared_memory_size_in_bytes=0, cudaStream_t stream={}) |
| 3-dimensional launch constructor More...
|
|
u32 | threadCount () const |
| Computes the total number of threads. More...
|
|
u32 | blockThreadCount () const |
| Computes the total number of threads per block. More...
|
|
|
static void | distribute (u32 max_b, u32 n, u32 &b, u32 &g) |
| Recomputes block and grid sizes to achieve good occupancy. More...
|
|
static void | redistribute (dim3 b, dim3 g, dim3 &new_b, dim3 &new_g) |
| Redistributes threads to fit the gpu block size limits. More...
|
|
Holds CUDA launch parameters.
Here is a list of limitations about the quantity of threads in a CUDA launch:
- Each block cannot have more than 512/1024 threads in total (Compute Capability 1.x or 2.x and later respectively)
- The maximum dimensions of each block are limited to [512,512,64]/1024,1024,64
- Each block cannot consume more than 8k/16k/32k/64k/32k/64k/32k/64k/32k/64k registers total (Compute 1.0,1.1/1.2,1.3/2.x-/3.0/3.2/3.5-5.2/5.3/6-6.1/6.2/7.0)
- Each block cannot consume more than 16kb/48kb/96kb of shared memory (Compute 1.x/2.x-6.2/7.0)
◆ LaunchInfo() [1/3]
hermes::cuda_utils::LaunchInfo::LaunchInfo |
( |
u32 |
n, |
|
|
size_t |
shared_memory_size_in_bytes = 0 , |
|
|
cudaStream_t |
stream = {} |
|
) |
| |
|
inline |
1-dimensional launch constructor
- Parameters
-
n | thread count |
shared_memory_size_in_bytes | (per block) |
stream | stream id |
◆ LaunchInfo() [2/3]
hermes::cuda_utils::LaunchInfo::LaunchInfo |
( |
size2 |
b, |
|
|
size2 |
s = {0, 0} , |
|
|
size_t |
shared_memory_size_in_bytes = 0 , |
|
|
cudaStream_t |
stream = {} |
|
) |
| |
|
inline |
2-dimensional launch constructor
- Parameters
-
b | block size (threads per block) |
s | grid size (blocks) |
shared_memory_size_in_bytes | per (per block) |
stream | stream id |
◆ LaunchInfo() [3/3]
hermes::cuda_utils::LaunchInfo::LaunchInfo |
( |
size3 |
b, |
|
|
size3 |
s = {0, 0, 0} , |
|
|
size_t |
shared_memory_size_in_bytes = 0 , |
|
|
cudaStream_t |
stream = {} |
|
) |
| |
|
inline |
3-dimensional launch constructor
- Parameters
-
b | block size (threads per block) |
s | grid size (blocks) |
shared_memory_size_in_bytes | (per block) |
stream | stream id |
◆ blockThreadCount()
u32 hermes::cuda_utils::LaunchInfo::blockThreadCount |
( |
| ) |
const |
|
inline |
Computes the total number of threads per block.
- Returns
◆ distribute()
static void hermes::cuda_utils::LaunchInfo::distribute |
( |
u32 |
max_b, |
|
|
u32 |
n, |
|
|
u32 & |
b, |
|
|
u32 & |
g |
|
) |
| |
|
inlinestatic |
Recomputes block and grid sizes to achieve good occupancy.
- Parameters
-
max_b | maximum number of threads per block |
n | total number of threads |
b | output block size |
g | output grid size |
◆ redistribute()
static void hermes::cuda_utils::LaunchInfo::redistribute |
( |
dim3 |
b, |
|
|
dim3 |
g, |
|
|
dim3 & |
new_b, |
|
|
dim3 & |
new_g |
|
) |
| |
|
inlinestatic |
Redistributes threads to fit the gpu block size limits.
- Parameters
-
◆ threadCount()
u32 hermes::cuda_utils::LaunchInfo::threadCount |
( |
| ) |
const |
|
inline |
Computes the total number of threads.
- Returns
The documentation for this struct was generated from the following file: