Holds CUDA launch parameters. More...

#include <cuda_utils.h>

Public Member Functions
	LaunchInfo (u32 n, size_t shared_memory_size_in_bytes=0, cudaStream_t stream={})
	1-dimensional launch constructor

	LaunchInfo (size2 b, size2 s={0, 0}, size_t shared_memory_size_in_bytes=0, cudaStream_t stream={})
	2-dimensional launch constructor

	LaunchInfo (size3 b, size3 s={0, 0, 0}, size_t shared_memory_size_in_bytes=0, cudaStream_t stream={})
	3-dimensional launch constructor

u32	threadCount () const
	Computes the total number of threads.

u32	blockThreadCount () const
	Computes the total number of threads per block.

Static Public Member Functions
static void	distribute (u32 max_b, u32 n, u32 &b, u32 &g)
	Recomputes block and grid sizes to achieve good occupancy.

static void	redistribute (dim3 b, dim3 g, dim3 &new_b, dim3 &new_g)
	Redistributes threads to fit the gpu block size limits.

Public Attributes
dim3	grid_size
	cuda grid size (in number of blocks)

dim3	block_size
	cuda block size (in number of threads)

size_t	shared_memory_size {0}
	size of shared memory in bytes

cudaStream_t	stream_id {}
	launch stream identifier

Detailed Description

Holds CUDA launch parameters.

Here is a list of limitations about the quantity of threads in a CUDA launch:

Each block cannot have more than 512/1024 threads in total (Compute Capability 1.x or 2.x and later respectively)
The maximum dimensions of each block are limited to [512,512,64]/[1024,1024,64] (Compute 1.x/2.x or later)
Each block cannot consume more than 8k/16k/32k/64k/32k/64k/32k/64k/32k/64k registers total (Compute 1.0,1.1/1.2,1.3/2.x-/3.0/3.2/3.5-5.2/5.3/6-6.1/6.2/7.0)
Each block cannot consume more than 16kb/48kb/96kb of shared memory (Compute 1.x/2.x-6.2/7.0)

Constructor & Destructor Documentation

◆ LaunchInfo() [1/3]

hermes::cuda_utils::LaunchInfo::LaunchInfo	(	u32	n,
		size_t	shared_memory_size_in_bytes = `0`,
		cudaStream_t	stream = `{}`
	)

inline

1-dimensional launch constructor

Parameters

n	thread count
shared_memory_size_in_bytes	(per block)
stream	stream id

◆ LaunchInfo() [2/3]

hermes::cuda_utils::LaunchInfo::LaunchInfo	(	size2	b,
		size2	s = `{0, 0}`,
		size_t	shared_memory_size_in_bytes = `0`,
		cudaStream_t	stream = `{}`
	)

inline

2-dimensional launch constructor

Parameters

b	block size (threads per block)
s	grid size (blocks)
shared_memory_size_in_bytes	per (per block)
stream	stream id

◆ LaunchInfo() [3/3]

hermes::cuda_utils::LaunchInfo::LaunchInfo	(	size3	b,
		size3	s = `{0, 0, 0}`,
		size_t	shared_memory_size_in_bytes = `0`,
		cudaStream_t	stream = `{}`
	)

inline

3-dimensional launch constructor

Parameters

b	block size (threads per block)
s	grid size (blocks)
shared_memory_size_in_bytes	(per block)
stream	stream id

Member Function Documentation

◆ blockThreadCount()

u32 hermes::cuda_utils::LaunchInfo::blockThreadCount ( ) const

inline

Computes the total number of threads per block.

Returns

◆ distribute()

static void hermes::cuda_utils::LaunchInfo::distribute	(	u32	max_b,
		u32	n,
		u32 &	b,
		u32 &	g
	)

inlinestatic

Recomputes block and grid sizes to achieve good occupancy.

Parameters

max_b	maximum number of threads per block
n	total number of threads
b	output block size
g	output grid size

◆ redistribute()

static void hermes::cuda_utils::LaunchInfo::redistribute	(	dim3	b,
		dim3	g,
		dim3 &	new_b,
		dim3 &	new_g
	)

inlinestatic

Redistributes threads to fit the gpu block size limits.

Parameters

b
g
new_b
new_g

◆ threadCount()

u32 hermes::cuda_utils::LaunchInfo::threadCount ( ) const

inline

Computes the total number of threads.

Returns

The documentation for this struct was generated from the following file:

hermes/common/cuda_utils.h

Public Member Functions

Static Public Member Functions

Public Attributes

Detailed Description

Constructor & Destructor Documentation

◆ LaunchInfo() [1/3]

◆ LaunchInfo() [2/3]

◆ LaunchInfo() [3/3]

Member Function Documentation

◆ blockThreadCount()

◆ distribute()

◆ redistribute()

◆ threadCount()