namespace cg = cooperative_group;
// thread block
cg::thread_block block = cg::this_thread_block();
// thread warp
cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
warp.thread_rank()
: index of thread in
warpwarp.size()
: generally is 32If you want to use cooperative group to build reduce operations:
int idx = blockIdx.x * warp.meta_group_size();
float sum = 0.0f;
float *x = inp + idx * C;
for (int i = warp.thread_rank(); i < C; i += warp.size()) {
+= x[i];
sum }
= cg::reduce(warp, sum, cg::plus<float>{}); sum