Execution and Programming Models

Computation partitioning (where to run)

Declarations on functions

__host__, __global__, __device__

__global__ void cuda_hello(){
}

Mapping of thread programs to device:

compute <<<gs,bs>>>(<args>)

cuda_hello <<<blocks_per_grid,threads_per_block>>> ();

Data partitioning (where does data reside, who may access it and how?)

Declarations on data

__shared__, __device__, __constant__, ...

__device__ const char *STR = "HELLO WORLD!";

Data management and orchestration

Copying to/from host: e.g.,

cudaMemcpy(h_obj,d_obj, cudaMemcpyDevicetoHost)

cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );

Concurrency management. e.g..

__synchthreads()

cudaDeviceSynchronize();

a simple C function

executes on GPU in parallel as many times as there are threads

The keyword

__global__

tells the compiler nvcc to make a function a kernel (and compile/run it for the GPU, instead of the CPU)

It's the functions that you may call from the host side using CUDA kernel call semantics ( $<<<...>>>$

cudaMemcpy : Transfer data to and from GPU (global memory)

cudaMalloc : Allocate memory on GPU (global memory)

    1 double *h_a; // Host input vectors 
    2 double *d_a; // Device input vectors
    3 h_a = (double*)malloc(bytes); // Allocate memory for each vector on host
    4 cudaMalloc(&d_a, bytes); // Allocate memory for each vector on GPU
    5 cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice); // Copy data from host array h_a to device arrays d_a
    6 add_vectors<<<blk_in_grid, thr_per_blk>>>(d_a, d_b, d_c); // Execute the kernel
    7 cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost ); // Copy data from device array d_c to host array h_c

GPU is the 'device', CPU is the 'host'. They do not share memory!

The HOST launches a kernel that execute on the DEVICE.

Threads and blocks have IDs
- So each thread can decide what data to work on
- Block ID: 1D or 2D (blockIdx.x, blockIdx.y)
- Thread ID: 1D, 2D, or 3D (threadIdx.x,y,z)
Simplifies memory addressing when processing multi- dimensional data.

Compiler nvcc takes as input a .cu program and produces

C Code for host processor (CPU), compiled by native C compiler
Code for device processor (GPU), compiled by nvcc compiler

Cuda Code:

    1 #include <stdio.h>
    2 #include <unistd.h>
    3 __device__ const char *STR = "HELLO WORLD!";
    4 const int STR_LENGTH = 12;
    5 __global__ void cuda_hello(){
    6 // blockIdx.x: Block index within the grid in x-direction
    7 // threadIdx.x: Thread index within the block
    8 // blockDim.x: # of threads in a block
    9     printf("Hello World from GPU! (%d ,%d) : %c ThreadID %d \n", blockIdx.x, threadIdx.x, STR[threadIdx.x % STR_LENGTH], (threadIdx.x +blockIdx.x*blockDim.x));
   10 }
   11 int main() {
   12     printf("Hello World from CPU!\n");
   13     sleep(2);
   14     int threads_per_block=12;
   15     int blocks_per_grid=2;
   16     cuda_hello <<<blocks_per_grid,threads_per_block>>> ();
   17     cudaDeviceSynchronize(); /* Halt host thread execution on CPU until the device has finished processing all previously requested tasks */
   18     return 0;
   19 }

Cuda Code:

    1 #include <stdio.h>
    2 #include <cuda.h>
    3 #include <cuda_runtime.h>
    4 
    5 // Note: Needs compute capability >= 2.0, so compile with:
    6 // nvcc helloWorld.cu -arch=compute_20 -code=sm_20,compute_20 -o helloWorld
    7 
    8 #define N 720 // number of computations
    9 #define GRID_D1 20 // constants for grid and block sizes 
   10 #define GRID_D2 3  // constants for grid and block sizes 
   11 #define BLOCK_D1 12 // constants for grid and block sizes 
   12 #define BLOCK_D2 1 // constants for grid and block sizes 
   13 #define BLOCK_D3 1 // constants for grid and block sizes 
   14 
   15 __global__ void hello(void) // this is the kernel function called for each thread
   16 {    
   17 // CUDA variables {threadIdx, blockIdx, blockDim, gridDim} to determine a unique thread ID
   18     int myblock = blockIdx.x + blockIdx.y * gridDim.x; // id of the block
   19     int blocksize = blockDim.x * blockDim.y * blockDim.z; // size of each block 
   20     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x; // id of thread in a given block
   21     int idx = myblock * blocksize + subthread; // assign overall id/index of the thread
   22     int nthreads=blocksize*gridDim.x*gridDim.y; // Total # of threads
   23     int chunk=20; // Vary this value to see the changes at the output
   24     if(idx < chunk || idx > nthreads-chunk) { // only print first and last chunks of threads
   25         if (idx < N){      
   26             printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => thread index=%d \n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y, threadIdx.x, threadIdx.y, threadIdx.z, idx);
   27         }
   28         else
   29         {
   30             printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => thread index=%d [### this thread would not be used for N=%d ###]\n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y, threadIdx.x, threadIdx.y, threadIdx.z, idx, N);
   31        }
   32     }
   33 }

   30 int main(int argc,char **argv)
   31 {
   32     // objects containing the block and grid info
   33     const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3);
   34     const dim3 gridSize(GRID_D1, GRID_D2, 1);
   35     int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*GRID_D1*GRID_D2; // Total # of threads
   36     if (nthreads < N){
   37         printf("\n============ NOT ENOUGH THREADS TO COVER N=%d ===============\n\n",N);
   38     } 
   39     else 
   40     {
   41         printf("Launching %d threads (N=%d)\n",nthreads,N);
   42     }
   43     hello<<<gridSize, blockSize>>>(); // launch the kernel on the specified grid of thread blocks
   44     cudaError_t cudaerr = cudaDeviceSynchronize(); // Need to flush prints, otherwise none of the prints from within the kernel will show up as program exit does not flush the print buffer
   45     if (cudaerr){
   46         printf("kernel launch failed with error \"%s\".\n",
   47                cudaGetErrorString(cudaerr));
   48     } 
   49     else 
   50     {
   51         printf("kernel launch success!\n");
   52     }
   53     printf("That's all!\n");
   54     return 0;
   55 }