__host__, __global__, __device__
__global__ void cuda_hello(){ } |
compute <<<gs,bs>>>(<args>)
cuda_hello <<<blocks_per_grid,threads_per_block>>> (); |
__shared__, __device__, __constant__, ...
__device__ const char *STR = "HELLO WORLD!"; |
cudaMemcpy(h_obj,d_obj, cudaMemcpyDevicetoHost)
cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice); cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost ); |
__synchthreads()
cudaDeviceSynchronize(); |
__global__tells the compiler nvcc to make a function a kernel (and compile/run it for the GPU, instead of the CPU)
1 double *h_a; // Host input vectors 2 double *d_a; // Device input vectors 3 h_a = (double*)malloc(bytes); // Allocate memory for each vector on host 4 cudaMalloc(&d_a, bytes); // Allocate memory for each vector on GPU 5 cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice); // Copy data from host array h_a to device arrays d_a 6 add_vectors<<<blk_in_grid, thr_per_blk>>>(d_a, d_b, d_c); // Execute the kernel 7 cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost ); // Copy data from device array d_c to host array h_c |
|
|
1 #include <stdio.h> 2 #include <unistd.h> 3 __device__ const char *STR = "HELLO WORLD!"; 4 const int STR_LENGTH = 12; 5 __global__ void cuda_hello(){ 6 // blockIdx.x: Block index within the grid in x-direction 7 // threadIdx.x: Thread index within the block 8 // blockDim.x: # of threads in a block 9 printf("Hello World from GPU! (%d ,%d) : %c ThreadID %d \n", blockIdx.x, threadIdx.x, STR[threadIdx.x % STR_LENGTH], (threadIdx.x +blockIdx.x*blockDim.x)); 10 } 11 int main() { 12 printf("Hello World from CPU!\n"); 13 sleep(2); 14 int threads_per_block=12; 15 int blocks_per_grid=2; 16 cuda_hello <<<blocks_per_grid,threads_per_block>>> (); 17 cudaDeviceSynchronize(); /* Halt host thread execution on CPU until the device has finished processing all previously requested tasks */ 18 return 0; 19 } |
1 #include <stdio.h> 2 #include <cuda.h> 3 #include <cuda_runtime.h> 4 5 // Note: Needs compute capability >= 2.0, so compile with: 6 // nvcc helloWorld.cu -arch=compute_20 -code=sm_20,compute_20 -o helloWorld 7 8 #define N 720 // number of computations 9 #define GRID_D1 20 // constants for grid and block sizes 10 #define GRID_D2 3 // constants for grid and block sizes 11 #define BLOCK_D1 12 // constants for grid and block sizes 12 #define BLOCK_D2 1 // constants for grid and block sizes 13 #define BLOCK_D3 1 // constants for grid and block sizes 14 15 __global__ void hello(void) // this is the kernel function called for each thread 16 { 17 // CUDA variables {threadIdx, blockIdx, blockDim, gridDim} to determine a unique thread ID 18 int myblock = blockIdx.x + blockIdx.y * gridDim.x; // id of the block 19 int blocksize = blockDim.x * blockDim.y * blockDim.z; // size of each block 20 int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x; // id of thread in a given block 21 int idx = myblock * blocksize + subthread; // assign overall id/index of the thread 22 int nthreads=blocksize*gridDim.x*gridDim.y; // Total # of threads 23 int chunk=20; // Vary this value to see the changes at the output 24 if(idx < chunk || idx > nthreads-chunk) { // only print first and last chunks of threads 25 if (idx < N){ 26 printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => thread index=%d \n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y, threadIdx.x, threadIdx.y, threadIdx.z, idx); 27 } 28 else 29 { 30 printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => thread index=%d [### this thread would not be used for N=%d ###]\n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y, threadIdx.x, threadIdx.y, threadIdx.z, idx, N); 31 } 32 } 33 } |
30 int main(int argc,char **argv) 31 { 32 // objects containing the block and grid info 33 const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3); 34 const dim3 gridSize(GRID_D1, GRID_D2, 1); 35 int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*GRID_D1*GRID_D2; // Total # of threads 36 if (nthreads < N){ 37 printf("\n============ NOT ENOUGH THREADS TO COVER N=%d ===============\n\n",N); 38 } 39 else 40 { 41 printf("Launching %d threads (N=%d)\n",nthreads,N); 42 } 43 hello<<<gridSize, blockSize>>>(); // launch the kernel on the specified grid of thread blocks 44 cudaError_t cudaerr = cudaDeviceSynchronize(); // Need to flush prints, otherwise none of the prints from within the kernel will show up as program exit does not flush the print buffer 45 if (cudaerr){ 46 printf("kernel launch failed with error \"%s\".\n", 47 cudaGetErrorString(cudaerr)); 48 } 49 else 50 { 51 printf("kernel launch success!\n"); 52 } 53 printf("That's all!\n"); 54 return 0; 55 } |