Hands-on; GPU parallelization

$ lspci | grep -i nvidia
01:00.0 VGA compatible controller: NVIDIA Corporation GP108
[GeForce GT 1030] (rev a1)
01:00.1 Audio device: NVIDIA Corporation GP108 High Definition
Audio Controller (rev a1)

Installation Instructions:
$ wget https://developer.download.nvidia.com/compute/cuda/repos/
ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
$ sudo dpkg -i cuda-keyring_1.0-1_all.deb
$ sudo apt-get update
$ sudo apt-get -y install cuda
$ sudo reboot
$ export PATH=/usr/local/cuda-12.0/bin${PATH:+:${PATH}}
$ export LD_LIBRARY_PATH=/usr/local/cuda-12.0/lib64${LD_LIBRARY_PATH:
+:${LD_LIBRARY_PATH}}

Hello world I: code24.cu Follow the steps below for executing Cuda code;

nvcc -o code24 code24.cu
./code24

    1 #include <stdio.h>
    2 #include <unistd.h>
    3 __device__ const char *STR = "HELLO WORLD!";
    4 const int STR_LENGTH = 12;
    5 __global__ void cuda_hello(){
    6   // blockIdx.x: Block index within the grid  in the x direction
    7   // threadIdx.x: Thread index within the block
    8   // blockDim.x,y,z # of threads in a block
    9   printf("Hello World from GPU! (%d ,%d) : %c ThreadID %d \n", blockIdx.x, threadIdx.x, STR[threadIdx.x % STR_LENGTH], (threadIdx.x +blockIdx.x*blockDim.x));
   10 }
   11 /*
   12   ./deviceQuery Starting...
   13 
   14   CUDA Device Query (Runtime API) version (CUDART static linking)
   15 
   16   Detected 3 CUDA Capable device(s)
   17 
   18   Device 2: "NVIDIA GeForce GT 1030"
   19   CUDA Driver Version / Runtime Version          12.0 / 11.8
   20   CUDA Capability Major/Minor version number:    6.1
   21   Total amount of global memory:                 1998 MBytes (2095185920 bytes)
   22   (003) Multiprocessors, (128) CUDA Cores/MP:    384 CUDA Cores
   23   GPU Max Clock rate:                            1468 MHz (1.47 GHz)
   24   Memory Clock rate:                             3004 Mhz
   25   Memory Bus Width:                              64-bit
   26   L2 Cache Size:                                 524288 bytes
   27   Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
   28   Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
   29   Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
   30   Total amount of constant memory:               65536 bytes
   31   Total amount of shared memory per block:       49152 bytes
   32   Total shared memory per multiprocessor:        98304 bytes
   33   Total number of registers available per block: 65536
   34   Warp size:                                     32
   35   Maximum number of threads per multiprocessor:  2048
   36   Maximum number of threads per block:           1024
   37   Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
   38   Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
   39   Maximum memory pitch:                          2147483647 bytes
   40   Texture alignment:                             512 bytes
   41   Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
   42   Run time limit on kernels:                     Yes
   43   Integrated GPU sharing Host Memory:            No
   44   Support host page-locked memory mapping:       Yes
   45   Alignment requirement for Surfaces:            Yes
   46   Device has ECC support:                        Disabled
   47   Device supports Unified Addressing (UVA):      Yes
   48   Device supports Managed Memory:                Yes
   49   Device supports Compute Preemption:            Yes
   50   Supports Cooperative Kernel Launch:            Yes
   51   Supports MultiDevice Co-op Kernel Launch:      Yes
   52   Device PCI Domain ID / Bus ID / location ID:   0 / 4 / 0
   53   Compute Mode:
   54   < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
   55   deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 12.0, CUDA Runtime Version = 11.8, NumDevs = 3
   56   Result = PASS
   57 */
   58 int main() {
   59   printf("Hello World from CPU!\n");
   60   sleep(2);
   61   int threads_per_block=12;
   62   int blocks_per_grid=2;
   63   cuda_hello <<< blocks_per_grid, threads_per_block >>> ();
   64   cudaDeviceSynchronize(); /* Halt host thread execution on CPU until the device has finished processing all previously requested tasks */
   65   return 0;
   66 }

Hello world II: code25.cu Follow the steps below for executing Cuda code;

nvcc -o code25 code25.cu
./code25

    1 #include <stdio.h>
    2 #include <cuda.h>
    3 #include <cuda_runtime.h>
    4 
    5 #define N 720 // number of computations
    6 #define GRID_D1 20 // constants for grid and block sizes 
    7 #define GRID_D2 3  // constants for grid and block sizes 
    8 #define BLOCK_D1 12 // constants for grid and block sizes 
    9 #define BLOCK_D2 1 // constants for grid and block sizes 
   10 #define BLOCK_D3 1 // constants for grid and block sizes 
   11 
   12 __global__ void hello(void) // this is the kernel function called for each thread
   13 {    
   14   // we use the CUDA variables {threadIdx, blockIdx, blockDim, gridDim} to determine a unique ID for each thread
   15   int myblock = blockIdx.x + blockIdx.y * gridDim.x; // id of the block
   16   int blocksize = blockDim.x * blockDim.y * blockDim.z;     // size of each block (within grid of blocks)
   17   int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x; // id of thread in a given block
   18   int idx = myblock * blocksize + subthread; // assign overall id/index of the thread
   19   int nthreads=blocksize*gridDim.x*gridDim.y; // Total # of threads
   20   int chunk=20; // Vary this value to see the changes at the output
   21   if(idx < chunk || idx > nthreads-chunk) { // print buffer from within the kernel is limited so only print for first and last chunks of threads
   22     if (idx < N){      
   23       printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => thread index=%d \n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y, threadIdx.x, threadIdx.y, threadIdx.z, idx);
   24     }
   25     else
   26       {
   27 	printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => thread index=%d [### this thread would not be used for N=%d ###]\n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y, threadIdx.x, threadIdx.y, threadIdx.z, idx, N);
   28       }
   29   }
   30 }
   31 
   32 int main(int argc,char **argv)
   33 {
   34   // objects containing the block and grid info
   35   const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3);
   36   const dim3 gridSize(GRID_D1, GRID_D2, 1);
   37   int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*GRID_D1*GRID_D2; // Total # of threads
   38   if (nthreads < N){
   39     printf("\n============ NOT ENOUGH THREADS TO COVER N=%d ===============\n\n",N);
   40   } 
   41   else 
   42     {
   43       printf("Launching %d threads (N=%d)\n",nthreads,N);
   44     }
   45   hello<<<gridSize, blockSize>>>(); // launch the kernel on the specified grid of thread blocks
   46   // Need to flush prints, otherwise none of the prints from within the kernel will show up
   47   // as program exit does not flush the print buffer.
   48   cudaError_t cudaerr = cudaDeviceSynchronize();
   49   if (cudaerr){
   50     printf("kernel launch failed with error \"%s\".\n",
   51 	   cudaGetErrorString(cudaerr));
   52   } 
   53   else 
   54     {
   55       printf("kernel launch success!\n");
   56     }
   57   printf("That's all!\n");
   58   return 0;
   59 }

Vector Addition: code26.cu Follow the steps below for executing Cuda code;

nvcc -o code26 code26.cu
./code26

    1 // https://www.olcf.ornl.gov/tutorials/cuda-vector-addition/
    2 // https://github.com/olcf-tutorials/vector_addition_cuda
    3 #include <stdio.h>
    4 #include <stdlib.h>
    5 #include <math.h>
    6  
    7 #define n 100000 // Size of array
    8 
    9 /* CUDA KERNEL Compute the sum of two vectors
   10  * Each thread takes care of one element of C 
   11  * C[i] = A[i] + B[i] 
   12  */
   13 __global__ void add_vectors(double *a, double *b, double *c)
   14 {
   15   int id = blockIdx.x*blockDim.x+threadIdx.x; // Get our global thread ID
   16   if (id < n) // Make sure we do not go out of bounds
   17     c[id] = a[id] + b[id]; /* Compute the element of C */
   18 }
   19  
   20 int main( int argc, char* argv[] )
   21 {
   22   double *h_a; // Host input vectors 
   23   double *h_b; 
   24   double *h_c; 
   25  
   26   double *d_a; // Device input vectors
   27   double *d_b; 
   28   double *d_c; 
   29  
   30   size_t bytes = n*sizeof(double); // Size, in bytes, of each vector
   31  
   32   h_a = (double*)malloc(bytes); // Allocate memory for each vector on host
   33   h_b = (double*)malloc(bytes); 
   34   h_c = (double*)malloc(bytes); 
   35  
   36   cudaMalloc(&d_a, bytes); // Allocate memory for each vector on GPU
   37   cudaMalloc(&d_b, bytes);  
   38   cudaMalloc(&d_c, bytes);
   39  
   40   int i;
   41   for( i = 0; i < n; i++ ) { // Initialize vectors on host 
   42     h_a[i] = sin(i)*sin(i);
   43     h_b[i] = cos(i)*cos(i);
   44   }
   45  
   46   // Copy data from host arrays h_a and h_b to device arrays d_a and d_b
   47   cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
   48   cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
   49  
   50   int thr_per_blk = 1024; // blockSize. Number of threads in each thread block
   51   int blk_in_grid = ceil( (float) n/thr_per_blk ); // gridSize. Number of thread blocks in grid
   52  
   53   add_vectors<<<blk_in_grid, thr_per_blk>>>(d_a, d_b, d_c); // Execute the kernel
   54  
   55   cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost ); // Copy data from device array d_c to host array h_c
   56  
   57   // Sum up vector c and print result divided by n, this should equal 1 within error
   58   double sum = 0;
   59   for(i=0; i<n; i++)
   60     sum += h_c[i];
   61   printf("final result: %f\n", sum/n);
   62  
   63   cudaFree(d_a); // Free device memory
   64   cudaFree(d_b);
   65   cudaFree(d_c);
   66  
   67   free(h_a); // Free host memory
   68   free(h_b);
   69   free(h_c);
   70     
   71   printf("---------------------------\n");
   72   printf("__ENDED__\n");
   73   printf("---------------------------\n");
   74   printf("N                 = %d\n", n);
   75   printf("Threads Per Block = %d\n", thr_per_blk);
   76   printf("Blocks In Grid    = %d\n", blk_in_grid);
   77   printf("---------------------------\n\n");
   78     
   79   return 0;
   80 }