$ lspci | grep -i nvidia 01:00.0 VGA compatible controller: NVIDIA Corporation GP108 [GeForce GT 1030] (rev a1) 01:00.1 Audio device: NVIDIA Corporation GP108 High Definition Audio Controller (rev a1) Installation Instructions: $ wget https://developer.download.nvidia.com/compute/cuda/repos/ ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb $ sudo dpkg -i cuda-keyring_1.0-1_all.deb $ sudo apt-get update $ sudo apt-get -y install cuda $ sudo reboot $ export PATH=/usr/local/cuda-12.0/bin${PATH:+:${PATH}} $ export LD_LIBRARY_PATH=/usr/local/cuda-12.0/lib64${LD_LIBRARY_PATH: +:${LD_LIBRARY_PATH}}
nvcc -o code24 code24.cu ./code24
1 #include <stdio.h> 2 #include <unistd.h> 3 __device__ const char *STR = "HELLO WORLD!"; 4 const int STR_LENGTH = 12; 5 __global__ void cuda_hello(){ 6 // blockIdx.x: Block index within the grid in the x direction 7 // threadIdx.x: Thread index within the block 8 // blockDim.x,y,z # of threads in a block 9 printf("Hello World from GPU! (%d ,%d) : %c ThreadID %d \n", blockIdx.x, threadIdx.x, STR[threadIdx.x % STR_LENGTH], (threadIdx.x +blockIdx.x*blockDim.x)); 10 } 11 /* 12 ./deviceQuery Starting... 13 14 CUDA Device Query (Runtime API) version (CUDART static linking) 15 16 Detected 3 CUDA Capable device(s) 17 18 Device 2: "NVIDIA GeForce GT 1030" 19 CUDA Driver Version / Runtime Version 12.0 / 11.8 20 CUDA Capability Major/Minor version number: 6.1 21 Total amount of global memory: 1998 MBytes (2095185920 bytes) 22 (003) Multiprocessors, (128) CUDA Cores/MP: 384 CUDA Cores 23 GPU Max Clock rate: 1468 MHz (1.47 GHz) 24 Memory Clock rate: 3004 Mhz 25 Memory Bus Width: 64-bit 26 L2 Cache Size: 524288 bytes 27 Maximum Texture Dimension Size (x,y,z) 1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384) 28 Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers 29 Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers 30 Total amount of constant memory: 65536 bytes 31 Total amount of shared memory per block: 49152 bytes 32 Total shared memory per multiprocessor: 98304 bytes 33 Total number of registers available per block: 65536 34 Warp size: 32 35 Maximum number of threads per multiprocessor: 2048 36 Maximum number of threads per block: 1024 37 Max dimension size of a thread block (x,y,z): (1024, 1024, 64) 38 Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535) 39 Maximum memory pitch: 2147483647 bytes 40 Texture alignment: 512 bytes 41 Concurrent copy and kernel execution: Yes with 2 copy engine(s) 42 Run time limit on kernels: Yes 43 Integrated GPU sharing Host Memory: No 44 Support host page-locked memory mapping: Yes 45 Alignment requirement for Surfaces: Yes 46 Device has ECC support: Disabled 47 Device supports Unified Addressing (UVA): Yes 48 Device supports Managed Memory: Yes 49 Device supports Compute Preemption: Yes 50 Supports Cooperative Kernel Launch: Yes 51 Supports MultiDevice Co-op Kernel Launch: Yes 52 Device PCI Domain ID / Bus ID / location ID: 0 / 4 / 0 53 Compute Mode: 54 < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) > 55 deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 12.0, CUDA Runtime Version = 11.8, NumDevs = 3 56 Result = PASS 57 */ 58 int main() { 59 printf("Hello World from CPU!\n"); 60 sleep(2); 61 int threads_per_block=12; 62 int blocks_per_grid=2; 63 cuda_hello <<< blocks_per_grid, threads_per_block >>> (); 64 cudaDeviceSynchronize(); /* Halt host thread execution on CPU until the device has finished processing all previously requested tasks */ 65 return 0; 66 } |
nvcc -o code25 code25.cu ./code25
1 #include <stdio.h> 2 #include <cuda.h> 3 #include <cuda_runtime.h> 4 5 #define N 720 // number of computations 6 #define GRID_D1 20 // constants for grid and block sizes 7 #define GRID_D2 3 // constants for grid and block sizes 8 #define BLOCK_D1 12 // constants for grid and block sizes 9 #define BLOCK_D2 1 // constants for grid and block sizes 10 #define BLOCK_D3 1 // constants for grid and block sizes 11 12 __global__ void hello(void) // this is the kernel function called for each thread 13 { 14 // we use the CUDA variables {threadIdx, blockIdx, blockDim, gridDim} to determine a unique ID for each thread 15 int myblock = blockIdx.x + blockIdx.y * gridDim.x; // id of the block 16 int blocksize = blockDim.x * blockDim.y * blockDim.z; // size of each block (within grid of blocks) 17 int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x; // id of thread in a given block 18 int idx = myblock * blocksize + subthread; // assign overall id/index of the thread 19 int nthreads=blocksize*gridDim.x*gridDim.y; // Total # of threads 20 int chunk=20; // Vary this value to see the changes at the output 21 if(idx < chunk || idx > nthreads-chunk) { // print buffer from within the kernel is limited so only print for first and last chunks of threads 22 if (idx < N){ 23 printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => thread index=%d \n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y, threadIdx.x, threadIdx.y, threadIdx.z, idx); 24 } 25 else 26 { 27 printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => thread index=%d [### this thread would not be used for N=%d ###]\n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y, threadIdx.x, threadIdx.y, threadIdx.z, idx, N); 28 } 29 } 30 } 31 32 int main(int argc,char **argv) 33 { 34 // objects containing the block and grid info 35 const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3); 36 const dim3 gridSize(GRID_D1, GRID_D2, 1); 37 int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*GRID_D1*GRID_D2; // Total # of threads 38 if (nthreads < N){ 39 printf("\n============ NOT ENOUGH THREADS TO COVER N=%d ===============\n\n",N); 40 } 41 else 42 { 43 printf("Launching %d threads (N=%d)\n",nthreads,N); 44 } 45 hello<<<gridSize, blockSize>>>(); // launch the kernel on the specified grid of thread blocks 46 // Need to flush prints, otherwise none of the prints from within the kernel will show up 47 // as program exit does not flush the print buffer. 48 cudaError_t cudaerr = cudaDeviceSynchronize(); 49 if (cudaerr){ 50 printf("kernel launch failed with error \"%s\".\n", 51 cudaGetErrorString(cudaerr)); 52 } 53 else 54 { 55 printf("kernel launch success!\n"); 56 } 57 printf("That's all!\n"); 58 return 0; 59 } |
nvcc -o code26 code26.cu ./code26
1 // https://www.olcf.ornl.gov/tutorials/cuda-vector-addition/ 2 // https://github.com/olcf-tutorials/vector_addition_cuda 3 #include <stdio.h> 4 #include <stdlib.h> 5 #include <math.h> 6 7 #define n 100000 // Size of array 8 9 /* CUDA KERNEL Compute the sum of two vectors 10 * Each thread takes care of one element of C 11 * C[i] = A[i] + B[i] 12 */ 13 __global__ void add_vectors(double *a, double *b, double *c) 14 { 15 int id = blockIdx.x*blockDim.x+threadIdx.x; // Get our global thread ID 16 if (id < n) // Make sure we do not go out of bounds 17 c[id] = a[id] + b[id]; /* Compute the element of C */ 18 } 19 20 int main( int argc, char* argv[] ) 21 { 22 double *h_a; // Host input vectors 23 double *h_b; 24 double *h_c; 25 26 double *d_a; // Device input vectors 27 double *d_b; 28 double *d_c; 29 30 size_t bytes = n*sizeof(double); // Size, in bytes, of each vector 31 32 h_a = (double*)malloc(bytes); // Allocate memory for each vector on host 33 h_b = (double*)malloc(bytes); 34 h_c = (double*)malloc(bytes); 35 36 cudaMalloc(&d_a, bytes); // Allocate memory for each vector on GPU 37 cudaMalloc(&d_b, bytes); 38 cudaMalloc(&d_c, bytes); 39 40 int i; 41 for( i = 0; i < n; i++ ) { // Initialize vectors on host 42 h_a[i] = sin(i)*sin(i); 43 h_b[i] = cos(i)*cos(i); 44 } 45 46 // Copy data from host arrays h_a and h_b to device arrays d_a and d_b 47 cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice); 48 cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice); 49 50 int thr_per_blk = 1024; // blockSize. Number of threads in each thread block 51 int blk_in_grid = ceil( (float) n/thr_per_blk ); // gridSize. Number of thread blocks in grid 52 53 add_vectors<<<blk_in_grid, thr_per_blk>>>(d_a, d_b, d_c); // Execute the kernel 54 55 cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost ); // Copy data from device array d_c to host array h_c 56 57 // Sum up vector c and print result divided by n, this should equal 1 within error 58 double sum = 0; 59 for(i=0; i<n; i++) 60 sum += h_c[i]; 61 printf("final result: %f\n", sum/n); 62 63 cudaFree(d_a); // Free device memory 64 cudaFree(d_b); 65 cudaFree(d_c); 66 67 free(h_a); // Free host memory 68 free(h_b); 69 free(h_c); 70 71 printf("---------------------------\n"); 72 printf("__ENDED__\n"); 73 printf("---------------------------\n"); 74 printf("N = %d\n", n); 75 printf("Threads Per Block = %d\n", thr_per_blk); 76 printf("Blocks In Grid = %d\n", blk_in_grid); 77 printf("---------------------------\n\n"); 78 79 return 0; 80 } |