CUDA
Voici un exemple de comment compiler et exécuter un programme CUDA (somme de deux vecteurs)
Mode interactif
On se connecte sur la file d'attente tesla.q
, une carte est allouée par défaut.
$ qlogin -q tesla.q
ou explicitement :
$ qlogin -q tesla.q -l gpu=1
Une fois connecté, on charge le module cuda
pour pouvoir compiler et exécuter les programmes CUDA/OpenCL.
$ cd WORK
$ module load gpu/cuda
Exemple de programme
- VectorAdd.cu
#include <stdio.h> #include <stdlib.h> #include <math.h> /* Kernel for vector addition */ __global__ void Vec_add(float x[], float y[], float z[], int n) { /* blockDim.x = threads_per_block */ /* First block gets first threads_per_block components. */ /* Second block gets next threads_per_block components, etc. */ int i = blockDim.x * blockIdx.x + threadIdx.x; /* block_count*threads_per_block may be >= n */ if (i < n) z[i] = x[i] + y[i]; } /* Vec_add */ /* Host code */ int main(int argc, char* argv[]) { int n, i; float *h_x, *h_y, *h_z; float *d_x, *d_y, *d_z; int threads_per_block; int block_count; size_t size; /* Get number of components in vector */ if (argc != 2) { fprintf(stderr, "usage: %s <vector order>\n", argv[0]); exit(0); } n = strtol(argv[1], NULL, 10); size = n*sizeof(float); /* Allocate input vectors in host memory */ h_x = (float*) malloc(size); h_y = (float*) malloc(size); h_z = (float*) malloc(size); /* Initialize input vectors */ for (i = 0; i < n; i++) { h_x[i] = i+1; h_y[i] = n-i; } printf("h_x = "); for (i = 0; i < n; i++) printf("%.1f ", h_x[i]); printf("\n"); printf("h_y = "); for (i = 0; i < n; i++) printf("%.1f ", h_y[i]); printf("\n\n"); /* Allocate vectors in device memory */ cudaMalloc(&d_x, size); cudaMalloc(&d_y, size); cudaMalloc(&d_z, size); /* Copy vectors from host memory to device memory */ cudaMemcpy(d_x, h_x, size, cudaMemcpyHostToDevice); cudaMemcpy(d_y, h_y, size, cudaMemcpyHostToDevice); /* Define block size */ threads_per_block = 256; /* Define grid size. If we just computed n/threads_per_block */ /* we might get fewer threads than vector components. Using */ /* ceil(n/threads_per_block) guarantees at least one thread */ /* per vector component. The following formula is a kludge */ /* since it appears that the CUDA ceil function doesn't work */ /* correctly. */ block_count = (n + threads_per_block - 1)/threads_per_block; /* Invoke kernel using block_count blocks, each of which */ /* contains threads_per_block threads */ Vec_add<<<block_count, threads_per_block>>>(d_x, d_y, d_z, n); /* Wait for the kernel to complete */ cudaThreadSynchronize(); /* Copy result from device memory to host memory */ /* h_z contains the result in host memory */ cudaMemcpy(h_z, d_z, size, cudaMemcpyDeviceToHost); printf("The sum is: \n"); for (i = 0; i < n; i++) printf("%.1f ", h_z[i]); printf("\n"); /* Free device memory */ cudaFree(d_x); cudaFree(d_y); cudaFree(d_z); /* Free host memory */ free(h_x); free(h_y); free(h_z); return 0; } /* main */
Compilation
$ nvcc VectorAdd.cu -o VectorAdd
On exécute le programme :
./VectorAdd
Mode passif
Vous pouvez utiliser SGE pour compiler et/ou exécuter un code CUDA ou OpenCL :
- cuda.sge
#!/bin/bash #$ -o $JOB_NAME.o$JOB_ID #$ -e $JOB_NAME.e$JOB_ID #$ -N job_cuda #$ -q tesla.q #$ -V #$ -cwd #$ -l h_vmem=2G module load gpu/cuda nvcc monprogramme.cu -o monprogramme ./monprogramme