CUDA
Voici un exemple de comment compiler et exécuter un programme CUDA (somme de deux vecteurs)
Mode interactif
On se connecte sur la file d'attente tesla.q, une carte est allouée par défaut.
$ qlogin -q tesla.q
ou explicitement :
$ qlogin -q tesla.q -l gpu=1
Une fois connecté, on charge le module cuda pour pouvoir compiler et exécuter les programmes CUDA/OpenCL.
$ cd WORK
$ module load gpu/cuda
Exemple de programme
- VectorAdd.cu
#include <stdio.h> #include <stdlib.h> #include <math.h> /* Kernel for vector addition */ __global__ void Vec_add(float x[], float y[], float z[], int n) { /* blockDim.x = threads_per_block */ /* First block gets first threads_per_block components. */ /* Second block gets next threads_per_block components, etc. */ int i = blockDim.x * blockIdx.x + threadIdx.x; /* block_count*threads_per_block may be >= n */ if (i < n) z[i] = x[i] + y[i]; } /* Vec_add */ /* Host code */ int main(int argc, char* argv[]) { int n, i; float *h_x, *h_y, *h_z; float *d_x, *d_y, *d_z; int threads_per_block; int block_count; size_t size; /* Get number of components in vector */ if (argc != 2) { fprintf(stderr, "usage: %s <vector order>\n", argv[0]); exit(0); } n = strtol(argv[1], NULL, 10); size = n*sizeof(float); /* Allocate input vectors in host memory */ h_x = (float*) malloc(size); h_y = (float*) malloc(size); h_z = (float*) malloc(size); /* Initialize input vectors */ for (i = 0; i < n; i++) { h_x[i] = i+1; h_y[i] = n-i; } printf("h_x = "); for (i = 0; i < n; i++) printf("%.1f ", h_x[i]); printf("\n"); printf("h_y = "); for (i = 0; i < n; i++) printf("%.1f ", h_y[i]); printf("\n\n"); /* Allocate vectors in device memory */ cudaMalloc(&d_x, size); cudaMalloc(&d_y, size); cudaMalloc(&d_z, size); /* Copy vectors from host memory to device memory */ cudaMemcpy(d_x, h_x, size, cudaMemcpyHostToDevice); cudaMemcpy(d_y, h_y, size, cudaMemcpyHostToDevice); /* Define block size */ threads_per_block = 256; /* Define grid size. If we just computed n/threads_per_block */ /* we might get fewer threads than vector components. Using */ /* ceil(n/threads_per_block) guarantees at least one thread */ /* per vector component. The following formula is a kludge */ /* since it appears that the CUDA ceil function doesn't work */ /* correctly. */ block_count = (n + threads_per_block - 1)/threads_per_block; /* Invoke kernel using block_count blocks, each of which */ /* contains threads_per_block threads */ Vec_add<<<block_count, threads_per_block>>>(d_x, d_y, d_z, n); /* Wait for the kernel to complete */ cudaThreadSynchronize(); /* Copy result from device memory to host memory */ /* h_z contains the result in host memory */ cudaMemcpy(h_z, d_z, size, cudaMemcpyDeviceToHost); printf("The sum is: \n"); for (i = 0; i < n; i++) printf("%.1f ", h_z[i]); printf("\n"); /* Free device memory */ cudaFree(d_x); cudaFree(d_y); cudaFree(d_z); /* Free host memory */ free(h_x); free(h_y); free(h_z); return 0; } /* main */
Compilation
$ nvcc VectorAdd.cu -o VectorAdd
On exécute le programme :
./VectorAdd
Mode passif
Vous pouvez utiliser SGE pour compiler et/ou exécuter un code CUDA ou OpenCL :
- cuda.sge
#!/bin/bash #$ -o $JOB_NAME.o$JOB_ID #$ -e $JOB_NAME.e$JOB_ID #$ -N job_cuda #$ -q tesla.q #$ -V #$ -cwd #$ -l h_vmem=2G module load gpu/cuda nvcc monprogramme.cu -o monprogramme ./monprogramme