#include #include #include /* nvcc cuda_template.cu -o cuda_template */ __global__ void inc(float *v, float *r){ unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x; r[gtid] = v[gtid] + 1; } #define N 32 int main(int argc, char **argv){ float v[32]; float r[32]; float *dv = 0; float *dr = 0; //generate input data for (int i = 0; i < 32; ++i) { v[i] = 1.0; } cudaMalloc((void **)&dv, sizeof(float) * N ); cudaMalloc((void **)&dr, sizeof(float) * N ); cudaMemcpy(dv, v, sizeof(float) * N, cudaMemcpyHostToDevice); inc<<<1, 32>>>(dv,dr); cudaMemcpy(r, dr, sizeof(float) * N , cudaMemcpyDeviceToHost); cudaFree(dv); cudaFree(dr); // show results for (int i = 0; i < 32; ++i) { printf("%f ", r[i]); } }