#include #include /* Compile: nvcc inc.cu -o inc */ __global__ void inc(float *i, float *r){ unsigned int ix = blockIdx.x * blockDim.x + threadIdx.x; r[ix] = i[ix]+1; } #define BLOCK_SIZE 256 #define BLOCKS 1024 #define N (BLOCKS * BLOCK_SIZE) //int main(int argc, char **argv){ int main(){ float *v, *r; float *dv, *dr; cudaEvent_t start,stop; v = (float*)malloc(N*sizeof(float)); r = (float*)malloc(N*sizeof(float)); //generate input data for (int i = 0; i < N; ++i) { v[i] = (float)(rand () % 1000) / 1000.0; } cudaEventCreate(&start); cudaEventCreate(&stop); cudaMalloc((void**)&dv, sizeof(float) * N ); cudaMalloc((void**)&dr, sizeof(float) * N ); cudaMemcpy(dv, v, sizeof(float) * N, cudaMemcpyHostToDevice); cudaEventRecord(start); inc<<>>(dv,dr); cudaEventRecord(stop); cudaEventSynchronize(stop); cudaMemcpy(r, dr, sizeof(float) * N , cudaMemcpyDeviceToHost); cudaFree(dv); cudaFree(dr); for (int i = 0; i < 3; ++i) { printf("%f ", r[i]); } printf("\n"); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); printf("Time %f ms\n", elapsedTime); free(v); free(r); }