Parallel Scan in C CUda

// MP Scan

// Given a list (lst) of length n

// Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ... + lst


#define BLOCK_SIZE 512 //@@ You can change this

#define wbCheck(stmt) do {
cudaError_t err = stmt;
if (err != cudaSuccess) {
wbLog(ERROR, "Failed to run stmt ", #stmt);
wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err));
return -1;
} while(0)
__global__ void scan(float * input, float * output, int len) {
//@@ Modify the body of this function to complete the functionality of
//@@ the scan on the device
//@@ You may need multiple kernel calls; write your kernels before this
// Load a segment of the input vector into shared memory
__shared__ float XY[2*BLOCK_SIZE];
unsigned int tx = threadIdx.x, start = 2 * blockIdx.x * BLOCK_SIZE;
int i=blockDim.x*blockIdx.x+threadIdx.x;
if (start + tx < len)
XY[tx] = input[start + tx];
XY[tx] = 0;
if (start + BLOCK_SIZE + tx < len)
XY[BLOCK_SIZE + tx] = input[start + BLOCK_SIZE + tx];
XY[BLOCK_SIZE + tx] = 0;
// Reduction
for (int stride = 1; stride <= BLOCK_SIZE; stride <<= 1) {
int index = (tx + 1) * stride * 2 - 1;
if (index < 2 * BLOCK_SIZE)
XY[index] += XY[index - stride];
// Post reduction
for (int stride = BLOCK_SIZE >> 1; stride>=1; stride >>= 1) {
int index = (tx + 1) * stride * 2 - 1;
if (index + stride < 2 * BLOCK_SIZE)
XY[index + stride] += XY[index];
/*if (start + t < len)
output[start + t] = scan_array[t];
if (start + BLOCK_SIZE + t < len)
output[start + BLOCK_SIZE + t] = XY[BLOCK_SIZE + t];
if (i<len)
output[i] = XY[tx];


int main(int argc, char ** argv) {

wbArg_t args;
float * hostInput; // The input 1D list
float * hostOutput; // The output list
float * deviceInput;
float * deviceOutput;
float * deviceOutputemp;
int numElements; // number of elements in the list
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostInput = (float *) wbImport(wbArg_getInputFile(args, 0), &numElements);
hostOutput = (float*) malloc(numElements * sizeof(float));
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The number of input elements in the input is ", numElements);
wbTime_start(GPU, "Allocating GPU memory.");
wbCheck(cudaMalloc((void**)&deviceInput, numElements*sizeof(float)));
wbCheck(cudaMalloc((void**)&deviceOutputemp, numElements*sizeof(float)));
wbCheck(cudaMalloc((void**)&deviceOutput, numElements*sizeof(float)));
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Clearing output memory.");
wbCheck(cudaMemset(deviceOutput, 0, numElements*sizeof(float)));
wbTime_stop(GPU, "Clearing output memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
wbCheck(cudaMemcpy(deviceInput, hostInput, numElements*sizeof(float), cudaMe
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
int numOfBlocks =(numElements-1)/BLOCK_SIZE+1;
dim3 gridSize(numOfBlocks);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Modify this to complete the functionality of the scan
//@@ on the deivce
scan<<<gridSize, BLOCK_SIZE>>>(deviceInput, deviceOutputemp, numElements
scan<<<1, gridSize,numOfBlocks*sizeof(float)>>>(deviceOutputemp, deviceO
utput, numOfBlocks);
fixup<<<dimGrid, dimBlock>>>(deviceOutput, deviceAuxScannedArray, numElement
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements*sizeof(float), cuda
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
wbTime_stop(GPU, "Freeing GPU Memory");

wbSolution(args, hostOutput, numElements);

return 0;

