Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

Optimize the reduction algorithm in kernel.cu so that the max value of the data

ID: 3853225 • Letter: O

Question

Optimize the reduction algorithm in kernel.cu so that the max value of the data is obtained. (other related files are provided)

kernel.cu

#define BLOCK_SIZE 512
#define SIMPLE

__global__ void reduction(float *out, float *in, unsigned size)
{
   /********************************************************************
   Load segment of input vector into shared memory
   Traverse reduction tree
   Write computed sum to output vector at correct index
   ********************************************************************/

   // INSERT KERNEL CODE HERE

#ifdef SIMPLE
   __shared__ float in_s[2 * BLOCK_SIZE];
   int idx = 2 * blockIdx.x * blockDim.x + threadIdx.x;

   in_s[threadIdx.x] = ((idx < size) ? in[idx] : 0.0f);
   in_s[threadIdx.x + BLOCK_SIZE] = ((idx + BLOCK_SIZE < size) ? in[idx + BLOCK_SIZE] : 0.0f);

   for (int stride = 1; stride < BLOCK_SIZE << 1; stride <<= 1) {
       __syncthreads();
       if (threadIdx.x % stride == 0)
           in_s[2 * threadIdx.x] += in_s[2 * threadIdx.x + stride];
   }

#else
   __shared__ float in_s[BLOCK_SIZE];
   int idx = 2 * blockIdx.x * blockDim.x + threadIdx.x;

   in_s[threadIdx.x] = ((idx < size) ? in[idx] : 0.0f) +
       ((idx + BLOCK_SIZE < size) ? in[idx + BLOCK_SIZE] : 0.0f);

   for (int stride = BLOCK_SIZE >> 1; stride > 0; stride >>= 1) {
       __syncthreads();
       if (threadIdx.x < stride)
           in_s[threadIdx.x] += in_s[threadIdx.x + stride];
   }
#endif

   if (threadIdx.x == 0)
       out[blockIdx.x] = in_s[0];
}

main.cu

#include

#include "support.h"
#include "kernel.cu"

int main(int argc, char* argv[])
{
   Timer timer;

   // Initialize host variables ----------------------------------------------

   printf(" Setting up the problem..."); fflush(stdout);
   startTime(&timer);

   float *in_h, *out_h;
   float *in_d, *out_d;
   unsigned in_elements, out_elements;
   cudaError_t cuda_ret;
   dim3 dim_grid, dim_block;
   int i;

   // Allocate and initialize host memory
   if (argc == 1) {
       in_elements = 1000000;
   }
   else if (argc == 2) {
       in_elements = atoi(argv[1]);
   }
   else {
       printf(" Invalid input parameters!"
           " Usage: ./reduction # Input of size 1,000,000 is used"
           " Usage: ./reduction # Input of size m is used"
           " ");
       exit(0);
   }
   initVector(&in_h, in_elements);

   out_elements = in_elements / (BLOCK_SIZE << 1);
   if (in_elements % (BLOCK_SIZE << 1)) out_elements++;

   out_h = (float*)malloc(out_elements * sizeof(float));
   if (out_h == NULL) FATAL("Unable to allocate host");

   stopTime(&timer); printf("%f s ", elapsedTime(timer));
   printf(" Input size = %u ", in_elements);

   // Allocate device variables ----------------------------------------------

   printf("Allocating device variables..."); fflush(stdout);
   startTime(&timer);

   cuda_ret = cudaMalloc((void**)&in_d, in_elements * sizeof(float));
   if (cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");
   cuda_ret = cudaMalloc((void**)&out_d, out_elements * sizeof(float));
   if (cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");

   cudaDeviceSynchronize();
   stopTime(&timer); printf("%f s ", elapsedTime(timer));

   // Copy host variables to device ------------------------------------------

   printf("Copying data from host to device..."); fflush(stdout);
   startTime(&timer);

   cuda_ret = cudaMemcpy(in_d, in_h, in_elements * sizeof(float),
       cudaMemcpyHostToDevice);
   if (cuda_ret != cudaSuccess) FATAL("Unable to copy memory to the device");

   cuda_ret = cudaMemset(out_d, 0, out_elements * sizeof(float));
   if (cuda_ret != cudaSuccess) FATAL("Unable to set device memory");

   cudaDeviceSynchronize();
   stopTime(&timer); printf("%f s ", elapsedTime(timer));

   // Launch kernel ----------------------------------------------------------
   printf("Launching kernel..."); fflush(stdout);
   startTime(&timer);

   dim_block.x = BLOCK_SIZE; dim_block.y = dim_block.z = 1;
   dim_grid.x = out_elements; dim_grid.y = dim_grid.z = 1;
   reduction << > >(out_d, in_d, in_elements);
   cuda_ret = cudaDeviceSynchronize();
   if (cuda_ret != cudaSuccess) FATAL("Unable to launch/execute kernel");

   stopTime(&timer); printf("%f s ", elapsedTime(timer));

   // Copy device variables from host ----------------------------------------

   printf("Copying data from device to host..."); fflush(stdout);
   startTime(&timer);

   cuda_ret = cudaMemcpy(out_h, out_d, out_elements * sizeof(float),
       cudaMemcpyDeviceToHost);
   if (cuda_ret != cudaSuccess) FATAL("Unable to copy memory to host");

   cudaDeviceSynchronize();
   stopTime(&timer); printf("%f s ", elapsedTime(timer));

   // Verify correctness -----------------------------------------------------

   printf("Verifying results..."); fflush(stdout);

   /* Accumulate partial sums on host */
   for (i = 1; i        out_h[0] += out_h[i];
   }

   /* Verify the result */
   verify(in_h, in_elements, out_h[0]);

   // Free memory ------------------------------------------------------------

   cudaFree(in_d); cudaFree(out_d);
   free(in_h); free(out_h);

   return 0;
}

support.cu

#include
#include

#include "support.h"

void initVector(float **vec_h, unsigned size)
{
   *vec_h = (float*)malloc(size * sizeof(float));

   if (*vec_h == NULL) {
       FATAL("Unable to allocate host");
   }

   for (unsigned int i = 0; i < size; i++) {
       (*vec_h)[i] = (rand() % 100) / 100.00;
   }

}


void verify(float* input, unsigned num_elements, float result) {

   const float relativeTolerance = 2e-5;

   float sum = 0.0f;
   for (int i = 0; i < num_elements; ++i) {
       sum += input[i];
   }
   float relativeError = (sum - result) / sum;
   if (relativeError > relativeTolerance
       || relativeError < -relativeTolerance) {
       printf("TEST FAILED, cpu = %0.3f, gpu = %0.3f ", sum, result);
       exit(0);
   }
   printf("TEST PASSED ");

}

void startTime(Timer* timer) {
   gettimeofday(&(timer->startTime), NULL);
}

void stopTime(Timer* timer) {
   gettimeofday(&(timer->endTime), NULL);
}

float elapsedTime(Timer timer) {
   return ((float)((timer.endTime.tv_sec - timer.startTime.tv_sec)
       + (timer.endTime.tv_usec - timer.startTime.tv_usec) / 1.0e6));
}

support.h

#ifndef __FILEH__
#define __FILEH__

#include

typedef struct {
   struct timeval startTime;
   struct timeval endTime;
} Timer;

#ifdef __cplusplus
extern "C" {
#endif
   void initVector(float **vec_h, unsigned size);
   void verify(float* input, unsigned num_elements, float result);
   void startTime(Timer* timer);
   void stopTime(Timer* timer);
   float elapsedTime(Timer timer);
#ifdef __cplusplus
}
#endif

#define FATAL(msg, ...)
do {
fprintf(stderr, "[%s:%d] "msg" ", __FILE__, __LINE__, ##__VA_ARGS__);
exit(-1);
} while(0)

#if __BYTE_ORDER != __LITTLE_ENDIAN
# error "File I/O is not implemented for this system: wrong endianness."
#endif

#endif

Explanation / Answer

#define BLOCK_SIZE 512

__global__ void total(float * input, float * output, int len) {

            __shared__ float partialSum[2 * BLOCK_SIZE];

            unsigned int t = threadIdx.x;

            unsigned int start = 2 * blockIdx.x * blockDim.x;

            partialSum[t] = (t < len) ? input[start + t] : 0;

            partialSum[blockDim.x + t] = ((blockDim.x + t) < len) ? input[start + blockDim.x + t] : 0;

            for(unsigned int stride = blockDim.x; stride >= 1; stride >>= 1) {

                        __syncthreads();

                        if(t < stride)

                                    partialSum[t] += partialSum[t + stride];

            }

            if(t == 0) {

                        output[blockIdx.x + t] = partialSum[t];

            }

           

}

int main(int argc, char ** argv) {

   int ii;

    float * hostInput;

    float * hostOutput;

    float * deviceInput;

    float * deviceOutput;

    int numInputElements;

    int numOutputElements;

   numOutputElements = numInputElements / (BLOCK_SIZE<<1);

    if (numInputElements % (BLOCK_SIZE<<1)) {

        numOutputElements++;

    }

    hostOutput = (float*) malloc(numOutputElements * sizeof(float));

            cudaMalloc((void **) &deviceInput, numInputElements * sizeof(float));

            cudaMalloc((void **) &deviceOutput, numOutputElements * sizeof(float));

            cudaMemcpy(deviceInput, hostInput, numInputElements * sizeof(float), cudaMemcpyHostToDevice);

            dim3 DimGrid((numInputElements - 1)/BLOCK_SIZE + 1, 1, 1);

            dim3 DimBlock(BLOCK_SIZE, 1, 1);

            total<<<DimGrid, DimBlock>>>(deviceInput, deviceOutput, numInputElements);

    cudaDeviceSynchronize();

            cudaMemcpy(hostOutput, deviceOutput, numOutputElements * sizeof(float), cudaMemcpyDeviceToHost);

    for (ii = 1; ii < numOutputElements; ii++) {

        hostOutput[0] += hostOutput[ii];

    }

            cudaFree(deviceInput);

            cudaFree(deviceOutput);

    free(hostInput);

    free(hostOutput);   

    return 0;

}

Hire Me For All Your Tutoring Needs
Integrity-first tutoring: clear explanations, guidance, and feedback.
Drop an Email at
drjack9650@gmail.com
Chat Now And Get Quote