Add an algorithm in CUDA C (no c++) to this kernel code that will find the max v

Q: Add an algorithm in CUDA C (no c++) to this kernel code that will find the max v

Here is the code which I tried from the assignment. __global__ void add_vectors_kernel(float *A, float *B, float *C, int N) { // Determine which element the thread is computing int block_Id = blockIdx.x + gridDim.x * blockIdx.y; int thread_Id = blockDim.x * block_Id + threadIdx.x; // Compute a single element of the result vector (if the element is valid) if (thread_Id < N) C[thread_Id] = A[thread_Id] + B[thread_Id]; }

ID: 3862611 • Letter: A

Question

Add an algorithm in CUDA C (no c++) to this kernel code that will find the max value from the vector the kernel reduces (float *in).

__global__ void reduction(float *out, float *in, unsigned size)
{

// INSERT KERNEL CODE HERE

#ifdef SIMPLE
__shared__ float in_s[2 * BLOCK_SIZE];
int idx = 2 * blockIdx.x * blockDim.x + threadIdx.x;

in_s[threadIdx.x] = ((idx < size) ? in[idx] : 0.0f);
in_s[threadIdx.x + BLOCK_SIZE] = ((idx + BLOCK_SIZE < size) ? in[idx + BLOCK_SIZE] : 0.0f);

   for (int stride = 1; stride < BLOCK_SIZE << 1; stride <<= 1) {
       __syncthreads();
       if (threadIdx.x % stride == 0)
           in_s[2 * threadIdx.x] += in_s[2 * threadIdx.x + stride];
   }

#else
__shared__ float in_s[BLOCK_SIZE];
int idx = 2 * blockIdx.x * blockDim.x + threadIdx.x;

in_s[threadIdx.x] = ((idx < size) ? in[idx] : 0.0f) +
((idx + BLOCK_SIZE < size) ? in[idx + BLOCK_SIZE] : 0.0f);

   for (int stride = BLOCK_SIZE >> 1; stride > 0; stride >>= 1) {
       __syncthreads();
       if (threadIdx.x < stride)
           in_s[threadIdx.x] += in_s[threadIdx.x + stride];
   }
#endif

if (threadIdx.x == 0)
out[blockIdx.x] = in_s[0];
}

Explanation / Answer

Here is the code which I tried from the assignment.

__global__ void add_vectors_kernel(float *A, float *B, float *C, int N) {
   // Determine which element the thread is computing
   int block_Id = blockIdx.x + gridDim.x * blockIdx.y;
   int thread_Id = blockDim.x * block_Id + threadIdx.x;

   // Compute a single element of the result vector (if the element is valid)
   if (thread_Id < N) C[thread_Id] = A[thread_Id] + B[thread_Id];
}

Navigate

Add an additional method to do binary search and find whether user-specified val

Add an average method to the MyLinkedListL3 class in the program showing below.

Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.

Add an algorithm in CUDA C (no c++) to this kernel code that will find the max v

Question

Explanation / Answer

Related Questions

Navigate