Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

You cannot use any libraries to perform any parallel computing in this project.

ID: 3832754 • Letter: Y

Question

You cannot use any libraries to perform any parallel computing in this project.

The objective of the project is to come up with an approach to utilize the memory/cache of your system effectively; therefore, no parallel computing is needed.

________________________________________________________________________________________________________________________________

//benchmark.c

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <float.h>

#include <math.h>

#include <time.h>

#include <sys/time.h>

#include <emmintrin.h>

/* Your function must have the following signature: */

void dgemm( int m, int n, float *A, float *C );

/* The reference code */

void dgemm_reference( int m, int n, float *A, float *C )

{

for( int i = 0; i < m; i++ )

for( int k = 0; k < n; k++ )

for( int j = 0; j < m; j++ )

C[i+j*m] += A[i+k*m] * A[j+k*m];

}

/* The benchmarking program */

int main( int argc, char **argv )

{

srand(time(NULL));

   for( int n = 32; n < 100; n = n+1+n/3 )

//   for( int n = 40; n < 41; n = n+1+n/3 )

   {

/* Try different m */

for( int m = 32; m < 100; m = m+1+m/3 )

//for( int m = 48; m < 49; m = m+1+m/3 )

{

/* Allocate and fill 2 random matrices A, C */

float *A = (float*) malloc( m * n * sizeof(float) );

float *C = (float*) malloc( m * m * sizeof(float) );

float *C_ref = (float*) malloc( m * m * sizeof(float) );

  

for( int i = 0; i < m*n; i++ ) A[i] = 2 * drand48() - 1;

for( int i = 0; i < m*m; i++ ) C[i] = 2 * drand48() - 1;

/* measure Gflop/s rate; time a sufficiently long sequence of calls to eliminate noise */

double Gflop_s, seconds = -1.0;

for( int n_iterations = 1; seconds < 0.1; n_iterations *= 2 )

{

/* warm-up */

dgemm( m, n, A, C );

  

/* measure time */

struct timeval start, end;

gettimeofday( &start, NULL );

for( int i = 0; i < n_iterations; i++ )

   dgemm( m,n,A, C );

gettimeofday( &end, NULL );

seconds = (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);

  

/* compute Gflop/s rate */

Gflop_s = 2e-9 * n_iterations * m * m * n / seconds;

}

  

printf( "%d by %d matrix with strip size %d %g Gflop/s ", n, n,m, Gflop_s );

  

/* Ensure that error does not exceed the theoretical error bound */

      

/* Set initial C to 0 and do matrix multiply of A*B */

memset( C, 0, sizeof( float ) * m * m );

dgemm( m,n, A, C );

/* Subtract A*B from C using standard dgemm and reference (note that this should be 0 to within machine roundoff) */

memset( C_ref, 0, sizeof( float ) * m * m );

dgemm_reference( m,n,A,C_ref );

/* Subtract the maximum allowed roundoff from each element of C */

for( int i = 0; i < m*m; i++ ) C[i] -= C_ref[i] ;

/* After this test if any element in C is still positive something went wrong in square_dgemm */

for( int i = 0; i < m * m; i++ )

if( C[i] > 0 ) {

   printf( "FAILURE: error in matrix multiply exceeds an acceptable margin " );

   printf( "Off by: %f, from the reference: %f, at n = %d, m = %d ",C[i], C_ref[i], n, m );

   return -1;

}

/* release memory */

free( C_ref );

free( C );

free( A );

}

   }

return 0;

}

________________________________________________________________________________________________________________________________

// dgemm-optimize.c

void dgemm( int m, int n, float *A, float *C )

{

/*FILL-IN, PROVIDE 3 DIFFERENT WAYS TO OPTIMIZE

PROVIDE STEP BY STEP COMMENTS PLEASE ON HOW YOU ARE OPTIMIZING

You cannot use any libraries to perform any parallel computing in this project.

The objective of the project is to come up with an approach to utilize the memory/cache of your system effectively; therefore, no parallel computing is needed.

*/

}

The matrix is column major. Naive implementation is given in dgemma-naive.c and you can run the bench-naive to see the output. void dgemm int m, int n, float *A, float *C) for (int i 0; i m; it for (int k 0; k n; k++) for (int j 0; j m; j++) C is where the result is stored and we are doing all the calculations from just one matrix, A. You are required to do all the calculations and no optimization is allowed on this front to make benchmarking easier. Zip contains the following files: Makefile: to make and benchmark benchmark.c: do not modify. It check results and produce performance numbers dgemm-naive.c: naive implementation as shown above dgemm-optimize c: your optimization Common optimizations you will see on the web are as follows: Reordering of instructions (compiler peep-hole optimization) Register blocking (reusing the same registers for multiple calculations) Cache optimizations o Blocking (trying to keep the data in the cache for large matrices) o Copying small matrix blocks into contiguous chunks of memory o Pre-compute transpose for spatial locality Loop optimizations o SSE instructions o Reordering o Unrolling Padding Matrices (odd sizes can hurt pipeline performance)

Explanation / Answer


dgemm-optimize.c


void dgemm(int m, int n, float *A, float *C)
{   // FILL-IN
  
    float c;
    int j, isOdd;
  
    if(m % 2 == 0)
        isOdd = m;
    else
        isOdd = m-1;
  
    for(int i = 0; i < m; i++){
        for(int k = 0; k < n; k++){
          
            c = A[i + k * m];
          
            for(j = 0; j < isOdd; j += 2){
              
                C[i + j * m] += c * A[j + k * m];
                C[i + (j + 1) * m] += c * A[(j + 1) + k * m];
            }
          
            if(m % 2 != 0)
                 C[i + j * m] += c * A[j + k * m];
        }
    }
}

dgemm-naive.c


void dgemm( int m, int n, float *A, float *C )
{
for( int i = 0; i < m; i++ )
    for( int k = 0; k < n; k++ )
      for( int j = 0; j < m; j++ )
       C[i+j*m] += A[i+k*m] * A[j+k*m];
}

benchmark.c

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <float.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>
#include <emmintrin.h>

/* Your function must have the following signature: */

void dgemm( int m, int n, float *A, float *C );

/* The reference code */
void dgemm_reference( int m, int n, float *A, float *C )
{
for( int i = 0; i < m; i++ )
    for( int k = 0; k < n; k++ )
      for( int j = 0; j < m; j++ )
        C[i+j*m] += A[i+k*m] * A[j+k*m];
}

/* The benchmarking program */

int main( int argc, char **argv )
{
srand(time(NULL));

   for( int n = 32; n < 100; n = n+1+n/3 )
//   for( int n = 40; n < 41; n = n+1+n/3 )
   {
/* Try different m */
for( int m = 32; m < 100; m = m+1+m/3 )
//for( int m = 48; m < 49; m = m+1+m/3 )
{
    /* Allocate and fill 2 random matrices A, C */
    float *A = (float*) malloc( m * n * sizeof(float) );
    float *C = (float*) malloc( m * m * sizeof(float) );
    float *C_ref = (float*) malloc( m * m * sizeof(float) );
  
    for( int i = 0; i < m*n; i++ ) A[i] = 2 * drand48() - 1;
    for( int i = 0; i < m*m; i++ ) C[i] = 2 * drand48() - 1;

    /* measure Gflop/s rate; time a sufficiently long sequence of calls to eliminate noise */
    double Gflop_s, seconds = -1.0;
    for( int n_iterations = 1; seconds < 0.1; n_iterations *= 2 )
    {
      /* warm-up */
      dgemm( m, n, A, C );
    
      /* measure time */
      struct timeval start, end;
      gettimeofday( &start, NULL );
      for( int i = 0; i < n_iterations; i++ )
   dgemm( m,n,A, C );
      gettimeofday( &end, NULL );
      seconds = (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);
    
      /* compute Gflop/s rate */
      Gflop_s = 2e-9 * n_iterations * m * m * n / seconds;
    }
  
    printf( "%d by %d matrix with strip size %d %g Gflop/s ", n, n,m, Gflop_s );
  
    /* Ensure that error does not exceed the theoretical error bound */
      
    /* Set initial C to 0 and do matrix multiply of A*B */
    memset( C, 0, sizeof( float ) * m * m );
    dgemm( m,n, A, C );

    /* Subtract A*B from C using standard dgemm and reference (note that this should be 0 to within machine roundoff) */
    memset( C_ref, 0, sizeof( float ) * m * m );
    dgemm_reference( m,n,A,C_ref );

    /* Subtract the maximum allowed roundoff from each element of C */
    for( int i = 0; i < m*m; i++ ) C[i] -= C_ref[i] ;

    /* After this test if any element in C is still positive something went wrong in square_dgemm */
    for( int i = 0; i < m * m; i++ )
      if( C[i] > 0 ) {
   printf( "FAILURE: error in matrix multiply exceeds an acceptable margin " );
   printf( "Off by: %f, from the reference: %f, at n = %d, m = %d ",C[i], C_ref[i], n, m );
   return -1;
      }

    /* release memory */
    free( C_ref );
    free( C );
    free( A );
}
   }
return 0;
}

Hire Me For All Your Tutoring Needs
Integrity-first tutoring: clear explanations, guidance, and feedback.
Drop an Email at
drjack9650@gmail.com
Chat Now And Get Quote