How to combine the functions to make the runtime shorter without anyother extra
ID: 3832320 • Letter: H
Question
How to combine the functions to make the runtime shorter without anyother extra elements. (need to comebine the three functions together.) #include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> #include <string.h> #include <math.h> #include <immintrin.h> #include <x86intrin.h> #define ALIGN __attribute__ ((aligned (32))) double ALIGN a[1024 * 1024]; double ALIGN b[1024 * 1024]; double ALIGN c[1024 * 1024]; double ALIGN c1[1024 * 1024]; void dgemm(int n) { int i,j,k; for(i=0; i<n; i++) { for(j=0; j<n; j++) { double cij = 0; for(k=0; k<n; k++) cij = cij + a[i*n+k] * b[k*n+j]; c1[i*n+j] = cij; } } } void dgemm_intrin(int n) { int i,j,k; for(i=0; i<n; i+=1) { for(j=0; j<n; j+=4) { __m256d c4 = _mm256_load_pd(&c[i * n+j]); for(k=0; k<n; k++) { __m256d a4 = _mm256_broadcast_sd(&a[i*n+k]); __m256d b4 = _mm256_load_pd(&b[k*n+j]); c4 = _mm256_add_pd(c4, _mm256_mul_pd(a4, b4)); } _mm256_store_pd(&c[i*n+j], c4); } } } void dgemm_unrolling(int n) { int i,j,k; for(i=0; i<n; i++) { for(j=0; j<n; j++) { double cij = 0; for(k=0; k<n; k+=4) { double s1 = a[i*n+k] * b[k*n+j]; double s2 = a[i*n+(k+1)] * b[(k+1)*n+j]; double s3 = a[i*n+(k+2)] * b[(k+2)*n+j]; double s4 = a[i*n+(k+3)] * b[(k+3)*n+j]; cij += s1 + s2 + s3 + s4; } c[i*n+j] = cij; } } } #define BLOCK_SIZE 4 void do_block(int n, int si, int sj, int sk, double *a, double *b, double *c) { int i, j, k; for (i=si; i<si+BLOCK_SIZE; i++) for (j=sj; j<sj+BLOCK_SIZE; j++) { double cij = c[i*n+j]; for (k=sk; k<sk+BLOCK_SIZE; k++) cij += a[i*n+k] * b[k*n+j]; c[i*n+j] = cij; } } void dgemm_blocking(int n) { int i, j, k; for(i=0; i<n; i+=BLOCK_SIZE) for(j=0; j<n; j+=BLOCK_SIZE) { c[i*n+j] = 0; for(k=0; k<n; k+=BLOCK_SIZE) do_block(n, i, j, k, a, b, c); } } /* Implement this function with multiple optimization techniques. */ void optimized_dgemm(int n) { dgemm_blocking(n); } void main(int argc, char** argv) { int i, j; time_t t; struct timeval start, end; double elapsed_time; int check_correctness = 0; int correct = 1; if(argc > 1) { if(strcmp(argv[1], "corr") == 0) { check_correctness = 1; } } /* Initialize random number generator */ srand((unsigned) time(&t)); /* Populate the arrays with random values */ for(i=0; i<1024; i++) { for(j=0; j<1024; j++) { a[i*1024+j] = (double)rand() / (RAND_MAX + 1.0); b[i*1024+j] = (double)rand() / (RAND_MAX + 1.0); c[i*1024+j] = 0.0; c1[i*1024+j] = 0.0; } } gettimeofday(&start, NULL); /* Call you optimized function optimized_dgemm */ optimized_dgemm(1024); gettimeofday(&end, NULL); /* For TA use only */ if(check_correctness) { dgemm(1024); for(i=0; (i<1024) && correct ; i++) { for(j=0; (j<1024) && correct; j++) { if(fabs(c[i*1024+j]-c1[i*1024+j]) >= 0.0000001) { printf("%f != %f ", c[i*1024+j], c1[i*1024+j]); correct = 0; } } } if(correct) printf("Result is correct! "); else printf("Result is incorrect! "); } elapsed_time = (end.tv_sec - start.tv_sec) * 1000.0; elapsed_time += (end.tv_usec - start.tv_usec) / 1000.0; printf("dgemm finished in %f milliseconds. ", elapsed_time); }
Explanation / Answer
#include #include #include #include #include #include #include #include #define ALIGN __attribute__ ((aligned (32))) double ALIGN a[1024 * 1024]; double ALIGN b[1024 * 1024]; double ALIGN c[1024 * 1024]; double ALIGN c1[1024 * 1024]; void dgemm(int n) { int i,j,k; for(i=0; iRelated Questions
Hire Me For All Your Tutoring Needs
Integrity-first tutoring: clear explanations, guidance, and feedback.
Drop an Email at
drjack9650@gmail.com
drjack9650@gmail.com
Navigate
Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.