Dear all,
I am new to the forum, and of course, to MKL (though I've used TBB before). I am using the MKL Link Helper to compile and link the first C example dgemm_threading_effect_example.c, but I cannot figure how to use TBB.
I know it is possible to use just TBB without OpenMP (which I don't have, being on a Mac), but it seems that I need to link the mkl_sequential library, and it seems no threads can be used.
Below you can find the example with my few added lines of code, and here are my linker switches:
-L/usr/local/lib -ltbb -ltbbmalloc -L/opt/intel/compilers_and_libraries_2016/mac/mkl/lib -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential
Thanks for any help you can give me!
Franco
#include <stdio.h>
#include <stdlib.h>
#include "mkl.h"
#include <tbb/task_scheduler_init.h>
/* Consider adjusting LOOP_COUNT based on the performance of your computer */
/* to make sure that total run time is at least 1 second */
#define LOOP_COUNT 10
int main()
{
double *A, *B, *C;
int m, n, p, i, j, r, max_threads;
double alpha, beta;
double s_initial, s_elapsed;
printf ("\n This example demonstrates threading impact on computing real matrix product \n"" C=alpha*A*B+beta*C using Intel(R) MKL function dgemm, where A, B, and C are \n"" matrices and alpha and beta are double precision scalars \n\n");
m = 2000, p = 200, n = 1000;
printf (" Initializing data for matrix multiplication C=A*B for matrix \n"" A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);
alpha = 1.0; beta = 0.0;
printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"" performance \n\n");
A = (double *)mkl_malloc( m*p*sizeof( double ), 64 );
B = (double *)mkl_malloc( p*n*sizeof( double ), 64 );
C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
if (A == NULL || B == NULL || C == NULL) {
printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
return 1;
}
printf (" Intializing matrix data \n\n");
for (i = 0; i < (m*p); i++) {
A[i] = (double)(i+1);
}
for (i = 0; i < (p*n); i++) {
B[i] = (double)(-i-1);
}
for (i = 0; i < (m*n); i++) {
C[i] = 0.0;
}
// HERE I TRY BUT IT'S ALWAYS ONE SINGLE THREAD
tbb::task_scheduler_init scheduler(4);
mkl_set_num_threads(4);
mkl_set_num_threads_local(4);
printf (" Finding max number of threads Intel(R) MKL can use for parallel runs \n\n");
// HERE I ALWAYS GET ONE
max_threads = mkl_get_max_threads();
printf (" Running Intel(R) MKL from 1 to %i threads \n\n", max_threads);
for (i = 1; i <= max_threads; i++) {
for (j = 0; j < (m*n); j++)
C[j] = 0.0;
printf (" Requesting Intel(R) MKL to use %i thread(s) \n\n", i);
mkl_set_num_threads(i);
printf (" Making the first run of matrix product using Intel(R) MKL dgemm function \n"" via CBLAS interface to get stable run time measurements \n\n");
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, p, alpha, A, p, B, n, beta, C, n);
printf (" Measuring performance of matrix product using Intel(R) MKL dgemm function \n"" via CBLAS interface on %i thread(s) \n\n", i);
s_initial = dsecnd();
for (r = 0; r < LOOP_COUNT; r++) {
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, p, alpha, A, p, B, n, beta, C, n);
}
s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT;
printf (" == Matrix multiplication using Intel(R) MKL dgemm completed ==\n"" == at %.5f milliseconds using %d thread(s) ==\n\n", (s_elapsed * 1000), i);
}
printf (" Deallocating memory \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
if (s_elapsed < 0.9/LOOP_COUNT) {
s_elapsed=1.0/LOOP_COUNT/s_elapsed;
i=(int)(s_elapsed*LOOP_COUNT)+1;
printf(" It is highly recommended to define LOOP_COUNT for this example on your \n"" computer as %i to have total execution time about 1 second for reliability \n"" of measurements\n\n", i);
}
printf (" Example completed. \n\n");
return 0;
}