As part of the open source library ArrayFire, Intel MKL is used for GEMM operations and recently updated the code to use batch version of GEMM. We have noticed that using GNU OpenMP or Intel OpenMP as threading solution is giving the expected speedups but TBB is not. We wanted to bring it to your attention. Given below is the arrayfire benchmark code used to time the GEMM operations.
#include <arrayfire.h>
#include <stdio.h>
#include <math.h>
#include <cstdlib>
using namespace af;
// create a small wrapper to benchmark
static array A; // populated before each timing
static void fn()
{
array B = matmul(A, A); // matrix multiply
B.eval(); // ensure evaluated
}
int main(int argc, char ** argv)
{
double peak = 0;
try {
int device = argc > 1 ? atoi(argv[1]) : 0;
setDevice(device);
info();
printf("Benchmark N-by-N matrix multiply\n");
for (int n = 128; n <= 2048; n += 128) {
//printf("%4d x %4d: ", n, n);
A = constant(1,n,n,3);
double time = timeit(fn); // time in seconds
double gflops = 2.0 * powf(n,3) / (time * 1e9);
if (gflops > peak)
peak = gflops;
printf("%4.2f\n", gflops);
fflush(stdout);
}
} catch (af::exception& e) {
fprintf(stderr, "%s\n", e.what());
throw;
}
printf(" ### peak %g GFLOPS\n", peak);
return 0;
}The benchmark results are provided in the form an interactive chart at the this URL
The usage of batch GEMM call inside arrayfire can be found in the following source file.
https://github.com/9prady9/arrayfire/blob/57eb26d03a738c8a99b664dcbe374b...
Thank you,
Pradeep.