Hi,
I recently noticed that when using a threaded 1-dimensional DFT, a DFTI_COMPLEX domain DFT does not appear to respect the DFTI_THREAD_LIMIT and instead always uses the threading value set by mkl_set_num_threads(). Furthermore, it appears that while a REAL domain DFT does obey DFTI_THREAD_LIMIT, its behavior has changed between MKL v11.1 and 11.2.
In MKL v11.1, setting mkl_set_num_threads(4) followed by DftiSetValue(dft_handle, DFTI_THREAD_LIMIT, 2) caused a complex-valued DFT to spawn and use 4 threads while a real-valued DFT only spawned and used 2 threads. In MKL 11.2, the complex-valued DFT still spawned and used 4 cores, but now the real-valued DFT also spawned 4 cores but only utilized 2 of them (2 cores were utilized at 90%+ and 2 were utilized at ~10%). Below is the example code I've been using to replicate this behavior. I was wondering if I'm doing something wrong or possibly misunderstanding the expected behavior of DFTI_THREAD_LIMIT.
// mkl_thread_test.cpp - Computes large threaded DFTs // arg1 = 'C' for complex domain or 'R' for real (optional, default REAL) // arg2 = scale factor for DFT (optional, default 1) #include <iostream> #include <cstdlib> #include <vector> #include <string.h> #include <mkl.h> #include <omp.h> using namespace std; int main(int argc, char* argv[]) { int mklThreads = 4; int dftThreads = 2; int dftSize = 10000000; int loops = 100; DFTI_CONFIG_VALUE domain = DFTI_REAL; int sizeMultiplier = 1; if(argv[1][0] == 'C') { domain = DFTI_COMPLEX; sizeMultiplier = 2; } float scale = 1.0f; if(argc > 2) scale = atof(argv[2]); // print version number for reference char version[DFTI_VERSION_LENGTH]; DftiGetValue(0, DFTI_VERSION, version); cerr<<"MKL Version: "<<version<<endl; vector<float> vin((dftSize+loops)*sizeMultiplier); vector<float> vout(dftSize*sizeMultiplier); vector<float> vtmp(loops); // for saving output to avoid compiler optimizing out computation for(int i=0; i<vin.size(); ++i) { vin[i] = float(rand())/float(RAND_MAX)-.5; } MKL_LONG status; DFTI_DESCRIPTOR_HANDLE dft; mkl_set_num_threads(mklThreads); status = DftiCreateDescriptor(&dft, DFTI_SINGLE, domain, 1, dftSize); if(status != DFTI_NO_ERROR) cerr<<DftiErrorMessage(status)<<endl; status = DftiSetValue(dft, DFTI_PLACEMENT, DFTI_INPLACE); if(status != DFTI_NO_ERROR) cerr<<DftiErrorMessage(status)<<endl; status = DftiSetValue(dft, DFTI_PACKED_FORMAT, DFTI_PERM_FORMAT); if(status != DFTI_NO_ERROR) cerr<<DftiErrorMessage(status)<<endl; status = DftiSetValue(dft, DFTI_ORDERING, DFTI_ORDERED); if(status != DFTI_NO_ERROR) cerr<<DftiErrorMessage(status)<<endl; status = DftiSetValue(dft, DFTI_FORWARD_SCALE, scale); if(status != DFTI_NO_ERROR) cerr<<DftiErrorMessage(status)<<endl; status = DftiSetValue(dft, DFTI_THREAD_LIMIT, dftThreads); if(status != DFTI_NO_ERROR) cerr<<DftiErrorMessage(status)<<endl; status = DftiCommitDescriptor(dft); if (status != DFTI_NO_ERROR) cerr<<DftiErrorMessage(status)<<endl; cerr<<"Computing "<<loops; if(domain == DFTI_COMPLEX) cerr<<" COMPLEX"; else cerr<<" REAL"; cerr<<" DFTs of size "<<dftSize<<" using "<<mklThreads<<" MKL threads but limiting DFT to "<<dftThreads<<" threads"<<endl; MKL_LONG threadLimit; status = DftiGetValue(dft, DFTI_THREAD_LIMIT, &threadLimit); if(status != DFTI_NO_ERROR) cerr<<DftiErrorMessage(status)<<endl; cerr<<"Thread limit: "<<threadLimit<<endl; for(int i=0; i<loops; ++i) { memcpy(&vout[0], &vin[i], dftSize*sizeMultiplier*sizeof(float)); status = DftiComputeForward(dft, &vout[0]); if(status != DFTI_NO_ERROR) cerr<<DftiErrorMessage(status)<<endl; vtmp[i] = vout[0]; // save a value to avoid optimizing out the DFT computation } cerr<<"Finished execution"<<endl; status = DftiFreeDescriptor(&dft); if(status != DFTI_NO_ERROR) cerr<<DftiErrorMessage(status)<<endl; return 0; }
For reference, I'm running on a quad-core processor running 64-bit linux (Ubuntu 14.04) and compiling with icpc v14.0.4 (for MKL v11.1) and v15.0.2 (for MKL v11.2). My compile line looks like:
icpc -O3 -xHost -openmp -I${MKLROOT}/include -o mkl_thread_test mkl_thread_test.cpp -L${MKLROOT}/lib/intel64 -lmkl_intel_lp64 -lmkl_core -lmkl_intel_thread -lpthread -lm -liomp5
When I call ./mkl_thread_test C, I notice 4 cores spinning at 95%, when I call ./mkl_thread_test R, I notice just 2 cores spinning at 95% for MKL v11.1 and 2 cores spinning at 95% plus 2 more cores spinning at 10% for MKL v11.2. My exact versions of MKL are 11.1.4 Product Build 20140806 and 11.2.2 Product Build 20150120. In both cases, the value returned by DftiGetValue(DFTI_THREAD_LIMIT) is 2 so my expectation is that the DFT should only be using 2 threads regardless of MKL version or real vs. complex DFT.
Am I doing something wrong or should MKL be respecting the value of DFTI_THREAD_LIMIT?
--Nick