Dear. Intel Members,
I have a question about distributing data among processes when using Intel MKL cluster FFT functions for 1D transforms. I am testing 1D FFT of an array of six complex numbers. The result obtained using one process is correct. However, the result obtained using two processes has wrong indices (the numbers are correct). The results and source codes are pasted below. I am confused about how data are distributed among processes, although I have read through the manual. Any advice is greatly appreciated.
Sirui
________________________________________
one process results:
local_nx= 6 for rank= 0
local_x_start= 1 for rank= 0
local_out_nx= 6 for rank= 0
local_out_x_start= 1 for rank= 0
local_size= 6 for rank= 0
input global array
(0.0000000E+00,1.000000) 1
(1.000000,1.000000) 2
(2.000000,1.000000) 3
(3.000000,1.000000) 4
(4.000000,1.000000) 5
(5.000000,1.000000) 6
output local array with global index
(15.00000,6.000000) 1 for rank= 0
(-3.000000,5.196153) 2 for rank= 0
(-3.000000,1.732051) 3 for rank= 0
(-3.000000,0.0000000E+00) 4 for rank= 0
(-3.000000,-1.732051) 5 for rank= 0
(-3.000000,-5.196152) 6 for rank= 0
successfully done
Two processes result:
local_nx= 4 for rank= 0
local_x_start= 1 for rank= 0
local_out_nx= 4 for rank= 0
local_out_x_start= 1 for rank= 0
local_size= 4 for rank= 0
input global array
(0.0000000E+00,1.000000) 1
(1.000000,1.000000) 2
(2.000000,1.000000) 3
(3.000000,1.000000) 4
(4.000000,1.000000) 5
(5.000000,1.000000) 6
local_nx= 2 for rank= 1
local_x_start= 5 for rank= 1
local_out_nx= 2 for rank= 1
local_out_x_start= 5 for rank= 1
local_size= 3 for rank= 1
(-3.000000,0.0000000E+00) 5 for rank= 1
(-3.000000,-1.732051) 6 for rank= 1
(-3.000000,-5.196152) 7 for rank= 1
output local array with global index
(15.00000,6.000000) 1 for rank= 0
(-3.000000,5.196153) 2 for rank= 0
(-3.000000,1.732051) 3 for rank= 0
(-3.000000,-1.732051) 4 for rank= 0
successfully done
My source code is the following.
program main
USE mmpivardef
USE MKL_CDFT
USE mpi
IMPLICIT NONE
complex(4), allocatable, dimension(:) :: in,work,in_local
INTEGER(4), parameter :: N=6
integer(4) :: i,j,localsize
INTEGER(4) :: status,local_nx,x_start,local_out_nx,out_xstart
TYPE(DFTI_DESCRIPTOR_DM), POINTER :: My_Desc1_Handle
CALL MPI_INIT(ierr)
CALL MPI_COMM_DUP(MPI_COMM_WORLD,MCW,ierr)
CALL MPI_COMM_RANK(MCW,rank,ierr)
CALL MPI_COMM_SIZE(MCW,msize,ierr)
allocate(in(N))
status = DftiCreateDescriptorDM(MCW,My_Desc1_Handle, &
DFTI_SINGLE,DFTI_COMPLEX,1,N)
status = DftiGetValueDM(My_Desc1_Handle,CDFT_LOCAL_SIZE,localsize)
status = DftiGetValueDM(My_Desc1_Handle,CDFT_LOCAL_NX,local_nx)
status = DftiGetValueDM(My_Desc1_Handle,CDFT_LOCAL_X_START, x_start)
status = DftiGetValueDM(My_Desc1_Handle,CDFT_LOCAL_OUT_NX, local_out_nx)
status = DftiGetValueDM(My_Desc1_Handle,CDFT_LOCAL_OUT_X_START, out_xstart)
write(*,*) 'local_nx=',local_nx,'for rank=',rank
write(*,*) 'local_x_start=',x_start,'for rank=',rank
write(*,*) 'local_out_nx=',local_out_nx,'for rank=',rank
write(*,*) 'local_out_x_start=',out_xstart,'for rank=',rank
write(*,*) 'local_size=',localsize,'for rank=',rank
write(*,*)
ALLOCATE(in_local(localsize))
ALLOCATE(work(localsize))
status = DftiSetValueDM(My_Desc1_Handle,CDFT_WORKSPACE,work)
do i=1,N
j=i-1
in(i)=cmplx(j,1)
enddo
IF (rank.eq.0) THEN
write(*,*) 'input global array'
Do i=1,N
write(*,*) in(i),i
ENDDO
ENDIF
write(*,*)
DO i=1,localsize
in_local(i) = in(i+x_start-1)
ENDDO
status = DftiCommitDescriptorDM(My_Desc1_Handle)
status = DftiComputeForwardDM(My_Desc1_Handle,in_local)
IF (rank.eq.0) write(*,*) 'output local array with global index'
DO i=1,localsize
write(*,*) in_local(i),i+x_start-1,'for rank=',rank
ENDDO
status = DftiFreeDescriptorDM(My_Desc1_Handle)
DEALLOCATE(in_local,work,in)
IF (rank.eq.0) write(*,*) 'successfully done'
CALL MPI_FINALIZE(ierr)
end program