Hi all,
I have implemented a kind of a gauss seidel algorithm for a sparse upper triangular coefficient matrix, where multiplication of a vector with a the sparse matrix (forward propagation) is a core feature. For some application I have to do this process for several independent vectors (memory location), where the spares coefficient matrix remains unchanged (read only access). Therefore I thought this process might be executable in parallel depending on the number of vectors. I expected an almost linear increase in speed as long as the number of vectors does not exceed the number of cores. However, I observed only a marginal increase in speed. Here is the code:
Module mod_tmp Type :: csrmatrixuppertri integer :: nrows, ncols integer, allocatable :: colpos(:), rowpos(:) real, allocatable :: value(:) end type csr contains Subroutine SubGlobal(a,b,c) implicit none Type(csr), intent(in) :: c Real, Intent(inout) :: a(:,:) Integer :: isnt, c1 !$ isnt=minval((/omp_get_max_threads(),int(size(a,2),kind=ikl)/)) !$OMP PARALLEL DO PRIVATE(c1) num_threads(isnt) Do c1=1,size(a,2) call sublocal(a=a(:,c1),c=c) End Do !$OMP END PARALLEL DO End Subroutine SubGlobal Subroutine SubLocal(av,c) Implicit none Type(csr), intent(in) :: c Real, Intent(inout) :: av(:) Integer :: c2, offdiagstart, offdiagend, diag Do c2=1,c%nrows !!get locations in csr diag=c%rowpos(c2) offdiagstart=c%rowpos(c2)+1 offdiagend=c%rowpos(c2+1)-1 !!calculate a new value av(c2)=av(c2)/c%value(diag) !!forward propagate av(c%colpos(offdiagstart:offdiagend))=& &av(c%colpos(offdiagstart:offdiagend))+& &c%value(offdiagstart:offdiagend)*(av(c2)) End do End Subroutine SubLocal End Module mod_tmp