omp threading of sparse matrix operation does not yield expected seed up. reason?

Hi all,

I have implemented a kind of a gauss seidel algorithm for a sparse upper triangular coefficient matrix, where multiplication of a vector with a the sparse matrix (forward propagation) is a core feature. For some application I have to do this process for several independent vectors (memory location), where the spares coefficient matrix remains unchanged (read only access). Therefore I thought this process might be executable in parallel depending on the number of vectors. I expected an almost linear increase in speed as long as the number of vectors does not exceed the number of cores. However, I observed only a marginal increase in speed. Here is the code:

Module mod_tmp
  Type :: csrmatrixuppertri
    integer :: nrows, ncols
    integer, allocatable :: colpos(:), rowpos(:)
    real, allocatable :: value(:)
  end type csr
contains
  Subroutine SubGlobal(a,b,c)
    implicit none
    Type(csr), intent(in) :: c
    Real, Intent(inout) :: a(:,:)
    Integer :: isnt, c1
    !$ isnt=minval((/omp_get_max_threads(),int(size(a,2),kind=ikl)/))
    !$OMP PARALLEL DO PRIVATE(c1) num_threads(isnt)
    Do c1=1,size(a,2)
      call sublocal(a=a(:,c1),c=c)
    End Do
    !$OMP END PARALLEL DO
  End Subroutine SubGlobal
  Subroutine SubLocal(av,c)
    Implicit none
    Type(csr), intent(in) :: c
    Real, Intent(inout) :: av(:)
    Integer :: c2, offdiagstart, offdiagend, diag
    Do c2=1,c%nrows
      !!get locations in csr
      diag=c%rowpos(c2)
      offdiagstart=c%rowpos(c2)+1
      offdiagend=c%rowpos(c2+1)-1
      !!calculate a new value
      av(c2)=av(c2)/c%value(diag)
      !!forward propagate
      av(c%colpos(offdiagstart:offdiagend))=&
        &av(c%colpos(offdiagstart:offdiagend))+&
        &c%value(offdiagstart:offdiagend)*(av(c2))
    End do
  End Subroutine SubLocal
End Module mod_tmp