      Subroutine mvcur4( trans, m, n, alpha, a, lda, x, incx,
     &                   beta, y, incy, wrk )
! ----------------------------------------------------------------------
! --- Column oriented implementation of matrix-vector multiplication 
!     y = beta*y + alpha*A*x   or   y = beta*y + alpha*A'*x 
!     with loop unrolling to a depth of 4.
!     Based on the report of Dongarra and Eisenstat.
!     The calling sequence is based on the Level 2 BLAS routine DGEMV
!
!  Parameters
!  ----------
!  TRANS   - Input  - Specifies the operation to be performed:
!                     TRANS = 'N' implies  y = A*X + beta*y.
!                     TRANS = 'T' implies  y = A'*x + beta*y.
!  M, N    - Input  - Dimension of the problem.
!  ALPHA   - Input  - Specifies scalar alpha.
!  A(M,N)  - Input  - Contains the matrix A.
!  LDA     - Input  - Leading Dimension of A.
!  X(*)    - Input  - Contains the vector x.
!  INCX    - Input  - Specifies the increment for the elements of X.
!                     In this implementation, INCX must be 1.  The
!                     parameter is only here for compatibility with
!                     Level 2 BLAS routine DGEMV.
!  BETA    - Input  - Scalar beta; if BETA = 0, then Y need not be set.
!  Y(*)    - Output - Contains result vector y
!  INCY    - Input  - Specifies the increment for the elements of Y.
!                     In this implementation, INCY must be 1. The
!                     parameter is only here for compatibility with
!                     Level 2 BLAS routine DGEMV.
!  WRK(*)  - Workspace. NOT compatible with DGEMV!
! ----------------------------------------------------------------------
      Use                    dist_module
      Use                    numerics
      Character*1         :: trans
      Integer             :: m, n, lda, incx, incy
      Real(l_)            :: alpha, beta
      Real(l_)            :: a(lda,*), x(*), y(*), wrk(*)

! --- Communication types
      Include                'mpif.h'
      Integer                com, type

! --- Local constants
      Real(l_), Parameter :: zero = 0.0_l_, one = 1.0_l_

! --- Local variables
      Integer             :: i, imin, info, j, jmin, jt, k, mybase
      Real(l_)            :: temp, temp1, temp2, temp3 
! ----------------------------------------------------------------------
! ---Test the input parameters

      info = 0
      If ( trans /= 'N' .AND. trans /= 'T' ) Then
         info = 1
      Else If ( m < 0 ) Then
         info = 2
      Else If ( n < 0 ) Then
         info = 3
      Else If ( lda < MAX( 1, m ) ) Then
         info = 6
      Else If ( incx /= 1 ) Then
         info = 8
      Else If ( incy /= 1 ) Then
         info = 11
      End If
      If ( info /= 0 ) Then
         Call eberr ( 'mvcur4', info )
         Return
      End If
! ----------------------------------------------------------------------
! --- Start the operations; First form y := beta*y

      If ( beta /= one ) Then
         If ( beta == zero ) Then
            y(1:m) = zero
         Else
            y(1:m) = beta*y(1:m)
         End If
      End If
      If ( alpha == zero ) Return
      If ( trans == 'N' ) Then
! ----------------------------------------------------------------------
! --- Form  y := y + alpha*A*x

        j = Mod ( n, 2 )
        If ( j >= 1 ) Then
           temp = alpha * x(j)
           Do i = 1, m
              y(i) = y(i) + a(i,j) * temp
           End Do
        End If
        j = Mod ( n, 4 )
        If ( j >= 2 ) Then
           temp1 = alpha * x(j-1)
           temp  = alpha * x(j)
           Do i = 1, m
              y(i) = y(i) + a(i,j-1) * temp1 + a(i,j) * temp
           End Do
        End If
        jmin = j + 4
        Do j = jmin, n, 4
           temp3 = alpha * x(j-3)
           temp2 = alpha * x(j-2)
           temp1 = alpha * x(j-1)
           temp  = alpha * x(j)
           Do i = 1, m
              y(i) = y(i) + a(i,j-3) * temp3 + a(i,j-2) * temp2
     &                    + a(i,j-1) * temp1 + a(i,j  ) * temp
           End Do
        End Do
      Else
! ----------------------------------------------------------------------
! --- Form  y := y + alpha*A'*x; generate partial results.

        mybase = offset(me)
        Do j = 1, n
           temp = zero
           wrk(j) = zero
           i = Mod ( m, 2 )
           If ( i >= 1 ) Then
             temp = temp + a(i,j) * x(i+mybase)
           End If
           i = Mod ( m, 4 )
           If ( i >= 2 ) Then
              temp = temp + a(i-1,j) * x(i+mybase-1) +
     &                      a(i,j)   * x(i+mybase)
           End If
           imin = i + 4
           Do i = imin, m, 4
              temp = temp + a(i-3,j) * x(i+mybase-3) +
     &                      a(i-2,j) * x(i+mybase-2) +
     &                      a(i-1,j) * x(i+mybase-1) +
     &                      a(i,j)   * x(i+mybase)
           End Do
           wrk(j) = wrk(j) + alpha * temp
        End Do
! ----------------------------------------------------------------------
! --- Combine partial results.

        type = MPI_Real8
        com  = MPI_Comm_World
        Do k = 0, nodes - 1
           kbase = offset(k) + 1
           Call MPI_Reduce( wrk(kbase), y, sizes(k), type, MPI_Sum, k, 
     &                      com, info ) 
        End Do
      Endif
! ----------------------------------------------------------------------
      End Subroutine mvcur4
