      Program mod2a
! **********************************************************************
! *** This program is a distributed-memory version of                ***
! *** EuroBen Benchmark program MOD2A (generalized matrix-vector     ***
! *** multiplication).                                               ***
! ***                                                                ***
! *** Copyright: European Benchmark Group p/o                        ***
! ***            Computational Physics Dept. Utrecht                 ***
! ***            P.O. Box 80125                                      ***
! ***            3508 TD Utrecht                                     ***
! ***            The Netherlands                                     ***
! ***                                                                ***
! *** Author of this program: Aad J. van der Steen                   ***
! *** Date                    04/29/1998                             ***
! *** Based on the shared-memory version of Ruud van der Pas,        ***
! ***                                           11/24/1988           ***
! *** and Peter de Rijk                       , 02/01/1993           ***
! *** Improved communication                  , 05/14/1996           ***
! *** Total rework                            , 04/29/1998           ***
! **********************************************************************
! --- Version 3.3
!
! --- Purpose of program Mod2a
!     ------------------------
!     This program measures the performance of the matrix-vector product
!     y = A x  and  y = A' x, for a full  n x n  matrix A  with a vector
!     of length  n.
!     Several variants are examined.
!
! --- Remarks
!     -------
!     It is possible to measure the performance of a special library
!     routine for matrix vector multiplication.
!     If such a routine is available, the recipe is:
!       - Replace the string DGEMV in character string LNAME by the
!         appropriate name.
!       - Set parameter ILIB to a non-zero integer value.
!       - Activate in subroutine LIBVER the call to DGEMV or replace
!         this statement with the appropriate call to the library
!         routine.
!     Warning: To prevent making mistakes, there is NO source version of
!              DGEMV included in this program.
!              The default situation is ILIB=0 i.e. no library version
!              present!
!
! --- A EuroBen encapsulation routine is used for setting the network 
!     (csetup). In addition, the EuroBen status routine 'state', and
!     the routine for accessing. Routine 'state' should be customized
!     for the machine at hand.
!
! --- Presently, the maximum number of nodes allowed is 2048. To
!     increase this bound, increase the value of parameter 'maxnod' in 
!     in all relevant routines.
! ----------------------------------------------------------------------
      Use                   dist_module
      Use                   numerics
      Integer, Parameter :: nmax = 2500 
      Integer, Parameter :: ncases = 4, lda = nmax, mxcase = 50,
     &                      ilib = 0

      Character          :: modnam*8, text(ncases+1)*20, lname*43
      Integer            :: icase, info, ndim(mxcase)
      Integer            :: i, j, k, n, m, mbase, nrep, ipoint,
     &                      lchk(nmax)
      Real(l_)           :: tiperf((ncases+1)*mxcase,4), extime, perfor
      Real(l_)           :: x(nmax), y(nmax), a(lda,nmax), wrk(nmax), 
     &                      ychk(nmax)

      External              mvrpln, mvrur4, mvcpln, mvcur4
      Data modnam /        'mod2a   ' /

!                           123456789 123456789 1
      Data text   /        'Rows,    plainly    ' ,
     &                     'Rows,    unrolled 4 ' ,
     &                     'Columns, plainly    ' ,
     &                     'Columns, unrolled 4 ' ,
     &                     'Library routine(s)  ' /
!
!                           123456789 123456789 123456789 123456789 1234
      Data lname  /        'Library routine(s) used: DGEMV           ' /
! ----------------------------------------------------------------------
! --- Set up communication network and print test status.

      Call csetup
      If ( me == 0 ) Call state (modnam)
! ----------------------------------------------------------------------
! --- Open file containing the repetition factor and the problem size
!     for each problem. Read and compute until EOF (maximum no. of
!     cases allowed is mxcase = 50). The maximum problem size
!     nmax = 2500.

      icase = 0
      Open( 1, File = 'mod2a.in' )
   10 Read( 1, *, End = 20 ) nrep, n
      icase = icase + 1
      If ( icase > mxcase ) Then
         If ( me == 0 ) Print *, 'More than mxcase = ', mxcase, 
     &                    ': increase mxcase'
         Go To 20
      End If
      ndim(icase) = n
      If ( n > nmax ) Then
         If ( me == 0 ) Print *, 'n > nmax = ', nmax,
     &                           ': increase nmax'
         Go To 20
      End If
! ----------------------------------------------------------------------
! --- Distribute a as evenly as possible over the available processors.

      Call evdist( n )
      Call bsaddr
      m = sizes(me)

! --- Generate the matrix A and vector x.

      Call matgen( a, x, lda, n )
! ----------------------------------------------------------------------
! --- For checking the correctness we generate the solution vector by
!     independent means:

      Call mkbnds( a, lda, ychk )
! ----------------------------------------------------------------------
!                                                ** Row oriented variant
      ipoint = icase

      Call timing( 'N', m, n, a, lda, x, y, mvrpln,
     &             nrep, extime, perfor, wrk )

! --- Check correctness:

       Call check( 'mvrpln', 'N', y, ychk, lchk )

       tiperf(ipoint,1) = extime
       tiperf(ipoint,2) = perfor

       Call timing( 'T', m, n, a, lda, x, y, mvrpln,
     &              nrep, extime, perfor, wrk )

! --- Check correctness:

      Call check( 'mvrpln', 'T', y, ychk, lchk )
!
      tiperf(ipoint,3) = extime
      tiperf(ipoint,4) = perfor
! ----------------------------------------------------------------------
!                            ** Row oriented variant with loop unrolling 
      ipoint = icase + mxcase

      Call timing( 'N', m, n, a, lda, x, y, mvrur4,
     &             nrep, extime, perfor, wrk )

! --- Check correctness:

      Call check( 'mvrur4', 'N', y, ychk, lchk )

      tiperf(ipoint,1) = extime
      tiperf(ipoint,2) = perfor

      Call timing( 'T', m, n, a, lda, x, y, mvrur4,
     &             nrep, extime, perfor, wrk )

! --- Check correctness:

      Call check( 'mvrur4', 'T', y, ychk, lchk )

      tiperf(ipoint,3) = extime
      tiperf(ipoint,4) = perfor
! ----------------------------------------------------------------------
!                                             ** Column oriented variant
      ipoint = icase + 2*mxcase

      Call timing( 'N', m, n, a, lda, x, y, mvcpln,
     &             nrep, extime, perfor, wrk )

! --- Check correctness:

      Call check( 'mvcpln', 'N', y, ychk, lchk )

      tiperf(ipoint,1) = extime
      tiperf(ipoint,2) = perfor

      Call timing( 'T', m, n, a, lda, x, y, mvcpln,
     &             nrep, extime, perfor, wrk )

! --- Check correctness:

      Call check ( 'mvcpln', 'T', y, ychk, lchk )

      tiperf(ipoint,3) = extime
      tiperf(ipoint,4) = perfor
! ----------------------------------------------------------------------
!                         ** Column oriented variant with loop unrolling
      ipoint = icase + 3*mxcase

      Call timing( 'N', m, n, a, lda, x, y, mvcur4,
     &             nrep, extime, perfor, wrk )

! --- Check correctness:

      Call check( 'mvcur4', 'N', y, ychk, lchk )

      tiperf(ipoint,1) = extime
      tiperf(ipoint,2) = perfor

      Call timing( 'T', m, n, a, lda, x, y, mvcur4,
     &             nrep, extime, perfor, wrk )

! --- Check correctness:

      Call check( 'mvcur4', 'T', y, ychk, lchk )

      tiperf(ipoint,3) = extime
      tiperf(ipoint,4) = perfor
! ----------------------------------------------------------------------
!                                             ** Special library variant
      If ( ilib /= 0 ) Then
         ipoint = icase + 4*mxcase

! --- Special library version of matrix vector multiplication

         Call libver( 'N', m, n, a, lda, x, y, 
     &                nrep, extime, perfor, wrk )

! --- Check correctness:

         Call check( 'libver', 'N', y, yck, lchk )

         tiperf(ipoint,1) = extime
         tiperf(ipoint,2) = perfor

         Call libver( 'T', m, n, a, lda, x, y, 
     &                nrep, extime, perfor, wrk )

! --- Check correctness:

         Call check( 'libver', 'T', y, yck, lchk )

         tiperf(ipoint,3) = extime
         tiperf(ipoint,4) = perfor
      EndIf
! ----------------------------------------------------------------------
! >>> Get new case at label 10 <<<

      Go To 10
! ----------------------------------------------------------------------
!                                                  ** Print the results.
   20 If ( me == 0 ) Then
         Print 9010, nodes, nrep

         Do k = 1,ncases
            ipoint = (k-1)*mxcase + 1
            Print 9020, text(k), ndim(1), ndim(1), 
     &                  (tiperf(ipoint,j), j=1,4)
            Do i = 2, icase
               ipoint = ipoint + 1
               Print 9030, ' ',ndim(i), ndim(i), (tiperf(ipoint,j),
     &                     j=1,4)
            End Do
         End Do
!
         If ( ilib /= 0 ) Then
            ipoint = ncases*mxcase + 1
            Print 9020, text(ncases+1), ndim(1), ndim(1), 
     &                 (tiperf(ipoint,j), j=1,4)
            Do i = 2,nt
               ipoint = ipoint + 1
               Print 9030, ' ', ndim(i), ndim(i), (tiperf(ipoint,j),
     &                     j=1,4)
            End Do
            Print 9040, lname
         End If
      End If
! ----------------------------------------------------------------------
!                                               ** Exit network orderly.
      Call MPI_Finalize( info )
! ----------------------------------------------------------------------
!                                                            ** Formats.
 9010 Format( //79('-')/
     &'Generalized matrix-vector multiplication'/
     &4X,'y := alpha*A*x + beta*y   or',
     &3X,'y := alpha*A''*x + beta*y',/
     &'Several variants are measured, No. of procs. = ', i3 /
     &79('-')//
     &'Initial repetition factor used in timings: ',I3//
     &'Variant, i.e. array',18X,'No Transpose',12X,'Transpose'/
     &'A is accessed by',4X,4X,'M',4X,'N',
     &    2(4X,'seconds',4X,'Mflop/s ')/
     &79('-') )
 9020 Format(/A20,2(1X,I4),1P,2(3X,E9.3,2X,E9.3) )
 9030 Format( A20,2(1X,I4),1P,2(3X,E9.3,2X,E9.3) )
 9040 Format( A43 )
! ----------------------------------------------------------------------
      End Program mod2a
