      Program mod1b
! ----------------------------------------------------------------------
! **********************************************************************
! *** This program is part of the EuroBen Benchmark                  ***
! *** Copyright: EuroBen Group p/o                                   ***
! ***            Utrecht University, Computational Physics Dept.     ***
! ***            P.O. Box 80.000                                     ***
! ***            3508 TA Utrecht                                     ***
! ***            The Netherlands                                     ***
! ***                                                                ***
! *** Authors of this program: Peter P.M. de Rijk and                ***
! ***                          Aad J. van der Steen                  ***
! *** Contributed:             Autumn 1990                           ***
! *** Last change:             Autumn 2003                           ***
! **********************************************************************
!  Version 4.2
!
! --- Purpose of this module
! --------------------------
!  This program times some kernels for some vector lengths and
!  calculates their Mflop rate.
!
!  In addition, n(1/2) and r(infi) are estimated for the same kernels.
!
!  Both parameters are obtained by a linear least squares fit on the
!  function f(n) = alfa + beta*n
!
!  The following holds: r(infi) = 1/beta  (as long as in primary cache)
!                       n(1/2)  = alfa / beta
!
!-----------------------------------------------------------------------
      Use               numerics
      Implicit          None
      
! --- Universal Constants
      Real(l_), Parameter :: zero = 0.0_l_, half = 0.5_l_, 
     &                       one = 1.0_l_, two = 2.0_l_, 
     &                       oneneg =-1.0_l_

! --- Variables used for measurements
      Real(l_), Allocatable :: x1(:), x2(:), x3(:), x4(:), y(:,:) 

! --- Parameters used for no. of measuring methods and lower bound of
!     time resolution accepted.
      Real(l_), Parameter :: timelb=1.0e-12_l_

! --- Parameters used in the Repetition Factor for the timings
      Integer               :: nfixed, mrep  != 10 * nfixed

! --- Variables used to prevent overoptimisation in timings
      Integer,  Allocatable :: indj(:)
      Integer               :: ndummy

! --- Number of kernels to be examined
      Integer               :: nkern

! --- Character variables
      Character             :: symbol*60

! --- Stuff to be calculated
      Integer               :: nflop, ntrans
      Real(l_)              :: time, rmflop, rtrans, cycle, fpc

! --- Counters and other stuff
      Integer               :: i, k, kn, n, nrep
      Real(l_)              :: tbegin, tend

! --- Introduce Common block to frustrate overoptimisation.
      Real(l_)              :: s(2), ssw
      Common /cfake/           s, ssw

! --- External functions
      Real(l_)              :: wclock, dran1

! --- Initialization
! ----------------------------------------------------------------------
! --- Call module identification routine
      Call state ( 'mod1b   ' )
! ----------------------------------------------------------------------
      tbegin = wclock()
      Print 9010
      Open( 1, File = 'mod1b.in' ) ! Contains vector lengths & rep. fac.
      Open( 2, File = 'mod1b.krn' )! Contains flops executed in kernel.
      Open( 3, File = 'mod1b.cyc' )! Contains clock frequency of proc.
                                   ! (in Hz.)
! --- Get clock frequency of processor.
      Read( 3, * ) cycle

! --- Get number of kernels.
      nkern = 0
   10 Read( 2, *, End = 20 ) symbol, nflop
      nkern = nkern + 1
      Go To 10
   20 Rewind ( 2 )

      Do k = 1, nkern
          Read ( 2, * ) symbol, nflop, ntrans, kn
          Print 9030, kn, symbol, nflop
! --- Get problem size and repetition factor.
   30   Read( 1, *, End = 50 ) n, nfixed
         mrep = 10*nfixed
! --- Initialize index vector to be used against overoptimization.
         ndummy = -1999
         Allocate ( indj(mrep) )
         Do i = 1, mrep
   40       Continue
            indj(i) = Int( dran1( ndummy ) * mrep ) + 1
! --- Avoid problematic cases (just to be sure ...)
            If( indj(i) <= 0 .OR. indj(i) > mrep ) Go To 40
         End Do

! --- Get vector length, allocate, and initialise.	 
         nrep = Max( 10, Min( Int(nfixed/n) * 100, mrep ) )
         Allocate( x1(4*n), x2(4*n), x3(4*n), x4(4*n), y(4*n,2) )
         x1 = half
         x2 = oneneg
         x3 = two
         x4 = half
         y  = one
! --- Time vector operation
         Call timer( kn, n, x1, x2, x3, x4, y, 4*n, indj, mrep/2, nrep,
     &               time )
! --- Average MFlop/s
         If(  nflop == 0  ) nflop = 1
         rmflop = Real(nflop, l_)*1.0e-6_l_*Real(n, l_) /
     &            Max( time, timelb )
         rtrans = ntrans*1.0e-9_l_*Real(n, l_)/Max( time, timelb )
         fpc    = rmflop*1.0e5_l_/cycle
         Print 9040, n, time, rmflop, rtrans, fpc, nrep
         Deallocate( x1, x2, x3, x4, y, indj )
         Go To 30
   50    Print 9020
         Rewind( 1 )
      End Do                                              
      tend = wclock()
      Print 9050, tend - tbegin
! ---------------------------------------------------------------------- 
 9010 Format(
     &' ------------------------------------------------------------'/
     &' |      Performance measurement of some basic kernels       |'/
     &' | ======================================================== |'/
     &' | A measuring method is used that evades overoptimisation  |'/
     &' | of the repetition loop that is used for better timing    |'/
     &' | accuracy by calling the function JSWITCH.  The call      |'/
     &' | of this funtion depends on the iteration counter of      |'/
     &' | the repetition loop.                                     |'/
     &' ------------------------------------------------------------' )
 9020 Format( 1x, 72('-') )
 9030 Format( //,1x, 72('=')/1x, 'Kernel', i3, ': ', a/
     &12x,'No. of flops per Iteration =', i3/
     &1x, 72('-')/
     &1x, '  Loop  ','|   ', 'CPU Time', 2x,'|', 2x, 'Average', 4x, '|',
     $'  Average', 4x, '|', ' Flop per ', '|', 2x, 'Repeat |'/
     &1x, ' length ','|   ', '  sec   ', 2x,'|', 2x, 'Mflop/s', 4x, '|',
     $'    GB/s ', 4x, '|', '   cycle  ', '|',2x, ' count |'/
     &1x, 72('-') )
 9040 Format( 1x, i8, '|', 1pg13.5, '|', g13.5, '|', g13.5, '| ',
     &        f8.4, ' |', i8, ' |' )
 9050 Format( //1x, 'Total execution time:  ',g12.5, ' sec.' )
! ---------------------------------------------------------------------- 
      End Program mod1b
