      Program mod2f
! **********************************************************************
! *** This program is part of the EuroBen Benchmark                  ***
! ***                                                                ***
! *** Copyright: EuroBen (European Benchmark Group)                  ***
! *** Distribution by:                                               ***
! ***            National Computer Facilities Foundation             ***
! ***            L.J. Costerstraat 5                                 ***
! ***            6827 AR Arnhem                                      ***
! ***            The Netherlands                                     ***
! ***                                                                ***
! *** Author of this program: Aad van der Steen                      ***
! *** Date Version 1.0        Summer 1996                            ***
! *** Date Version 2.0        Spring 1998                            ***
! *** Date Version 2.1        Autumn 1998                            ***
! *** Date Version 3.0        Autumn 2005                            ***
! *** Date Version 3.1        Summer 2009                            ***
! **********************************************************************
!  ===============
!  MPI Version 3.1
!  ===============
! ----------------------------------------------------------------------
! --- Program 'mod2f' measures the speed of a parallel 1-D 
!     complex-to-complex FFT on 'nodes' processors.
!
!     The FFT is factorised into 'nc' 1-D FFTs of length 'nr'.
!     After the first sequence of small FFTs the Real and Imaginary
!     parts of the sequences that are transformed are multiplied
!     with the appropriate twiddle factors.
!     Next the sequences are transposed and 'nr' 1-D FFTs of length
!     'nc' are done to complete the transformation process.
!     See for a description of this kind of implementation:
!
!     R.C. Agarwal, F.G. Gustavson, M. Zubair, "A High Performance
!        Parallel Algorithm for 1-D FFT", Proc. Supercomputing '94,
!        IEEE Press, 1994, 34--40.
!
! --- From version 2.0 on the number of columns 'nc' and the number of
!     rows 'nr' need not be divided exactly by the number of processors
!     anymore.
! ----------------------------------------------------------------------
!     Parameters:
!
!     N=NC*NR -- Total length of FFT.
!     NC      -- No. FFTs of length NR.
!     NODES   -- No. of processors used.
!     M1      -- No. of rows of the arrays holding the NR-length FFTs.
!     M2      -- No. of rows of the arrays holding the NC-length FFTs.
! ----------------------------------------------------------------------
      Use                   mpi_module
      Use                   numerics
      Implicit              None

      Real(l_), Allocatable :: arr1r(:,:), arr1i(:,:),
     &                         arr2r(:,:), arr2i(:,:),
     &                         carr(:,:),  cari(:,:),
     &                         warr(:),  wari(:),
     &                         ur(:),      ui(:),
     &                         actsiz(:,:), base(:,:)
      Real(l_)              :: time1, time2, timx1, timx2
      Integer               :: ml, nl, ok
      Real(l_)              :: corr, ctime, err, frac, wclock
      Real(l_)              :: rmflint, rmfltrn, mflops
      Integer               :: i, irep, mc, mr, m1, m2, n, nc, nr,
     &                         ncase, nrep, nx
      Integer(8)            :: mflint, mfltrn
! ----------------------------------------------------------------------
! --- Set up communication.

      Call csetup
      If ( me == 0 ) Then
         Call state( 'mod2f   ' )
         Print 1000, nodes
      End If
! ----------------------------------------------------------------------
! --- Read problem sizes and compute local array sizes.

      Open ( 1, File = 'mod2f.in' )
   10 Read ( 1, *, End = 20 ) n, nrep
      Call split( n, m1, m2 )
      nc = 2**m2
      nr = 2**m1
      nx = Max( nc, nr )
      mr = nr/nodes
      mc = nc/nodes
! ----------------------------------------------------------------------
! --- Check that No. of processes matches the problem size.

      If ( mr == 0 ) Then
         If ( me == 0 ) Then
            Print *, 'Stop: No. of rows in 2-D decomposition of FFT (',
     &                nr, ') <  No. of processes (', nodes, ').'
            Print *, 'Increase problem size'
         End If
         Call MPI_Bye
         Stop
      End If
      If ( mc == 0 ) Then
         If ( me == 0 ) Then
            Print *, 'Stop: No. of columns in 2-D decomposition of FFT',
     &               ' (', nc, ') <  No. of processes (', nodes, ').'
            Print *, 'Increase problem size'
         End If
         Call MPI_Bye
         Stop
      End If
! ----------------------------------------------------------------------
! --- Check that the number of processors is a power of 2.

      If ( mr*nodes /= nr .OR. mc*nodes /= nc ) Then
         If ( me == 0 ) Then
            Print *, 'Stop: No. of processors should be a power of 2.'
         End If
         Call MPI_Bye
         Stop
      End If
      ml = m1+m2
      nl = nc*nr
      err = ( 10.0_l_*nl*ml*nodes ) * 1.0e-10_l_
      Call sizoff( nr, nc )
      Allocate( carr(nr,mc),  cari(nr,mc),
     &          arr1r(nr,mc), arr1i(nr,mc),
     &          arr2r(nc,mr), arr2i(nc,mr),
     &          warr(nx),     wari(nx),
     &          ur(nx),       ui(nx) )
      Call gendat( carr, cari, nr, mc )
! ----------------------------------------------------------------------
! --- Repeat FFT 'nrep' times for this problem size.

      Call MPI_Barrier( comm, ierr )
      time1 = MPI_Wtime()
      time2 = 0.0_l_
      Do irep = 1, nrep
         arr1r = carr
         arr1i = cari
! ----------------------------------------------------------------------
! --- Do 1st transposition.

         Call gtrans( arr1r, arr1i, arr2r, arr2i, nr, nc, mr, mc, time2)
! ----------------------------------------------------------------------
! --- Do 1st pass of MR NC-length FFTs per processor.

         Call cfft4( 0, m2, ur, ui, arr2r, arr2i, warr, wari )
         Do i = 1, mr
            Call cfft4( 1, m2, ur, ui, arr2r(1,i), arr2i(1,i), warr,
     &                  wari )
         End Do
! ----------------------------------------------------------------------
! --- Multiply with twiddle factors.

         Call twiddle( arr2r, arr2i, nc, mr )
! ----------------------------------------------------------------------
! --- Do 2nd transposition.

         Call gtrans( arr2r, arr2i, arr1r, arr1i, nc, nr, mc, mr, time2)
! ----------------------------------------------------------------------
! --- Do 2nd pass of MC NR-length FFTs per processor.
 
         Call cfft4( 0, m1, ur, ui, arr1r, arr1i, warr, wari )
         Do i = 1, mc
            Call cfft4( 1, m1, ur, ui, arr1r(1,i), arr1i(1,i), warr, 
     &                  wari)
         End Do
! ----------------------------------------------------------------------
! --- Do 3rd transposition.

         Call gtrans( arr1r, arr1i, arr2r, arr2i, nr, nc, mr, mc, time2)
      End Do
      time1 = MPI_Wtime() - time1
! ----------------------------------------------------------------------
! --- Check for errors and correct timing for filling of arrays.

      Call errchk( arr2r, arr2i, nc, mr, err, ok )
      corr = MPI_Wtime()
      Do irep = 1, nrep
         arr1r = carr
         arr1i = cari
      End Do
      corr = MPI_Wtime() - corr
      time1 = time1 - corr
      Call MPI_Reduce( time1, timx1, 1, rtyp, MPI_Max, 0, comm, ierr )
      time1 = timx1/ Real( nrep, l_ )
      Call MPI_Reduce( time2, timx2, 1, rtyp, MPI_Max, 0, comm, ierr )
      time2 = timx2/ Real( nrep, l_ )
      If ( me == 0 ) Then
! ----------------------------------------------------------------------
! --- Calculate Mflop rates.     

         Call nflops( ml, mflint, mfltrn )
         mflops = 1.0E-6_l_*( mfltrn + mflint )
     &                     / time1
         frac = 100.0_l_*( time2/time1 )
         Print 1010, nl, time1, mflops, time2, frac, ok
      End If
      Deallocate( carr, cari, arr1r, arr1i, arr2r, arr2i,
     &            warr, wari, ur, ui )
! ----------------------------------------------------------------------
! --- Get new problem.

      Go To 10
     
! ----------------------------------------------------------------------
! --- End of measurements: report results.
!     Execution time is maximum time over all nodes.          

   20 If ( me == 0 ) Print 1020
      Call MPI_bye
! ----------------------------------------------------------------------
 1000 Format( /'Program mod2f computes FFT: No. of procs. = ', i3/
     &        73('-')/
     &       '                 FFT results, Radix-4 algorithm',/
     &        73('-')/
     &       ' Length |  Total Time  |    Speed     |  Comm. Time  |',
     &       '   Frac. | No. of |'/
     &       '   N =  |     (sec)    |  (Mflop/s)   |     (sec)    |',
     &       '    (%)  | errors |'/
     &        73('-') )
 1010 Format ( i8, '| ', g13.5, '| ', g13.5, '| ',  g13.5, '| ',  f7.3,
     &         ' | ', i6, 1x, '|' )
 1020 Format ( 73('-')/'Ran OK' )
! ----------------------------------------------------------------------
      End Program mod2f
