      Program mod2f
! **********************************************************************
! *** This program is part of the EuroBen Benchmark                  ***
! ***                                                                ***
! *** Copyright: EuroBen Benchmark Group p/o                         ***
! ***            Academic Computing Centre Utrecht                   ***
! ***            P.O. Box 80.011                                     ***
! ***            3508 TA Utrecht                                     ***
! ***            The Netherlands                                     ***
! ***                                                                ***
! *** Author of this program: Aad van der Steen                      ***
! *** Date Version 1.0        Summer 1996                            ***
! *** Date Version 2.0        Spring 1998                            ***
! *** Date Version 2.1        Autumn 1998                            ***
! *** Date Version 3.0        Autumn 2005                            ***
! **********************************************************************
!  ===============
!  MPI Version 3.0
!  ===============
! ----------------------------------------------------------------------
! --- Program 'mod2f' measures the speed of a parallel 1-D 
!     complex-to-complex FFT on 'nodes' processors.
!
!     The FFT is factorised into 'nc' 1-D FFTs of length 'nr'.
!     After the first sequence of small FFTs the Real and Imaginary
!     parts of the sequences that are transformed are multiplied
!     with the appropriate twiddle factors.
!     Next the sequences are transposed and 'nr' 1-D FFTs of length
!     'nc' are done to complete the transformation process.
!     See for a description of this kind of implementation:
!
!     R.C. Agarwal, F.G. Gustavson, M. Zubair, "A High Performance
!        Parallel Algorithm for 1-D FFT", Proc. Supercomputing '94,
!        IEEE Press, 1994, 34--40.
!
! --- From version 2.0 on the number of columns 'nc' and the number of
!     rows 'nr' need not be divided exactly by the number of processors
!     anymore.
! ----------------------------------------------------------------------
!     Parameters:
!
!     NC*NR   -- Total length of FFT.
!     NC      -- No. FFTs of length NR.
!     NODES   -- No. of processors used.
!     M1      -- No. of rows of the arrays holding the NR-length FFTs.
!     M2      -- No. of rows of the arrays holding the NC-length FFTs.
! ----------------------------------------------------------------------
      Use                   numerics
      Implicit              None

      Real(l_), Allocatable :: arr1r(:,:), arr1i(:,:),
     &                         arr2r(:,:), arr2i(:,:),
     &                         carr(:,:),  cari(:,:),
     &                         warr(:),    wari(:),
     &                         ur(:),      ui(:),
     &                         actsiz(:,:), base(:,:)
      Real(l_)              :: time1
      Integer               :: ml, nl, ok
      Real(l_)              :: corr, ctime, err, wclock
      Real(l_)              :: rmflint, rmfltrn, mflops
      Integer               :: i, irep, m1, m2, mflint, mfltrn, mfltot,
     &                         n,  nc, nr, ncase, nrep, nx
      Integer               :: nprocs, omp_get_num_threads
! ----------------------------------------------------------------------
! --- Set up communication.

      Call state( 'mod2f   ' )
!$omp parallel shared(nprocs)
      nprocs = omp_get_num_threads()
!$omp end parallel
      Print 1000, nprocs
! ----------------------------------------------------------------------
! --- Read problem sizes and initialise arrays.

      Open ( 1, File = 'mod2f.in' )
   10 Read ( 1, *, End = 20 ) n, nrep
      Call split( n, m1, m2 )
      nc = 2**m2
      nr = 2**m1
! -----------------------------------------------------------------------

      ml = m1+m2
      nl = nc*nr
      nx = Max( nc, nr )
      err = ( 10.0_l_*nl*ml ) * 1.0e-10_l_
      Allocate( carr(nr,nc),  cari(nr,nc),
     &          arr1r(nr,nc), arr1i(nr,nc),
     &          arr2r(nc,nr), arr2i(nc,nr),
     &          warr(nx),     wari(nx),
     &          ur(nx),       ui(nx) )
      Call gendat( carr, cari, nr, nc )
! ----------------------------------------------------------------------
! --- Repeat FFT 'nrep' times for this problem size.

      time1 = wclock()
      Do irep = 1, nrep
         arr1r = carr
         arr1i = cari
         Call ltrans( nr, nc, arr1r, arr1i, arr2r, arr2i )
         Call cfft4( 0, m2, ur, ui, arr2r, arr2i, warr, wari )
         Do i = 1, nr
            Call cfft4( 1, m2, ur, ui, arr2r(1,i), arr2i(1,i),
     &                  warr, wari )
         End Do
! ----------------------------------------------------------------------
! --- Multiply with twiddle factors.

         Call twiddle( arr2r, arr2i, nc, nr )
! ----------------------------------------------------------------------
! --- Do transposition.
         Call ltrans( nc, nr, arr2r, arr2i, arr1r, arr1i )
! ----------------------------------------------------------------------
! --- Do second pass of M1 NC-length FFTs per processor.
 
         Call cfft4( 0, m1, ur, ui, arr1r, arr1i, warr, wari )
         Do i = 1, nc
            Call cfft4( 1, m1, ur, ui, arr1r(1,i), arr1i(1,i),
     &                  warr, wari)
         End Do
         Call ltrans( nr, nc, arr1r, arr1i, arr2r, arr2i )
      End Do
      time1 = wclock() - time1
! ----------------------------------------------------------------------
! --- Check for errors and correct timing for filling of arrays.

      corr = wclock()
      Do irep = 1, nrep
         arr1r = carr
         arr1i = cari
      End Do
      corr = wclock() - corr
      time1 = time1 - corr
      time1 = time1/Real( nrep, l_ )
      ok = 0
      Call check( arr2r, arr2i, nc, nr, err, ok )
! ----------------------------------------------------------------------
! --- Calculate Mflop rates.     

      Call nflops( ml, mflint, mfltrn )
      mflops = 1.0E-6_l_*( mfltrn + mflint )/time1
      Print 1010, nl, time1, mflops, ok
      Deallocate( carr, cari, arr1r, arr1i, arr2r, arr2i,
     &            warr, wari, ur, ui )
! ----------------------------------------------------------------------
! --- Get new problem.

      Go To 10
     
! ----------------------------------------------------------------------
! --- End of measurements: report results.

   20 Print 1020
! ----------------------------------------------------------------------
 1000 Format( /'Program mod2f computes 1-D, complex-to-complex FFTs:'/
     &         'No. of procs. = ', i3                                /
     &         '------------------------------------------------'/
     &         '         FFT results, Radix-4 algorithm '/
     &         '------------------------------------------------'/
     &         ' Length |  Total Time  |    Speed     | No. of |'/
     &         '   N =  |     (sec)    |  (Mflop/s)   | Errors |'/
     &         '------------------------------------------------' )
 1010 Format ( i8, '| ', g13.5, '| ', g13.5, '| ', i6, ' |' )
 1020 Format ( '------------------------------------------------'/
     &         'Ran OK' )
! ----------------------------------------------------------------------
      End Program mod2f
