#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "fundefs.h"

int main()
/***********************************************************************
! *** This program is part of the EuroBen Benchmark                  ***
! ***                                                                ***
! *** Copyright: EuroBen Benchmark Group p/o                         ***
! ***            NCF/HPC Research                                    ***
! ***            L.J> Costerstraat 5                                 ***
! ***            6827 AR Arnhem                                      ***
! ***            The Netherlands                                     ***
! ***                                                                ***
! *** Author of this program: Aad van der Steen                      ***
! *** Date Version 1.0        Winter 2011                            ***
! **********************************************************************
!  ==================
!  OpenMP Version 1.0
!  ==================
! ----------------------------------------------------------------------
! --- Program 'mod2f' measures the speed of a parallel 1-D
!     complex-to-complex FFT on 'nodes' processors.
!
!     The FFT is factorised into 'nc' 1-D FFTs of length 'nr'.
!     After the first sequence of small FFTs the Real and Imaginary
!     parts of the sequences that are transformed are multiplied
!     with the appropriate twiddle factors.
!     Next the sequences are transposed and 'nr' 1-D FFTs of length
!     'nc' are done to complete the transformation process.
!     See for a description of this kind of implementation:
!
!     R.C. Agarwal, F.G. Gustavson, M. Zubair, "A High Performance
!        Parallel Algorithm for 1-D FFT", Proc. Supercomputing '94,
!        IEEE Press, 1994, 34--40.
!
! ----------------------------------------------------------------------
!     Parameters:
!
!     NC*NR   -- Total length of FFT.
!     NC      -- No. FFTs of length NR.
!     NODES   -- No. of processors used.
!     M1      -- No. of rows of the arrays holding the NR-length FFTs.
!     M2      -- No. of rows of the arrays holding the NC-length FFTs.
! ----------------------------------------------------------------------*/
{ 
   int      m, n, nrep;
   int      mflint, mfltrn, ok;
   int      i, mr, mc, ml, nr, nc, nl, nx;
   int      nprocs;
   double   **arr1r, **arr1i, **arr2r, **arr2i, **carr, **cari;
   double   *ur, *ui;
   double   corr, err, mflops, time;
   FILE     *inl;
// ------------------------------------------------------------------------
   state( "mod2f" );
#pragma omp parallel
   nprocs = omp_get_num_threads();
   printf( "No. of proc.s = %d\n\n", nprocs );
   prthead();
   inl = fopen( "mod2f.in", "r" );
   while( ( fscanf( inl, "%d%d\n", &n, &nrep ) != EOF ) ){
      split( n, &mr, &mc );
      nr  = pow( 2, mr );
      nc  = pow( 2, mc );
      nx  = max( nr, nc );
      ml  = mr + mc;
      err = ( 10.0*n*ml )*1.0e-10;         // --- Allowed error tolerance.
      carr  = makmat( nr, nc ); cari  = makmat( nr, nc );
      arr1r = makmat( nr, nc ); arr1i = makmat( nr, nc );
      arr2r = makmat( nc, nr ); arr2i = makmat( nc, nr );
      ur    = calloc( nx, sizeof ( double ) );
      ui    = calloc( nx, sizeof ( double ) );
// -------------------------------------------------------------------------
// -- Generate data.

      gendat( nr, nc, carr, cari );
// -------------------------------------------------------------------------
// -- Repeat FFT 'nrep' times for this problem size.

      time = cclock();
      for( i = 0; i < nrep; i++ ) {
         cp_arr2d( nr, nc, carr, arr1r );
         cp_arr2d( nr, nc, cari, arr1i );
//---------------------------------------------------------------------------
//-- Do 1st transposition.
         ltrans( nr, nc, arr1r, arr1i, arr2r, arr2i );

         cfft4( 0, mr, ur, ui, arr2r[0], arr2i[0] );
#pragma omp parallel for private(i)
         for( i = 0; i < nc; i++ ) {
            cfft4( 1, mr, ur, ui, arr2r[i], arr2i[i] );
         }
//---------------------------------------------------------------------------
//-- Multiply with twiddle factors.

         twiddle( nc, nr, arr2r, arr2i );
//---------------------------------------------------------------------------
//-- Do 2nd transposition.

         ltrans( nc, nr, arr2r, arr2i, arr1r, arr1i );
         cfft4( 0, mc, ur, ui, arr1r[0], arr1i[0] );
#pragma omp parallel for private(i)
         for( i = 0; i < nr; i++ ) {
            cfft4( 1, mc, ur, ui, arr1r[i], arr1i[i] );
         }
//---------------------------------------------------------------------------
//-- Do 3rd transposition.

         ltrans( nr, nc, arr1r, arr1i, arr2r, arr2i );
      }
      time = cclock() - time;
// -------------------------------------------------------------------------
// -- Check for errors and correct timing for filling of arrays.

      corr = cclock();
      for( i = 0; i < nrep; i++ ) {
         cp_arr2d( mr, nc, carr, arr1r );
         cp_arr2d( mr, nc, cari, arr1i );
      }
      corr = cclock() - corr;
      time = ( time - corr )/(double)nrep;
      ok   = check( nc, nr, arr2r, arr2i, err );
// -------------------------------------------------------------------------
// -- Calculate Mflop rates.

      nflops( ml, &mflint, &mfltrn );
      mflops = 1.0e-6*( mflint + mfltrn )/time;
      prtspeed( n, time, mflops, ok );
      free( ui ); free( ur );
      delmat( nc, arr2i ); delmat( nc, arr2r );
      delmat( nr, arr1i ); delmat( nr, arr1r );
      delmat( nr, cari ) ; delmat( nr, carr );
   }
   printf( "-----------------------------------------------\nRan OK\n" );
}
