#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "fundefs.h"

void tfqmr( int n, int nel, int m, int *indx, int *rowp, double *matvals,
            double *q, double *x, double *b, double *gamma, int maxit,
            double tol, double *exnrm )
/*----------------------------------------------------------------------
  --- tfqmr does an iterative solve of Ax = b, where A is in CRS format:
      int    indx[nel],
      int    rowp[n+1], and
      double matvals[nel].
      double b[n]       : The righthand side.
      double x[n]       : On input the initial guess of the solution
                          On convergence 'x' contains the solution.
      double q[n]       : Contains a (left) lpolynonditioning vector.
      double gamma[m+1] : Polynomial coefficients used in the
                          lpolynonditioning.
      int  maxit        : The maximum number of iterations allowed.
  --- double tol        : Tolerance used as a stop criterium.
      double exnrm      : Contains the norm of the residual on exit.
      void lpolyn(...)    : Name of subroutine performing the
                          lpolynonditioning.
  ----------------------------------------------------------------------*/
{
   extern long int flops;
   int    i, im, im0, iter, j;
   double alpha, beta, c, eta, eta0, kappa, rho, rho0, rhstp, sigma,
          tau, tau0, theta, theta0;
   double *d, *g, *h, *p, *r, *rt, *v, *w, *y, *y0, *z;
// ---------------------------------------------------------------------
   d  = calloc( n, sizeof(double) );
   g  = calloc( n, sizeof(double) );
   h  = calloc( n, sizeof(double) );
   p  = calloc( n, sizeof(double) );
   r  = calloc( n, sizeof(double) );
   rt = calloc( n, sizeof(double) );
   v  = calloc( n, sizeof(double) );
   w  = calloc( n, sizeof(double) );
   y  = calloc( n, sizeof(double) );
   y0 = calloc( n, sizeof(double) );
   z  = calloc( n, sizeof(double) );
   spmxv( n, nel, indx, rowp, matvals, x, w );
#pragma omp parallel for
   for( j = 0; j < n; ++j ) {
      z[j] = b[j] - w[j];
   }
   lpolyn( n, nel, m, indx, rowp, matvals, z, r, gamma );
#pragma omp parallel for
   for( j = 0; j < n; ++j ) {
      w[j] = r[j];
      y[j] = r[j];
   }
   spmxv( n, nel, indx, rowp, matvals, y, z );
   lpolyn( n, nel, m, indx, rowp, matvals, z, g, gamma );
#pragma omp parallel for
   for( j = 0; j < n; ++j ) {
      v[j] = g[j];
      d[j] = 0.0;
   }
   tau   = sqrt( nrm2( n, r ) );
   theta = 0.0;
   eta   = 0.0;
#pragma omp parallel for
   for( j = 0; j < n; ++j ) {
      rt[j] = r[j];
   }
   rho   = nrm2( n, r );// -- We can use 'nrm2' instead of dotpr
                        //    because rt = r.
   rhstp = tol*sqrt( nrm2( n, b ) );
   im0   = 1;
   for( i = 1; i <= maxit; ++i ) {
      iter  = i;
      sigma = dotpr( n, rt, v );
      if ( sigma == 0.0 ) {
         printf( "Stop: sigma = 0.0\n" );
         abort();
      }
      alpha = rho/sigma;
#pragma omp parallel for
      for( j = 0; j < n; ++j ) {
         y0[j] = y[j];
         y[j] -= alpha*v[j];
      }
      spmxv( n, nel, indx, rowp, matvals, y, z );
      lpolyn( n, nel, m, indx, rowp, matvals, z, h, gamma );
      for( im = im0; im <= im0+1; ++im ) {
#pragma omp parallel for
         for( j = 0; j < n; ++j ) {
            w[j] -= alpha*g[j];
         }
         theta0 = theta;
         tau0   = tau;
         if ( tau0 == 0.0 ) {
            printf( "Stop: tau0 = 0.0\n" );
            abort();
         }
         theta = sqrt( nrm2( n, w ) )/tau;
         c     = 1.0/sqrt( 1.0 + theta*theta );
         tau   = tau0*theta*c;
         eta0  = eta;
         eta   = c*c*alpha;
         if ( alpha == 0.0 ) {
            printf( "Stop: alpha = 0.0\n" );
            abort();
         }
#pragma omp parallel for
         for( j = 0; j < n; ++j ) {
            d[j] = y0[j] + (theta0*theta0*eta0/alpha)*d[j];
            x[j] += eta*d[j];
         } 
         *exnrm = sqrt( nrm2( n, r ) );
         kappa = sqrt( (double)( im + 1 ) )*tau;
         if ( kappa < tol ) {
            spmxv( n, nel, indx, rowp, matvals, x, p );
#pragma omp parallel for
            for( j = 0; j < n; ++j ) {
               z[j] = b[j] - p[j];
            }
            lpolyn( n, nel, m, indx, rowp, matvals, z, r, gamma );
            *exnrm = sqrt( nrm2( n, r ) );
            if ( *exnrm < rhstp ) goto L10;  // <--- Convergence
            flops += n + 9;
         }
#pragma omp parallel for
         for( j = 0; j < n; ++j ) {
            y0[j] = y[j];
            g[j]  = h[j];
         }
      }                                     // -- End of im-loop
      rho0 = rho;
      rho  = dotpr( n, rt, w );
      if ( rho0 == 0.0 ) {
         printf( "Stop: rho0 = 0.0\n" );
         abort();
      }
      beta = rho/rho0;
#pragma omp parallel for
      for( j = 0; j < n; ++j ) {
         y[j] = w[j] + beta*y0[j];
      }
       spmxv( n, nel, indx, rowp, matvals, y, z );
       lpolyn( n, nel, m, indx, rowp, matvals, z, g, gamma );
#pragma omp parallel for
       for( j = 0; j < n; ++j ) {
          v[j] = g[j] + beta*( h[j] + beta*v[j] );
       }
       im0 += 2;
    }                                      // -- End of i-loop
/*---------------------------------------------------------------------
  -- Normally we would end up here and with no convergence issue a
     warning. Because we only want to see the residual value and
     the speed for benchmarking purposes, we comment out the following
     lines:
   
    if ( iter >= maxit ) {
       iter = maxit;
       printf( "No convergence in %d iterations;\n", maxit );
       printf( "Norm of residual = %d\n", *exnrm );
         abort();
    }
  --------------------------------------------------------------------- */
L10: flops +=  n + 10 + iter*( 22*n + 79 ); // -- Update # of flops.
     free( d ); free( g ); free( h ); free( p ); free( r ); free( rt );
     free( v ); free( w ); free( y ); free( y0 ); free( z );
//---------------------------------------------------------------------      
} //-- End of tfqmr
