#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include "fundefs.h"

void rgmres( int n, int nel, int m, int *indx, int *rowp, double *matvals,
             double *q, double *x, double *b, double *gamma, int maxit,
             double tol, double *exnrm )
/*----------------------------------------------------------------------
  --- rgmres does an iterative solve of Ax = b, where A is in CRS format:
      int    indx[nel],
      int    rowp[n+1], and
      double matvals[nel].
      double b[n]       : The righthand side.
      double x[n]       : On input the initial guess of the solution
                          On convergence 'x' contains the solution.
      double q[n]       : Contains a (left) lpolynonditioning vector.
      double gamma[m+1] : Polynomial coefficients used in the
                          lpolynonditioning.
      int  maxit        : The maximum number of iterations allowed.
  --- double tol        : Tolerance used as a stop criterium.
      double exnrm      : Contains the norm of the residual on exit.
  ----------------------------------------------------------------------*/
{
   extern long int flops;
   int const ib = 10, ir = 50;
   int       i, iter, j, jc, k, k0, k1, l;
   int       conv = 0;
   double    g[ir+2], rho[ir], **rm, v[ib*n];
   double    alpha, beta, eta, rhstp, tau1, tau2;
   double    *r, *w, *z;
// ---------------------------------------------------------------------
   r = calloc( n, sizeof(double) );
   w = calloc( n, sizeof(double) );
   z = calloc( n, sizeof(double) );
   rm = makmat( ir+1, ir+1 );
   rhstp = sqrt( nrm2( n, b ) );
   spmxv( n, nel, indx, rowp, matvals, x, w );
#pragma omp parallel for
   for( l = 0; l < n; ++l ) {
      z[l] = b[l] - w[l];
   } 
   lpolyn( n, nel, m, indx, rowp, matvals, z, r, gamma );
   beta = sqrt( nrm2( n, r ) );
   for( i = 1; i <= maxit; ++i ) {
      iter = i;
      g[0] = beta;
      g[1] = beta;
      if ( beta == 0.0 ) {
         printf( "Stop: beta = 0.0\n" );
         abort();
      }
#pragma omp parallel for
      for( l = 0; l < n; ++l ) {
         v[l] = r[l]/beta;
      }
      k0 = 0;
      for( j = 0; j < ib; ++j ) {
         jc = j;
         spmxv( n, nel, indx, rowp, matvals, v+k0, w );
         lpolyn( n, nel, m, indx, rowp, matvals, w, z, gamma );
         k1 = 0;
         for( k = 0; k <= j; ++k ) {
            rm[k][j] = dotpr( n, v+k1, z );
            k1 += n;
         }
         k1 = 0;
         setval( n, w, 0.0 );
         for( k = 0; k <= j; ++k ) {
#pragma omp parallel for
            for( l = 0; l < n; ++l ) {
               w[l] += rm[k][j]*v[k1+l];
            }
            k1    += n;
            flops += 2*n;
         }
#pragma omp parallel for
         for( l = 0; l < n; ++l ) {
            w[l] = z[l] - w[l];
         }
         rm[j+1][j] = sqrt( nrm2( n, w ) );
         if ( rm[j+1][j] == 0.0 ) {
            printf( "Stop: rm[j+1][j] = 0\n" );
            abort();
         }
#pragma omp parallel for
         for( l = 0; l < n; ++l ) {
            v[k0+l] = w[l]/rm[j+1][j];
         }
         k0 += n;
         for( k = 0; k < j; ++k ) {
            rotf( rho[k], &alpha, &eta );
            tau1 = rm[k][j];
            tau2 = rm[k+1][j];
            rm[k][j]   = alpha*tau1 - eta*tau2;
            rm[k+1][j] = eta*tau1   + alpha*tau2;
            flops += 6;
         }
         givens( rm[j][j], rm[j+1][j], &alpha, &eta );
         tau1 = rm[j][j];
         tau2 = rm[j+1][j];
         rm[j][j]   = alpha*tau1 - eta*tau2;
         rm[j+1][j] = eta*tau1   + alpha*tau2;
         flops += 6;
         rho[j] = rotb( alpha, eta );
         g[j]   = g[j]*alpha;
         g[j+1] = g[j+1]*eta;
         g[j+2] = g[j+1];

//    -- Test for convergence.

         *exnrm = fabs( g[j+1] );
         conv  = ( *exnrm < rhstp ) ? 1 : 0;
         if ( conv ) break;
      }                                             // <--- End of j-loop
      lsqslv( jc, ir+2, rm, g );
      k1 = 0;
      for( k = 0; k <= jc; ++k ) {
#pragma omp parallel for
         for( l = 0; l < n; ++l ) {
            x[l] += g[k]*v[k1+l];
         }
         k1    += n;
         flops += 2*n;
      }
      if ( conv ) {
         free( r ); free( w ); free( z );
         return;
      }
      spmxv( n, nel, indx, rowp, matvals, x, w );
#pragma omp parallel for
      for( l = 0; l < n; ++l ) {
         z[l] = b[l] - w[l];
      }
      lpolyn( n, nel, m, indx, rowp, matvals, z, r, gamma );
      beta = sqrt( nrm2( n, r ) );
   }                                               // <--- End of i-loop
/*----------------------------------------------------------------------
  --- Normally we would end up here and with no convergence issue a
      warning. Because we only want to see the residual value and
      the speed for benchmarking purposes, we comment out the following
      lines:
 
      if ( iter >= maxit ) {
           iter = maxit;
           printf( "No convergence in %d iterations\n", maxit );
           printf( "Norm of residual = %11.5g\n", *exnrm );
           abort();
      }
  -----------------------------------------------------------------------*/
   delmat( ir+1, rm );
   flops += n + 9 + iter*( 2*n + 9 );                // -- Rest of flops.
   free( r ); free( w ); free( z );
} // -- End of rgmres
