#include <stdio.h>
#include <stdlib.h>
#include <string.h>

double timer( int kernno, int length, int repeat )
{
   int          j, k;
   int          lenb, lend, leni;
   int          *indx;
   double       *a, *b, *c, *d, *e;
   double       s0;
   double       corr, time;
   double const c0 = 0.1, c1 = 0.2, c2 = 0.5, c3 = 0.8, c4 = 0.9,
                c5 = 1.0, c6 = 1.1, c7 = 1.2, c8 = 0.4, c9 = 1.5;

   double       cclock( void );
// ---------------------------------------------------------------------
   lenb = length*sizeof( double );
   lend = sizeof( double );
   leni = length*sizeof( int );
   switch( kernno ) {
      case 1 :
         // --- Broadcast.
         a = calloc( length, lend );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
                a[j] = c1;
            }
         }
         time = cclock() - time;
         free( a );
         break;
      case 2 :
         // --- Copy.
         a = calloc( length, lend );
         b = calloc( length, lend );
         memset( a, c0, lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
               b[j] = a[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b );
         break;
      case 3 :
         // --- Addition.
         a = calloc( length, lend );
         b = calloc( length, lend );
         c = calloc( length, lend );
         memset( a, c0, lenb );
         memset( b, c8, lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
               c[j] = a[j] + b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( c );
         break;
      case 4 :
         // --- Subtraction.
         a = calloc( length, lend );
         b = calloc( length, lend );
         c = calloc( length, lend );
         memset( a, c0, lenb );
         memset( b, c8, lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
               c[j] = a[j] - b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( c );
         break;
      case 5 :
         // --- Multiplication.
         a = calloc( length, lend );
         b = calloc( length, lend );
         c = calloc( length, lend );
         memset( a, c0, lenb );
         memset( b, c8, lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
               c[j] = a[j]*b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( c );
         break;
      case 6 :
         // --- Division.
         a = calloc( length, lend );
         b = calloc( length, lend );
         c = calloc( length, lend );
         memset( a, c0, lenb );
         memset( b, c8, lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
               c[j] = a[j]/b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( c );
         break;
      case 7 :
         // --- Dotproduct.
         a = calloc( length, lend );
         b = calloc( length, lend );
         memset( a, c0, lenb );
         memset( b, c8, lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
            s0 = 0.0;
#pragma omp parallel for reduction(+:s0)
            for( j = 0; j < length; j++ ) {
               s0 = s0 + a[j]*b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b );
         break;
      case 8 :
         // --- DAXPY: x[j] = x[j] + const*y[j].
         a = calloc( length, lend );
         b = calloc( length, lend );
         memset( a, c0, lenb );
         memset( b, c8, lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
               a[j] = a[j] + c3*b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b );
         break;
      case 9 :
         // --- General vector update: y[j] = x1[j] + c0*x2[j].
         a = calloc( length, lend );
         b = calloc( length, lend );
         c = calloc( length, lend );
         memset( a, c0, lenb );
         memset( b, c8, lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
               c[j] = a[j] + c3*b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( c );
         break;
      case 10 :
         // --- Flat rotation: y[j] = x1[j]*x2[j] + x3[j]*x4[j]
         a = calloc( length, lend );
         b = calloc( length, lend );
         c = calloc( length, lend );
         d = calloc( length, lend );
         e = calloc( length, lend );
         memset( a, c1, lenb );
         memset( b, c2, lenb );
         memset( c, c3, lenb );
         memset( d, c4, lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
               e[j] = a[j]*c[j] + b[j]*d[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( c ); free( d ); free( e );
         break;
      case 11 :
         // --- 1st order recursion.
         a = calloc( length + 1, lend );
         b = calloc( length + 1, lend );
         memset( a, c1, lenb+8 );
         memset( b, c2, lenb+8 );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 1; j < length + 1; j++ ) {
               b[j] = a[j] - b[j-1];
            }
         }
         time = cclock() - time;
         free( a ); free( b );
         break;   
      case 12 :
         // --- 2nd order recursion.
         a = calloc( length + 2, lend );
         b = calloc( length + 2, lend );
         memset( a, c1, lenb+16 );
         memset( b, c2, lenb+16 );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 2; j < length + 2; j++ ) {
               b[j] = a[j] + b[j-1] - b[j-2];
            }
         }
         time = cclock() - time;
         free( a ); free( b );
         break;
      case 13 :
         // --- 2nd difference.
         a = calloc( length + 2, lend );
         b = calloc( length + 2, lend );
         memset( a, c1, lenb+16 );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 1; j < length + 1; j++ ) {
               b[j] = a[j+1] + 2*a[j] - a[j-1];
            }
         }
         time = cclock() - time;
         free( a ); free( b );
         break;
      case 14 :
         // --- 9th degree polynomial evaluation.
         a = calloc( length, lend );
         b = calloc( length, lend );
         memset( a, c0, lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
               b[j] = c0 + a[j]*(c1 + a[j]*(c2 + a[j]*(c3 + a[j]*
                     (c4 + a[j]*(c5 + a[j]*(c6 + a[j]*(c7 + a[j]*
                     (c8 + a[j]*c9))))))));
            }
         }
         time = cclock() - time;
         free( a ); free( b );
         break;
      case 15 :
         // --- Multiplication with stride 3.
         a = calloc( 3*length, lend );
         b = calloc( 3*length, lend );
         c = calloc( 3*length, lend );
         memset( a, c0, 3*lenb );
         memset( b, c8, 3*lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < 3*length; j += 3 ) {
               c[j] = a[j]*b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( c );
         break;
      case 16 :
         // --- Multiplication with stride 4.
         a = calloc( 4*length, lend );
         b = calloc( 4*length, lend );
         c = calloc( 4*length, lend );
         memset( a, c0, 4*lenb );
         memset( b, c8, 4*lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < 4*length; j += 4 ) {
               c[j] = a[j]*b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( c );
         break;
      case 17 :
         // --- Division with stride 3.
         a = calloc( 3*length, lend );
         b = calloc( 3*length, lend );
         c = calloc( 3*length, lend );
         memset( a, c0, 3*lenb );
         memset( b, c8, 3*lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < 3*length; j += 3 ) {
               c[j] = a[j]/b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( c );
         break;
      case 18 :
         // --- Division with stride 4.
         a = calloc( 4*length, lend );
         b = calloc( 4*length, lend );
         c = calloc( 4*length, lend );
         memset( a, c0, 4*lenb );
         memset( b, c8, 4*lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < 4*length; j += 4 ) {
               c[j] = a[j]/b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( c );
         break;
      case 19 :
         // --- Dotproduct with stride 3.
         a = calloc( 3*length, lend );
         b = calloc( 3*length, lend );
         memset( a, c0, 3*lenb );
         memset( b, c8, 3*lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
            s0 = 0.0;
#pragma omp parallel for reduction(+:s0)
            for( j = 0; j < 3*length; j += 3 ) {
               s0 = s0 + a[j]*b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b );
         break;
      case 20 :
         // --- Dotproduct with stride 4.
         a = calloc( 4*length, lend );
         b = calloc( 4*length, lend );
         memset( a, c0, 4*lenb );
         memset( b, c8, 4*lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
            s0 = 0.0;
#pragma omp parallel for reduction(+:s0)
            for( j = 0; j < 4*length; j += 4 ) {
               s0 = s0 + a[j]*b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b );
         break;
      case 21 :
         // --- DAXPY: x[j] = x[j] + const*y[j] with stride 3.
         a = calloc( 3*length, lend );
         b = calloc( 3*length, lend );
         memset( a, c0, 3*lenb );
         memset( b, c8, 3*lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < 3*length; j += 3 ) {
               a[j] = a[j] + c3*b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b );
         break;
      case 22 :
         // --- DAXPY: x[j] = x[j] + const*y[j] with stride 4.
         a = calloc( 4*length, lend );
         b = calloc( 4*length, lend );
         memset( a, c0, 4*lenb );
         memset( b, c8, 4*lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < 4*length; j += 4 ) {
               a[j] = a[j] + c3*b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b );
         break;
      case 23 :
         // --- General vector update: y[j] = x1[j] + c0*x2[j] with stride 3.
         a = calloc( 3*length, lend );
         b = calloc( 3*length, lend );
         c = calloc( 3*length, lend );
         memset( a, c0, 3*lenb );
         memset( b, c8, 3*lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < 3*length; j += 3 ) {
               c[j] = a[j] + c3*b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( c );
         break;
      case 24 :
         // --- General vector update: y[j] = x1[j] + c0*x2[j] with stride 4.
         a = calloc( 4*length, lend );
         b = calloc( 4*length, lend );
         c = calloc( 4*length, lend );
         memset( a, c0, 4*lenb );
         memset( b, c8, 4*lenb );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < 4*length; j += 4 ) {
               c[j] = a[j] + c3*b[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( c );
         break;
      case 25 :
         // --- Vector scatter.
         a    = calloc( length, lend );
         b    = calloc( length, lend );
         indx = calloc( length, leni );
         memset( a, c0, lenb );
         permut( length, indx );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
               b[indx[j]] = a[j];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( indx );
         break;
      case 26 :
         // --- Vector scatter.
         a    = calloc( length, lend );
         b    = calloc( length, lend );
         indx = calloc( length, leni );
         memset( a, c0, lenb );
         permut( length, indx );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
               b[j] = a[indx[j]];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( indx );
         break;
      case 27 :
         // --- Indirect indexed multiply.
         a    = calloc( length, lend );
         b    = calloc( length, lend );
         c    = calloc( length, lend );
         indx = calloc( length, leni );
         memset( b, c0, lenb );
         memset( c, c8, lenb );
         permut( length, indx );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
               a[j] = b[indx[j]]*c[indx[j]];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( c ); free( indx );
         break;
      case 28 :
         // --- Indirect indexed division.
         a    = calloc( length, lend );
         b    = calloc( length, lend );
         c    = calloc( length, lend );
         indx = calloc( length, leni );
         memset( b, c0, lenb );
         memset( c, c8, lenb );
         permut( length, indx );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
               a[j] = b[indx[j]]/c[indx[j]];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( c ); free( indx );
         break;
      case 29 :
         // --- Indirect indexed dotproduct.
         a    = calloc( length, lend );
         b    = calloc( length, lend );
         indx = calloc( length, leni );
         memset( a, c8, lenb );
         memset( b, c0, lenb );
         permut( length, indx );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
            s0 = 0.0;
#pragma omp parallel for reduction(+:s0)
            for( j = 0; j < length; j++ ) {
               s0 = s0 + a[indx[j]]*b[indx[j]];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( indx );
         break;
      case 30 :
         // --- Indirect indexed DAXPY.
         a    = calloc( length, lend );
         b    = calloc( length, lend );
         indx = calloc( length, leni );
         memset( a, c8, lenb );
         memset( b, c0, lenb );
         permut( length, indx );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
               a[j] = a[j] + c3*b[indx[j]];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( indx );
         break;
      case 31 :
         // --- Indirect indexed general vector update.
         a    = calloc( length, lend );
         b    = calloc( length, lend );
         c    = calloc( length, lend );
         indx = calloc( length, leni );
         memset( b, c0, lenb );
         memset( c, c2, lenb );
         permut( length, indx );
         time = cclock();
         for( k = 0; k < repeat; k++ ) {
            dummy( &s0, a, c0, length );
#pragma omp parallel for
            for( j = 0; j < length; j++ ) {
               a[j] = c[indx[j]] + c3*b[indx[j]];
            }
         }
         time = cclock() - time;
         free( a ); free( b ); free( c ); free( indx );
         break;
      default :
         printf( "In timing routine: Unknown kernel # %d\n", kernno );
   }
   corr = cclock();
   for( k = 0; k < repeat; k++ ) {
      dummy( &s0, a, c0, length );
   }
   time = time - cclock() + corr;
   return( time );
}
