      Subroutine d_psrs( np, mx, keys, npnew )
! ----------------------------------------------------------------------
      Use      numerics
      Use      dist_module
      Implicit None

      Include 'mpif.h'
! ----------------------------------------------------------------------
! --- Parallel Sort by Regular Sampling algorithm as described by
!     Xiaobo Li, et. al.
!
! --- This implementation is based on the PSRS version for the 
!     HP/Convex SPP-1000 of C. Mobarry/GSFC & J. Crawford/VSEP
!     at NASA.
! --- Adapted to be used as a generic subroutine for 8-byte Real data.
!     In this Fortran 90 implementation no preprocessing is needed.
! --- npnew has been added to the parameter list to enable global
!     combining of the sorted list.
! --- A bug has been removed out of the merging routine that caused
!     the keys() array not to be rearranged.
! ----------------------------------------------------------------------
! --- keys() is the array of keys to be sorted.
!     w1() is a work array.
! ----------------------------------------------------------------------
      Integer  :: np, mx, npnew
      Real(l_) :: keys(mx)

      Real(l_) :: w1(mx)
      Integer  :: irnkl(mx)
      Integer  :: islen(nodes), irlen(nodes)

! --- fencepost index and key values for shuffle:
      Integer  :: fposts(nodes+1), gposts(nodes+1)
      Integer  :: itabr(nodes), itabl(nodes+1)
      Real(l_) :: work(nodes*(nodes+1))
      Real(l_) :: fpval(nodes+1)
      Real(l_) :: lmax, ak

      Integer  :: stat(MPI_Status_Size)
      Integer  :: comm, datyp1, datyp2
      Real(l_) :: buf(nodes)
      Integer  :: ier, itag
      Integer  :: i, j, k, step
! ----------------------------------------------------------------------
      itag   = 0
      datyp1 = MPI_Integer
      datyp2 = MPI_Real8
      comm   = MPI_Comm_World
! ----------------------------------------------------------------------
! --- Local sorts. Note that indx() is a local index on the process.

      w1 = keys
      Call dqsort( w1, np , 1, np )
      If ( nodes == 1 ) Then       ! --- Serial sort: we're done.
         keys  = w1
         npnew = np
         Return
      End If

! --- w1() is now the sorted keys.

      lmax = w1(np)

! --- Choose nodes evenly spaced values out of every bin.
!     Store them all in work().

      k    = 1
      step = np/nodes
      Do i = 1,nodes
         work(i) = w1(k)
         k       = k + step
      End Do

! --- work(1:nodes) are the sampled key values for the fenceposts.

      itag = itag + 1

      If ( me /= 0 ) Then
         Call MPI_Send( work, nodes, datyp2, 0, itag, comm, ier )
      Else
         Do i = 1, nodes - 1
            Call MPI_Recv( buf, nodes, datyp2, MPI_Any_Source,
     &                     itag, comm, stat, ier)
            Do j = 1, nodes
               work(stat(MPI_Source)*nodes+j) = buf(j)
            End Do
         End Do
      End If

! --- Insertion sort the fencepost values of keys and indexes.

      If ( me == 0 ) Then
         Do i = 2, nodes*nodes
            ak = work(i)
            Do j = i, 2, -1
               If ( work(j-1) <= ak ) Go To 10
               work(j) = work(j-1)
            End Do
            j = 1
   10       Continue
            work(j) = ak
         End Do

! --- After the insertion sort work() contains the sorted sampled
!     keys for the fenceposts. We put them in fpval and braodcast them.

         k = 1
         Do i = 1, nodes*nodes, nodes
            fpval(k) = work(i)
            k = k + 1
         End Do
      End If
      Call MPI_BCast( fpval, nodes+1, datyp2, 0, comm, ier )

      fpval(nodes+1) = lmax + 1

! --- Determine segment boundaries. Within each bin, fposts(i) is the
!     start of the i-th shuffle segment.

      fposts(1) = 1
      k         = 2
      Do i = 1, np

! --- The first element may be greater than several fencepost values,
!     so we must use a do-while loop.

         Do 
            If ( w1(i) < fpval(k) ) Exit
            fposts(k) = i
            k = k + 1
         End Do
      End Do

! --- The last element may not be greater than the last fencepost value,
!     so we must assign an appropriate value to every fencepost past the
!     last.

      Do i = k, nodes+1
         fposts(i) = np + 1
      End Do

! --- Every process needs fposts() values from every other process, so
!     we will give each process a copy of all the fposts's in work().

      Do i = 1, nodes
         islen(i) = fposts(i+1) - fposts(i)
      End Do

      Call MPI_Alltoall(islen, 1, datyp1, irlen, 1, datyp1, comm, ier )

! --- Make sure that "fposts" and "gposts" are zero based for
!     MPI_Alltoallv. fposts and gposts are the addresses of the segment
!     boundaries.

      fposts(1) = 0
      gposts(1) = 0
      Do i = 1, nodes
         fposts(i+1) = fposts(i) + islen(i)
         gposts(i+1) = gposts(i) + irlen(i)
      End Do

      npnew = gposts(nodes+1)

      Call MPI_Alltoallv(
     &     w1  , islen, fposts, datyp2,
     &     keys, irlen, gposts, datyp2,
     &     comm, ier )

!--- Set up the information for the merge:

      Do i = 1, nodes + 1
         itabl(i) = gposts(i)
      End Do

! --- Merge the segments within each bin.

      Call d_merge( npnew, nodes, mx, keys, irnkl, itabl, itabr )
! ----------------------------------------------------------------------
      End Subroutine d_psrs
