LCOV - code coverage report
Current view: top level - utils/pilgrim - mod_comm.F90 (source / functions) Hit Total Coverage
Test: coverage.info Lines: 641 1842 34.8 %
Date: 2025-03-14 01:21:06 Functions: 26 52 50.0 %

          Line data    Source code
       1             : #define MOD_ASSUMED_SIZE
       2             : !#define MOD_SPECIFIED_SHAPE
       3             : #if !defined( STAND_ALONE )
       4             : #define NOR4 ! Activate to effectively eliminate real*4 window
       5             : #endif
       6             : !BOP
       7             : !
       8             : ! !MODULE: mod_comm --- SPMD parallel decompostion/communication module
       9             :       module mod_comm
      10             : !
      11             : ! !DESCRIPTION:
      12             : !
      13             : !  \paragraph{Overview}
      14             : !
      15             : !    This module contains SPMD parallelism decomposition and
      16             : !    communication routines.  This library was originally written by
      17             : !    W. Putman and S.-J. Lin for simple gridded communications in the
      18             : !    Finite-Volume General Circulation Model (FVGCM).  Most of the
      19             : !    member functions are specific to the type of gridded data, 
      20             : !    ghost communication and decompositions used in FVGCM (which 
      21             : !    are, however, very common in atmospheric models).
      22             : !
      23             : !    The module was extended for irregular communication
      24             : !    by W. Sawyer and A. Mirin. It is now
      25             : !    a more general tool and has been incorporated into the Parallel
      26             : !    Library for Grid Manipulations (PILGRIM) which is used in the
      27             : !    Community Atmospheric Model (CAM) and the Physical-space
      28             : !    Statistical Analysis System (PSAS).
      29             : !
      30             : !    **********************************************************
      31             : !    The storage associated with the irregular communications
      32             : !    is based on CAM requirements. It runs the risk of functioning
      33             : !    improperly when used in another code.
      34             : !    **********************************************************
      35             : !
      36             : !    Irregular communication is based on the {\tt blockdescriptor}
      37             : !    derived type, which defines a set of parcels which are to be
      38             : !    send to (or received from) another PE.  The irregular 
      39             : !    communication routines operate on arrays of block descriptors
      40             : !    whose length is equal to number of PEs involved in the
      41             : !    communication.  This means the irregular communication primitives
      42             : !    are merely non-blocking (potentially) all-to-all primitives.
      43             : ! 
      44             : !    This package is based on standard MPI-1 communications, and OpenMP
      45             : !    may be implemented. MPI-2 and SHMEM support have been removed.
      46             : !
      47             : !
      48             : !  \paragraph{Use of Global Arrays}
      49             : !
      50             : !    The module uses the concept of global arrays (coined from former
      51             : !    usage of shared memory arenas in the "multi-level parallelism"
      52             : !    (MLP) paradigm).  Global arrays are merely buffers into which
      53             : !    data are packed for the transfer to other PEs and are not
      54             : !    necessarily of global extent. All such arrays are
      55             : !    1-dimensional; they are accessed as needed with offset vars.
      56             : !
      57             : !  \paragraph{Use of Windows}
      58             : !
      59             : !       All implementations use real*8, real*4, and integer*4 windows
      60             : !       which are used with global arrays as follows:
      61             : !
      62             : !       \begin{itemize}
      63             : !         \item   r8\_win -> ga\_r8 - for use with real*8 types
      64             : !         \item   r4\_win -> ga\_r4 - for use with real*4 types
      65             : !         \item   i4\_win -> ga\_i4 - for use with integer*4 types
      66             : !       \end{itemize}
      67             : !
      68             : !       note: MPI routines need 2 buffers per GA, ga\_<type>\_s & ga\_<type>\_r
      69             : !             ga\_<type>\_r is used for the windows
      70             : !
      71             : !  \paragraph{Compilation}
      72             : !
      73             : !    This module contains several precompile options:
      74             : !
      75             : !    \begin{itemize}
      76             : !      \item {\tt STAND_ALONE}:  Use as stand-alone library (if
      77             : !                                defined) or as part of CAM (if 
      78             : !                                undefined)
      79             : !      \item {\tt MODCM_TIMING}: Turn on CAM timing routines (only
      80             : !                                available if compiled in CAM framework)
      81             : !      \item {\tt _OPENMP}:      Implicit token (controlled by
      82             : !                                compiler) to enable OpenMP
      83             : !    \end{itemize}
      84             : !
      85             : !    
      86             : !  \paragraph{Usage}
      87             : !
      88             : !    NOTE - must call PILGRIM routine parinit to initialize before
      89             : !    making any other calls.
      90             : !
      91             : !    The public members of this module are:
      92             : !
      93             : !      \begin{itemize}
      94             : !         \item {\tt mp\_init}:          Initialize module
      95             : !         \item {\tt mp\_exit}:          Exit module
      96             : !         \item {\tt mp\_send4d\_ns}:    Ghost 4D array on north/south
      97             : !         \item {\tt mp\_recv4d\_ns}:    Complete 4D N/S ghost operation
      98             : !         \item {\tt mp\_send2\_ns}:     Ghost 2 3D arrays on north/south
      99             : !         \item {\tt mp\_recv2\_ns}:     Complete 2x3D N/S ghost operation
     100             : !         \item {\tt mp\_send3d}:        Send 3D general ghost region
     101             : !         \item {\tt mp\_recv3d}:        Complete 3D general ghost operation
     102             : !         \item {\tt mp\_send3d\_2}:     Send 2x3D general ghost regions
     103             : !         \item {\tt mp\_recv3d\_2}:     Complete 2x3D general ghost operation
     104             : !         \item {\tt get\_partneroffset}:Offset for remote write
     105             : !         \item {\tt mp\_sendirr}:       Initiate all-to-all send of parcels
     106             : !         \item {\tt mp\_recvirr}:       Complete all-to-all chunk commun.
     107             : !       \end{itemize}
     108             : !
     109             : !     There are variants of some of these routines for r4 and i4 data types.
     110             : !     There are other public routines, but these are only used internally
     111             : !     in PILGRIM, and they should not be called by user applications.
     112             : !
     113             : ! !REVISION HISTORY:
     114             : !    2001.09.01   Lin
     115             : !    2002.04.16   Putman  Modified for Global Array code
     116             : !    2002.04.16   Putman  Added ProTeX documentation
     117             : !    2002.05.28   Putman  Added use of precision module
     118             : !    2003.06.24   Sawyer  Minor additions for use with mod_irreg
     119             : !    2004.01.08   Sawyer  Removed older functionality, no longer needed
     120             : !    2004.02.10   Mirin   Major restructuring and simplification. Documentation
     121             : !    2004.03.06   Sawyer  Additional documentation; cosmetics
     122             : !    2005.03.20   Sawyer  Added extensive support for real*4
     123             : !    2005.10.12   Worley  Improved vectorization of buffer copies and general clean-up
     124             : !    2006.05.15   Mirin   Make dynamic allocation the default; general clean-up.
     125             : ! !USES:
     126             : #if defined( STAND_ALONE )
     127             : # define iulog 6
     128             : #else
     129             :       use cam_logfile, only: iulog
     130             : #endif
     131             : 
     132             : !
     133             : ! Performance bug work around for Gemini interconnect
     134             : !
     135             : #ifdef _NO_MPI_RSEND
     136             : #define MPI_RSEND MPI_SEND
     137             : #define mpi_rsend mpi_send
     138             : #define MPI_IRSEND MPI_ISEND
     139             : #define mpi_irsend mpi_isend
     140             : #endif 
     141             : 
     142             : !
     143             : ! Mod_comm has option for stand-alone use as well as within CAM
     144             : !
     145             : 
     146             : #if defined ( SPMD )
     147             : 
     148             : #if defined( STAND_ALONE )
     149             : # define r8 selected_real_kind(12)
     150             : # define r4 selected_real_kind( 6)
     151             : # define i8 selected_int_kind(13)
     152             : # define i4 selected_int_kind( 6)
     153             : # define PLON        144
     154             : # define PLAT         91
     155             : # define PLEV         26
     156             : # define PCNST         1
     157             : #else
     158             :       use shr_kind_mod, only : r8 => shr_kind_r8, r4 => shr_kind_r4,  &
     159             :                                i8 => shr_kind_i8, i4 => shr_kind_i4
     160             : #endif
     161             : #if defined( MODCM_TIMING )
     162             :       use perf_mod
     163             : #endif
     164             : 
     165             :       implicit none
     166             : 
     167             : #include "mpif.h"
     168             : 
     169             : ! !PUBLIC MEMBER FUNCTIONS:
     170             :       public mp_init, mp_exit,                                             &
     171             :              mp_send4d_ns, mp_recv4d_ns, mp_send4d_ns_r4, mp_recv4d_ns_r4, &
     172             :              mp_send2_ns, mp_recv2_ns, mp_send3d_2, mp_recv3d_2,           &
     173             :              mp_send3d, mp_recv3d, mp_sendirr, mp_recvirr,                 &
     174             :              mp_sendirr_r4, mp_recvirr_r4, mp_sendirr_i4, mp_recvirr_i4,   &
     175             :              mp_swapirr, mp_swapirr_i4, mp_barrier,                        &
     176             :              get_partneroffset, mp_r8, mp_r4, mp_i4,                       &
     177             :              mp_sendtrirr, mp_recvtrirr, mp_swaptrirr
     178             :       public modcam_method, modcam_geopk, modcam_gatscat, modcam_npryz, modcam_maxirr
     179             : 
     180             : ! !PRIVATE MEMBER FUNCTIONS:
     181             :       private ceil2    ! copy of routine in atm/cam/src/utils/spmdutils
     182             :       private pair     ! copy of routine in atm/cam/src/utils/spmdutils
     183             : 
     184             : !------------------------------------------------------------------------------
     185             : !  type declaration for describing an arbitrary number of contiguous parcels
     186             : !  this is for irregular communications
     187             : !------------------------------------------------------------------------------
     188             :       type blockdescriptor
     189             :          integer              :: method             ! transpose method
     190             :          integer              :: type               ! Ptr to MPI derived type
     191             :          integer, pointer     :: displacements(:)   ! Offsets in local segment
     192             :          integer, pointer     :: blocksizes(:)      ! Block sizes to transfer
     193             :          integer              :: partneroffset      ! Aggregated partner offset
     194             :          integer              :: partnertype        ! Ptr to partner's MPI derived type
     195             :          integer              :: Nparcels           ! size( displacements )
     196             :          integer              :: Tot_Size           ! sum ( blocksizes )
     197             :       end type blockdescriptor
     198             : 
     199             : ! Transpose methods (method)
     200             : !      0 for contiguous temporary buffer
     201             : !      1 for direct communication (derived types)
     202             : 
     203             : ! The variables immediately below refer specifically to mpi derived types
     204             :       INTEGER, ALLOCATABLE, SAVE :: InHandle(:, :)
     205             :       INTEGER, ALLOCATABLE, SAVE :: OutHandle(:, :)
     206             :       INTEGER, SAVE :: BegTrf = 0  ! Ongoing overlapped begintransfer #
     207             :       INTEGER, SAVE :: EndTrf = 0  ! Ongoing overlapped endtransfer #
     208             :       INTEGER, SAVE :: MaxTrf = 0  ! Max no. active Mp_sendirr derived type messages
     209             : 
     210             : ! !PUBLIC DATA MEMBERS:
     211             :       integer, SAVE:: gid                         ! PE id
     212             :       integer(i4), SAVE:: masterpro = 0           ! Master process id 
     213             :       integer(i4), SAVE:: numpro                  ! Permanent No. of PEs
     214             :       integer(i4), SAVE:: numcomm                 ! Local No. of PEs
     215             :       integer(i4), SAVE:: numcpu                  ! No. of threads
     216             :       integer, SAVE:: commglobal                  ! Global Communicator
     217             :       integer, SAVE:: Max_Nparcels = 0            ! Maximum number of parcels in
     218             :                                                   !  single blockdescriptor
     219             : 
     220             : !------------------------------------------------------------------------------
     221             : !  Local parameters
     222             : !------------------------------------------------------------------------------
     223             :       integer, parameter:: nbuf = 2               ! Max No. of sends per call
     224             : ! mp_send4d_ns has two sends per call (full border regions to north and south)
     225             : ! mp_send2_ns has four sends per call (2 directions and 2 variables); however,
     226             : !  only one ghost latitude is sent, so nbuf=2 suffices as long as nghost
     227             : !  is greater than 1.
     228             : ! mp_send3d has one send per call (border region in one direction).
     229             : ! mp_send3d_2 has two sends per call (2 variables, border region in one direction).
     230             :       integer, parameter:: nghost = 3             ! No. of ghost indices
     231             :       integer, parameter:: max_nq = 1             ! No. of tracers simultaneously
     232             :                                                   !  border communicated; can be
     233             :                                                   !  overridden with dynamic storage
     234             :       integer, parameter:: max_trac = PCNST       ! No. of tracers
     235             :       integer, parameter:: max_call = 2           ! Max No. of back-to-back...
     236             :                                                   ! ...mp_send calls
     237             : ! Currently, CAM has at most two overlapping border communication calls
     238             : ! The above variable is relevant for contiguous irregular communications
     239             : 
     240             :       integer, parameter:: idimsize = PLON*nghost*(PLEV+1)*max_nq
     241             :                                                   ! Size of MPI buffer region
     242             :                                                   ! in mp_send/mp_recv calls, used
     243             :                                                   ! to determine offset in GA
     244             :       integer, parameter:: platg = PLAT + 2*nghost
     245             :       integer, parameter :: mp_r4 = MPI_REAL
     246             :       integer, parameter :: mp_r8 = MPI_DOUBLE_PRECISION
     247             :       integer, parameter :: mp_i4 = MPI_INTEGER
     248             : 
     249             : !------------------------------------------------------------------------------
     250             : !  Local variables
     251             : !------------------------------------------------------------------------------
     252             : 
     253             :       integer, SAVE:: max_irr = 0       ! Max No. active Mp_sendirr calls with window
     254             :       integer ierror
     255             :       integer, SAVE:: sizet1, sizer8, sizer4, sizei4
     256             : 
     257             : ! CAM-specific variables
     258             :       integer, SAVE:: tracmax, tracbmax, dpvarmax, totvar
     259             :       integer, SAVE:: phys_transpose_mod
     260             :       integer, SAVE:: idimsizz
     261             :       integer, SAVE:: modcam_method, modcam_geopk, modcam_gatscat
     262             :       integer, SAVE:: modcam_npryz(4), modcam_tagoffset, modcam_maxirr
     263             :       integer, parameter :: phys_transpose_modmin = 11
     264             :       integer, parameter :: phys_transpose_vars = 7
     265             :       data phys_transpose_mod / -1 /
     266             :       data modcam_method / -1 /
     267             :       data modcam_geopk / -1 /
     268             :       data modcam_gatscat / -1 /
     269             :       data modcam_npryz / -1, -1, -1, -1 /
     270             :       data modcam_tagoffset / 0 /
     271             :       data modcam_maxirr / -1 /
     272             : !
     273             : ! tracmax is the maximum number of tracers simultaneously transposed within dynamics (set to 1)
     274             : !    (except in dynamics-physics transposes)
     275             : ! tracbmax is the maximum number of tracers simultaneously border communicated
     276             : ! dpvarmax is the number of variables communicated in dynamics-physics transposes 
     277             : ! totvar is the maximum number of variables simultaneously transposed
     278             : ! phys_transpose_mod is the communication method for dynamics/physics transposes; admissable values
     279             : !     are >= phys_transpose_modmin; it is communicated from CAM when such transposes
     280             : !     are requested.
     281             : ! phys_transpose_vars is the number of non-tracer variables transposed between dynamics and
     282             : !     physics instantiations in CAM.
     283             : ! modcam_method, modcam_geopk and modcam_gatscat correspond to mod_method, mod_geopk and
     284             : !     mod_gatscat in CAM.
     285             : ! modcam_npryz corresponds to npr_yz in CAM.
     286             : ! modcam_maxirr corresonds to mod_maxirr in CAM.
     287             : 
     288             : !------------------------------------------------------------------------------
     289             : !  Variables to control global array locations and window synchronization
     290             : !------------------------------------------------------------------------------
     291             :       integer win_count                 ! Counts No. of windows in use
     292             :       integer igosouth, igonorth        ! Index of latitudinal send direction
     293             :       integer ifromsouth, ifromnorth    ! Index of latitudinal recv direction
     294             : 
     295             : !------------------------------------------------------------------------------
     296             : !  Local type declaration for mp_windows
     297             : !------------------------------------------------------------------------------
     298             :       type window
     299             :          integer :: id            ! Window id
     300             :          integer :: size          ! Size of global window (point based)
     301             :          integer :: ncall_s       ! Count send calls on window
     302             :          integer :: ncall_r       ! Count recv calls on window
     303             :          integer :: offset_s      ! Starting position in GA send
     304             :          integer :: offset_r      ! Starting position in GA recv
     305             :          integer :: dest          ! For use with send calls
     306             :          integer :: src           ! For use with recv calls
     307             :          integer :: size_r        ! Size of incoming message
     308             :          integer :: nsend         ! Send counter
     309             :          integer :: nrecv         ! Receive post counter
     310             :          integer :: nread         ! Receive confirm counter
     311             :          integer, pointer :: sqest(:) ! Send handle
     312             :          integer, pointer :: rqest(:) ! Receive handle
     313             :      end type window
     314             : 
     315             : !------------------------------------------------------------------------------
     316             : ! Beginning Global Array variable declaration:
     317             : !------------------------------------------------------------------------------
     318             : 
     319             :       type (window) :: r8_win
     320             :       type (window) :: r4_win
     321             :       type (window) :: i4_win
     322             :       type (window) :: t1_win
     323             : 
     324             : ! Upper bound on ratio of local to average storage over subdomains.
     325             : ! This takes into account different sized subdomains.
     326             : 
     327             :       real*8, parameter :: alloc_slack_factor = 1.2_r8
     328             : 
     329             : !
     330             : !   window variable declarations
     331             : !
     332             :       real(r8), allocatable,    SAVE:: ga_t1_r(:)
     333             :       real(r8), allocatable,    SAVE:: ga_t1_s(:)
     334             :       real(r8), allocatable,    SAVE:: ga_r8_r(:)
     335             :       real(r8), allocatable,    SAVE:: ga_r8_s(:)
     336             :       real(r4), allocatable,    SAVE:: ga_r4_r(:)
     337             :       real(r4), allocatable,    SAVE:: ga_r4_s(:)
     338             :       integer(i4), allocatable, SAVE:: ga_i4_r(:)
     339             :       integer(i4), allocatable, SAVE:: ga_i4_s(:)
     340             : !
     341             : !   auxiliary variable declarations
     342             : !
     343             :       integer, SAVE:: Status(MPI_STATUS_SIZE)
     344             :       integer, allocatable, SAVE:: Stats(:)
     345             : !
     346             : !EOP
     347             : !------------------------------------------------------------------------------
     348             :       contains
     349             : !------------------------------------------------------------------------------
     350             : !BOP
     351             : ! !ROUTINE: mp_init --- Initialize SPMD parallel communication
     352             : !
     353             : ! !INTERFACE:
     354        1536 :       subroutine mp_init( comm, npryzxy, mod_method, mod_geopk, mod_gatscat, mod_maxirr )
     355             : !
     356             : ! !INPUT PARAMETERS:
     357             :       integer, optional :: comm                        ! communicator
     358             :       integer, optional, intent(in) :: npryzxy(4)      ! 2D decomposition
     359             :       integer, optional, intent(in) :: mod_method      ! CAM optimization
     360             :       integer, optional, intent(in) :: mod_geopk       ! CAM optimization
     361             :       integer, optional, intent(in) :: mod_gatscat     ! CAM optimization
     362             :       integer, optional, intent(in) :: mod_maxirr      ! CAM optimization
     363             : ! !DESCRIPTION:
     364             : !
     365             : !     Initialize SPMD parallel communication.  It is recommended that
     366             : !     COMM (main communicator) and NPRYZXY (2D decomposition) be set.
     367             : !
     368             : !     Set the mod* variables only if you are acquainted with their 
     369             : !     meaning (default is 0).
     370             : !
     371             : ! !REVISION HISTORY: 
     372             : !    2001.09.01   Lin
     373             : !    2002.02.15   Putman        Modified for Global Array code
     374             : !    2002.04.09   Putman        Added ProTeX documentation
     375             : !    2002.08.06   Sawyer        Added optional communicator input argument
     376             : !    2006.06.15   Sawyer        Added CAM-dependent optional arguments
     377             : !
     378             : !EOP
     379             : !------------------------------------------------------------------------------
     380             : !BOC
     381             : !
     382             : ! !LOCAL VARIABLES:
     383             :       integer mysize
     384             :       integer using_window, vertical_lines, latitude_lines
     385             :       integer local_dynamic_storage, numpro_use
     386             :       real*8 geopkrat, one, ghostrat
     387             : 
     388             : ! Initialize MPI; allow for general communicator
     389        1536 :       if ( present(comm) ) then
     390        1536 :         call mpi_start( comm )
     391             :       else
     392           0 :         call mpi_start( MPI_COMM_WORLD )
     393             :       endif
     394             : ! Initialize OpenMP
     395        1536 :       call omp_start
     396             : !
     397             : ! Adopt 2D decomposition if provided.
     398             : !
     399        1536 :       modcam_npryz = (/ 1,1,1,1 /)    ! Default value (sequential)
     400        1536 :       if ( present( npryzxy ) ) then
     401        1536 :           modcam_npryz(1:4) = npryzxy(1:4)
     402        1536 :           modcam_tagoffset = modcam_npryz(3) * modcam_npryz(4)
     403             :       endif
     404        1536 :       if (gid .eq. 0) then
     405           2 :         write (iulog,*) 'MOD_COMM - modcam_npryz = ', &
     406           2 :                modcam_npryz(1), modcam_npryz(2),     &
     407           4 :                modcam_npryz(3), modcam_npryz(4)
     408           2 :         write (iulog,*) 'MOD_COMM - modcam_tagoffset = ', modcam_tagoffset
     409             :       endif
     410             : 
     411             : !
     412             : ! Set CAM optimization variables
     413             : !
     414             : ! modcam_method refers to irregular communications for transposes
     415             : ! modcam_geopk refers to irregular communications for the geopotential
     416             : ! modcam_gatscat refers to irregular communications for gather/scatters
     417             : ! For any of these, a value of 0 means source data will be gathered into a contiguous
     418             : !  buffer (window), communicated to a contiguous buffer (window) in the target, and
     419             : !  then scattered to its final destination; a value of 1 means MPI derived types will
     420             : !  be used (hence not requiring window storage).
     421             : ! modcam_maxirr refers to maximum number of irregular communications to be active at once
     422        1536 :       modcam_method  = 0   ! Default value
     423        1536 :       modcam_geopk   = 0   ! Default value
     424        1536 :       modcam_gatscat = 0   ! Default value
     425        1536 :       modcam_maxirr = 1   ! Default value
     426        1536 :       if ( present( mod_method ) )  modcam_method  = mod_method
     427        1536 :       if ( present( mod_geopk ) )   modcam_geopk   = mod_geopk
     428        1536 :       if ( present( mod_gatscat ) ) modcam_gatscat = mod_gatscat
     429        1536 :       if ( present( mod_maxirr ) ) modcam_maxirr = mod_maxirr
     430             : 
     431        1536 :       if (gid .eq. 0) then
     432           2 :         write(iulog,*) 'MOD_COMM - modcam_method modcam_geopk modcam_gatscat modcam_maxirr = ',    &
     433           4 :         modcam_method, modcam_geopk, modcam_gatscat, modcam_maxirr
     434             :       endif
     435             : 
     436             : !
     437             : ! End CAM optimizations
     438             : !
     439             : 
     440        1536 :       MaxTrf = modcam_maxirr
     441        1536 :       max_irr = modcam_maxirr
     442             : 
     443        1536 :       win_count = 0
     444             : !
     445             : !*************************************************************************
     446             : ! local_dynamic_storage is set to 1 when window storage is based on locally dimensioned
     447             : !  arrays, 0 otherwise; this occurs when modcam_gatscat equals 1, as it is only the
     448             : !  gather/scatters that require global storage.
     449             : !*************************************************************************
     450             : !
     451             :       local_dynamic_storage = 0
     452             :       if (modcam_gatscat .eq. 1) local_dynamic_storage = 1
     453             : 
     454             : !*************************************************************************
     455             : ! Override original strategy, as only single 2D lat-lon variables (rather
     456             : !  than 3D with multiple tracers) are used for gather/scatters;
     457             : !  set local_dynamic_storage to 1 always, and then allow for gather
     458             : !  of 2D lat-lon variable in inidat.
     459             : !*************************************************************************
     460             : 
     461        1536 :       local_dynamic_storage = 1
     462             : 
     463        4608 :       allocate( Stats(MAX(nbuf,numpro)*MAX(max_call,max_irr)*MPI_STATUS_SIZE) )
     464        6144 :       allocate( InHandle(numpro,MaxTrf) )
     465        4608 :       allocate( OutHandle(numpro,MaxTrf) )
     466             : 
     467        1536 :       idimsizz = idimsize
     468             :       if (local_dynamic_storage .eq. 1) then
     469        1536 :          if (gid .eq. 0) write(iulog,*) 'Using local dynamic storage for mod_comm window'
     470             :       else
     471             :          if (gid .eq. 0) write(iulog,*) 'Using global dynamic storage for mod_comm window'
     472             :       endif
     473             : !
     474             : ! Dynamically allocate target global arrays
     475             : !
     476             : !*************************************************************************
     477             : ! Compute additional storage due to ghost latitudes being included in some
     478             : !   transposes. Allow 3 ghost points on each side. The required storage
     479             : !   could be (6+L)/L times the original storage, where L is the number of
     480             : !   latitude lines in the subdomain. Ghost points can also occur in the
     481             : !   vertical due to edge quantities, but this would not occur simultaneously
     482             : !   with ghost points in latitude; the extra storage due to vertical ghost
     483             : !   points is not nearly as great as with latitude.
     484             : !*************************************************************************
     485        1536 :       using_window = 1   !  This is a local variable
     486        1536 :       if (modcam_method .eq. 1) using_window = 0
     487        1536 :       one = real(1,r8)
     488           0 :       ghostrat = one
     489             :       if (using_window .eq. 1 .and. local_dynamic_storage .eq. 1) then
     490        1536 :          latitude_lines = real(PLAT,r8)/real(modcam_npryz(1),r8)
     491        1536 :          ghostrat = real(6+latitude_lines,r8)/real(latitude_lines,r8)
     492             :       endif
     493        1536 :       if (gid .eq. 0) write(iulog,*) 'Mod_comm - ghostrat = ', ghostrat
     494             : 
     495             : !*************************************************************************
     496             : ! Compute extent to which required window storage for geopotential computation
     497             : !   exceeds that of transpose - relevant only for local dynamic storage,
     498             : !   since with global storage there will be enough space anyway; also,
     499             : !   this applies only when using window; further, this applies only when
     500             : !   the CAM variable geopktrans equals 1, though we do not test for that here.
     501             : ! The geopotential calculation sends a latitude line to every other process
     502             : !   either vertically above or below the given process; there can be
     503             : !   at most modcam_npryz(2)-1 such target processes; compared to transposes
     504             : !   (which send all vertical lines), the amount of data sent is expressed
     505             : !   as the ratio geopkrat; our concern is making the window (whose size
     506             : !   is computed based on transposes) large enough, so we must multiply its
     507             : !   size by geopkrat; we never shrink the window, so geopkrat >= 1.
     508             : !*************************************************************************
     509        1536 :       using_window = 1   !  This is a local variable
     510        1536 :       if (modcam_geopk .eq. 1) using_window = 0
     511        1536 :       one = real(1,r8)
     512           0 :       geopkrat = one
     513             :       if (using_window .eq. 1 .and. local_dynamic_storage .eq. 1) then
     514        1536 :          vertical_lines = ceiling(real(PLEV,r8)/real(modcam_npryz(2),r8))
     515        1536 :          geopkrat = real(modcam_npryz(2)-1,r8)/real(vertical_lines,r8)
     516        1536 :          geopkrat = max(geopkrat,one)
     517             :       endif
     518        1536 :       if (gid .eq. 0) write(iulog,*) 'Mod_comm - geopkrat = ', geopkrat
     519             : 
     520             : !*************************************************************************
     521             : ! beginning of CAM totvar computation
     522             : !*************************************************************************
     523             : 
     524             : ! CAM contains two kinds of transposes. The most commonly referred to transposes
     525             : !  refer to those which connect the xy and yz decompositions. Depending on
     526             : !  the physics decomposition, CAM might additionally compute transposes between
     527             : !  the dynamics and physics; this depends on the variable phys_loadbalance.
     528             : !  Furthermore, these transposes might or might not be computed using mod_comm.
     529             : !  The former transposes are generally performed one variable at a time; the
     530             : !  latter transposes combine all variables to be transposed, including the
     531             : !  full complement of tracers. The maximum number of variables to be 
     532             : !  simultaneously subject to irregular communications is dependent on
     533             : !  whether or not mod_comm is used to compute dynamics-physics transposes
     534             : !  and could depend on the number of tracers.
     535             : 
     536             : ! Compute maximum number of variables to be simultaneously subject
     537             : !  to irregular communications (e.g., transposed variables based on CAM)
     538             : !  and store in the variable 'totvar'.
     539             : 
     540             : ! Tracmax is the number of tracers simultaneously transposed within dynamics;
     541             : ! Tracbmax is the number of tracers simultaneously border comunicated within trac2d;
     542             : !  both of these are currently hardwired to 1.
     543        1536 :       tracmax = 1
     544        1536 :       tracbmax = 1
     545        1536 :       totvar = tracmax
     546             : 
     547             : ! Now consider dynamics-physics transposes in CAM dp_coupling (dpvarmax)
     548             : !  If phys_transpose_mod is still -1, that means it has not been updated
     549             : !  by CAM and hence mod_comm will not be used for dynamics-physics transposes.
     550             : ! (NOTE: phys_transpose_mod is computed in phys_grid_setopts in phys_grid.F90.)
     551             : 
     552             : ! Also note that the logic involving phys_transpose_mod and phys_transpose_modmin
     553             : !  must remain consistent with the coding in phys_grid.F90. Additionally,
     554             : !  phys_transpose_vars must remain consistent with the coding in dp_coupling.F90.
     555             : !  (See above declaration and initialization for CAM-specific variables.)
     556             : 
     557             : ! (begin dpvarmax calculation)
     558             : 
     559        1536 :       if (phys_transpose_mod .eq. -1) then
     560        1536 :          if (gid .eq. 0) write(iulog,*)       &
     561           2 :            '(MOD_COMM) - mod_comm not being used for dynamcis-physics transposes'
     562        1536 :          dpvarmax = 0
     563             : !
     564             : ! If phys_transpose_mod is >= phys_transpose_modmin, that is a signal that mod_comm is to be used
     565             : !  for dynamics/physics transposes in CAM. In that case, one must allocate enough window
     566             : !  storage for those transposes. Presently, the number of such simultaneously transposed
     567             : !  variables equals phys_transpose_vars plus the number of constituents.
     568             : !
     569           0 :       elseif (phys_transpose_mod .ge. phys_transpose_modmin) then
     570           0 :          dpvarmax = phys_transpose_vars + max_trac
     571             :       else
     572           0 :          dpvarmax = 0
     573             :       endif
     574             : 
     575             : ! (end dpvarmax calculation)
     576             : 
     577             : ! totvar is the maximum of (1) the number of tracers to be simultaneously transposed
     578             : !  within the dynamics, and (2) the number of variables to be transposed between
     579             : !  dynamics and physics instantiations in CAM
     580             : 
     581        1536 :       totvar = max(totvar, dpvarmax)
     582             : 
     583             : !*************************************************************************
     584             : ! end of CAM totvar computation
     585             : !*************************************************************************
     586             : 
     587        1538 :       if (gid .eq. 0) write(iulog,*) 'Mod_comm - tracmax dpvarmax totvar tracbmax = ',     &
     588           4 :           tracmax, dpvarmax, totvar, tracbmax 
     589             : 
     590        1536 :       idimsizz = (idimsize/max_nq)*tracbmax
     591        1536 :       sizet1 = idimsizz*nbuf*max_call
     592             : ! Adjust window sizes for geopotential and/or ghost points
     593        1536 :       sizer8 = PLON*platg*(PLEV+1)*totvar*max(geopkrat,ghostrat)*max_irr
     594        1536 :       sizer4 = PLON*platg*(PLEV+1)*totvar*max(geopkrat,ghostrat)*max_irr
     595        1536 :       sizei4 = PLON*PLAT*PLEV*max_irr
     596             : 
     597             : ! Compute local storage requirement for irregular communications by dividing
     598             : !    global requirement by the number of tasks. Allow slack factor to account
     599             : !    for nonuniformity of decomposition and ghost zones. Not valid for global
     600             : !    operations such as gathers and scatters when local windows are used.
     601             :       if (local_dynamic_storage .eq. 1) then
     602        1536 :          numpro_use = modcam_npryz(1) * modcam_npryz(2)
     603        1536 :          sizer8 = ceiling( alloc_slack_factor*real(sizer8,r8)/real(numpro_use,r8) )
     604             : 
     605             : ! Allow for gather of single 2D lat-lon variable in inidat.
     606        1536 :          if (modcam_gatscat .eq. 0) sizer8 = max( sizer8, PLON*PLAT*max_irr )  
     607             : 
     608        1536 :          sizer4 = ceiling( alloc_slack_factor*real(sizer4,r8)/real(numpro_use,r8) )
     609             : ! The only i4 irregular communications in CAM occur in io_dist.
     610        1536 :          sizei4 = 1
     611             :       endif
     612             : 
     613             : # if defined ( NOR4 )
     614        1536 :       sizer4 = 1
     615        1536 :       if (gid .eq. 0) write(iulog,*) 'Mod_comm - r4 windows disabled'
     616             : # endif
     617             : 
     618        1536 :       using_window = 1   !  This is a local variable
     619        1536 :       if (modcam_method .eq. 1 .and. modcam_geopk .eq. 1) using_window = 0
     620             :       if (using_window .eq. 0) then
     621           0 :          if (gid .eq. 0) write(iulog,*) 'Mod_comm - r8 and r4 windows set to trivial size'
     622           0 :          sizer8 = 1
     623           0 :          sizer4 = 1
     624             :       endif
     625             : 
     626             : ! Allocate global storage
     627             : 
     628        4608 :       allocate( ga_t1_r(sizet1) )
     629        3072 :       allocate( ga_t1_s(sizet1) )
     630        4608 :       allocate( ga_r8_r(sizer8) )
     631        3072 :       allocate( ga_r8_s(sizer8) )
     632        4608 :       allocate( ga_r4_r(sizer4) )
     633        3072 :       allocate( ga_r4_s(sizer4) )
     634        4608 :       allocate( ga_i4_r(sizei4) )
     635        3072 :       allocate( ga_i4_s(sizei4) )
     636             : 
     637             : ! Initialize windows
     638             : 
     639        1536 :         mysize = sizet1
     640        1536 :         call win_init_r8(comm, t1_win, ga_t1_r, mysize)
     641        1536 :         if (gid .eq. 0) write(iulog,*) 'Mod_comm t1_win window size = ', mysize
     642             : 
     643        1536 :         mysize = sizer8
     644        1536 :         call win_init_r8(comm, r8_win, ga_r8_r, mysize)
     645        1536 :         if (gid .eq. 0) write(iulog,*) 'Mod_comm r8_win window size = ', mysize
     646             : 
     647        1536 :         mysize = sizer4
     648        1536 :         call win_init_r4(comm, r4_win, ga_r4_r, mysize)
     649        1536 :         if (gid .eq. 0) write(iulog,*) 'Mod_comm r4_win window size = ', mysize
     650             : 
     651        1536 :         mysize = sizei4
     652        1536 :         call win_init_i4(comm, i4_win, ga_i4_r, mysize)
     653        1536 :         if (gid .eq. 0) write(iulog,*) 'Mod_comm i4_win window size = ', mysize
     654             : 
     655        1536 :         igosouth   = 0
     656        1536 :         igonorth   = 1
     657        1536 :         ifromsouth = 1
     658        1536 :         ifromnorth = 0
     659             : 
     660             : !EOC
     661        1536 :       end subroutine mp_init
     662             : !------------------------------------------------------------------------------
     663             : !------------------------------------------------------------------------------
     664             : !BOP
     665             : ! !ROUTINE: mp_exit --- End SPMD parallel communication
     666             : !
     667             : ! !INTERFACE:
     668           0 :       subroutine mp_exit( comm )
     669             : ! !INPUT PARAMETERS:
     670             :       integer, intent(in) :: comm      !  communicator
     671             : ! !DESCRIPTION:
     672             : !
     673             : !     End SPMD parallel communication
     674             : !
     675             : ! !REVISION HISTORY: 
     676             : !    2001.09.01   Lin
     677             : !    2002.02.15   Putman        Modified for Global Array code
     678             : !    2002.04.09   Putman        Added ProTeX documentation
     679             : !
     680             : !EOP
     681             : !------------------------------------------------------------------------------
     682             : !BOC
     683           0 :         call MPI_FINALIZE (ierror)
     684           0 :         return
     685             : !EOC
     686             :       end subroutine mp_exit
     687             : !------------------------------------------------------------------------------
     688             : !------------------------------------------------------------------------------
     689             : !BOP
     690             : ! !ROUTINE: omp_start --- Start openMP parallelism
     691             : !
     692             : ! !INTERFACE:
     693        1536 :       subroutine omp_start
     694             : ! !DESCRIPTION:
     695             : !
     696             : !     Start openMP parallelism
     697             : !
     698             : ! !REVISION HISTORY: 
     699             : !    02.02.15   Putman
     700             : !    02.04.09   Putman        Added ProTeX documentation
     701             : !
     702             : !EOP
     703             : !------------------------------------------------------------------------------
     704             : !BOC
     705             : !
     706             : ! !LOCAL VARIABLES:
     707             :         integer ios, n, nowpro, nowcpu
     708             : 
     709             : ! Compute number of OpenMP threads
     710             : 
     711             : #if defined(_OPENMP)
     712             : 
     713             :         integer omp_get_num_threads
     714             : !$omp parallel
     715             :         numcpu = omp_get_num_threads()
     716             : !$omp end parallel
     717             : 
     718             : #else
     719        1536 :         numcpu = 1
     720             : #endif
     721             : 
     722             : !EOC
     723        1536 :       end subroutine omp_start
     724             : !------------------------------------------------------------------------------
     725             : 
     726             : !------------------------------------------------------------------------------
     727             : !BOP
     728             : ! !ROUTINE: mpi_start --- Start MPI parallelism
     729             : !
     730             : ! !INTERFACE:
     731        1536 :       subroutine mpi_start( comm )
     732             : ! !INPUT PARAMETERS:
     733             :       integer :: comm      !  communicator
     734             : ! !DESCRIPTION:
     735             : !
     736             : !     Start MPI parallelism
     737             : !
     738             : ! !REVISION HISTORY: 
     739             : !    02.02.15   Putman
     740             : !    02.04.09   Putman        Added ProTeX documentation
     741             : !    02.08.06   Sawyer  Added communicator input arguments
     742             : !
     743             : !EOP
     744             : !------------------------------------------------------------------------------
     745             : !BOC
     746             : !
     747             : ! !LOCAL VARIABLES:
     748             :         logical flag
     749             :         integer npthreads
     750             : 
     751        1536 :         call MPI_INITIALIZED( flag, ierror )
     752        1536 :         if ( .not. flag ) then
     753           0 :           call MPI_INIT( ierror )
     754           0 :           comm = MPI_COMM_WORLD
     755             :         endif
     756             : 
     757        1536 :         call MPI_COMM_RANK (comm, gid, ierror)
     758        1536 :         call MPI_COMM_SIZE (comm, numpro, ierror)
     759        1536 :         call MPI_COMM_DUP  (comm, commglobal, ierror)
     760             : !EOC
     761        1536 :       end subroutine mpi_start
     762             : !------------------------------------------------------------------------------
     763             : 
     764             : !------------------------------------------------------------------------------
     765             : !BOP
     766             : ! !ROUTINE: win_init_r8 --- Initialize real*8 communication window
     767             : !
     768             : ! !INTERFACE:
     769        3072 :       subroutine win_init_r8(comm, win, ga, isize)
     770             : ! !INPUT PARAMETERS:
     771             :         integer, intent(in) :: comm      !  communicator
     772             :         integer, intent(in) :: isize
     773             :         real(r8), intent(in) :: ga(isize)
     774             : ! !OUTPUT PARAMETERS:
     775             :         type (window), intent(inout) :: win
     776             : ! !DESCRIPTION:
     777             : !
     778             : !     Initialize real*8 communication window
     779             : !
     780             : ! !REVISION HISTORY: 
     781             : !    02.02.15   Putman
     782             : !    02.04.09   Putman        Added ProTeX documentation
     783             : !
     784             : !EOP
     785             : !------------------------------------------------------------------------------
     786             : !BOC
     787             : !
     788             : ! !LOCAL VARIABLES:
     789             : 
     790        3072 :         win_count = win_count + 1
     791        3072 :         win%id = win_count
     792        3072 :         win%size = isize
     793        3072 :         win%ncall_s = 0
     794        3072 :         win%ncall_r = 0
     795        3072 :         win%nsend = 0
     796        3072 :         win%nrecv = 0
     797        3072 :         win%nread = 0
     798        9216 :         allocate( win%sqest(MAX(nbuf,numpro)*MAX(max_call,max_irr)) )
     799        6144 :         allocate( win%rqest(MAX(nbuf,numpro)*MAX(max_call,max_irr)) )
     800             : !EOC
     801        3072 :       end subroutine win_init_r8
     802             : !------------------------------------------------------------------------------
     803             : !------------------------------------------------------------------------------
     804             : !BOP
     805             : ! !ROUTINE: win_init_r4 --- Initialize real*4 communication window
     806             : !
     807             : ! !INTERFACE:
     808        1536 :       subroutine win_init_r4(comm, win, ga, isize)
     809             : ! !INPUT PARAMETERS:
     810             :         integer, intent(in) :: comm      !  communicator
     811             :         integer, intent(in) :: isize
     812             :         real(r4), intent(in) :: ga(isize)
     813             : ! !OUTPUT PARAMETERS:
     814             :         type (window), intent(inout) :: win
     815             : ! !DESCRIPTION:
     816             : !
     817             : !     Initialize real*4 communication window
     818             : !
     819             : ! !REVISION HISTORY: 
     820             : !    02.02.15   Putman
     821             : !    02.04.09   Putman        Added ProTeX documentation
     822             : !
     823             : !EOP
     824             : !------------------------------------------------------------------------------
     825             : !BOC
     826             : !
     827             : ! !LOCAL VARIABLES:
     828             : 
     829        1536 :         win_count = win_count + 1
     830        1536 :         win%id = win_count
     831        1536 :         win%size = isize
     832        1536 :         win%ncall_s = 0
     833        1536 :         win%ncall_r = 0
     834        1536 :         win%nsend = 0
     835        1536 :         win%nrecv = 0
     836        1536 :         win%nread = 0
     837        4608 :         allocate( win%sqest(MAX(nbuf,numpro)*MAX(max_call,max_irr)) )
     838        3072 :         allocate( win%rqest(MAX(nbuf,numpro)*MAX(max_call,max_irr)) )
     839             : !EOC
     840        1536 :       end subroutine win_init_r4
     841             : !------------------------------------------------------------------------------
     842             : !------------------------------------------------------------------------------
     843             : !BOP
     844             : ! !ROUTINE: win_init_i4 --- Initialize integer*4 communication window
     845             : !
     846             : ! !INTERFACE:
     847        1536 :       subroutine win_init_i4(comm, win, ga, isize)
     848             : ! !INPUT PARAMETERS:
     849             :         integer, intent(in) :: comm      !  communicator
     850             :         integer, intent(in) :: isize
     851             :         integer(i4), intent(in) :: ga(isize)
     852             : ! !OUTPUT PARAMETERS:
     853             :         type (window), intent(inout) :: win
     854             : ! !DESCRIPTION:
     855             : !
     856             : !     Initialize integer*4 communication window
     857             : !
     858             : ! !REVISION HISTORY: 
     859             : !    02.02.15   Putman
     860             : !    02.04.09   Putman        Added ProTeX documentation
     861             : !
     862             : !EOP
     863             : !------------------------------------------------------------------------------
     864             : !BOC
     865             : !
     866             : ! !LOCAL VARIABLES:
     867             : 
     868        1536 :         win_count = win_count + 1
     869        1536 :         win%id = win_count
     870        1536 :         win%size = isize
     871        1536 :         win%ncall_s = 0
     872        1536 :         win%ncall_r = 0
     873        1536 :         win%nsend = 0
     874        1536 :         win%nrecv = 0
     875        1536 :         win%nread = 0
     876        4608 :         allocate( win%sqest(MAX(nbuf,numpro)*MAX(max_call,max_irr)) )
     877        3072 :         allocate( win%rqest(MAX(nbuf,numpro)*MAX(max_call,max_irr)) )
     878             : !EOC
     879        1536 :       end subroutine win_init_i4
     880             : !------------------------------------------------------------------------------
     881             : !------------------------------------------------------------------------------
     882             : !BOP
     883             : ! !ROUTINE: mp_send4d_ns --- Send 4d north/south ghost latitudes (real*8)
     884             : !
     885             : ! !INTERFACE:
     886    16450560 :       subroutine mp_send4d_ns(comm, im, jm, km, nq, jfirst, jlast, kfirst, klast, &
     887    16450560 :                               ng_s, ng_n, q)
     888             : !
     889             : ! !INPUT PARAMETERS:
     890             :       integer, intent(in):: comm      !  communicator
     891             :       integer, intent(in):: im, jm, km, nq
     892             :       integer, intent(in):: jfirst, jlast
     893             :       integer, intent(in):: kfirst, klast
     894             :       integer, intent(in):: ng_s      ! southern zones to ghost 
     895             :       integer, intent(in):: ng_n      ! northern zones to ghost 
     896             :       real(r8), intent(in):: q(im,jfirst-ng_s:jlast+ng_n,kfirst:klast,nq)
     897             : !
     898             : ! !DESCRIPTION:
     899             : !
     900             : !     Send 4d north/south ghost latitudes
     901             : !
     902             : ! !REVISION HISTORY: 
     903             : !    2001.09.01   Lin    
     904             : !    2002.02.15   Putman        Modified for Global Arrays code   
     905             : !    2002.04.09   Putman        Added ProTeX documentation
     906             : !
     907             : !EOP
     908             : !------------------------------------------------------------------------------
     909             : !BOC
     910             : !
     911             : ! !LOCAL VARIABLES:
     912             :       integer :: gidu
     913             : 
     914             : #if defined( MODCM_TIMING )
     915             :       call t_startf('mod_comm communication')
     916             : #endif
     917             : 
     918    16450560 :       call MPI_COMM_RANK (comm, gidu, ierror)
     919             : 
     920    16450560 :       call Win_Open(comm, t1_win)
     921             : 
     922             : ! Send to south
     923    16450560 :       if ( jfirst > 1 ) then
     924    16193520 :         t1_win%src = gidu - 1
     925    16193520 :         t1_win%offset_r = ifromsouth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf
     926    16193520 :         t1_win%size_r = im*ng_s*(klast-kfirst+1)*nq
     927    16193520 :         call Ga_RecvInit_r8(comm, t1_win, ga_t1_r)
     928    16193520 :         t1_win%dest = gidu - 1
     929    16193520 :         t1_win%offset_s = igosouth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf
     930             :         call Ga_Put4d_r8(comm, q, t1_win, im, jm, km, nq, &
     931             :                          1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, &
     932             :                          1, im, jfirst, jfirst+ng_n-1, kfirst, klast, 1, nq,   &
     933    16193520 :                          ga_t1_s, ga_t1_r )
     934             :       endif
     935             : ! Send to north
     936    16450560 :       if ( jlast < jm ) then
     937    16193520 :         t1_win%src = gidu + 1
     938    16193520 :         t1_win%offset_r = ifromnorth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf
     939    16193520 :         t1_win%size_r = im*ng_n*(klast-kfirst+1)*nq
     940    16193520 :         call Ga_RecvInit_r8(comm, t1_win, ga_t1_r)
     941    16193520 :         t1_win%dest = gidu + 1
     942    16193520 :         t1_win%offset_s = igonorth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf
     943             :         call Ga_Put4d_r8(comm, q, t1_win, im, jm, km, nq, &
     944             :                          1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, &
     945             :                          1, im, jlast-ng_s+1, jlast, kfirst, klast, 1, nq,     &
     946    16193520 :                          ga_t1_s, ga_t1_r )
     947             :       endif
     948             : 
     949             : #if defined( MODCM_TIMING )
     950             :       call t_stopf('mod_comm communication')
     951             : #endif
     952             : 
     953             : !EOC
     954    16450560 :       end subroutine mp_send4d_ns
     955             : !------------------------------------------------------------------------------
     956             : !------------------------------------------------------------------------------
     957             : !BOP
     958             : ! !ROUTINE: mp_recv4d_ns --- Receive 4d north/south ghost latitudes (real*8)
     959             : !
     960             : ! !INTERFACE:
     961    16450560 :       subroutine mp_recv4d_ns(comm, im, jm, km, nq, jfirst, jlast, kfirst, klast, &
     962    16450560 :                               ng_s, ng_n, q)
     963             : !
     964             : ! !INPUT PARAMETERS:
     965             :       integer, intent(in):: comm      !  communicator
     966             :       integer, intent(in):: im, jm, km, nq
     967             :       integer, intent(in):: jfirst, jlast
     968             :       integer, intent(in):: kfirst, klast
     969             :       integer, intent(in):: ng_s      ! southern zones to ghost 
     970             :       integer, intent(in):: ng_n      ! northern zones to ghost 
     971             : ! !OUTPUT PARAMETERS:
     972             :       real(r8), intent(inout):: q(im,jfirst-ng_s:jlast+ng_n,kfirst:klast,nq)
     973             : !
     974             : ! !DESCRIPTION:
     975             : !
     976             : !     Receive 4d north/south ghost latitudes
     977             : !
     978             : ! !REVISION HISTORY: 
     979             : !    2001.09.01   Lin    
     980             : !    2002.02.15   Putman        Modified for Global Arrays code   
     981             : !    2002.04.09   Putman        Added ProTeX documentation
     982             : !
     983             : !EOP
     984             : !------------------------------------------------------------------------------
     985             : !BOC
     986             : !
     987             : ! !LOCAL VARIABLES:
     988             :       integer :: gidu
     989             : 
     990             : #if defined( MODCM_TIMING )
     991             :       call t_startf('mod_comm communication')
     992             : #endif
     993             : 
     994    16450560 :       call MPI_COMM_RANK (comm, gidu, ierror)
     995             : 
     996    16450560 :       call Win_Close(comm, t1_win)
     997             : 
     998             : ! Recv from south
     999    16450560 :       if ( jfirst > 1 ) then
    1000    16193520 :         t1_win%src  = gidu-1
    1001    16193520 :         t1_win%offset_r = ifromsouth*idimsizz + (t1_win%ncall_r-1)*idimsizz*nbuf
    1002             :         call Ga_Get4d_r8(comm, q, t1_win, im, jm, km, nq, &
    1003             :                          1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, &
    1004             :                          1, im, jfirst-ng_s, jfirst-1,   kfirst, klast, 1, nq, &
    1005    16193520 :                          ga_t1_r  )
    1006             :       endif
    1007             : ! Recv from north
    1008    16450560 :       if ( jlast < jm ) then
    1009    16193520 :         t1_win%src  = gidu+1
    1010    16193520 :         t1_win%offset_r = ifromnorth*idimsizz + (t1_win%ncall_r-1)*idimsizz*nbuf
    1011             :         call Ga_Get4d_r8(comm, q, t1_win, im, jm, km, nq, &
    1012             :                          1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, &
    1013             :                          1, im, jlast+1,     jlast+ng_n, kfirst, klast, 1, nq, &
    1014    16193520 :                          ga_t1_r  )
    1015             :       endif
    1016             : 
    1017    16450560 :       call Win_Finalize(comm, t1_win)
    1018             : 
    1019             : #if defined( MODCM_TIMING )
    1020             :       call t_stopf('mod_comm communication')
    1021             : #endif
    1022             : 
    1023             : !EOC
    1024    16450560 :       end subroutine mp_recv4d_ns
    1025             : !------------------------------------------------------------------------------
    1026             : !------------------------------------------------------------------------------
    1027             : !BOP
    1028             : ! !ROUTINE: mp_send4d_ns_r4 --- Send 4d north/south ghost latitudes (real*4)
    1029             : !
    1030             : ! !INTERFACE:
    1031           0 :       subroutine mp_send4d_ns_r4(comm, im, jm, km, nq, jfirst, jlast, kfirst, klast, &
    1032           0 :                                  ng_s, ng_n, q)
    1033             : !
    1034             : ! !INPUT PARAMETERS:
    1035             :       integer, intent(in):: comm      !  communicator
    1036             :       integer, intent(in):: im, jm, km, nq
    1037             :       integer, intent(in):: jfirst, jlast
    1038             :       integer, intent(in):: kfirst, klast
    1039             :       integer, intent(in):: ng_s      ! southern zones to ghost 
    1040             :       integer, intent(in):: ng_n      ! northern zones to ghost 
    1041             :       real(r4), intent(in):: q(im,jfirst-ng_s:jlast+ng_n,kfirst:klast,nq)
    1042             : !
    1043             : ! !DESCRIPTION:
    1044             : !
    1045             : !     Send 4d north/south ghost latitudes
    1046             : !
    1047             : ! !REVISION HISTORY: 
    1048             : !    2005.03.20   Sawyer        Creation from mp_send4d_ns
    1049             : !
    1050             : !EOP
    1051             : !------------------------------------------------------------------------------
    1052             : !BOC
    1053             : !
    1054             : ! !LOCAL VARIABLES:
    1055             :       integer :: gidu
    1056             : 
    1057             : #if defined ( NOR4 )
    1058           0 :         write(iulog,*) 'Mod_comm: mp_send4d_ns_r4 - r4 windows disabled - exiting'
    1059           0 :         stop
    1060             : #endif
    1061             : 
    1062             : #if defined( MODCM_TIMING )
    1063             :       call t_startf('mod_comm communication')
    1064             : #endif
    1065             : 
    1066             :       call MPI_COMM_RANK (comm, gidu, ierror)
    1067             : 
    1068             :       call Win_Open(comm, r4_win)
    1069             : 
    1070             : ! Send to south
    1071             :       if ( jfirst > 1 ) then
    1072             :         r4_win%src = gidu - 1
    1073             :         r4_win%offset_r = ifromsouth*idimsizz + (r4_win%ncall_s-1)*idimsizz*nbuf
    1074             :         r4_win%size_r = im*ng_s*(klast-kfirst+1)*nq
    1075             :         call Ga_RecvInit_r4(comm, r4_win, ga_r4_r)
    1076             :         r4_win%dest = gidu - 1
    1077             :         r4_win%offset_s = igosouth*idimsizz + (r4_win%ncall_s-1)*idimsizz*nbuf
    1078             :         call Ga_Put4d_r4(comm, q, r4_win, im, jm, km, nq, &
    1079             :                          1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, &
    1080             :                          1, im, jfirst, jfirst+ng_n-1, kfirst, klast, 1, nq,   &
    1081             :                          ga_r4_s, ga_r4_r )
    1082             :       endif
    1083             : ! Send to north
    1084             :       if ( jlast < jm ) then
    1085             :         r4_win%src = gidu + 1
    1086             :         r4_win%offset_r = ifromnorth*idimsizz + (r4_win%ncall_s-1)*idimsizz*nbuf
    1087             :         r4_win%size_r = im*ng_n*(klast-kfirst+1)*nq
    1088             :         call Ga_RecvInit_r4(comm, r4_win, ga_r4_r)
    1089             :         r4_win%dest = gidu + 1
    1090             :         r4_win%offset_s = igonorth*idimsizz + (r4_win%ncall_s-1)*idimsizz*nbuf
    1091             :         call Ga_Put4d_r4(comm, q, r4_win, im, jm, km, nq, &
    1092             :                          1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, &
    1093             :                          1, im, jlast-ng_s+1, jlast, kfirst, klast, 1, nq,     &
    1094             :                          ga_r4_s, ga_r4_r )
    1095             :       endif
    1096             : 
    1097             : #if defined( MODCM_TIMING )
    1098             :       call t_stopf('mod_comm communication')
    1099             : #endif
    1100             : 
    1101             : !EOC
    1102             :       end subroutine mp_send4d_ns_r4
    1103             : !------------------------------------------------------------------------------
    1104             : !------------------------------------------------------------------------------
    1105             : !BOP
    1106             : ! !ROUTINE: mp_recv4d_ns_r4 --- Receive 4d north/south ghost latitudes (real*4)
    1107             : !
    1108             : ! !INTERFACE:
    1109           0 :       subroutine mp_recv4d_ns_r4(comm, im, jm, km, nq, jfirst, jlast, kfirst, klast, &
    1110           0 :                               ng_s, ng_n, q)
    1111             : !
    1112             : ! !INPUT PARAMETERS:
    1113             :       integer, intent(in):: comm      !  communicator
    1114             :       integer, intent(in):: im, jm, km, nq
    1115             :       integer, intent(in):: jfirst, jlast
    1116             :       integer, intent(in):: kfirst, klast
    1117             :       integer, intent(in):: ng_s      ! southern zones to ghost 
    1118             :       integer, intent(in):: ng_n      ! northern zones to ghost 
    1119             : ! !OUTPUT PARAMETERS:
    1120             :       real(r4), intent(inout):: q(im,jfirst-ng_s:jlast+ng_n,kfirst:klast,nq)
    1121             : !
    1122             : ! !DESCRIPTION:
    1123             : !
    1124             : !     Receive 4d north/south ghost latitudes (real*4)
    1125             : !
    1126             : ! !REVISION HISTORY: 
    1127             : !    2005.03.20   Sawyer        Creation from mp_recv4d_ns
    1128             : !
    1129             : !EOP
    1130             : !------------------------------------------------------------------------------
    1131             : !BOC
    1132             : !
    1133             : ! !LOCAL VARIABLES:
    1134             :       integer :: gidu
    1135             : 
    1136             : #if defined ( NOR4 )
    1137           0 :         write(iulog,*) 'Mod_comm: mp_recv4d_ns_r4 - r4 windows disabled - exiting'
    1138           0 :         stop
    1139             : #endif
    1140             : 
    1141             : #if defined( MODCM_TIMING )
    1142             :       call t_startf('mod_comm communication')
    1143             : #endif
    1144             : 
    1145             :       call MPI_COMM_RANK (comm, gidu, ierror)
    1146             : 
    1147             :       call Win_Close(comm, r4_win)
    1148             : 
    1149             : ! Recv from south
    1150             :       if ( jfirst > 1 ) then
    1151             :         r4_win%src  = gidu-1
    1152             :         r4_win%offset_r = ifromsouth*idimsizz + (r4_win%ncall_r-1)*idimsizz*nbuf
    1153             :         call Ga_Get4d_r4(comm, q, r4_win, im, jm, km, nq, &
    1154             :                          1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, &
    1155             :                          1, im, jfirst-ng_s, jfirst-1,   kfirst, klast, 1, nq, &
    1156             :                          ga_r4_r  )
    1157             :       endif
    1158             : ! Recv from north
    1159             :       if ( jlast < jm ) then
    1160             :         r4_win%src  = gidu+1
    1161             :         r4_win%offset_r = ifromnorth*idimsizz + (r4_win%ncall_r-1)*idimsizz*nbuf
    1162             :         call Ga_Get4d_r4(comm, q, r4_win, im, jm, km, nq, &
    1163             :                          1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, &
    1164             :                          1, im, jlast+1,     jlast+ng_n, kfirst, klast, 1, nq, &
    1165             :                          ga_r4_r  )
    1166             :       endif
    1167             : 
    1168             :       call Win_Finalize(comm, r4_win)
    1169             : 
    1170             : #if defined( MODCM_TIMING )
    1171             :       call t_stopf('mod_comm communication')
    1172             : #endif
    1173             : 
    1174             : !EOC
    1175             :       end subroutine mp_recv4d_ns_r4
    1176             : !------------------------------------------------------------------------------
    1177             : !------------------------------------------------------------------------------
    1178             : !BOP
    1179             : ! !ROUTINE: mp_send2_ns --- Send 2 variables north/south ghost latitudes
    1180             : !
    1181             : ! !INTERFACE:
    1182      129024 :       subroutine mp_send2_ns(comm, im, jm, km, jfirst, jlast, kfirst, klast, &
    1183      129024 :                              nd, q1, q2)
    1184             : !
    1185             : ! !INPUT PARAMETERS:
    1186             :       integer, intent(in):: comm      !  communicator
    1187             :       integer, intent(in):: im, jm, km
    1188             :       integer, intent(in):: jfirst, jlast
    1189             :       integer, intent(in):: kfirst, klast !careful: klast might be klast+1
    1190             :       integer, intent(in):: nd
    1191             :       real(r8), intent(in):: q1(im,jfirst-nd:jlast+nd,kfirst:klast) 
    1192             :       real(r8), intent(in):: q2(im,jfirst-nd:jlast+nd,kfirst:klast) 
    1193             : !
    1194             : ! !DESCRIPTION:
    1195             : !
    1196             : !     Send 2 variables north/south ghost latitudes
    1197             : !
    1198             : ! !REVISION HISTORY: 
    1199             : !    2001.09.01   Lin    
    1200             : !    2002.02.15   Putman        Modified for Global Arrays code   
    1201             : !    2002.04.09   Putman        Added ProTeX documentation
    1202             : !
    1203             : !EOP
    1204             : !------------------------------------------------------------------------------
    1205             : !BOC
    1206             : !
    1207             : ! !LOCAL VARIABLES:
    1208             :       integer :: gidu
    1209             : 
    1210             : #if defined( MODCM_TIMING )
    1211             :       call t_startf('mod_comm communication')
    1212             : #endif
    1213             : 
    1214      129024 :       call MPI_COMM_RANK (comm, gidu, ierror)
    1215             : 
    1216      129024 :       call Win_Open(comm, t1_win)
    1217             : 
    1218             : ! Send to south
    1219      129024 :       if ( jfirst > 1 ) then
    1220      127008 :         t1_win%src  = gidu - 1
    1221      127008 :         t1_win%size_r = im*(klast-kfirst+1)
    1222      127008 :         t1_win%offset_r = ifromsouth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf
    1223      127008 :         call Ga_RecvInit_r8(comm, t1_win, ga_t1_r)
    1224      127008 :         t1_win%offset_r = t1_win%offset_r + im*(klast-kfirst+1)
    1225      127008 :         call Ga_RecvInit_r8(comm, t1_win, ga_t1_r)
    1226      127008 :         t1_win%dest = gidu - 1
    1227      127008 :         t1_win%offset_s = igosouth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf
    1228             :         call Ga_Put4d_r8( comm, q1, t1_win, im, jm, km, 2, &
    1229             :                           1, im, jfirst-nd, jlast+nd, kfirst, klast, 1, 1, &
    1230             :                           1, im, jfirst,    jfirst,   kfirst, klast, 1, 1, &
    1231      127008 :                           ga_t1_s, ga_t1_r  )
    1232      127008 :         t1_win%offset_s = t1_win%offset_s + im*(klast-kfirst+1)
    1233             :         call Ga_Put4d_r8( comm, q2, t1_win, im, jm, km, 2, &
    1234             :                           1, im, jfirst-nd, jlast+nd, kfirst, klast, 2, 2, &
    1235             :                           1, im, jfirst,    jfirst,   kfirst, klast, 2, 2, &
    1236      127008 :                           ga_t1_s, ga_t1_r  )
    1237             :       endif
    1238             : ! Send to north
    1239      129024 :       if ( jlast < jm ) then
    1240      127008 :         t1_win%src  = gidu + 1
    1241      127008 :         t1_win%size_r = im*(klast-kfirst+1)
    1242      127008 :         t1_win%offset_r = ifromnorth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf
    1243      127008 :         call Ga_RecvInit_r8(comm, t1_win, ga_t1_r)
    1244      127008 :         t1_win%offset_r = t1_win%offset_r + im*(klast-kfirst+1)
    1245      127008 :         call Ga_RecvInit_r8(comm, t1_win, ga_t1_r)
    1246      127008 :         t1_win%dest = gidu + 1
    1247      127008 :         t1_win%offset_s = igonorth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf
    1248             :         call Ga_Put4d_r8( comm, q1, t1_win, im, jm, km, 2, &
    1249             :                           1, im, jfirst-nd, jlast+nd, kfirst, klast, 1, 1, &
    1250             :                           1, im, jlast,     jlast,    kfirst, klast, 1, 1, &
    1251      127008 :                           ga_t1_s, ga_t1_r  )
    1252      127008 :         t1_win%offset_s = t1_win%offset_s + im*(klast-kfirst+1)
    1253             :         call Ga_Put4d_r8( comm, q2, t1_win, im, jm, km, 2, &
    1254             :                           1, im, jfirst-nd, jlast+nd, kfirst, klast, 2, 2, &
    1255             :                           1, im, jlast,     jlast,    kfirst, klast, 2, 2, &
    1256      127008 :                           ga_t1_s, ga_t1_r  )
    1257             :       endif
    1258             : 
    1259             : #if defined( MODCM_TIMING )
    1260             :       call t_stopf('mod_comm communication')
    1261             : #endif
    1262             : 
    1263             : !EOC
    1264      129024 :       end subroutine mp_send2_ns
    1265             : !------------------------------------------------------------------------------
    1266             : !------------------------------------------------------------------------------
    1267             : !BOP
    1268             : ! !ROUTINE: mp_recv2_ns --- Receive 2 variables north/south ghost latitudes
    1269             : !
    1270             : ! !INTERFACE:
    1271      129024 :       subroutine mp_recv2_ns(comm, im, jm, km, jfirst, jlast, kfirst, klast, &
    1272      129024 :                              nd, q1, q2)
    1273             : !
    1274             : ! !INPUT PARAMETERS:
    1275             :       integer, intent(in):: comm      !  communicator
    1276             :       integer, intent(in):: im, jm, km
    1277             :       integer, intent(in):: jfirst, jlast
    1278             :       integer, intent(in):: kfirst, klast !careful: klast might be klast+1
    1279             :       integer, intent(in):: nd
    1280             : ! !OUTPUT PARAMETERS:
    1281             :       real(r8), intent(inout):: q1(im,jfirst-nd:jlast+nd,kfirst:klast) 
    1282             :       real(r8), intent(inout):: q2(im,jfirst-nd:jlast+nd,kfirst:klast)
    1283             : !
    1284             : ! !DESCRIPTION:
    1285             : !
    1286             : !     Receive 2 variables north/south ghost latitudes
    1287             : !
    1288             : ! !REVISION HISTORY: 
    1289             : !    2001.09.01   Lin    
    1290             : !    2002.02.15   Putman        Modified for Global Arrays code   
    1291             : !    2002.04.09   Putman        Added ProTeX documentation
    1292             : !
    1293             : !EOP
    1294             : !------------------------------------------------------------------------------
    1295             : !BOC
    1296             : ! !LOCAL VARIABLES:
    1297             :       integer j
    1298             :       integer :: gidu
    1299             : 
    1300             : #if defined( MODCM_TIMING )
    1301             :       call t_startf('mod_comm communication')
    1302             : #endif
    1303             : 
    1304      129024 :       call MPI_COMM_RANK (comm, gidu, ierror)
    1305             : 
    1306      129024 :       call Win_Close(comm, t1_win)
    1307             : 
    1308             : ! Recv from south
    1309      129024 :       if ( jfirst > 1 ) then
    1310      127008 :         j = jfirst - 1
    1311      127008 :         t1_win%src  = gidu - 1
    1312      127008 :         t1_win%offset_r = ifromsouth*idimsizz + (t1_win%ncall_r-1)*idimsizz*nbuf
    1313             :         call Ga_Get4d_r8( comm, q1, t1_win, im, jm, km, 2, &
    1314             :                           1, im, jfirst-nd, jlast+nd, kfirst, klast, 1, 1, &
    1315             :                           1, im, j,         j,        kfirst, klast, 1, 1, &
    1316      127008 :                           ga_t1_r  )
    1317      127008 :         t1_win%offset_r = t1_win%offset_r + im*(klast-kfirst+1)
    1318             :         call Ga_Get4d_r8( comm, q2, t1_win, im, jm, km, 2, &
    1319             :                           1, im, jfirst-nd, jlast+nd, kfirst, klast, 2, 2, &
    1320             :                           1, im, j,         j,        kfirst, klast, 2, 2, &
    1321      127008 :                           ga_t1_r  )
    1322             :       endif
    1323             : ! Recv from north
    1324      129024 :       if ( jlast < jm ) then
    1325      127008 :         j = jlast + 1
    1326      127008 :         t1_win%src  = gidu + 1
    1327      127008 :         t1_win%offset_r = ifromnorth*idimsizz + (t1_win%ncall_r-1)*idimsizz*nbuf
    1328             :         call Ga_Get4d_r8( comm, q1, t1_win, im, jm, km, 2, & 
    1329             :                           1, im, jfirst-nd, jlast+nd, kfirst, klast, 1, 1, &
    1330             :                           1, im, j,         j,        kfirst, klast, 1, 1, &
    1331      127008 :                           ga_t1_r  )
    1332      127008 :         t1_win%offset_r = t1_win%offset_r + im*(klast-kfirst+1)
    1333             :         call Ga_Get4d_r8( comm, q2, t1_win, im, jm, km, 2, &
    1334             :                           1, im, jfirst-nd, jlast+nd, kfirst, klast, 2, 2, &
    1335             :                           1, im, j,         j,        kfirst, klast, 2, 2, &
    1336      127008 :                           ga_t1_r  )
    1337             :       endif
    1338             : 
    1339      129024 :       call Win_Finalize(comm, t1_win)
    1340             : 
    1341             : #if defined( MODCM_TIMING )
    1342             :       call t_stopf('mod_comm communication')
    1343             : #endif
    1344             : 
    1345             : !EOC
    1346      129024 :       end subroutine mp_recv2_ns
    1347             : !------------------------------------------------------------------------------
    1348             : !------------------------------------------------------------------------------
    1349             : !BOP
    1350             : ! !ROUTINE: mp_send3d --- Send ghost region
    1351             : !
    1352             : ! !INTERFACE:
    1353      738816 :       subroutine mp_send3d(comm, dest, src, im, jm, km, if, il, jf, jl, kf, kl, &
    1354      738816 :                                                   i1, i2, j1, j2, k1, k2, q)
    1355             : !
    1356             : ! !INPUT PARAMETERS:
    1357             :       integer, intent(in):: comm      !  communicator
    1358             :       integer, intent(in):: dest, src
    1359             :       integer, intent(in):: im, jm, km
    1360             :       integer, intent(in):: if, il, jf, jl, kf, kl 
    1361             :       integer, intent(in):: i1, i2, j1, j2, k1, k2
    1362             :       real(r8), intent(in):: q(if:il, jf:jl, kf:kl)
    1363             : !
    1364             : ! !DESCRIPTION:
    1365             : !
    1366             : !     Send a general 3d real*8 ghost region
    1367             : !
    1368             : ! !REVISION HISTORY:
    1369             : !    02.04.15   Putman  
    1370             : !
    1371             : !EOP
    1372             : !------------------------------------------------------------------------------
    1373             : !BOC
    1374             : 
    1375             : #if defined( MODCM_TIMING )
    1376             :       call t_startf('mod_comm communication')
    1377             : #endif
    1378             : 
    1379      738816 :       call MPI_COMM_SIZE (comm, numcomm, ierror)
    1380             : 
    1381      738816 :       call Win_Open(comm, t1_win)
    1382             : 
    1383             : ! Init Recv src
    1384      738816 :       if ( src >= 0 .and. src < numcomm ) then     ! is PE in valid range?
    1385      730776 :         t1_win%src = src
    1386      730776 :         t1_win%size_r = (i2-i1+1)*(j2-j1+1)*(k2-k1+1)  ! chunk size
    1387      730776 :         t1_win%offset_r = (t1_win%ncall_s-1)*idimsizz*nbuf
    1388      730776 :         call Ga_RecvInit_r8(comm, t1_win, ga_t1_r)
    1389             :       endif
    1390             : ! Send ghost region
    1391      738816 :       if ( dest >= 0 .and. dest < numcomm ) then
    1392      730776 :         t1_win%dest = dest
    1393      730776 :         t1_win%offset_s = (t1_win%ncall_s-1)*idimsizz*nbuf
    1394             :         call Ga_Put4d_r8( comm, q, t1_win, im, jm, km, 1, &
    1395             :                           if, il, jf, jl, kf, kl, 1, 1,  &
    1396      730776 :                           i1, i2, j1, j2, k1, k2, 1, 1, ga_t1_s, ga_t1_r  )
    1397             :       endif
    1398             : 
    1399             : #if defined( MODCM_TIMING )
    1400             :       call t_stopf('mod_comm communication')
    1401             : #endif
    1402             : 
    1403             : !EOC
    1404      738816 :       end subroutine mp_send3d
    1405             : !------------------------------------------------------------------------------
    1406             : !------------------------------------------------------------------------------
    1407             : !BOP
    1408             : ! !ROUTINE: mp_recv3d --- Recv ghost region
    1409             : !
    1410             : ! !INTERFACE:
    1411      738816 :       subroutine mp_recv3d(comm, src, im, jm, km, if, il, jf, jl, kf, kl, &
    1412      738816 :                                                 i1, i2, j1, j2, k1, k2, qout)
    1413             : !
    1414             : ! !INPUT PARAMETERS:
    1415             :       integer, intent(in):: comm      !  communicator
    1416             :       integer, intent(in):: src
    1417             :       integer, intent(in):: im, jm, km
    1418             :       integer, intent(in):: if, il, jf, jl, kf, kl 
    1419             :       integer, intent(in):: i1, i2, j1, j2, k1, k2
    1420             : ! !OUTPUT PARAMETERS:
    1421             :       real(r8), intent(inout):: qout(if:il, jf:jl, kf:kl)
    1422             : !
    1423             : ! !DESCRIPTION:
    1424             : !
    1425             : !     Recv a general 3d real*8 ghost region
    1426             : !
    1427             : ! !REVISION HISTORY:
    1428             : !    02.04.15   Putman  
    1429             : !
    1430             : !EOP
    1431             : !------------------------------------------------------------------------------
    1432             : !BOC
    1433             : 
    1434             : #if defined( MODCM_TIMING )
    1435             :       call t_startf('mod_comm communication')
    1436             : #endif
    1437             : 
    1438      738816 :       call MPI_COMM_SIZE (comm, numcomm, ierror)
    1439             : 
    1440      738816 :       call Win_Close(comm, t1_win)
    1441             : 
    1442             : ! Recv from src
    1443      738816 :       if ( src >= 0 .and. src < numcomm ) then     ! is PE in valid range?
    1444      730776 :         t1_win%src  = src
    1445      730776 :         t1_win%offset_r = (t1_win%ncall_r-1)*idimsizz*nbuf
    1446             :         call Ga_Get4d_r8( comm, qout, t1_win, im, jm, km, 1, &
    1447             :                           if, il, jf, jl, kf, kl, 1, 1,   &
    1448      730776 :                           i1, i2, j1, j2, k1, k2, 1, 1, ga_t1_r  )
    1449             :       endif
    1450             : 
    1451      738816 :       call Win_Finalize(comm, t1_win)
    1452             : 
    1453             : #if defined( MODCM_TIMING )
    1454             :       call t_stopf('mod_comm communication')
    1455             : #endif
    1456             : 
    1457             : !EOC
    1458      738816 :       end subroutine mp_recv3d
    1459             : !------------------------------------------------------------------------------
    1460             : !------------------------------------------------------------------------------
    1461             : !BOP
    1462             : ! !ROUTINE: mp_send3d_2 --- Send 2 ghost regions
    1463             : !
    1464             : ! !INTERFACE:
    1465      161280 :       subroutine mp_send3d_2(comm, dest, src, im, jm, km, if, il, jf, jl, kf, kl, &
    1466      161280 :                                         i1, i2, j1, j2, k1, k2, q1, q2)
    1467             : !
    1468             : ! !INPUT PARAMETERS:
    1469             :       integer, intent(in):: comm      !  communicator
    1470             :       integer, intent(in):: dest, src
    1471             :       integer, intent(in):: im, jm, km
    1472             :       integer, intent(in):: if, il, jf, jl, kf, kl
    1473             :       integer, intent(in):: i1, i2, j1, j2, k1, k2
    1474             :       real(r8), intent(in):: q1(if:il, jf:jl, kf:kl)
    1475             :       real(r8), intent(in):: q2(if:il, jf:jl, kf:kl)
    1476             : !
    1477             : ! !DESCRIPTION:
    1478             : !
    1479             : !     Send two general 3d real*8 ghost region
    1480             : !
    1481             : ! !REVISION HISTORY:
    1482             : !    02.04.15   Putman
    1483             : !
    1484             : !EOP
    1485             : !------------------------------------------------------------------------------
    1486             : !BOC
    1487             : 
    1488             : #if defined( MODCM_TIMING )
    1489             :       call t_startf('mod_comm communication')
    1490             : #endif
    1491             : 
    1492      161280 :       call MPI_COMM_SIZE (comm, numcomm, ierror)
    1493             : 
    1494      161280 :       call Win_Open(comm, t1_win)
    1495             : 
    1496             : ! Init Recv src
    1497      161280 :       if ( src >= 0 .and. src < numcomm ) then     ! is PE in valid range?
    1498      158760 :         t1_win%src = src
    1499      158760 :         t1_win%size_r = (i2-i1+1)*(j2-j1+1)*(k2-k1+1)  ! chunk size
    1500      158760 :         t1_win%offset_r = (t1_win%ncall_s-1)*idimsizz*nbuf
    1501      158760 :         call Ga_RecvInit_r8(comm, t1_win, ga_t1_r)
    1502      158760 :         t1_win%offset_r = t1_win%offset_r + t1_win%size_r 
    1503      158760 :         call Ga_RecvInit_r8(comm, t1_win, ga_t1_r)
    1504             :       endif
    1505             : ! Send ghost region
    1506      161280 :       if ( dest >= 0 .and. dest < numcomm ) then
    1507      158760 :         t1_win%dest = dest
    1508      158760 :         t1_win%offset_s = (t1_win%ncall_s-1)*idimsizz*nbuf
    1509             :         call Ga_Put4d_r8( comm, q1, t1_win, im, jm, km, 2, &
    1510             :                           if, il, jf, jl, kf, kl, 1, 1,  &
    1511      158760 :                           i1, i2, j1, j2, k1, k2, 1, 1, ga_t1_s, ga_t1_r  )
    1512      158760 :         t1_win%offset_s = t1_win%offset_s + (i2-i1+1)*(j2-j1+1)*(k2-k1+1)
    1513             :         call Ga_Put4d_r8( comm, q2, t1_win, im, jm, km, 2, &
    1514             :                           if, il, jf, jl, kf, kl, 2, 2,  &
    1515      158760 :                           i1, i2, j1, j2, k1, k2, 2, 2, ga_t1_s, ga_t1_r  )
    1516             :       endif
    1517             : 
    1518             : #if defined( MODCM_TIMING )
    1519             :       call t_stopf('mod_comm communication')
    1520             : #endif
    1521             : 
    1522             : !EOC
    1523      161280 :       end subroutine mp_send3d_2
    1524             : !------------------------------------------------------------------------------
    1525             : !------------------------------------------------------------------------------
    1526             : !BOP
    1527             : ! !ROUTINE: mp_recv3d_2 --- Recv 2 ghost regions
    1528             : !
    1529             : ! !INTERFACE:
    1530      161280 :       subroutine mp_recv3d_2(comm, src, im, jm, km, if, il, jf, jl, kf, kl, &
    1531      161280 :                                   i1, i2, j1, j2, k1, k2, qout1, qout2)
    1532             : !
    1533             : ! !INPUT PARAMETERS:
    1534             :       integer, intent(in):: comm      !  communicator
    1535             :       integer, intent(in):: src
    1536             :       integer, intent(in):: im, jm, km
    1537             :       integer, intent(in):: if, il, jf, jl, kf, kl
    1538             :       integer, intent(in):: i1, i2, j1, j2, k1, k2
    1539             : ! !OUTPUT PARAMETERS:
    1540             :       real(r8), intent(inout):: qout1(if:il, jf:jl, kf:kl)
    1541             :       real(r8), intent(inout):: qout2(if:il, jf:jl, kf:kl)
    1542             : !
    1543             : ! !DESCRIPTION:
    1544             : !
    1545             : !     Recv two general 3d real*8 ghost regions
    1546             : !
    1547             : ! !REVISION HISTORY:
    1548             : !    02.04.15   Putman
    1549             : !
    1550             : !EOP
    1551             : !------------------------------------------------------------------------------
    1552             : !BOC
    1553             : 
    1554             : #if defined( MODCM_TIMING )
    1555             :       call t_startf('mod_comm communication')
    1556             : #endif
    1557             : 
    1558      161280 :       call MPI_COMM_SIZE (comm, numcomm, ierror)
    1559             : 
    1560      161280 :       call Win_Close(comm, t1_win)
    1561             : 
    1562             : ! Recv from src
    1563      161280 :       if ( src >= 0 .and. src < numcomm ) then     ! is PE in valid range?
    1564      158760 :         t1_win%src  = src
    1565      158760 :         t1_win%offset_r = (t1_win%ncall_r-1)*idimsizz*nbuf
    1566             :         call Ga_Get4d_r8( comm, qout1, t1_win, im, jm, km, 2, &
    1567             :                           if, il, jf, jl, kf, kl, 1, 1,   &
    1568      158760 :                           i1, i2, j1, j2, k1, k2, 1, 1, ga_t1_r  )
    1569      158760 :         t1_win%offset_r = t1_win%offset_r + (i2-i1+1)*(j2-j1+1)*(k2-k1+1)
    1570             :         call Ga_Get4d_r8( comm, qout2, t1_win, im, jm, km, 2, &
    1571             :                           if, il, jf, jl, kf, kl, 2, 2,   &
    1572      158760 :                           i1, i2, j1, j2, k1, k2, 2, 2, ga_t1_r  )
    1573             :       endif
    1574             : 
    1575      161280 :       call Win_Finalize(comm, t1_win)
    1576             : 
    1577             : #if defined( MODCM_TIMING )
    1578             :       call t_stopf('mod_comm communication')
    1579             : #endif
    1580             : 
    1581             : !EOC
    1582      161280 :       end subroutine mp_recv3d_2
    1583             : !------------------------------------------------------------------------------
    1584             : !------------------------------------------------------------------------------
    1585             : !BOP
    1586             : ! !ROUTINE: mp_barrier --- Synchronize all SPMD processes
    1587             : !
    1588             : ! !INTERFACE:
    1589           0 :       subroutine mp_barrier (comm)
    1590             : !
    1591             : ! !INPUT PARAMETERS:
    1592             :       integer, intent(in) :: comm      !  communicator
    1593             : ! !DESCRIPTION:
    1594             : !
    1595             : !     Synchronize all SPMD processes
    1596             : !
    1597             : ! !REVISION HISTORY: 
    1598             : !    2001.09.01   Lin    
    1599             : !    2002.04.09   Putman        Added ProTeX documentation
    1600             : !
    1601             : !EOP
    1602             : !------------------------------------------------------------------------------
    1603             : !BOC
    1604             : 
    1605           0 :         call MPI_BARRIER(comm, ierror)
    1606             : 
    1607             : !EOC
    1608           0 :       end subroutine mp_barrier
    1609             : !------------------------------------------------------------------------------
    1610             : !------------------------------------------------------------------------------
    1611             : !BOP
    1612             : ! !ROUTINE: Win_Open --- Open a communication window
    1613             : !
    1614             : ! !INTERFACE:
    1615    17479680 :       subroutine Win_Open(comm, win)
    1616             : ! !INPUT PARAMETERS:
    1617             :       integer, intent(in) :: comm      !  communicator
    1618             : ! !OUTPUT PARAMETERS:
    1619             :       type(window), intent(inout):: win
    1620             : !
    1621             : ! !DESCRIPTION:
    1622             : !
    1623             : !     Begin a communication epoch, by opening a comm window.
    1624             : !     Update number of send calls on the window (win%ncall_s).
    1625             : !     Barrier synchronzize if necessary.
    1626             : !
    1627             : ! !REVISION HISTORY: 
    1628             : !    02.02.15   Putman
    1629             : !    02.04.09   Putman        Added ProTeX documentation
    1630             : !
    1631             : !EOP
    1632             : !------------------------------------------------------------------------------
    1633             : !BOC
    1634             : 
    1635    17479680 :       win%ncall_s = win%ncall_s + 1
    1636             : 
    1637             : !EOC
    1638    17479680 :       end subroutine Win_Open
    1639             : !------------------------------------------------------------------------------
    1640             : !------------------------------------------------------------------------------
    1641             : !BOP
    1642             : ! !ROUTINE: Win_Close --- Close a communication window
    1643             : !
    1644             : ! !INTERFACE:
    1645    17479680 :       subroutine Win_Close(comm, win)
    1646             : ! !INPUT PARAMETERS:
    1647             :       integer, intent(in) :: comm      !  communicator
    1648             : ! !OUTPUT PARAMETERS:
    1649             :       type(window), intent(inout):: win
    1650             : !
    1651             : ! !DESCRIPTION:
    1652             : !
    1653             : !     End a communication epoch, by closing a comm window.
    1654             : !     Update number of receive calls on the window (win%ncall_r).
    1655             : !     Barrier synchronzize if necessary.
    1656             : !
    1657             : ! !REVISION HISTORY: 
    1658             : !    02.02.15   Putman
    1659             : !    02.04.09   Putman        Added ProTeX documentation
    1660             : !
    1661             : !EOP
    1662             : !------------------------------------------------------------------------------
    1663             : !BOC
    1664             : 
    1665    17479680 :       win%ncall_r = win%ncall_r + 1
    1666             : 
    1667             : !EOC
    1668    17479680 :       end subroutine Win_Close
    1669             : !------------------------------------------------------------------------------
    1670             : !------------------------------------------------------------------------------
    1671             : !BOP
    1672             : ! !ROUTINE: Win_Finalize --- Reset a communication window after a comm epoch.
    1673             : !
    1674             : ! !INTERFACE:
    1675    17479680 :       subroutine Win_Finalize(comm, win)
    1676             : ! !INPUT PARAMETERS:
    1677             :       integer, intent(in) :: comm      !  communicator
    1678             : ! !OUTPUT PARAMETERS:
    1679             :       type(window), intent(inout):: win
    1680             : !
    1681             : ! !DESCRIPTION:
    1682             : !
    1683             : !     Complete a communication epoch and reset a comm window.
    1684             : !     Barrier synchronzize if necessary.
    1685             : !
    1686             : ! !REVISION HISTORY:
    1687             : !    02.02.15   Putman
    1688             : !    02.04.09   Putman        Added ProTeX documentation
    1689             : !
    1690             : !EOP
    1691             : !------------------------------------------------------------------------------
    1692             : !BOC
    1693             : 
    1694    17479680 :       if (win%ncall_s == win%ncall_r) then
    1695    17060352 :         call MPI_WAITALL(win%nsend, win%sqest, Stats, ierror)
    1696    17060352 :         win%nsend = 0
    1697    17060352 :         win%nrecv = 0
    1698    17060352 :         win%nread = 0
    1699    17060352 :         win%ncall_s = 0
    1700    17060352 :         win%ncall_r = 0
    1701             :       endif
    1702             : 
    1703             : !EOC
    1704    17479680 :       end subroutine Win_Finalize
    1705             : !------------------------------------------------------------------------------
    1706             : !------------------------------------------------------------------------------
    1707             : !BOP
    1708             : ! !ROUTINE: Ga_Put4d_r8 --- Write to real*8 4d global array
    1709             : !
    1710             : ! !INTERFACE:
    1711    33943368 :       subroutine Ga_Put4d_r8 ( comm, q, win, im, jm, km, nq, &
    1712             :                                   ifrom, ito, jfrom, jto, kfrom, kto, &
    1713             :                                   nqfrom, nqto, i1, i2, j1, j2, k1, k2, &
    1714    33943368 :                                   nq1, nq2, ga_s, ga_r )
    1715             : ! !INPUT PARAMETERS:
    1716             :       integer, intent(in) :: comm      !  communicator
    1717             :       type(window), intent(inout)  :: win           ! Global Array Window
    1718             :       integer, intent(in)  :: im, jm, km, nq
    1719             :       integer, intent(in)  :: ifrom, ito, jfrom, jto, kfrom, kto, nqfrom, nqto
    1720             :       real(r8), intent(in)  :: q(ifrom:ito,jfrom:jto,kfrom:kto,nqfrom:nqto)
    1721             :       integer, intent(in)  :: i1, i2, j1, j2, k1, k2, nq1, nq2
    1722             : ! !OUTPUT PARAMETERS:
    1723             :       real(r8), intent(inout):: ga_s(win%size)
    1724             :       real(r8), intent(inout):: ga_r(win%size)
    1725             : !
    1726             : ! !DESCRIPTION:
    1727             : !
    1728             : !     Write to real*8 4d global array.
    1729             : !
    1730             : ! !REVISION HISTORY: 
    1731             : !    02.02.15   Putman
    1732             : !    02.04.09   Putman        Added ProTeX documentation
    1733             : !
    1734             : !EOP
    1735             : !------------------------------------------------------------------------------
    1736             : !BOC
    1737             : !
    1738             : ! !LOCAL VARIABLES:
    1739             :       integer i, j, k, iq, inc, inc1
    1740             :       integer i_length, j_length, k_length, ij_length, ijk_length
    1741             : 
    1742             :       integer send_tag, qsize
    1743             :       integer :: gidu
    1744             : 
    1745    33943368 :       call MPI_COMM_RANK (comm, gidu, ierror)
    1746             : 
    1747    33943368 :       i_length   = i2-i1+1
    1748    33943368 :       j_length   = j2-j1+1
    1749    33943368 :       k_length   = k2-k1+1
    1750             : 
    1751    33943368 :       ij_length  = i_length*j_length 
    1752    33943368 :       ijk_length = i_length*j_length*k_length
    1753             : 
    1754             : ! Begin Non-Blocking Sends
    1755    67886736 :         do iq = nq1, nq2
    1756             : !$omp parallel do private(i,j,k,inc,inc1)
    1757   163617684 :           do k = k1, k2
    1758             :                 inc1 = (win%offset_s) + ((iq-nq1)*ijk_length) &
    1759    95730948 :                        + ((k-k1)*ij_length) -i1+1
    1760   420148380 :             do j = j1, j2
    1761   290474064 :                 inc = inc1 + (j-j1)*i_length
    1762 75962966292 :               do i = i1, i2
    1763 75867235344 :                 ga_s(inc+i) = q(i,j,k,iq)
    1764             :               enddo
    1765             :             enddo
    1766             :           enddo
    1767             :         enddo
    1768             : 
    1769    33943368 :       qsize = (i2-i1+1)*(j2-j1+1)*(k2-k1+1)*(nq2-nq1+1)
    1770    33943368 :       send_tag = gidu
    1771    33943368 :       win%nsend = win%nsend + 1
    1772    33943368 :       call MPI_ISEND(ga_s(win%offset_s+1), qsize, mp_r8, win%dest, &
    1773    67886736 :                      send_tag, comm, win%sqest(win%nsend), ierror)
    1774             : 
    1775             : !EOC
    1776    33943368 :       end subroutine Ga_Put4d_r8
    1777             : !------------------------------------------------------------------------------
    1778             : !------------------------------------------------------------------------------
    1779             : !BOP
    1780             : ! !ROUTINE: Ga_RecvInit_r8 --- Initiate real*8 Non-Blocking receive
    1781             : !
    1782             : ! !INTERFACE:
    1783    33943368 :       subroutine Ga_RecvInit_r8( comm, win, ga )
    1784             : ! !INPUT PARAMETERS:
    1785             :       integer, intent(in) :: comm      !  communicator
    1786             :       type(window), intent(inout)    :: win           ! Global Array Window
    1787             : ! !OUTPUT PARAMETERS:
    1788             :       real(r8), intent(inout):: ga(win%size)
    1789             : !
    1790             : ! !DESCRIPTION:
    1791             : !
    1792             : !     Initiate real*8 Non-Blocking receive
    1793             : !
    1794             : ! !REVISION HISTORY: 
    1795             : !    02.02.15   Putman
    1796             : !    02.04.09   Putman        Added ProTeX documentation
    1797             : !    03.06.06   Sawyer        Added else clause
    1798             : !
    1799             : !EOP
    1800             : !------------------------------------------------------------------------------
    1801             : !BOC
    1802             : !
    1803             : ! !LOCAL VARIABLES:
    1804             :       integer qsize, recv_tag
    1805             : 
    1806    33943368 :       if (win%size >= win%offset_r + win%size_r) then
    1807    33943368 :         recv_tag = win%src
    1808    33943368 :         qsize    = win%size_r
    1809    33943368 :         win%nrecv    = win%nrecv + 1
    1810    33943368 :         call MPI_IRECV(ga(win%offset_r+1), qsize, mp_r8, win%src, &
    1811    67886736 :                        recv_tag, comm, win%rqest(win%nrecv), ierror)
    1812             :       else
    1813           0 :         write(iulog,*) "Fatal ga_recvinit_r8: receive window out of space - exiting"
    1814           0 :         write(iulog,*) 'gid win%size win%offset_r win%size_r = ', gid,  &
    1815           0 :                   win%size, win%offset_r, win%size_r
    1816           0 :         stop
    1817             :       endif
    1818             : 
    1819             : !EOC
    1820    33943368 :       end subroutine Ga_RecvInit_r8
    1821             : !------------------------------------------------------------------------------
    1822             : !------------------------------------------------------------------------------
    1823             : !BOP
    1824             : ! !ROUTINE: Ga_Get4d_r8 --- Read from real*8 4d global array
    1825             : !
    1826             : ! !INTERFACE:
    1827    67886736 :       subroutine Ga_Get4d_r8 ( comm, q, win, im, jm, km, nq, &
    1828             :                                   ifrom, ito, jfrom, jto, kfrom, kto, &
    1829             :                                   nqfrom, nqto, i1, i2, j1, j2, k1, k2, &
    1830    33943368 :                                   nq1, nq2, ga )
    1831             : ! !INPUT PARAMETERS:
    1832             :       integer, intent(in) :: comm      !  communicator
    1833             :       type(window), intent(inout)    :: win           ! Global Array Window
    1834             :       integer, intent(in)  :: im, jm, km, nq
    1835             :       integer, intent(in)  :: i1, i2, j1, j2, k1, k2, nq1, nq2
    1836             :       integer, intent(in)  :: ifrom, ito, jfrom, jto, kfrom, kto, nqfrom, nqto
    1837             :       real(r8), intent(in)  :: ga(win%size)
    1838             : ! !OUTPUT PARAMETERS:
    1839             :       real(r8), intent(inout) :: q(ifrom:ito, jfrom:jto, kfrom:kto, nqfrom:nqto)
    1840             : !
    1841             : ! !DESCRIPTION:
    1842             : !
    1843             : !     Read from real*8 4d global array.
    1844             : !
    1845             : ! !REVISION HISTORY: 
    1846             : !    02.02.15   Putman
    1847             : !    02.04.09   Putman        Added ProTeX documentation
    1848             : !
    1849             : !EOP
    1850             : !------------------------------------------------------------------------------
    1851             : !BOC
    1852             : !
    1853             : ! !LOCAL VARIABLES:
    1854             :       integer i, j, k, iq, inc, inc1
    1855             :       integer i_length, j_length, k_length, ij_length, ijk_length
    1856    33943368 :       win%nread = win%nread + 1
    1857    33943368 :       call MPI_WAIT(win%rqest(win%nread), Status, ierror)
    1858             : 
    1859    33943368 :       i_length   = i2-i1+1
    1860    33943368 :       j_length   = j2-j1+1
    1861    33943368 :       k_length   = k2-k1+1
    1862             : 
    1863    33943368 :       ij_length  = i_length*j_length
    1864    33943368 :       ijk_length = i_length*j_length*k_length
    1865             : 
    1866    67886736 :         do iq = nq1, nq2
    1867             : !$omp parallel do private(i,j,k,inc,inc1)
    1868   163617684 :           do k = k1, k2
    1869             :                 inc1 = (win%offset_r) + ((iq-nq1)*ijk_length) &
    1870    95730948 :                        + ((k-k1)*ij_length) -i1+1
    1871   420148380 :             do j = j1, j2
    1872   290474064 :                 inc = inc1 + (j-j1)*i_length
    1873 75962966292 :               do i = i1, i2
    1874 75867235344 :                 q(i,j,k,iq) = ga(inc+i)
    1875             :               enddo
    1876             :             enddo
    1877             :           enddo
    1878             :         enddo
    1879             : 
    1880             : !EOC
    1881    33943368 :       end subroutine Ga_Get4d_r8
    1882             : !------------------------------------------------------------------------------
    1883             : !------------------------------------------------------------------------------
    1884             : !BOP
    1885             : ! !ROUTINE: Ga_Put4d_r4 --- Write to real*4 4d global array
    1886             : !
    1887             : ! !INTERFACE:
    1888           0 :       subroutine Ga_Put4d_r4 ( comm, q, win, im, jm, km, nq, &
    1889             :                                   ifrom, ito, jfrom, jto, kfrom, kto, &
    1890             :                                   nqfrom, nqto, i1, i2, j1, j2, k1, k2, &
    1891           0 :                                   nq1, nq2, ga_s, ga_r )
    1892             : ! !INPUT PARAMETERS:
    1893             :       integer, intent(in) :: comm      !  communicator
    1894             :       type(window), intent(inout)  :: win           ! Global Array Window
    1895             :       integer, intent(in)  :: im, jm, km, nq
    1896             :       integer, intent(in)  :: ifrom, ito, jfrom, jto, kfrom, kto, nqfrom, nqto
    1897             :       real(r4), intent(in)  :: q(ifrom:ito,jfrom:jto,kfrom:kto,nqfrom:nqto)
    1898             :       integer, intent(in)  :: i1, i2, j1, j2, k1, k2, nq1, nq2
    1899             : ! !OUTPUT PARAMETERS:
    1900             :       real(r4), intent(inout):: ga_s(win%size)
    1901             :       real(r4), intent(inout):: ga_r(win%size)
    1902             : !
    1903             : ! !DESCRIPTION:
    1904             : !
    1905             : !     Write to real*4 4d global array.
    1906             : !
    1907             : ! !REVISION HISTORY: 
    1908             : !    02.02.15   Putman
    1909             : !    02.04.09   Putman        Added ProTeX documentation
    1910             : !
    1911             : !EOP
    1912             : !------------------------------------------------------------------------------
    1913             : !BOC
    1914             : !
    1915             : ! !LOCAL VARIABLES:
    1916             :       integer i, j, k, iq, inc, inc1
    1917             :       integer i_length, j_length, k_length, ij_length, ijk_length
    1918             : 
    1919             :       integer send_tag, qsize
    1920             :       integer :: gidu
    1921             : 
    1922           0 :       call MPI_COMM_RANK (comm, gidu, ierror)
    1923             : 
    1924             : #if defined ( NOR4 )
    1925           0 :         write(iulog,*) 'Mod_comm: Ga_Put4d_r4 - r4 windows disabled - exiting'
    1926           0 :         stop
    1927             : #endif
    1928             : 
    1929             :       i_length   = i2-i1+1
    1930             :       j_length   = j2-j1+1
    1931             :       k_length   = k2-k1+1
    1932             : 
    1933             :       ij_length  = i_length*j_length 
    1934             :       ijk_length = i_length*j_length*k_length
    1935             : 
    1936             : ! Begin Non-Blocking Sends
    1937             :         do iq = nq1, nq2
    1938             : !$omp parallel do private(i,j,k,inc,inc1)
    1939             :           do k = k1, k2
    1940             :                 inc1 = (win%offset_s) + ((iq-nq1)*ijk_length) &
    1941             :                        + ((k-k1)*ij_length) -i1+1
    1942             :             do j = j1, j2
    1943             :                 inc = inc1 + (j-j1)*i_length
    1944             :               do i = i1, i2
    1945             :                 ga_s(inc+i) = q(i,j,k,iq)
    1946             :               enddo
    1947             :             enddo
    1948             :           enddo
    1949             :         enddo
    1950             : 
    1951             :       qsize = (i2-i1+1)*(j2-j1+1)*(k2-k1+1)*(nq2-nq1+1)
    1952             :       send_tag = gidu
    1953             :       win%nsend = win%nsend + 1
    1954             :       call MPI_ISEND(ga_s(win%offset_s+1), qsize, mp_r4, win%dest, &
    1955             :                      send_tag, comm, win%sqest(win%nsend), ierror)
    1956             : 
    1957             : !EOC
    1958             :       end subroutine Ga_Put4d_r4
    1959             : !------------------------------------------------------------------------------
    1960             : !------------------------------------------------------------------------------
    1961             : !BOP
    1962             : ! !ROUTINE: Ga_RecvInit_r4 --- Initiate real*4 Non-Blocking receive
    1963             : !
    1964             : ! !INTERFACE:
    1965           0 :       subroutine Ga_RecvInit_r4( comm, win, ga )
    1966             : ! !INPUT PARAMETERS:
    1967             :       integer, intent(in) :: comm      !  communicator
    1968             :       type(window), intent(inout)    :: win           ! Global Array Window
    1969             : ! !OUTPUT PARAMETERS:
    1970             :       real(r4), intent(inout):: ga(win%size)
    1971             : !
    1972             : ! !DESCRIPTION:
    1973             : !
    1974             : !     Initiate real*8 Non-Blocking receive
    1975             : !
    1976             : ! !REVISION HISTORY: 
    1977             : !    02.02.15   Putman
    1978             : !    02.04.09   Putman        Added ProTeX documentation
    1979             : !    03.06.06   Sawyer        Added else clause
    1980             : !
    1981             : !EOP
    1982             : !------------------------------------------------------------------------------
    1983             : !BOC
    1984             : !
    1985             : ! !LOCAL VARIABLES:
    1986             :       integer qsize, recv_tag
    1987             : 
    1988             : #if defined ( NOR4 )
    1989           0 :         write(iulog,*) 'Mod_comm: Ga_RecvInit_r4 - r4 windows disabled - exiting'
    1990           0 :         stop
    1991             : #endif
    1992             : 
    1993             :       if (win%size >= win%offset_r + win%size_r) then
    1994             :         recv_tag = win%src
    1995             :         qsize    = win%size_r
    1996             :         win%nrecv    = win%nrecv + 1
    1997             :         call MPI_IRECV(ga(win%offset_r+1), qsize, mp_r4, win%src, &
    1998             :                        recv_tag, comm, win%rqest(win%nrecv), ierror)
    1999             :       else
    2000             :         write(iulog,*) "Fatal ga_recvinit_r4: receive window out of space - exiting"
    2001             :         write(iulog,*) 'gid win%size win%offset_r win%size_r = ', gid,  &
    2002             :                   win%size, win%offset_r, win%size_r
    2003             :         stop
    2004             :       endif
    2005             : 
    2006             : !EOC
    2007             :       end subroutine Ga_RecvInit_r4
    2008             : !------------------------------------------------------------------------------
    2009             : !------------------------------------------------------------------------------
    2010             : !BOP
    2011             : ! !ROUTINE: Ga_Get4d_r4 --- Read from real*4 4d global array
    2012             : !
    2013             : ! !INTERFACE:
    2014           0 :       subroutine Ga_Get4d_r4 ( comm, q, win, im, jm, km, nq, &
    2015             :                                   ifrom, ito, jfrom, jto, kfrom, kto, &
    2016             :                                   nqfrom, nqto, i1, i2, j1, j2, k1, k2, &
    2017           0 :                                   nq1, nq2, ga )
    2018             : ! !INPUT PARAMETERS:
    2019             :       integer, intent(in) :: comm      !  communicator
    2020             :       type(window), intent(inout)    :: win           ! Global Array Window
    2021             :       integer, intent(in)  :: im, jm, km, nq
    2022             :       integer, intent(in)  :: i1, i2, j1, j2, k1, k2, nq1, nq2
    2023             :       integer, intent(in)  :: ifrom, ito, jfrom, jto, kfrom, kto, nqfrom, nqto
    2024             :       real(r4), intent(in)  :: ga(win%size)
    2025             : ! !OUTPUT PARAMETERS:
    2026             :       real(r4), intent(inout) :: q(ifrom:ito, jfrom:jto, kfrom:kto, nqfrom:nqto)
    2027             : !
    2028             : ! !DESCRIPTION:
    2029             : !
    2030             : !     Read from real*8 4d global array.
    2031             : !
    2032             : ! !REVISION HISTORY: 
    2033             : !    02.02.15   Putman
    2034             : !    02.04.09   Putman        Added ProTeX documentation
    2035             : !
    2036             : !EOP
    2037             : !------------------------------------------------------------------------------
    2038             : !BOC
    2039             : !
    2040             : ! !LOCAL VARIABLES:
    2041             :       integer i, j, k, iq, inc, inc1
    2042             :       integer i_length, j_length, k_length, ij_length, ijk_length
    2043             : 
    2044             : #if defined ( NOR4 )
    2045           0 :         write(iulog,*) 'Mod_comm: Ga_Get4d_r4 - r4 windows disabled - exiting'
    2046           0 :         stop
    2047             : #endif
    2048             : 
    2049             :       win%nread = win%nread + 1
    2050             :       call MPI_WAIT(win%rqest(win%nread), Status, ierror)
    2051             : 
    2052             :       i_length   = i2-i1+1
    2053             :       j_length   = j2-j1+1
    2054             :       k_length   = k2-k1+1
    2055             : 
    2056             :       ij_length  = i_length*j_length
    2057             :       ijk_length = i_length*j_length*k_length
    2058             : 
    2059             :         do iq = nq1, nq2
    2060             : !$omp parallel do private(i,j,k,inc,inc1)
    2061             :           do k = k1, k2
    2062             :                 inc1 = (win%offset_r) + ((iq-nq1)*ijk_length) &
    2063             :                        + ((k-k1)*ij_length) -i1+1
    2064             :             do j = j1, j2
    2065             :                 inc = inc1 + (j-j1)*i_length
    2066             :               do i = i1, i2
    2067             :                 q(i,j,k,iq) = ga(inc+i)
    2068             :               enddo
    2069             :             enddo
    2070             :           enddo
    2071             :         enddo
    2072             : 
    2073             : !EOC
    2074             :       end subroutine Ga_Get4d_r4
    2075             : !------------------------------------------------------------------------------
    2076             : !------------------------------------------------------------------------------
    2077             : !BOP
    2078             : ! !ROUTINE: Ga_Put4d_i4 --- Write to integer*4 4d global array
    2079             : !
    2080             : ! !INTERFACE:
    2081           0 :       subroutine Ga_Put4d_i4 ( comm, q, win, im, jm, km, nq, &
    2082             :                                   ifrom, ito, jfrom, jto, kfrom, kto, &
    2083             :                                   nqfrom, nqto, i1, i2, j1, j2, k1, k2, &
    2084           0 :                                   nq1, nq2, ga_s, ga_r )
    2085             : ! !INPUT PARAMETERS:
    2086             :       integer, intent(in) :: comm      !  communicator
    2087             :       type(window), intent(inout)  :: win           ! Global Array Window
    2088             :       integer, intent(in)  :: im, jm, km, nq
    2089             :       integer, intent(in)  :: ifrom, ito, jfrom, jto, kfrom, kto, nqfrom, nqto
    2090             :       integer(i4), intent(in)  :: q(ifrom:ito,jfrom:jto,kfrom:kto,nqfrom:nqto)
    2091             :       integer, intent(in)  :: i1, i2, j1, j2, k1, k2, nq1, nq2
    2092             : ! !OUTPUT PARAMETERS:
    2093             :       integer(i4), intent(inout):: ga_s(win%size)
    2094             :       integer(i4), intent(inout):: ga_r(win%size)
    2095             : !
    2096             : ! !DESCRIPTION:
    2097             : !
    2098             : !     Write to integer*4 4d global array.
    2099             : !
    2100             : ! !REVISION HISTORY: 
    2101             : !    02.02.15   Putman
    2102             : !    02.04.09   Putman        Added ProTeX documentation
    2103             : !
    2104             : !EOP
    2105             : !------------------------------------------------------------------------------
    2106             : !BOC
    2107             : !
    2108             : ! !LOCAL VARIABLES:
    2109             :       integer i, j, k, iq, inc, inc1
    2110             :       integer i_length, j_length, k_length, ij_length, ijk_length
    2111             : 
    2112             :       integer send_tag, qsize
    2113             :       integer :: gidu
    2114             : 
    2115           0 :       call MPI_COMM_RANK (comm, gidu, ierror)
    2116             : 
    2117           0 :       i_length   = i2-i1+1
    2118           0 :       j_length   = j2-j1+1
    2119           0 :       k_length   = k2-k1+1
    2120             : 
    2121           0 :       ij_length  = i_length*j_length
    2122           0 :       ijk_length = i_length*j_length*k_length
    2123             : 
    2124             : ! Begin Non-Blocking Sends
    2125           0 :         do iq = nq1, nq2
    2126             : !$omp parallel do private(i,j,k,inc,inc1)
    2127           0 :           do k = k1, k2
    2128             :                 inc1 = (win%offset_s) + ((iq-nq1)*ijk_length) &
    2129           0 :                        + ((k-k1)*ij_length) -i1+1
    2130           0 :             do j = j1, j2
    2131           0 :                 inc = inc1 + (j-j1)*i_length
    2132           0 :               do i = i1, i2
    2133           0 :                 ga_s(inc+i) = q(i,j,k,iq)
    2134             :               enddo
    2135             :             enddo
    2136             :           enddo
    2137             :         enddo
    2138             : 
    2139           0 :       qsize = (i2-i1+1)*(j2-j1+1)*(k2-k1+1)*(nq2-nq1+1)
    2140           0 :       send_tag = gidu
    2141           0 :       win%nsend = win%nsend + 1
    2142           0 :       call MPI_ISEND(ga_s(win%offset_s+1), qsize, mp_i4, win%dest, &
    2143           0 :                      send_tag, comm, win%sqest(win%nsend), ierror)
    2144             : 
    2145             : !EOC
    2146           0 :       end subroutine Ga_Put4d_i4
    2147             : !------------------------------------------------------------------------------
    2148             : !------------------------------------------------------------------------------
    2149             : !BOP
    2150             : ! !ROUTINE: Ga_RecvInit_i4 --- Initiate integer*4 Non-Blocking receive
    2151             : !
    2152             : ! !INTERFACE:
    2153           0 :       subroutine Ga_RecvInit_i4( comm, win, ga )
    2154             : ! !INPUT PARAMETERS:
    2155             :       integer, intent(in) :: comm      !  communicator
    2156             :       type(window), intent(inout)    :: win           ! Global Array Window
    2157             : ! !OUTPUT PARAMETERS:
    2158             :       integer(i4), intent(inout):: ga(win%size)
    2159             : !
    2160             : ! !DESCRIPTION:
    2161             : !
    2162             : !     Initiate integer*4 Non-Blocking receive
    2163             : !
    2164             : ! !REVISION HISTORY: 
    2165             : !    02.02.15   Putman
    2166             : !    02.04.09   Putman        Added ProTeX documentation
    2167             : !    06.05.21   Mirin         Added else clause
    2168             : !
    2169             : !EOP
    2170             : !------------------------------------------------------------------------------
    2171             : !BOC
    2172             : !
    2173             : ! !LOCAL VARIABLES:
    2174             :       integer qsize, recv_tag
    2175             : 
    2176           0 :       if (win%size >= win%offset_r + win%size_r) then
    2177           0 :         recv_tag = win%src
    2178           0 :         qsize    = win%size_r
    2179           0 :         win%nrecv    = win%nrecv + 1
    2180           0 :         call MPI_IRECV(ga(win%offset_r+1), qsize, mp_i4, win%src, &
    2181           0 :                        recv_tag, comm, win%rqest(win%nrecv), ierror)
    2182             :       else
    2183           0 :         write(iulog,*) "Fatal ga_recvinit_i4: receive window out of space - exiting"
    2184           0 :         write(iulog,*) 'gid win%size win%offset_r win%size_r = ', gid,  &
    2185           0 :                   win%size, win%offset_r, win%size_r
    2186           0 :         stop
    2187             :       endif
    2188             : !EOC
    2189           0 :       end subroutine Ga_RecvInit_i4
    2190             : !------------------------------------------------------------------------------
    2191             : !------------------------------------------------------------------------------
    2192             : !BOP
    2193             : ! !ROUTINE: Ga_Get4d_i4 --- Read from integer*4 4d global array
    2194             : !
    2195             : ! !INTERFACE:
    2196           0 :       subroutine Ga_Get4d_i4 ( comm, q, win, im, jm, km, nq, &
    2197             :                                   ifrom, ito, jfrom, jto, kfrom, kto, &
    2198             :                                   nqfrom, nqto, i1, i2, j1, j2, k1, k2, &
    2199           0 :                                   nq1, nq2, ga )
    2200             : ! !INPUT PARAMETERS:
    2201             :       integer, intent(in) :: comm      !  communicator
    2202             :       type(window), intent(inout)    :: win           ! Global Array Window
    2203             :       integer, intent(in)  :: im, jm, km, nq
    2204             :       integer, intent(in)  :: i1, i2, j1, j2, k1, k2, nq1, nq2
    2205             :       integer, intent(in)  :: ifrom, ito, jfrom, jto, kfrom, kto, nqfrom, nqto
    2206             :       integer(i4), intent(in)  :: ga(win%size)
    2207             : ! !OUTPUT PARAMETERS:
    2208             :       integer(i4), intent(inout) :: q(ifrom:ito, jfrom:jto, kfrom:kto, nqfrom:nqto)
    2209             : !
    2210             : ! !DESCRIPTION:
    2211             : !
    2212             : !     Read from integer*4 4d global array.
    2213             : !
    2214             : ! !REVISION HISTORY: 
    2215             : !    02.02.15   Putman
    2216             : !    02.04.09   Putman        Added ProTeX documentation
    2217             : !
    2218             : !EOP
    2219             : !------------------------------------------------------------------------------
    2220             : !BOC
    2221             : !
    2222             : ! !LOCAL VARIABLES:
    2223             :       integer i, j, k, iq, inc, inc1
    2224             :       integer i_length, j_length, k_length, ij_length, ijk_length
    2225             : 
    2226           0 :       win%nread = win%nread + 1
    2227           0 :       call MPI_WAIT(win%rqest(win%nread), Status, ierror)
    2228             : 
    2229           0 :       i_length   = i2-i1+1
    2230           0 :       j_length   = j2-j1+1
    2231           0 :       k_length   = k2-k1+1
    2232           0 :       ij_length  = i_length*j_length
    2233           0 :       ijk_length = i_length*j_length*k_length
    2234             : 
    2235           0 :         do iq = nq1, nq2
    2236             : !$omp parallel do private(i,j,k,inc,inc1)
    2237           0 :           do k = k1, k2
    2238             :                 inc1 = (win%offset_r) + ((iq-nq1)*ijk_length) &
    2239           0 :                        + ((k-k1)*ij_length) -i1+1
    2240           0 :             do j = j1, j2
    2241           0 :                 inc = inc1 + (j-j1)*i_length
    2242           0 :               do i = i1, i2
    2243           0 :                 q(i,j,k,iq) = ga(inc+i)
    2244             :               enddo
    2245             :             enddo
    2246             :           enddo
    2247             :         enddo
    2248             : 
    2249             : !EOC
    2250           0 :       end subroutine Ga_Get4d_i4
    2251             : !------------------------------------------------------------------------------
    2252             : !------------------------------------------------------------------------------
    2253             : !BOP
    2254             : ! !ROUTINE: Ga_Broadcast_r8 --- Broadcast an real*8 1d global array
    2255             : !
    2256             : ! !INTERFACE:
    2257           0 :       subroutine Ga_Broadcast_r8 ( comm, q, isize )
    2258             : ! !INPUT PARAMETERS:
    2259             :       integer, intent(in) :: comm      !  communicator
    2260             :       integer, intent(in)  :: isize
    2261             : ! !OUTPUT PARAMETERS:
    2262             :       real(r8), intent(inout) :: q(isize)
    2263             : !
    2264             : ! !DESCRIPTION:
    2265             : !
    2266             : !     Broadcast an real*8 1d global array.
    2267             : !
    2268             : ! !REVISION HISTORY:
    2269             : !    03.04.02        Putman
    2270             : !
    2271             : !EOP
    2272             : !------------------------------------------------------------------------------
    2273             : !BOC
    2274             : ! !LOCAL VARIABLES:
    2275             : 
    2276           0 :       call MPI_BCAST(q, isize, mp_r8, 0, comm, ierror)
    2277             : 
    2278             : !EOC
    2279           0 :       end subroutine Ga_Broadcast_r8
    2280             : !------------------------------------------------------------------------------
    2281             : !------------------------------------------------------------------------------
    2282             : !BOP
    2283             : ! !ROUTINE: Ga_Broadcast_r4 --- Broadcast an real*4 1d global array
    2284             : !
    2285             : ! !INTERFACE:
    2286           0 :       subroutine Ga_Broadcast_r4 ( comm, q, isize )
    2287             : ! !INPUT PARAMETERS:
    2288             :       integer, intent(in) :: comm      !  communicator
    2289             :       integer, intent(in)  :: isize
    2290             : ! !OUTPUT PARAMETERS:
    2291             :       real(r4), intent(inout) :: q(isize)
    2292             : !
    2293             : ! !DESCRIPTION:
    2294             : !
    2295             : !     Broadcast an real*4 1d global array.
    2296             : !
    2297             : ! !REVISION HISTORY:
    2298             : !    03.04.02        Putman
    2299             : !
    2300             : !EOP
    2301             : !------------------------------------------------------------------------------
    2302             : !BOC
    2303             : ! !LOCAL VARIABLES:
    2304             : 
    2305             : #if defined ( NOR4 )
    2306           0 :         write(iulog,*) 'Mod_comm: Ga_Broadcast_r4 - r4 windows disabled - exiting'
    2307           0 :         stop
    2308             : #endif
    2309             : 
    2310             :       call MPI_BCAST(q, isize, mp_r4, 0, comm, ierror)
    2311             : 
    2312             : !EOC
    2313             :       end subroutine Ga_Broadcast_r4
    2314             : !------------------------------------------------------------------------------
    2315             : !------------------------------------------------------------------------------
    2316             : !BOP
    2317             : ! !ROUTINE: Ga_Broadcast_i4 --- Broadcast an integer*4 1d global array
    2318             : !
    2319             : ! !INTERFACE:
    2320           0 :       subroutine Ga_Broadcast_i4 ( comm, q, isize )
    2321             : ! !INPUT PARAMETERS:
    2322             :       integer, intent(in) :: comm      !  communicator
    2323             :       integer, intent(in)  :: isize
    2324             : ! !OUTPUT PARAMETERS:
    2325             :       integer(i4), intent(inout) :: q(isize)
    2326             : !
    2327             : ! !DESCRIPTION:
    2328             : !
    2329             : !     Broadcast an integer*4 1d global array.
    2330             : !
    2331             : ! !REVISION HISTORY:
    2332             : !    03.04.02        Putman
    2333             : !
    2334             : !EOP
    2335             : !------------------------------------------------------------------------------
    2336             : !BOC
    2337             : ! !LOCAL VARIABLES:
    2338             : 
    2339           0 :       call MPI_BCAST(q, isize, mp_i4, 0, comm, ierror)
    2340             : 
    2341             : !EOC
    2342           0 :       end subroutine Ga_Broadcast_i4
    2343             : !------------------------------------------------------------------------------
    2344             : !------------------------------------------------------------------------------
    2345             : !BOP
    2346             : ! !ROUTINE: Ga_AllToAll_r8 --- All to All of an real*8 1d global array
    2347             : !
    2348             : ! !INTERFACE:
    2349           0 :       subroutine Ga_AllToAll_r8 ( comm, q, Gsize, Lsize, istart )
    2350             : ! !INPUT PARAMETERS:
    2351             :       integer, intent(in) :: comm      !  communicator
    2352             :       integer, intent(in)  :: Gsize    ! Global size of array
    2353             :       integer, intent(in)  :: Lsize    ! size of Local portion
    2354             :       integer, intent(in)  :: istart   ! starting point
    2355             : ! !OUTPUT PARAMETERS:
    2356             :       real(r8), intent(inout) :: q(Gsize)
    2357             : !
    2358             : ! !DESCRIPTION:
    2359             : !
    2360             : !     All to All of a real*8 1d global array.
    2361             : !
    2362             : ! !REVISION HISTORY:
    2363             : !    03.04.02        Putman
    2364             : !
    2365             : !EOP
    2366             : !------------------------------------------------------------------------------
    2367             : !BOC
    2368             : ! !LOCAL VARIABLES:
    2369             : 
    2370           0 :       call MPI_ALLGATHER(q(istart), Lsize, mp_r8, q, Lsize, mp_r8, comm, ierror)
    2371             : 
    2372             : !EOC
    2373           0 :       end subroutine Ga_AllToAll_r8
    2374             : !------------------------------------------------------------------------------
    2375             : !------------------------------------------------------------------------------
    2376             : !BOP
    2377             : ! !ROUTINE: Ga_AllToAll_r4 --- All to All of an real*4 1d global array
    2378             : !
    2379             : ! !INTERFACE:
    2380           0 :       subroutine Ga_AllToAll_r4 ( comm, q, Gsize, Lsize, istart )
    2381             : ! !INPUT PARAMETERS:
    2382             :       integer, intent(in)  :: comm      !  communicator
    2383             :       integer, intent(in)  :: Gsize   ! Global size of array
    2384             :       integer, intent(in)  :: Lsize   ! size of Local portion
    2385             :       integer, intent(in)  :: istart  ! starting point
    2386             : ! !OUTPUT PARAMETERS:
    2387             :       real(r4), intent(inout) :: q(Gsize)
    2388             : !
    2389             : ! !DESCRIPTION:
    2390             : !
    2391             : !     All to All of an real*4 1d global array.
    2392             : !
    2393             : ! !REVISION HISTORY:
    2394             : !    03.04.02        Putman
    2395             : !
    2396             : !EOP
    2397             : !------------------------------------------------------------------------------
    2398             : !BOC
    2399             : ! !LOCAL VARIABLES:
    2400             : 
    2401             : #if defined ( NOR4 )
    2402           0 :         write(iulog,*) 'Mod_comm: Ga_AllToAll_r4 - r4 windows disabled - exiting'
    2403           0 :         stop
    2404             : #endif
    2405             : 
    2406             :       call MPI_ALLGATHER(q(istart), Lsize, mp_r4, q, Lsize, mp_r4, comm, ierror)
    2407             : 
    2408             : !EOC
    2409             :       end subroutine Ga_AllToAll_r4
    2410             : !------------------------------------------------------------------------------
    2411             : !------------------------------------------------------------------------------
    2412             : !BOP
    2413             : ! !ROUTINE: Ga_AllToAll_i4 --- All to All of an integer*4 1d global array
    2414             : !
    2415             : ! !INTERFACE:
    2416           0 :       subroutine Ga_AllToAll_i4 ( comm, q, Gsize, Lsize, istart )
    2417             : ! !INPUT PARAMETERS:
    2418             :       integer, intent(in)  :: comm      !  communicator
    2419             :       integer, intent(in)  :: Gsize   ! Global size of array
    2420             :       integer, intent(in)  :: Lsize   ! size of Local portion
    2421             :       integer, intent(in)  :: istart  ! starting point
    2422             : ! !OUTPUT PARAMETERS:
    2423             :       integer(i4), intent(inout) :: q(Gsize)
    2424             : !
    2425             : ! !DESCRIPTION:
    2426             : !
    2427             : !     All to All of an integer*4 1d global array.
    2428             : !
    2429             : ! !REVISION HISTORY:
    2430             : !    03.04.02        Putman
    2431             : !
    2432             : !EOP
    2433             : !------------------------------------------------------------------------------
    2434             : !BOC
    2435             : ! !LOCAL VARIABLES:
    2436             : 
    2437           0 :       call MPI_ALLGATHER(q(istart), Lsize, mp_i4, q, Lsize, mp_i4, comm, ierror)
    2438             : 
    2439             : !EOC
    2440           0 :       end subroutine Ga_AllToAll_i4
    2441             : !------------------------------------------------------------------------------
    2442             : !BOP
    2443             : ! !ROUTINE: get_partneroffset --- Computes partneroffset/type from descriptor
    2444             : !
    2445             : ! !INTERFACE:
    2446       47616 :       subroutine get_partneroffset ( comm, send_bl, recv_bl )
    2447             : 
    2448             : ! !INPUT PARAMETERS:
    2449             :       integer, intent(in)  :: comm      !  communicator
    2450             : ! !INPUT/OUTPUT PARAMETERS:
    2451             :       type(blockdescriptor), intent(inout)  :: send_bl(:) ! send blocks
    2452             :       type(blockdescriptor), intent(inout)  :: recv_bl(:) ! receive blocks
    2453             : 
    2454             : !
    2455             : ! !DESCRIPTION:
    2456             : !     Compute partneroffsets/types from other blockdescriptor
    2457             : !     information.  Used exclusively for irregular communication 
    2458             : !     in PILGRIM.
    2459             : !
    2460             : ! !REVISION HISTORY: 
    2461             : !    03.10.31   Mirin       Creation
    2462             : !
    2463             : ! !BUGS:
    2464             : !
    2465             : !EOP
    2466             : !------------------------------------------------------------------------------
    2467             : !BOC
    2468             : !
    2469             : ! !LOCAL VARIABLES:
    2470             : 
    2471             :       integer :: i, j, k, ns, pos, por, numpsq, ierror
    2472             :       integer :: ami(numpro,numpro), am(numpro,numpro)
    2473             :       integer mod_method, num_s, num_r
    2474             : 
    2475       47616 :       num_s = size(send_bl)
    2476       47616 :       num_r = size(recv_bl)
    2477             : 
    2478    36616704 :       do j = 1, num_s
    2479    36569088 :          send_bl(j)%partneroffset = 0
    2480    36616704 :          send_bl(j)%partnertype = MPI_DATATYPE_NULL
    2481             :       enddo
    2482    36616704 :       do j = 1, num_r
    2483    36569088 :          recv_bl(j)%partneroffset = 0
    2484    36616704 :          recv_bl(j)%partnertype = MPI_DATATYPE_NULL
    2485             :       enddo
    2486             : 
    2487       47616 :       end subroutine get_partneroffset
    2488             : !------------------------------------------------------------------------------
    2489             : !
    2490             : !------------------------------------------------------------------------------
    2491             : !BOP
    2492             : ! !ROUTINE: mp_sendirr --- Initiate communication of contiguous parcels
    2493             : !
    2494             : ! !INTERFACE:
    2495      919296 :       subroutine mp_sendirr ( comm, send_bl, recv_bl, q1in, q1out, q2in, q2out,      &
    2496             :                               modc )
    2497             :  
    2498             : ! !INPUT PARAMETERS:
    2499             :       integer, intent(in)  :: comm      !  communicator
    2500             :       type(blockdescriptor), intent(in)  :: send_bl(:) ! send blocks
    2501             :       type(blockdescriptor), intent(in)  :: recv_bl(:) ! receive blocks
    2502             :       real(r8), intent(in) :: q1in(*)                  ! input array
    2503             :       real(r8), optional, intent(in) :: q2in(*)        ! second input array
    2504             :       integer, optional, intent(in) :: modc(4)         ! 1: classical, swap p2p, swap a2a
    2505             :                                                        ! 2: handshake
    2506             :                                                        ! 3: send vs isend
    2507             :                                                        ! 4: max number of outstanding requests
    2508             : 
    2509             : ! !OUTPUT PARAMETERS:
    2510             :       real(r8), intent(out) :: q1out(*)                ! output array
    2511             :       real(r8), optional, intent(out) :: q2out(*)      ! second output array
    2512             : !
    2513             : ! !DESCRIPTION:
    2514             : !     Communicate a number of contiguous parcels to/from arbitrary set of PEs.
    2515             : !     Modc(1): if 0, use original approach of posting all communications here and placing
    2516             : !     wait points in mp_recvirr; if 1, call swap routine with p2p messages; if 2, call swap
    2517             : !     routine with a2a messages. 
    2518             : !     Modc(2): if 1, then apply handshaking (don't send until corresponding receive is posted)
    2519             : !     Modc(3): if 1, then use blocking send; otherwise use nonblocking send
    2520             : !     Modc(4): maximum number of outstanding requests (applies to swap routines only)
    2521             : !
    2522             : ! !REVISION HISTORY: 
    2523             : !    02.08.13   Sawyer      Creation
    2524             : !    02.11.06   Mirin       Optimizations
    2525             : !    03.03.03   Sawyer      Use partneroffset
    2526             : !    03.06.24   Sawyer      Integrated Use_Mpi_Types; added qout
    2527             : !    04.02.24   Mirin       Various mpi2 options
    2528             : !    08.09.18   Mirin       Major overhaul, to include approaches from Mirin and Worley
    2529             : !    09.10.07   Worley      eliminated mpi_recv from handshake logic
    2530             : !
    2531             : ! !BUGS:
    2532             : !
    2533             : !EOP
    2534             : !------------------------------------------------------------------------------
    2535             : !BOC
    2536             : !
    2537             : ! !LOCAL VARIABLES:
    2538             :       integer ipe, qsize, offset, blocksize, nparcels, offset_s, offset_r, ierr, mod_method
    2539             :       integer p, mysize, nthpc, minsize, nthrd, pn, pt, tmpsize, unitsize, offset_0
    2540             :       integer i, j, send_tag, recv_tag, num_s, num_r
    2541     1838592 :       integer :: offset_v (Max_Nparcels)
    2542     1838592 :       integer :: hs_snd, hs_rcv(numpro), hs_rcvids(numpro)
    2543             :       integer ipe2, ceil2num
    2544             :       integer onetwo
    2545             :       logical twovar
    2546             :       integer sw_local, maxreq_local
    2547             :       logical hs_local, send_local
    2548             :       logical sw_alltoall
    2549             :       integer comm_pid
    2550             : 
    2551             : 
    2552             : #if defined( MODCM_TIMING )
    2553             :       call t_startf('mod_comm communication')
    2554             : #endif
    2555             : 
    2556      919296 :       if (present(modc)) then
    2557      919296 :          sw_local   = modc(1)
    2558      919296 :          hs_local   = (modc(2) .eq. 1)
    2559      919296 :          send_local = (modc(3) .eq. 1)
    2560      919296 :          maxreq_local = modc(4)
    2561             :       else
    2562           0 :          sw_local = 0
    2563           0 :          hs_local = .true.
    2564           0 :          send_local = .true.
    2565           0 :          maxreq_local = -1
    2566             :       endif
    2567             : 
    2568             : ! Do not call mp_swapirr unless mod_method equals 0
    2569      919296 :       mod_method = recv_bl(1)%method
    2570      919296 :       if (mod_method .gt. 0) sw_local = 0
    2571             : 
    2572      919296 :       onetwo = 1
    2573      919296 :       twovar = .false.
    2574      919296 :       if (present(q2in)) then
    2575      580608 :          onetwo = 2
    2576      580608 :          twovar = .true.
    2577             :       endif
    2578             : 
    2579      919296 :     if (sw_local .gt. 0) then
    2580           0 :          sw_alltoall = (sw_local .eq. 2)
    2581           0 :          if (present(q2in)) then
    2582             :             call mp_swapirr(comm, send_bl, recv_bl, q1in, q1out, q2in, q2out,   &
    2583             :                             sw_handshake=hs_local, sw_maxreq=maxreq_local,      &
    2584           0 :                             sw_alltoall=sw_alltoall, sw_send=send_local)
    2585             :          else
    2586             :             call mp_swapirr(comm, send_bl, recv_bl, q1in, q1out,                &
    2587             :                             sw_handshake=hs_local, sw_maxreq=maxreq_local,      &
    2588           0 :                             sw_alltoall=sw_alltoall, sw_send=send_local)
    2589             :          endif
    2590             :     else
    2591             : 
    2592      919296 :       call MPI_COMM_RANK (comm, comm_pid, ierr)
    2593             : 
    2594      919296 :       hs_snd = 1
    2595      919296 :       ceil2num = ceil2(numpro)
    2596             : 
    2597             : !     num_s = 0 if this processes is not part of the sending decomposition
    2598      919296 :       num_s = size(send_bl)
    2599      919296 :       if (send_bl(1)%Nparcels == -1) then
    2600           0 :          num_s = 0
    2601             :       endif
    2602             : 
    2603             : !     num_r = 0 if this processes is not part of the receiving decomposition
    2604      919296 :       num_r = size(recv_bl)
    2605      919296 :       if (recv_bl(1)%Nparcels == -1) then
    2606           0 :          num_r = 0
    2607             :       endif
    2608             : 
    2609      919296 :       r8_win%ncall_s = r8_win%ncall_s + 1
    2610      919296 :      if (mod_method .gt. 0) then
    2611             : !
    2612             : ! mpi derived types
    2613           0 :       if (r8_win%ncall_s .gt. MaxTrf-onetwo+1) then
    2614           0 :          write(iulog,*) "mp_sendirr: derived type handle count exceeded - exiting"
    2615           0 :          write(iulog,*) "r8_win%ncall_s MaxTrf = ", r8_win%ncall_s, MaxTrf
    2616           0 :          stop
    2617             :       endif
    2618             : !
    2619             : ! MPI: Irecv over all processes
    2620             : !
    2621           0 :       if (hs_local) then
    2622           0 :          hs_rcvids(:) = MPI_REQUEST_NULL
    2623           0 :          do ipe2=1, ceil2num
    2624           0 :             ipe = ieor(ipe2-1,comm_pid) + 1
    2625           0 :             if (ipe .gt. num_s) cycle
    2626           0 :             if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then
    2627           0 :                if (ipe-1 /= comm_pid) &
    2628           0 :                   call MPI_IRECV ( hs_rcv(ipe), 1, mp_i4, ipe-1, comm_pid, comm, &
    2629           0 :                                    hs_rcvids(ipe), ierr )
    2630             :             endif
    2631             :          enddo
    2632             :       endif
    2633             : 
    2634           0 :       OutHandle(:,r8_win%ncall_s) = MPI_REQUEST_NULL
    2635           0 :       if (twovar) OutHandle(:,r8_win%ncall_s+1) = MPI_REQUEST_NULL
    2636           0 :       do ipe2=1, ceil2num
    2637           0 :         ipe = ieor(ipe2-1,comm_pid) + 1
    2638           0 :         if (ipe .gt. num_r) cycle
    2639             : !
    2640             : ! Receive the buffers with MPI_Irecv. Non-blocking
    2641             : !
    2642           0 :         if ( recv_bl(ipe)%type /= MPI_DATATYPE_NULL ) then
    2643           0 :           recv_tag = ipe-1 + modcam_tagoffset
    2644             :           call mpi_irecv( q1out, 1, recv_bl(ipe)%type, ipe-1, recv_tag,     &
    2645           0 :                           comm, OutHandle(ipe,r8_win%ncall_s), ierr )
    2646           0 :           if (twovar) then
    2647           0 :              call mpi_irecv( q2out, 1, recv_bl(ipe)%type, ipe-1, recv_tag,     &
    2648           0 :                              comm, OutHandle(ipe,r8_win%ncall_s+1), ierr )
    2649             :           endif
    2650           0 :           if (hs_local) then
    2651           0 :              if (ipe-1 /= comm_pid) &
    2652           0 :                call MPI_SEND ( hs_snd, 1, mp_i4, ipe-1, ipe-1, comm, ierr )
    2653             :           endif
    2654             :         endif
    2655             :       enddo
    2656             : 
    2657             : !
    2658             : ! MPI: Isend/Send over all processes; use risend/rsend with hs
    2659             : !
    2660           0 :       InHandle(:,r8_win%ncall_s) = MPI_REQUEST_NULL
    2661           0 :       if (twovar) InHandle(:,r8_win%ncall_s+1) = MPI_REQUEST_NULL
    2662           0 :       do ipe2=1, ceil2num
    2663           0 :         ipe = ieor(ipe2-1,comm_pid) + 1
    2664           0 :         if (ipe .gt. num_s) cycle
    2665             : 
    2666             : !
    2667             : ! Send the individual buffers with non-blocking sends
    2668             : !
    2669           0 :         if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then
    2670           0 :           send_tag = comm_pid + modcam_tagoffset
    2671           0 :           if (hs_local) then
    2672           0 :              if (ipe-1 /= comm_pid) &
    2673           0 :                 call MPI_WAIT ( hs_rcvids(ipe), MPI_STATUS_IGNORE, ierr )
    2674           0 :              if (send_local) then
    2675           0 :                 call mpi_rsend( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    2676           0 :                                 comm, ierr )
    2677             :              else
    2678           0 :                 call mpi_irsend( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    2679           0 :                                  comm, InHandle(ipe,r8_win%ncall_s), ierr )
    2680             :              endif
    2681           0 :              if (twovar) then
    2682           0 :                 if (send_local) then
    2683           0 :                    call mpi_rsend( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    2684           0 :                                    comm, ierr )
    2685             :                 else
    2686           0 :                    call mpi_irsend( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    2687           0 :                                     comm, InHandle(ipe,r8_win%ncall_s+1), ierr )
    2688             :                 endif
    2689             :              endif
    2690             :           else
    2691           0 :              if (send_local) then
    2692             :                 call mpi_send( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    2693           0 :                                comm, ierr )
    2694             :              else
    2695             :                 call mpi_isend( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    2696           0 :                                 comm, InHandle(ipe,r8_win%ncall_s), ierr )
    2697             :              endif
    2698           0 :              if (twovar) then
    2699           0 :                 if (send_local) then
    2700           0 :                    call mpi_send( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    2701           0 :                                   comm, ierr )
    2702             :                 else
    2703           0 :                    call mpi_isend( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    2704           0 :                                    comm, InHandle(ipe,r8_win%ncall_s+1), ierr )
    2705             :                 endif
    2706             :              endif
    2707             :           endif
    2708             :         endif
    2709             :       enddo
    2710             :      else
    2711             : 
    2712             : ! temporary contiguous buffers
    2713             : 
    2714      919296 :       if (r8_win%ncall_s .gt. max_irr-onetwo+1) then
    2715           0 :          write(iulog,*) "mp_sendirr: insufficient window storage - exiting"
    2716           0 :          write(iulog,*) "r8_win%ncall_s max_irr = ", r8_win%ncall_s, max_irr
    2717           0 :          stop
    2718             :       endif
    2719      919296 :       unitsize = r8_win%size/max_irr
    2720             : 
    2721             : ! issue call to receive data in global receive buffer
    2722      919296 :       offset_0 = (r8_win%ncall_s-1)*unitsize
    2723      919296 :       offset_s = offset_0
    2724      919296 :       offset_r = offset_0
    2725             : 
    2726      919296 :       if (hs_local) then
    2727   706938624 :          hs_rcvids(:) = MPI_REQUEST_NULL
    2728   942278400 :          do ipe2=1, ceil2num
    2729   941359104 :             ipe = ieor(ipe2-1,comm_pid) + 1
    2730   941359104 :             if (ipe .gt. num_s) cycle
    2731   706019328 :             qsize = onetwo*send_bl(ipe)%Tot_Size
    2732   706938624 :             if (qsize .ne. 0) then
    2733    13317696 :                r8_win%dest = ipe-1
    2734    13317696 :                send_tag = comm_pid + modcam_tagoffset
    2735    13317696 :                if (r8_win%dest /= comm_pid) &
    2736    13300308 :                   call MPI_IRECV ( hs_rcv(ipe), 1, mp_i4, r8_win%dest, send_tag, comm, &
    2737    13300308 :                                    hs_rcvids(ipe), ierr )
    2738             :             endif
    2739             :          enddo
    2740             :       endif
    2741             : 
    2742   942278400 :       do ipe2=1, ceil2num
    2743   941359104 :          ipe = ieor(ipe2-1,comm_pid) + 1
    2744   941359104 :          if (ipe .gt. num_r) cycle
    2745   706019328 :          r8_win%size_r = onetwo*recv_bl(ipe)%Tot_Size
    2746   706938624 :          if (r8_win%size_r .ne. 0) then
    2747    13317696 :             r8_win%offset_r = offset_r
    2748    13317696 :             offset_r = offset_r + r8_win%size_r
    2749    13317696 :             r8_win%src = ipe-1
    2750    13317696 :             if (onetwo*unitsize >= offset_r-offset_0) then
    2751    13317696 :               recv_tag = r8_win%src + modcam_tagoffset
    2752    13317696 :               qsize    = r8_win%size_r
    2753    13317696 :               r8_win%nrecv    = r8_win%nrecv + 1
    2754           0 :               call MPI_IRECV(ga_r8_r(r8_win%offset_r+1), qsize, mp_r8, r8_win%src, &
    2755    13317696 :                              recv_tag, comm, r8_win%rqest(r8_win%nrecv), ierror)
    2756    13317696 :               if (hs_local) then
    2757    13317696 :                  if (r8_win%src /= comm_pid) &
    2758    13300308 :                    call MPI_SEND ( hs_snd, 1, mp_i4, r8_win%src, recv_tag, comm, ierror)
    2759             :               endif
    2760             :             else
    2761           0 :               write(iulog,*) "Fatal mp_sendirr: receive window out of space - exiting"
    2762           0 :               write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid,  &
    2763           0 :                         ipe, unitsize, offset_r, offset_0
    2764           0 :               stop
    2765             :             endif
    2766             :          endif
    2767             :       enddo
    2768             : 
    2769             : ! gather data into global send buffer
    2770   942278400 :       do ipe2=1, ceil2num
    2771   941359104 :          ipe = ieor(ipe2-1,comm_pid) + 1
    2772   941359104 :          if (ipe .gt. num_s) cycle
    2773   706019328 :          qsize = onetwo*send_bl(ipe)%Tot_Size
    2774   706938624 :          if (qsize .ne. 0) then
    2775    13317696 :             r8_win%dest = ipe-1
    2776    13317696 :             r8_win%offset_s = offset_s
    2777    13317696 :             offset_s = offset_s + qsize
    2778    13317696 :             if (offset_s-offset_0 .gt. onetwo*unitsize) then
    2779           0 :               write(iulog,*) "Fatal mp_sendirr: send window out of space - exiting"
    2780           0 :               write(iulog,*) 'comm_pid ipe unitsize offset_s offset_0 = ', comm_pid,  &
    2781           0 :                         ipe, unitsize, offset_s, offset_0
    2782           0 :               stop
    2783             :             endif
    2784             : 
    2785    13317696 :             offset_v(1) = r8_win%offset_s
    2786   114863616 :             do j = 2, send_bl(ipe)%nparcels
    2787   114863616 :                offset_v(j) = offset_v(j-1) + send_bl(ipe)%blocksizes(j-1)
    2788             :             enddo
    2789             : 
    2790   128181312 :             do j = 1, send_bl(ipe)%nparcels
    2791  2884908096 :                do i = 1, send_bl(ipe)%blocksizes(j)
    2792  2871590400 :                   ga_r8_s(offset_v(j)+i) = q1in(send_bl(ipe)%displacements(j)+i)
    2793             :                enddo
    2794             :             enddo
    2795    13317696 :             if (twovar) then
    2796    70060032 :                do j = 1, send_bl(ipe)%nparcels
    2797  1584285696 :                   do i = 1, send_bl(ipe)%blocksizes(j)
    2798  1577318400 :                      ga_r8_s(send_bl(ipe)%Tot_Size+offset_v(j)+i) = q2in(send_bl(ipe)%displacements(j)+i)
    2799             :                   enddo
    2800             :                enddo
    2801             :             endif
    2802             : 
    2803             : ! nonblocking send
    2804    13317696 :             send_tag = comm_pid + modcam_tagoffset
    2805    13317696 :             r8_win%nsend = r8_win%nsend + 1
    2806    13317696 :             if (hs_local) then
    2807    13317696 :                if (r8_win%dest /= comm_pid) &
    2808    13300308 :                   call MPI_WAIT ( hs_rcvids(ipe), MPI_STATUS_IGNORE, ierr )
    2809    13317696 :                if (send_local) then
    2810           0 :                   call MPI_RSEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, &
    2811    13317696 :                                  send_tag, comm, ierr)
    2812             :                else
    2813           0 :                   call MPI_IRSEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, &
    2814           0 :                                  send_tag, comm, r8_win%sqest(r8_win%nsend), ierr)
    2815             :                endif
    2816             :             else
    2817           0 :                if (send_local) then
    2818           0 :                   call MPI_SEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, &
    2819           0 :                                  send_tag, comm, ierr)
    2820             :                else
    2821           0 :                   call MPI_ISEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, &
    2822           0 :                                  send_tag, comm, r8_win%sqest(r8_win%nsend), ierr)
    2823             :                endif
    2824             :             endif
    2825             :          endif
    2826             :       enddo
    2827             : 
    2828             :      endif   !  mod_method
    2829             : 
    2830      919296 :       if (twovar) r8_win%ncall_s = r8_win%ncall_s + 1
    2831             : 
    2832             :     endif   !  sw_local
    2833             : 
    2834             : #if defined( MODCM_TIMING )
    2835             :       call t_stopf('mod_comm communication')
    2836             : #endif
    2837             : 
    2838      919296 :       end subroutine mp_sendirr
    2839             : !------------------------------------------------------------------------------
    2840             : !
    2841             : !------------------------------------------------------------------------------
    2842             : !BOP
    2843             : ! !ROUTINE: mp_recvirr --- Finalize communication of contiguous parcels
    2844             : !
    2845             : ! !INTERFACE:
    2846      919296 :       subroutine mp_recvirr ( comm, send_bl, recv_bl, q1in, q1out, q2in, q2out,      &
    2847             :                               modc )
    2848             :  
    2849             : ! !INPUT PARAMETERS:
    2850             :       integer, intent(in)  :: comm      !  communicator
    2851             :       type(blockdescriptor), intent(in)  :: send_bl(:) ! send blocks
    2852             :       type(blockdescriptor), intent(in)  :: recv_bl(:) ! receive blocks
    2853             :       real(r8), intent(in) :: q1in(*)                  ! input array
    2854             :       real(r8), optional, intent(in) :: q2in(*)        ! second input array
    2855             :       integer, optional, intent(in) :: modc(4)         ! 1: classical, swap p2p, swap a2a
    2856             :                                                        ! 2: handshake
    2857             :                                                        ! 3: send vs isend
    2858             :                                                        ! 4: max number of outstanding requests
    2859             : ! !INPUT/OUTPUT PARAMETERS:
    2860             :       real(r8), intent(inout) :: q1out(*)                ! output array
    2861             :       real(r8), optional, intent(inout) :: q2out(*)      ! second output array
    2862             : !
    2863             : ! !DESCRIPTION:
    2864             : !     Complete transfer of a generalized region initiated by {\tt mp\_sendirr}.
    2865             : !     Communicate a number of contiguous parcels to/from arbitrary set of PEs.
    2866             : !     Modc(1): if 0, use original approach of posting all communications in mp_sendirr and
    2867             : !     placing wait points here; otherwise don't do anything - mp_swapirr is called from mp_sendirr.
    2868             : !     Modc(3): if 1, then use blocking send; otherwise use nonblocking send
    2869             : !
    2870             : ! !REVISION HISTORY:
    2871             : !    02.08.15   Sawyer      Creation
    2872             : !    02.11.06   Mirin       Optimizations
    2873             : !    03.03.03   Sawyer      Now using packed arrays for MPI2
    2874             : !    04.02.24   Mirin       Various mpi2 options
    2875             : !    08.09.18   Mirin       Major overhaul, to include approaches from Mirin and Worley
    2876             : !
    2877             : !EOP
    2878             : !------------------------------------------------------------------------------
    2879             : !BOC
    2880             :       integer :: ipe, blocksize, offset_r, mod_method
    2881             :       integer unitsize, offset_0
    2882             :       integer Ierr
    2883     1838592 :       integer InStats(numpro*MPI_STATUS_SIZE)
    2884     1838592 :       integer OutStats(numpro*MPI_STATUS_SIZE)
    2885             :       integer i, j, num_r, num_s
    2886      919296 :       integer :: offset_v (Max_Nparcels)
    2887             :       integer ipe2, ceil2num
    2888             :       integer onetwo
    2889             :       logical twovar
    2890             :       integer sw_local, maxreq_local
    2891             :       logical hs_local, send_local
    2892             :       logical sw_alltoall
    2893             :       integer comm_size, comm_pid
    2894             : 
    2895      919296 :       if (present(modc)) then
    2896      919296 :          sw_local   = modc(1)
    2897      919296 :          hs_local   = (modc(2) .eq. 1)
    2898      919296 :          send_local = (modc(3) .eq. 1)
    2899      919296 :          maxreq_local = modc(4)
    2900             :       else
    2901             :          sw_local = 0
    2902      919296 :          hs_local = .true.
    2903             :          send_local = .true.
    2904      919296 :          maxreq_local = -1
    2905             :       endif
    2906             : 
    2907             : ! Do not call mp_swapirr (hence return) unless mod_method equals 0
    2908      919296 :       mod_method = recv_bl(1)%method
    2909      919296 :       if (mod_method .gt. 0) sw_local = 0
    2910             : 
    2911             : ! Return if swap_irr
    2912      919296 :       if (sw_local .gt. 0) return
    2913             : 
    2914             : #if defined( MODCM_TIMING )
    2915             :       call t_startf('mod_comm communication')
    2916             : #endif
    2917             : 
    2918      919296 :       onetwo = 1
    2919      919296 :       twovar = .false.
    2920      919296 :       if (present(q2in)) then
    2921      580608 :          onetwo = 2
    2922      580608 :          twovar = .true.
    2923             :       endif
    2924             : 
    2925      919296 :       call MPI_COMM_SIZE (comm, comm_size, ierr)
    2926      919296 :       call MPI_COMM_RANK (comm, comm_pid, ierr)
    2927             : 
    2928      919296 :       ceil2num = ceil2(numpro)
    2929             : 
    2930             : !     num_s = 0 if this processes is not part of the sending decomposition
    2931      919296 :       num_s = size(send_bl)
    2932      919296 :       if (send_bl(1)%Nparcels == -1) then
    2933           0 :          num_s = 0
    2934             :       endif
    2935             : 
    2936             : !     num_r = 0 if this processes is not part of the receiving decomposition
    2937      919296 :       num_r = size(recv_bl)
    2938      919296 :       if (recv_bl(1)%Nparcels == -1) then
    2939           0 :          num_r = 0
    2940             :       endif
    2941             : 
    2942      919296 :       r8_win%ncall_r = r8_win%ncall_r + 1
    2943             : 
    2944      919296 :     if (mod_method .gt. 0) then
    2945             : 
    2946             : ! mpi derived types
    2947           0 :       if (r8_win%ncall_r .gt. MaxTrf-onetwo+1) then
    2948           0 :          write(iulog,*) "mp_recvirr: derived type handle count exceeded - exiting"
    2949           0 :          write(iulog,*) "r8_win%ncall_r MaxTrf = ", r8_win%ncall_r, MaxTrf
    2950           0 :          stop
    2951             :       endif
    2952             : 
    2953           0 :       if (num_s .gt. 0 .and. (.not. send_local)) then
    2954           0 :          CALL MPI_WAITALL( comm_size, InHandle(:,r8_win%ncall_r), InStats, Ierr )
    2955           0 :          if (twovar) then
    2956           0 :             CALL MPI_WAITALL( comm_size, InHandle(:,r8_win%ncall_r+1), InStats, Ierr )
    2957             :          endif
    2958             :       endif
    2959           0 :       if (num_r .gt. 0) then
    2960           0 :          CALL MPI_WAITALL( comm_size, OutHandle(:,r8_win%ncall_r), OutStats, Ierr )
    2961           0 :          if (twovar) then
    2962           0 :             CALL MPI_WAITALL( comm_size, OutHandle(:,r8_win%ncall_r+1), OutStats, Ierr )
    2963             :          endif
    2964             :       endif
    2965             : 
    2966             :     else
    2967             : 
    2968             : ! temporary contiguous buffer / global window
    2969             : 
    2970      919296 :       if (r8_win%ncall_r .gt. max_irr-onetwo+1) then
    2971           0 :          write(iulog,*) "mp_recvirr: insufficient window storage - exiting"
    2972           0 :          write(iulog,*) "r8_win%ncall_r max_irr = ", r8_win%ncall_r, max_irr
    2973           0 :          stop
    2974             :       endif
    2975      919296 :       unitsize = r8_win%size/max_irr
    2976             : 
    2977             : ! scatter data from global receive buffer to final destination
    2978      919296 :       offset_0 = (r8_win%ncall_r-1)*unitsize
    2979      919296 :       offset_r = offset_0
    2980             : 
    2981   942278400 :       do ipe2=1, ceil2num
    2982   941359104 :          ipe = ieor(ipe2-1,comm_pid) + 1
    2983   941359104 :          if (ipe .gt. num_r) cycle
    2984   706019328 :          r8_win%size_r = onetwo*recv_bl(ipe)%Tot_Size
    2985   706938624 :          if (r8_win%size_r .ne. 0) then
    2986    13317696 :             r8_win%offset_r = offset_r
    2987    13317696 :             offset_r = offset_r + r8_win%size_r
    2988    13317696 :             if (offset_r-offset_0 .gt. onetwo*unitsize) then
    2989           0 :               write(iulog,*) "Fatal mp_recvirr: receive window out of space - exiting"
    2990           0 :               write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid,  &
    2991           0 :                         ipe, unitsize, offset_r, offset_0
    2992           0 :               stop
    2993             :             endif
    2994             : 
    2995    13317696 :             r8_win%nread = r8_win%nread + 1
    2996    13317696 :             call MPI_WAIT(r8_win%rqest(r8_win%nread), Status, ierr)
    2997             : 
    2998    13317696 :             offset_v(1) = r8_win%offset_r
    2999   114863616 :             do j = 2, recv_bl(ipe)%Nparcels
    3000   114863616 :                offset_v(j) = offset_v(j-1) + recv_bl(ipe)%blocksizes(j-1)
    3001             :             enddo
    3002             : 
    3003   128181312 :             do j = 1, recv_bl(ipe)%Nparcels
    3004  2884908096 :                do i = 1, recv_bl(ipe)%blocksizes(j)
    3005  2871590400 :                   q1out(recv_bl(ipe)%displacements(j)+i) = ga_r8_r(offset_v(j)+i)
    3006             :                enddo
    3007             :             enddo
    3008    13317696 :             if (twovar) then
    3009    70060032 :             do j = 1, recv_bl(ipe)%Nparcels
    3010  1584285696 :                do i = 1, recv_bl(ipe)%blocksizes(j)
    3011  1577318400 :                   q2out(recv_bl(ipe)%displacements(j)+i) = ga_r8_r(recv_bl(ipe)%Tot_Size+offset_v(j)+i)
    3012             :                enddo
    3013             :             enddo
    3014             :             endif
    3015             : 
    3016             :          endif
    3017             :       enddo
    3018             : 
    3019      919296 :       if ((r8_win%ncall_s == r8_win%ncall_r + onetwo - 1) .and. (.not. send_local)) then
    3020           0 :          call MPI_WAITALL(r8_win%nsend, r8_win%sqest, Stats, ierror)
    3021             :       endif
    3022             : 
    3023             :     endif    !    mod_method .gt. 0
    3024             : 
    3025      919296 :     if (twovar) r8_win%ncall_r = r8_win%ncall_r + 1
    3026             : 
    3027      919296 :     if (r8_win%ncall_s == r8_win%ncall_r) then
    3028      919296 :        r8_win%nsend = 0
    3029      919296 :        r8_win%nrecv = 0
    3030      919296 :        r8_win%nread = 0
    3031      919296 :        r8_win%ncall_s = 0
    3032      919296 :        r8_win%ncall_r = 0
    3033             :     endif
    3034             : 
    3035             : #if defined( MODCM_TIMING )
    3036             :       call t_stopf('mod_comm communication')
    3037             : #endif
    3038             : 
    3039             : !EOC
    3040             :       end subroutine mp_recvirr
    3041             : !------------------------------------------------------------------------------
    3042             : !
    3043             : !------------------------------------------------------------------------------
    3044             : !BOP
    3045             : ! !ROUTINE: mp_sendirr_r4 --- Initiate communication of contiguous parcels - r4
    3046             : !
    3047             : ! !INTERFACE:
    3048           0 :       subroutine mp_sendirr_r4 ( comm, send_bl, recv_bl, q1in, q1out, q2in, q2out,      &
    3049             :                                  modc )
    3050             :  
    3051             : ! !INPUT PARAMETERS:
    3052             :       integer, intent(in)  :: comm      !  communicator
    3053             :       type(blockdescriptor), intent(in)  :: send_bl(:) ! send blocks
    3054             :       type(blockdescriptor), intent(in)  :: recv_bl(:) ! receive blocks
    3055             :       real(r4), intent(in) :: q1in(*)                  ! input array
    3056             :       real(r4), optional, intent(in) :: q2in(*)        ! second input array
    3057             :       integer, optional, intent(in) :: modc(4)         ! 1: classical, swap p2p, swap a2a
    3058             :                                                        ! 2: handshake
    3059             :                                                        ! 3: send vs isend
    3060             :                                                        ! 4: max number of outstanding requests
    3061             : 
    3062             : ! !OUTPUT PARAMETERS:
    3063             :       real(r4), intent(out) :: q1out(*)                ! output array
    3064             :       real(r4), optional, intent(out) :: q2out(*)      ! second output array
    3065             : !
    3066             : ! !DESCRIPTION:
    3067             : !     Communicate a number of contiguous parcels to/from arbitrary set of PEs.
    3068             : !     Modc(1): if 0, use original approach of posting all communications here and placing
    3069             : !     wait points in mp_recvirr; if 1, call swap routine with p2p messages; if 2, call swap
    3070             : !     routine with a2a messages. 
    3071             : !     Modc(2): if 1, then apply handshaking (don't send until corresponding receive is posted)
    3072             : !     Modc(3): if 1, then use blocking send; otherwise use nonblocking send
    3073             : !     Modc(4): maximum number of outstanding requests (applies to swap routines only)
    3074             : !
    3075             : ! !REVISION HISTORY: 
    3076             : !    02.08.13   Sawyer      Creation
    3077             : !    02.11.06   Mirin       Optimizations
    3078             : !    03.03.03   Sawyer      Use partneroffset
    3079             : !    03.06.24   Sawyer      Integrated Use_Mpi_Types; added qout
    3080             : !    04.02.24   Mirin       Various mpi2 options
    3081             : !    08.09.18   Mirin       No-op version
    3082             : !
    3083             : ! !BUGS:
    3084             : !
    3085             : !EOP
    3086             : !------------------------------------------------------------------------------
    3087             : !BOC
    3088             : !
    3089           0 :       write(iulog,*) 'Mod_comm: mp_sendirr_r4 - r4 no longer supported - exiting'
    3090           0 :       stop
    3091             : 
    3092             : !EOC
    3093             :       end subroutine mp_sendirr_r4
    3094             : !------------------------------------------------------------------------------
    3095             : !
    3096             : !------------------------------------------------------------------------------
    3097             : !BOP
    3098             : ! !ROUTINE: mp_recvirr_r4 --- Finalize communication of contiguous parcels - r4
    3099             : !
    3100             : ! !INTERFACE:
    3101           0 :       subroutine mp_recvirr_r4 ( comm, send_bl, recv_bl, q1in, q1out, q2in, q2out,      &
    3102             :                                  modc )
    3103             :  
    3104             : ! !INPUT PARAMETERS:
    3105             :       integer, intent(in)  :: comm      !  communicator
    3106             :       type(blockdescriptor), intent(in)  :: send_bl(:) ! send blocks
    3107             :       type(blockdescriptor), intent(in)  :: recv_bl(:) ! receive blocks
    3108             :       real(r4), intent(in) :: q1in(*)                  ! input array
    3109             :       real(r4), optional, intent(in) :: q2in(*)        ! second input array
    3110             :       integer, optional, intent(in) :: modc(4)         ! 1: classical, swap p2p, swap a2a
    3111             :                                                        ! 2: handshake
    3112             :                                                        ! 3: send vs isend
    3113             :                                                        ! 4: max number of outstanding requests
    3114             : ! !INPUT/OUTPUT PARAMETERS:
    3115             :       real(r4), intent(inout) :: q1out(*)                ! output array
    3116             :       real(r4), optional, intent(inout) :: q2out(*)      ! second output array
    3117             : !
    3118             : ! !DESCRIPTION:
    3119             : !     Complete transfer of a generalized region initiated by {\tt mp\_sendirr}.
    3120             : !     Communicate a number of contiguous parcels to/from arbitrary set of PEs.
    3121             : !     Modc(1): if 0, use original approach of posting all communications in mp_sendirr and
    3122             : !     placing wait points here; otherwise don't do anything - mp_swapirr is called from mp_sendirr.
    3123             : !     Modc(3): if 1, then use blocking send; otherwise use nonblocking send
    3124             : !
    3125             : ! !REVISION HISTORY:
    3126             : !    02.08.15   Sawyer      Creation
    3127             : !    02.11.06   Mirin       Optimizations
    3128             : !    03.03.03   Sawyer      Now using packed arrays for MPI2
    3129             : !    04.02.24   Mirin       Various mpi2 options
    3130             : !    08.09.18   Mirin       No-op version
    3131             : !
    3132             : ! !BUGS:
    3133             : !
    3134             : !EOP
    3135             : !------------------------------------------------------------------------------
    3136             : !BOC
    3137             : !
    3138           0 :       write(iulog,*) 'Mod_comm: mp_recvirr_r4 - r4 no longer supported - exiting'
    3139           0 :       stop
    3140             : 
    3141             : !EOC
    3142             :       end subroutine mp_recvirr_r4
    3143             : !------------------------------------------------------------------------------
    3144             : !
    3145             : !------------------------------------------------------------------------------
    3146             : !BOP
    3147             : ! !ROUTINE: mp_sendirr_i4 --- Initiate communication of contiguous parcels - i4
    3148             : !
    3149             : ! !INTERFACE:
    3150           0 :       subroutine mp_sendirr_i4 ( comm, send_bl, recv_bl, q1in, q1out, q2in, q2out,      &
    3151             :                                  modc )
    3152             :  
    3153             : ! !INPUT PARAMETERS:
    3154             :       integer, intent(in)  :: comm      !  communicator
    3155             :       type(blockdescriptor), intent(in)  :: send_bl(:) ! send blocks
    3156             :       type(blockdescriptor), intent(in)  :: recv_bl(:) ! receive blocks
    3157             :       integer(i4), intent(in) :: q1in(*)                  ! input array
    3158             :       integer(i4), optional, intent(in) :: q2in(*)        ! second input array
    3159             :       integer, optional, intent(in) :: modc(4)         ! 1: classical, swap p2p, swap a2a
    3160             :                                                        ! 2: handshake
    3161             :                                                        ! 3: send vs isend
    3162             :                                                        ! 4: max number of outstanding requests
    3163             : 
    3164             : ! !OUTPUT PARAMETERS:
    3165             :       integer(i4), intent(out) :: q1out(*)                ! output array
    3166             :       integer(i4), optional, intent(out) :: q2out(*)      ! second output array
    3167             : !
    3168             : ! !DESCRIPTION:
    3169             : !     Communicate a number of contiguous parcels to/from arbitrary set of PEs.
    3170             : !     Modc(1): if 0, use original approach of posting all communications here and placing
    3171             : !     wait points in mp_recvirr; if 1, call swap routine with p2p messages; if 2, call swap
    3172             : !     routine with a2a messages. 
    3173             : !     Modc(2): if 1, then apply handshaking (don't send until corresponding receive is posted)
    3174             : !     Modc(3): if 1, then use blocking send; otherwise use nonblocking send
    3175             : !     Modc(4): maximum number of outstanding requests (applies to swap routines only)
    3176             : !
    3177             : ! !REVISION HISTORY: 
    3178             : !    02.08.13   Sawyer      Creation
    3179             : !    02.11.06   Mirin       Optimizations
    3180             : !    03.03.03   Sawyer      Use partneroffset
    3181             : !    03.06.24   Sawyer      Integrated Use_Mpi_Types; added qout
    3182             : !    04.02.24   Mirin       Various mpi2 options
    3183             : !    08.09.18   Mirin       Major overhaul, to include approaches from Mirin and Worley
    3184             : !    09.10.07   Worley      eliminated mpi_recv from handshake logic
    3185             : !
    3186             : ! !BUGS:
    3187             : !
    3188             : !EOP
    3189             : !------------------------------------------------------------------------------
    3190             : !BOC
    3191             : !
    3192             : ! !LOCAL VARIABLES:
    3193             :       integer ipe, qsize, offset, blocksize, nparcels, offset_s, offset_r, ierr, mod_method
    3194             :       integer p, mysize, nthpc, minsize, nthrd, pn, pt, tmpsize, unitsize, offset_0
    3195             :       integer i, j, send_tag, recv_tag, num_s, num_r
    3196           0 :       integer :: offset_v (Max_Nparcels)
    3197           0 :       integer :: hs_snd, hs_rcv(numpro), hs_rcvids(numpro)
    3198             :       integer ipe2, ceil2num
    3199             :       integer onetwo
    3200             :       logical twovar
    3201             :       integer sw_local, maxreq_local
    3202             :       logical hs_local, send_local
    3203             :       logical sw_alltoall
    3204             :       integer comm_pid
    3205             : 
    3206             : #if defined( MODCM_TIMING )
    3207             :       call t_startf('mod_comm communication')
    3208             : #endif
    3209             : 
    3210           0 :       if (present(modc)) then
    3211           0 :          sw_local   = modc(1)
    3212           0 :          hs_local   = (modc(2) .eq. 1)
    3213           0 :          send_local = (modc(3) .eq. 1)
    3214           0 :          maxreq_local = modc(4)
    3215             :       else
    3216           0 :          sw_local = 0
    3217           0 :          hs_local = .true.
    3218           0 :          send_local = .false.
    3219           0 :          maxreq_local = -1
    3220             :       endif
    3221             : 
    3222             : ! Do not call mp_swapirr_i4 unless mod_method equals 0
    3223           0 :       mod_method = recv_bl(1)%method
    3224           0 :       if (mod_method .gt. 0) sw_local = 0
    3225             : 
    3226           0 :       onetwo = 1
    3227           0 :       twovar = .false.
    3228           0 :       if (present(q2in)) then
    3229           0 :          onetwo = 2
    3230           0 :          twovar = .true.
    3231             :       endif
    3232             : 
    3233           0 :     if (sw_local .gt. 0) then
    3234           0 :          sw_alltoall = (sw_local .eq. 2)
    3235           0 :          if (present(q2in)) then
    3236             :             call mp_swapirr_i4(comm, send_bl, recv_bl, q1in, q1out, q2in, q2out,   &
    3237             :                                sw_handshake=hs_local, sw_maxreq=maxreq_local,      &
    3238           0 :                                sw_alltoall=sw_alltoall, sw_send=send_local)
    3239             :          else
    3240             :             call mp_swapirr_i4(comm, send_bl, recv_bl, q1in, q1out,                &
    3241             :                                sw_handshake=hs_local, sw_maxreq=maxreq_local,      &
    3242           0 :                                sw_alltoall=sw_alltoall, sw_send=send_local)
    3243             :          endif
    3244             :     else
    3245             : 
    3246           0 :       call MPI_COMM_RANK (comm, comm_pid, ierr)
    3247             : 
    3248           0 :       hs_snd = 1
    3249           0 :       ceil2num = ceil2(numpro)
    3250             : 
    3251             : !     num_s = 0 if this processes is not part of the sending decomposition
    3252           0 :       num_s = size(send_bl)
    3253           0 :       if (send_bl(1)%Nparcels == -1) then
    3254           0 :          num_s = 0
    3255             :       endif
    3256             : 
    3257             : !     num_r = 0 if this processes is not part of the receiving decomposition
    3258           0 :       num_r = size(recv_bl)
    3259           0 :       if (recv_bl(1)%Nparcels == -1) then
    3260           0 :          num_r = 0
    3261             :       endif
    3262             : 
    3263           0 :       mod_method = recv_bl(1)%method
    3264             : 
    3265           0 :       i4_win%ncall_s = i4_win%ncall_s + 1
    3266           0 :      if (mod_method .gt. 0) then
    3267             : !
    3268             : ! mpi derived types
    3269           0 :       if (i4_win%ncall_s .gt. MaxTrf-onetwo+1) then
    3270           0 :          write(iulog,*) "mp_sendirr_i4: derived type handle count exceeded - exiting"
    3271           0 :          write(iulog,*) "i4_win%ncall_s MaxTrf = ", i4_win%ncall_s, MaxTrf
    3272           0 :          stop
    3273             :       endif
    3274             : !
    3275             : ! MPI: Irecv over all processes
    3276             : !
    3277           0 :       if (hs_local) then
    3278           0 :          hs_rcvids(:) = MPI_REQUEST_NULL
    3279           0 :          do ipe2=1, ceil2num
    3280           0 :             ipe = ieor(ipe2-1,comm_pid) + 1
    3281           0 :             if (ipe .gt. num_s) cycle
    3282           0 :             if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then
    3283           0 :                if (ipe-1 /= comm_pid) &
    3284           0 :                   call MPI_IRECV ( hs_rcv(ipe), 1, mp_i4, ipe-1, comm_pid, comm, &
    3285           0 :                                    hs_rcvids(ipe), ierr )
    3286             :             endif
    3287             :          enddo
    3288             :       endif
    3289             : 
    3290           0 :       OutHandle(:,i4_win%ncall_s) = MPI_REQUEST_NULL
    3291           0 :       if (twovar) OutHandle(:,i4_win%ncall_s+1) = MPI_REQUEST_NULL
    3292           0 :       do ipe2=1, ceil2num
    3293           0 :         ipe = ieor(ipe2-1,comm_pid) + 1
    3294           0 :         if (ipe .gt. num_r) cycle
    3295             : !
    3296             : ! Receive the buffers with MPI_Irecv. Non-blocking
    3297             : !
    3298           0 :         if ( recv_bl(ipe)%type /= MPI_DATATYPE_NULL ) then
    3299           0 :           recv_tag = ipe-1 + modcam_tagoffset
    3300             :           call mpi_irecv( q1out, 1, recv_bl(ipe)%type, ipe-1, recv_tag,     &
    3301           0 :                           comm, OutHandle(ipe,i4_win%ncall_s), ierr )
    3302           0 :           if (twovar) then
    3303           0 :              call mpi_irecv( q2out, 1, recv_bl(ipe)%type, ipe-1, recv_tag,     &
    3304           0 :                              comm, OutHandle(ipe,i4_win%ncall_s+1), ierr )
    3305             :           endif
    3306           0 :           if (hs_local) then
    3307           0 :              if (ipe-1 /= comm_pid) &
    3308           0 :                call MPI_SEND ( hs_snd, 1, mp_i4, ipe-1, ipe-1, comm, ierr )
    3309             :           endif
    3310             :         endif
    3311             :       enddo
    3312             : 
    3313             : !
    3314             : ! MPI: Isend/Send over all processes; use risend/rsend with hs
    3315             : !
    3316           0 :       InHandle(:,i4_win%ncall_s) = MPI_REQUEST_NULL
    3317           0 :       if (twovar) InHandle(:,i4_win%ncall_s+1) = MPI_REQUEST_NULL
    3318           0 :       do ipe2=1, ceil2num
    3319           0 :         ipe = ieor(ipe2-1,comm_pid) + 1
    3320           0 :         if (ipe .gt. num_s) cycle
    3321             : 
    3322             : !
    3323             : ! Send the individual buffers with non-blocking sends
    3324             : !
    3325           0 :         if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then
    3326           0 :           send_tag = comm_pid + modcam_tagoffset
    3327           0 :           if (hs_local) then
    3328           0 :              if (ipe-1 /= comm_pid) &
    3329           0 :                 call MPI_WAIT ( hs_rcvids(ipe), MPI_STATUS_IGNORE, ierr )
    3330           0 :              if (send_local) then
    3331           0 :                 call mpi_rsend( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    3332           0 :                                 comm, ierr )
    3333             :              else
    3334           0 :                 call mpi_irsend( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    3335           0 :                                  comm, InHandle(ipe,i4_win%ncall_s), ierr )
    3336             :              endif
    3337           0 :              if (twovar) then
    3338           0 :                 if (send_local) then
    3339           0 :                    call mpi_rsend( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    3340           0 :                                    comm, ierr )
    3341             :                 else
    3342           0 :                    call mpi_irsend( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    3343           0 :                                     comm, InHandle(ipe,i4_win%ncall_s+1), ierr )
    3344             :                 endif
    3345             :              endif
    3346             :           else
    3347           0 :              if (send_local) then
    3348             :                 call mpi_send( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    3349           0 :                                comm, ierr )
    3350             :              else
    3351             :                 call mpi_isend( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    3352           0 :                                 comm, InHandle(ipe,i4_win%ncall_s), ierr )
    3353             :              endif
    3354           0 :              if (twovar) then
    3355           0 :                 if (send_local) then
    3356           0 :                    call mpi_send( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    3357           0 :                                   comm, ierr )
    3358             :                 else
    3359           0 :                    call mpi_isend( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    3360           0 :                                    comm, InHandle(ipe,i4_win%ncall_s+1), ierr )
    3361             :                 endif
    3362             :              endif
    3363             :           endif
    3364             :         endif
    3365             :       enddo
    3366             :      else
    3367             : 
    3368             : ! temporary contiguous buffers
    3369             : 
    3370           0 :       if (i4_win%ncall_s .gt. max_irr-onetwo+1) then
    3371           0 :          write(iulog,*) "mp_sendirr_i4: insufficient window storage - exiting"
    3372           0 :          write(iulog,*) "i4_win%ncall_s max_irr = ", i4_win%ncall_s, max_irr
    3373           0 :          stop
    3374             :       endif
    3375           0 :       unitsize = i4_win%size/max_irr
    3376             : 
    3377             : ! issue call to receive data in global receive buffer
    3378           0 :       offset_0 = (i4_win%ncall_s-1)*unitsize
    3379           0 :       offset_s = offset_0
    3380           0 :       offset_r = offset_0
    3381             : 
    3382           0 :       if (hs_local) then
    3383           0 :          hs_rcvids(:) = MPI_REQUEST_NULL
    3384           0 :          do ipe2=1, ceil2num
    3385           0 :             ipe = ieor(ipe2-1,comm_pid) + 1
    3386           0 :             if (ipe .gt. num_s) cycle
    3387           0 :             qsize = onetwo*send_bl(ipe)%Tot_Size
    3388           0 :             if (qsize .ne. 0) then
    3389           0 :                i4_win%dest = ipe-1
    3390           0 :                send_tag = comm_pid + modcam_tagoffset
    3391           0 :                if (i4_win%dest /= comm_pid) &
    3392           0 :                   call MPI_IRECV ( hs_rcv(ipe), 1, mp_i4, i4_win%dest, send_tag, comm, &
    3393           0 :                                    hs_rcvids(ipe), ierr )
    3394             :             endif
    3395             :          enddo
    3396             :       endif
    3397             : 
    3398           0 :       do ipe2=1, ceil2num
    3399           0 :          ipe = ieor(ipe2-1,comm_pid) + 1
    3400           0 :          if (ipe .gt. num_r) cycle
    3401           0 :          i4_win%size_r = onetwo*recv_bl(ipe)%Tot_Size
    3402           0 :          if (i4_win%size_r .ne. 0) then
    3403           0 :             i4_win%offset_r = offset_r
    3404           0 :             offset_r = offset_r + i4_win%size_r
    3405           0 :             i4_win%src = ipe-1
    3406           0 :             if (onetwo*unitsize >= offset_r-offset_0) then
    3407           0 :               recv_tag = i4_win%src + modcam_tagoffset
    3408           0 :               qsize    = i4_win%size_r
    3409           0 :               i4_win%nrecv    = i4_win%nrecv + 1
    3410           0 :               call MPI_IRECV(ga_i4_r(i4_win%offset_r+1), qsize, mp_i4, i4_win%src, &
    3411           0 :                              recv_tag, comm, i4_win%rqest(i4_win%nrecv), ierror)
    3412           0 :               if (hs_local) then
    3413           0 :                  if (i4_win%src /= comm_pid) &
    3414           0 :                    call MPI_SEND ( hs_snd, 1, mp_i4, i4_win%src, recv_tag, comm, ierror)
    3415             :               endif
    3416             :             else
    3417           0 :               write(iulog,*) "Fatal mp_sendirr_i4: receive window out of space - exiting"
    3418           0 :               write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid,  &
    3419           0 :                         ipe, unitsize, offset_r, offset_0
    3420           0 :               stop
    3421             :             endif
    3422             :          endif
    3423             :       enddo
    3424             : ! gather data into global send buffer
    3425           0 :       do ipe2=1, ceil2num
    3426           0 :          ipe = ieor(ipe2-1,comm_pid) + 1
    3427           0 :          if (ipe .gt. num_s) cycle
    3428           0 :          qsize = onetwo*send_bl(ipe)%Tot_Size
    3429           0 :          if (qsize .ne. 0) then
    3430           0 :             i4_win%dest = ipe-1
    3431           0 :             i4_win%offset_s = offset_s
    3432           0 :             offset_s = offset_s + qsize
    3433           0 :             if (offset_s-offset_0 .gt. onetwo*unitsize) then
    3434           0 :               write(iulog,*) "Fatal mp_sendirr_i4: send window out of space - exiting"
    3435           0 :               write(iulog,*) 'comm_pid ipe unitsize offset_s offset_0 = ', comm_pid,  &
    3436           0 :                         ipe, unitsize, offset_s, offset_0
    3437           0 :               stop
    3438             :             endif
    3439             : 
    3440           0 :             offset_v(1) = i4_win%offset_s
    3441           0 :             do j = 2, send_bl(ipe)%nparcels
    3442           0 :                offset_v(j) = offset_v(j-1) + send_bl(ipe)%blocksizes(j-1)
    3443             :             enddo
    3444             : 
    3445           0 :             do j = 1, send_bl(ipe)%nparcels
    3446           0 :                do i = 1, send_bl(ipe)%blocksizes(j)
    3447           0 :                   ga_i4_s(offset_v(j)+i) = q1in(send_bl(ipe)%displacements(j)+i)
    3448             :                enddo
    3449             :             enddo
    3450           0 :             if (twovar) then
    3451           0 :                do j = 1, send_bl(ipe)%nparcels
    3452           0 :                   do i = 1, send_bl(ipe)%blocksizes(j)
    3453           0 :                      ga_i4_s(send_bl(ipe)%Tot_Size+offset_v(j)+i) = q2in(send_bl(ipe)%displacements(j)+i)
    3454             :                   enddo
    3455             :                enddo
    3456             :             endif
    3457             : 
    3458             : ! nonblocking send
    3459           0 :             send_tag = comm_pid + modcam_tagoffset
    3460           0 :             i4_win%nsend = i4_win%nsend + 1
    3461           0 :             if (hs_local) then
    3462           0 :                if (i4_win%dest /= comm_pid) &
    3463           0 :                   call MPI_WAIT ( hs_rcvids(ipe), MPI_STATUS_IGNORE, ierr )
    3464           0 :                if (send_local) then
    3465           0 :                   call MPI_RSEND(ga_i4_s(i4_win%offset_s+1), qsize, mp_i4, i4_win%dest, &
    3466           0 :                                  send_tag, comm, ierr)
    3467             :                else
    3468           0 :                   call MPI_IRSEND(ga_i4_s(i4_win%offset_s+1), qsize, mp_i4, i4_win%dest, &
    3469           0 :                                  send_tag, comm, i4_win%sqest(i4_win%nsend), ierr)
    3470             :                endif
    3471             :             else
    3472           0 :                if (send_local) then
    3473           0 :                   call MPI_SEND(ga_i4_s(i4_win%offset_s+1), qsize, mp_i4, i4_win%dest, &
    3474           0 :                                  send_tag, comm, ierr)
    3475             :                else
    3476           0 :                   call MPI_ISEND(ga_i4_s(i4_win%offset_s+1), qsize, mp_i4, i4_win%dest, &
    3477           0 :                                  send_tag, comm, i4_win%sqest(i4_win%nsend), ierr)
    3478             :                endif
    3479             :             endif
    3480             :          endif
    3481             :       enddo
    3482             : 
    3483             :      endif   !  mod_method
    3484             : 
    3485           0 :       if (twovar) i4_win%ncall_s = i4_win%ncall_s + 1
    3486             : 
    3487             :     endif   !  sw_local
    3488             : 
    3489             : #if defined( MODCM_TIMING )
    3490             :       call t_stopf('mod_comm communication')
    3491             : #endif
    3492             : 
    3493           0 :       end subroutine mp_sendirr_i4
    3494             : !------------------------------------------------------------------------------
    3495             : !
    3496             : !------------------------------------------------------------------------------
    3497             : !BOP
    3498             : ! !ROUTINE: mp_recvirr_i4 --- Finalize communication of contiguous parcels - i4
    3499             : !
    3500             : ! !INTERFACE:
    3501           0 :       subroutine mp_recvirr_i4 ( comm, send_bl, recv_bl, q1in, q1out, q2in, q2out,      &
    3502             :                                  modc )
    3503             :  
    3504             : ! !INPUT PARAMETERS:
    3505             :       integer, intent(in)  :: comm      !  communicator
    3506             :       type(blockdescriptor), intent(in)  :: send_bl(:) ! send blocks
    3507             :       type(blockdescriptor), intent(in)  :: recv_bl(:) ! receive blocks
    3508             :       integer(i4), intent(in) :: q1in(*)                  ! input array
    3509             :       integer(i4), optional, intent(in) :: q2in(*)        ! second input array
    3510             :       integer, optional, intent(in) :: modc(4)         ! 1: classical, swap p2p, swap a2a
    3511             :                                                        ! 2: handshake
    3512             :                                                        ! 3: send vs isend
    3513             :                                                        ! 4: max number of outstanding requests
    3514             : 
    3515             : ! !INPUT/OUTPUT PARAMETERS:
    3516             :       integer(i4), intent(inout) :: q1out(*)                ! output array
    3517             :       integer(i4), optional, intent(inout) :: q2out(*)      ! second output array
    3518             : !
    3519             : ! !DESCRIPTION:
    3520             : !     Complete transfer of a generalized region initiated by {\tt mp\_sendirr}.
    3521             : !     Communicate a number of contiguous parcels to/from arbitrary set of PEs.
    3522             : !     Modc(1): if 0, use original approach of posting all communications in mp_sendirr and
    3523             : !     placing wait points here; otherwise don't do anything - mp_swapirr is called from mp_sendirr.
    3524             : !     Modc(3): if 1, then use blocking send; otherwise use nonblocking send
    3525             : !
    3526             : ! !REVISION HISTORY:
    3527             : !    02.08.15   Sawyer      Creation
    3528             : !    02.11.06   Mirin       Optimizations
    3529             : !    03.03.03   Sawyer      Now using packed arrays for MPI2
    3530             : !    04.02.24   Mirin       Various mpi2 options
    3531             : !    08.09.18   Mirin       Major overhaul, to include approaches from Mirin and Worley
    3532             : !
    3533             : !EOP
    3534             : !------------------------------------------------------------------------------
    3535             : !BOC
    3536             :       integer :: ipe, blocksize, offset_r, mod_method
    3537             :       integer unitsize, offset_0
    3538             :       integer Ierr
    3539           0 :       integer InStats(numpro*MPI_STATUS_SIZE)
    3540           0 :       integer OutStats(numpro*MPI_STATUS_SIZE)
    3541             :       integer i, j, num_r, num_s
    3542           0 :       integer :: offset_v (Max_Nparcels)
    3543             :       integer ipe2, ceil2num
    3544             :       integer onetwo
    3545             :       logical twovar
    3546             :       integer sw_local, maxreq_local
    3547             :       logical hs_local, send_local
    3548             :       logical sw_alltoall
    3549             :       integer comm_size, comm_pid
    3550             : 
    3551           0 :       if (present(modc)) then
    3552           0 :          sw_local   = modc(1)
    3553           0 :          hs_local   = (modc(2) .eq. 1)
    3554           0 :          send_local = (modc(3) .eq. 1)
    3555           0 :          maxreq_local = modc(4)
    3556             :       else
    3557             :          sw_local = 0
    3558           0 :          hs_local = .true.
    3559             :          send_local = .false.
    3560           0 :          maxreq_local = -1
    3561             :       endif
    3562             : 
    3563             : ! Do not call mp_swapirr_i4 (hence return) unless mod_method equals 0
    3564           0 :       mod_method = recv_bl(1)%method
    3565           0 :       if (mod_method .gt. 0) sw_local = 0
    3566             : 
    3567             : ! Return if swap_irr
    3568           0 :       if (sw_local .gt. 0) return
    3569             : 
    3570             : #if defined( MODCM_TIMING )
    3571             :       call t_startf('mod_comm communication')
    3572             : #endif
    3573             : 
    3574           0 :       onetwo = 1
    3575           0 :       twovar = .false.
    3576           0 :       if (present(q2in)) then
    3577           0 :          onetwo = 2
    3578           0 :          twovar = .true.
    3579             :       endif
    3580             : 
    3581           0 :       call MPI_COMM_SIZE (comm, comm_size, ierr)
    3582           0 :       call MPI_COMM_RANK (comm, comm_pid, ierr)
    3583             : 
    3584           0 :       ceil2num = ceil2(numpro)
    3585             : 
    3586             : !     num_s = 0 if this processes is not part of the sending decomposition
    3587           0 :       num_s = size(send_bl)
    3588           0 :       if (send_bl(1)%Nparcels == -1) then
    3589           0 :          num_s = 0
    3590             :       endif
    3591             : 
    3592             : !     num_r = 0 if this processes is not part of the receiving decomposition
    3593           0 :       num_r = size(recv_bl)
    3594           0 :       if (recv_bl(1)%Nparcels == -1) then
    3595           0 :          num_r = 0
    3596             :       endif
    3597             : 
    3598           0 :       mod_method = recv_bl(1)%method
    3599             : 
    3600           0 :       i4_win%ncall_r = i4_win%ncall_r + 1
    3601             : 
    3602           0 :     if (mod_method .gt. 0) then
    3603             : 
    3604             : ! mpi derived types
    3605           0 :       if (i4_win%ncall_r .gt. MaxTrf-onetwo+1) then
    3606           0 :          write(iulog,*) "mp_recvirr_i4: derived type handle count exceeded - exiting"
    3607           0 :          write(iulog,*) "i4_win%ncall_r MaxTrf = ", i4_win%ncall_r, MaxTrf
    3608           0 :          stop
    3609             :       endif
    3610             : 
    3611           0 :       if (num_s .gt. 0 .and. (.not. send_local)) then
    3612           0 :          CALL MPI_WAITALL( comm_size, InHandle(:,i4_win%ncall_r), InStats, Ierr )
    3613           0 :          if (twovar) then
    3614           0 :             CALL MPI_WAITALL( comm_size, InHandle(:,i4_win%ncall_r+1), InStats, Ierr )
    3615             :          endif
    3616             :       endif
    3617           0 :       if (num_r .gt. 0) then
    3618           0 :          CALL MPI_WAITALL( comm_size, OutHandle(:,i4_win%ncall_r), OutStats, Ierr )
    3619           0 :          if (twovar) then
    3620           0 :             CALL MPI_WAITALL( comm_size, OutHandle(:,i4_win%ncall_r+1), OutStats, Ierr )
    3621             :          endif
    3622             :       endif
    3623             : 
    3624             :     else
    3625             : 
    3626             : ! temporary contiguous buffer / global window
    3627             : 
    3628           0 :       if (i4_win%ncall_r .gt. max_irr-onetwo+1) then
    3629           0 :          write(iulog,*) "mp_recvirr_i4: insufficient window storage - exiting"
    3630           0 :          write(iulog,*) "i4_win%ncall_r max_irr = ", i4_win%ncall_r, max_irr
    3631           0 :          stop
    3632             :       endif
    3633           0 :       unitsize = i4_win%size/max_irr
    3634             : 
    3635             : ! scatter data from global receive buffer to final destination
    3636           0 :       offset_0 = (i4_win%ncall_r-1)*unitsize
    3637           0 :       offset_r = offset_0
    3638             : 
    3639           0 :       do ipe2=1, ceil2num
    3640           0 :          ipe = ieor(ipe2-1,comm_pid) + 1
    3641           0 :          if (ipe .gt. num_r) cycle
    3642           0 :          i4_win%size_r = onetwo*recv_bl(ipe)%Tot_Size
    3643           0 :          if (i4_win%size_r .ne. 0) then
    3644           0 :             i4_win%offset_r = offset_r
    3645           0 :             offset_r = offset_r + i4_win%size_r
    3646           0 :             if (offset_r-offset_0 .gt. onetwo*unitsize) then
    3647           0 :               write(iulog,*) "Fatal mp_recvirr_i4: receive window out of space - exiting"
    3648           0 :               write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid,  &
    3649           0 :                         ipe, unitsize, offset_r, offset_0
    3650           0 :               stop
    3651             :             endif
    3652             : 
    3653           0 :             i4_win%nread = i4_win%nread + 1
    3654           0 :             call MPI_WAIT(i4_win%rqest(i4_win%nread), Status, ierr)
    3655             : 
    3656           0 :             offset_v(1) = i4_win%offset_r
    3657           0 :             do j = 2, recv_bl(ipe)%Nparcels
    3658           0 :                offset_v(j) = offset_v(j-1) + recv_bl(ipe)%blocksizes(j-1)
    3659             :             enddo
    3660             : 
    3661           0 :             do j = 1, recv_bl(ipe)%Nparcels
    3662           0 :                do i = 1, recv_bl(ipe)%blocksizes(j)
    3663           0 :                   q1out(recv_bl(ipe)%displacements(j)+i) = ga_i4_r(offset_v(j)+i)
    3664             :                enddo
    3665             :             enddo
    3666           0 :             if (twovar) then
    3667           0 :             do j = 1, recv_bl(ipe)%Nparcels
    3668           0 :                do i = 1, recv_bl(ipe)%blocksizes(j)
    3669           0 :                   q2out(recv_bl(ipe)%displacements(j)+i) = ga_i4_r(recv_bl(ipe)%Tot_Size+offset_v(j)+i)
    3670             :                enddo
    3671             :             enddo
    3672             :             endif
    3673             : 
    3674             :          endif
    3675             :       enddo
    3676             : 
    3677           0 :       if ((i4_win%ncall_s == i4_win%ncall_r + onetwo - 1) .and. (.not. send_local)) then
    3678           0 :          call MPI_WAITALL(i4_win%nsend, i4_win%sqest, Stats, ierror)
    3679             :       endif
    3680             : 
    3681             :     endif    !    mod_method .gt. 0
    3682             : 
    3683           0 :     if (twovar) i4_win%ncall_r = i4_win%ncall_r + 1
    3684             : 
    3685           0 :     if (i4_win%ncall_s == i4_win%ncall_r) then
    3686           0 :        i4_win%nsend = 0
    3687           0 :        i4_win%nrecv = 0
    3688           0 :        i4_win%nread = 0
    3689           0 :        i4_win%ncall_s = 0
    3690           0 :        i4_win%ncall_r = 0
    3691             :     endif
    3692             : 
    3693             : #if defined( MODCM_TIMING )
    3694             :       call t_stopf('mod_comm communication')
    3695             : #endif
    3696             : 
    3697             : !EOC
    3698             :       end subroutine mp_recvirr_i4
    3699             : !------------------------------------------------------------------------------
    3700             : !
    3701             : !------------------------------------------------------------------------------
    3702             : !BOP
    3703             : ! !ROUTINE: mp_swapirr --- Write r8 contiguous parcels to global array
    3704             : !                           using XOR swap ordering
    3705             : !
    3706             : ! !INTERFACE:
    3707           0 :       subroutine mp_swapirr ( comm, send_bl, recv_bl, a1in, a1out, &
    3708             :                                a2in, a2out, sw_handshake, sw_maxreq, &
    3709             :                                sw_alltoall, sw_send )
    3710             :  
    3711             : ! !INPUT PARAMETERS:
    3712             :       integer, intent(in)  :: comm                     ! communicator
    3713             :       type(blockdescriptor), intent(in)  :: send_bl(:) ! send blocks
    3714             :       type(blockdescriptor), intent(in)  :: recv_bl(:) ! receive blocks
    3715             :       real(r8), intent(in) :: a1in(*)                  ! local data segment
    3716             :       real(r8), optional, intent(in) :: a2in(*)        ! local data segment
    3717             :       logical, optional, intent(in) :: sw_handshake    ! use flow control and 
    3718             :                                                        !  ready send
    3719             :       integer, optional, intent(in) :: sw_maxreq       ! maximum number of outstanding
    3720             :                                                        !  MPI requests
    3721             :       logical, optional, intent(in) :: sw_alltoall     ! use mpi_alltoall
    3722             :       logical, optional, intent(in) :: sw_send         ! use mpi_send instead of isend
    3723             : 
    3724             : ! !OUTPUT PARAMETERS:
    3725             :       real(r8), intent(out) :: a1out(*)                ! local output segment
    3726             :       real(r8), optional, intent(out) :: a2out(*)      ! local output segment
    3727             : !
    3728             : ! !DESCRIPTION:
    3729             : !     XOR-ordered version of all-to-all communication
    3730             : !
    3731             : ! WARNING: mod_comm parameter max_irr might need to be set larger than expected
    3732             : !          when swapping two variables; specifically, max_irr must be at least
    3733             : !          as large as the incoming r8_win%ncall_s + the number of variables to
    3734             : !          be swapped
    3735             : !
    3736             : ! !REVISION HISTORY: 
    3737             : !    08.06.30   Worley      original: derived from mp_sendirr, but using 
    3738             : !                            swapm logic and XOR swap order 
    3739             : !    08.08.22   Worley      removed swapm; reimplemented with native MPI,
    3740             : !                            added flow control/ready send option and maxreq
    3741             : !                            throttling, added alltoall option
    3742             : !    09.10.07   Worley      eliminated mpi_recv from handshake logic
    3743             : !
    3744             : ! !BUGS:
    3745             : !
    3746             : !EOP
    3747             : !------------------------------------------------------------------------------
    3748             : !BOC
    3749             : !
    3750             : ! !LOCAL VARIABLES:
    3751             :       integer :: i, j, p, istep, num_s, num_r
    3752             :       integer :: comm_pid, comm_size, steps, ierr
    3753             :       integer :: ipe, offset_s, offset_r, offset_0, unitsize, onetwo
    3754             : 
    3755           0 :       integer :: arr_sndlths(0:numpro-1), arr_rcvlths(0:numpro-1)
    3756           0 :       integer :: sndlths(0:numpro-1), sdispls(0:numpro-1)
    3757           0 :       integer :: rcvlths(0:numpro-1), rdispls(0:numpro-1)
    3758           0 :       integer :: swapids(numpro) 
    3759           0 :       integer :: sndids(numpro)  ! nonblocking MPI send request ids
    3760           0 :       integer :: rcvids(numpro)  ! nonblocking MPI recv request ids
    3761           0 :       integer :: hs_snd, hs_rcv(numpro)! handshake variables (send/receive)
    3762           0 :       integer :: hs_rcvids(numpro) ! nonblocking MPI handshake recv request ids
    3763           0 :       integer :: InStats(numpro*MPI_STATUS_SIZE)
    3764           0 :       integer :: OutStats(numpro*MPI_STATUS_SIZE)
    3765             : 
    3766             :       integer :: offset_v
    3767             : 
    3768             :       integer :: rstep
    3769             : 
    3770             :       integer :: maxreq, maxreqh
    3771             :       logical :: handshake, alltoall, sendd
    3772             : 
    3773             : #if defined( MODCM_TIMING )
    3774             :       call t_startf('mod_comm communication')
    3775             : #endif
    3776             : 
    3777           0 :       call MPI_COMM_SIZE (comm, comm_size, ierr)
    3778           0 :       call MPI_COMM_RANK (comm, comm_pid, ierr)
    3779             : 
    3780             : !     num_s = 0 if this process is not part of the sending decomposition
    3781           0 :       num_s = size(send_bl)
    3782           0 :       if (send_bl(1)%Nparcels == -1) then
    3783           0 :          num_s = 0
    3784             :       endif
    3785             : 
    3786             : !     num_r = 0 if this process is not part of the receiving decomposition
    3787           0 :       num_r = size(recv_bl)
    3788           0 :       if (recv_bl(1)%Nparcels == -1) then
    3789           0 :          num_r = 0
    3790             :       endif
    3791             : 
    3792           0 :       if ( present(a2in) .and. (.not. present(a2out)) ) then
    3793           0 :          write(iulog,*) "Fatal mp_swapirr: a2in specified, but a2out missing - exiting"
    3794           0 :          stop
    3795             :       endif
    3796             : 
    3797           0 :       if ( (.not. present(a2in)) .and. present(a2out)) then
    3798           0 :          write(iulog,*) "Fatal mp_swapirr: a2out specified, but a2in missing - exiting"
    3799           0 :          stop
    3800             :       endif
    3801             : 
    3802           0 :       if ( present(sw_handshake) ) then
    3803           0 :          handshake = sw_handshake
    3804           0 :          hs_snd = 1
    3805             :       else
    3806             :          handshake = .false.
    3807             :       endif
    3808             : 
    3809           0 :       if ( present(sw_alltoall) ) then
    3810           0 :          alltoall = sw_alltoall
    3811             :       else
    3812             :          alltoall = .false.
    3813             :       endif
    3814             : 
    3815           0 :       if ( present(sw_send) ) then
    3816           0 :          sendd = sw_send
    3817             :       else
    3818             :          sendd = .false.
    3819             :       endif
    3820             : 
    3821           0 :       onetwo = 1
    3822           0 :       if (present(a2in)) onetwo = 2
    3823           0 :       unitsize = r8_win%size/max_irr
    3824             : 
    3825             : ! advance to unused portion of storage window
    3826           0 :       r8_win%ncall_s = r8_win%ncall_s + 1
    3827             : 
    3828           0 :       if (r8_win%ncall_s .gt. max_irr-onetwo+1) then
    3829           0 :          write(iulog,*) "mp_swapirr: insufficient window storage - exiting"
    3830           0 :          write(iulog,*) "r8_win%ncall_s max_irr = ", r8_win%ncall_s, max_irr
    3831           0 :          stop
    3832             :       endif
    3833             : 
    3834             : ! calculate send lengths and displacements
    3835           0 :       offset_0 = (r8_win%ncall_s-1)*unitsize
    3836           0 :       offset_s = offset_0
    3837           0 :       sndlths(:) = 0
    3838           0 :       sdispls(:) = 0
    3839           0 :       arr_sndlths(:) = 0
    3840           0 :       do ipe=1, num_s
    3841           0 :          sndlths(ipe-1) = send_bl(ipe)%Tot_Size
    3842           0 :          sdispls(ipe-1) = offset_s
    3843           0 :          if (sndlths(ipe-1) .ne. 0) then
    3844             : 
    3845             :             ! pack first array
    3846           0 :             offset_s = offset_s + sndlths(ipe-1)
    3847           0 :             if (offset_s-offset_0 .gt. onetwo*unitsize) then
    3848           0 :               write(iulog,*) "Fatal mp_swapirr: send window out of space - exiting"
    3849           0 :               write(iulog,*) '1 comm_pid ipe unitsize offset_s offset_0 = ', comm_pid,  &
    3850           0 :                              ipe, unitsize, offset_s, offset_0
    3851           0 :               stop
    3852             :             endif
    3853             : 
    3854           0 :             arr_sndlths(ipe-1) = sndlths(ipe-1)
    3855             : 
    3856             :             ! calculate for second array (if it exists)
    3857           0 :             if ( present(a2in) ) then
    3858             : 
    3859           0 :                offset_s = offset_s + sndlths(ipe-1)
    3860           0 :                if (offset_s-offset_0 .gt. onetwo*unitsize) then
    3861           0 :                  write(iulog,*) "Fatal mp_swapirr: send window out of space - exiting"
    3862           0 :                  write(iulog,*) '2 comm_pid ipe unitsize offset_s offset_0 = ', comm_pid,  &
    3863           0 :                                 ipe, unitsize, offset_s, offset_0
    3864           0 :                  stop
    3865             :                endif
    3866             : 
    3867           0 :                sndlths(ipe-1) = sndlths(ipe-1) + arr_sndlths(ipe-1)
    3868             : 
    3869             :             endif
    3870             : 
    3871             :          endif
    3872             :       enddo
    3873             : 
    3874             : ! calculate receive lengths and displacements
    3875           0 :       offset_r = offset_0
    3876           0 :       rcvlths(:) = 0
    3877           0 :       rdispls(:) = 0
    3878           0 :       arr_rcvlths(:) = 0
    3879           0 :       do ipe=1, num_r
    3880           0 :          rcvlths(ipe-1) = recv_bl(ipe)%Tot_Size
    3881           0 :          rdispls(ipe-1) = offset_r
    3882           0 :          if (rcvlths(ipe-1) .ne. 0) then
    3883             : 
    3884           0 :             offset_r = offset_r + rcvlths(ipe-1)
    3885           0 :             if (onetwo*unitsize < offset_r-offset_0) then
    3886           0 :               write(iulog,*) "Fatal mp_swapirr: receive window out of space - exiting"
    3887           0 :               write(iulog,*) '1 comm_pid ipe unitsize offset_r offset_0 = ', comm_pid,  &
    3888           0 :                         ipe, unitsize, offset_r, offset_0
    3889           0 :               stop
    3890             :             endif
    3891             : 
    3892           0 :             arr_rcvlths(ipe-1) = rcvlths(ipe-1)
    3893             : 
    3894             :             ! compute for second array (if it exists)
    3895           0 :             if ( present(a2out) ) then
    3896             : 
    3897           0 :                offset_r = offset_r + rcvlths(ipe-1)
    3898           0 :                if (onetwo*unitsize < offset_r-offset_0) then
    3899           0 :                  write(iulog,*) "Fatal mp_swapirr: receive window out of space - exiting"
    3900           0 :                  write(iulog,*) '2 comm_pid ipe unitsize offset_r offset_0 = ', comm_pid,  &
    3901           0 :                           ipe, unitsize, offset_r, offset_0
    3902           0 :                  stop
    3903             :                endif
    3904             : 
    3905           0 :                rcvlths(ipe-1) = rcvlths(ipe-1) + arr_rcvlths(ipe-1)
    3906             : 
    3907             :             endif
    3908             : 
    3909             :          endif
    3910             :       enddo
    3911             : 
    3912             : ! Calculate swap partners and number of steps in point-to-point
    3913             : ! implementations of alltoall algorithm.
    3914           0 :       steps = 0
    3915           0 :       do ipe=1,ceil2(comm_size)-1
    3916           0 :          p = pair(comm_size,ipe,comm_pid)
    3917           0 :          if (p >= 0) then
    3918           0 :             if (sndlths(p) > 0 .or. rcvlths(p) > 0) then
    3919           0 :                steps = steps + 1
    3920           0 :                swapids(steps) = p
    3921             :             end if
    3922             :          end if
    3923             :       end do
    3924             : 
    3925           0 :       if (.not. alltoall) then
    3926             : 
    3927           0 :          sndids(1:steps) = MPI_REQUEST_NULL
    3928           0 :          rcvids(1:steps) = MPI_REQUEST_NULL
    3929             : 
    3930           0 :          if (steps .eq. 0) then
    3931           0 :             maxreq  = 0
    3932           0 :             maxreqh = 0
    3933           0 :          elseif (steps .eq. 1) then
    3934           0 :             maxreq  = 1
    3935           0 :             maxreqh = 1
    3936             :          else
    3937           0 :             if ( present(sw_maxreq) ) then
    3938           0 :                if ((sw_maxreq .le. steps) .and. (sw_maxreq .ge. 0)) then
    3939           0 :                   maxreq  = sw_maxreq
    3940           0 :                   if (maxreq > 1) then
    3941           0 :                      maxreqh = maxreq/2
    3942             :                   else
    3943           0 :                      maxreq  = 2
    3944           0 :                      maxreqh = 1
    3945             :                   endif
    3946             :                else
    3947           0 :                   maxreq  = steps
    3948           0 :                   maxreqh = steps
    3949             :                endif
    3950             :             else
    3951           0 :                maxreq  = steps
    3952           0 :                maxreqh = steps
    3953             :             endif
    3954             :          endif
    3955             : 
    3956             : ! Post initial handshake receive requests
    3957           0 :          if (handshake) then
    3958           0 :             do istep=1,maxreq
    3959           0 :                p = swapids(istep)
    3960           0 :                if (sndlths(p) > 0) then
    3961             :                   call mpi_irecv  ( hs_rcv(istep), 1, mp_i4, p, comm_pid, comm, &
    3962           0 :                                     hs_rcvids(istep), ierr )
    3963             :                endif
    3964             :             enddo
    3965             :          endif
    3966             : 
    3967             : ! Post initial receive requests
    3968           0 :          do istep=1,maxreq
    3969           0 :             p = swapids(istep)
    3970           0 :             if (rcvlths(p) > 0) then
    3971           0 :                offset_r = rdispls(p)+1
    3972           0 :                call mpi_irecv ( ga_r8_r(offset_r), rcvlths(p), mp_r8, &
    3973           0 :                                 p, p, comm, rcvids(istep), ierr )
    3974           0 :                if (handshake) then
    3975             :                   call mpi_send( hs_snd, 1, mp_i4, p, p, comm, &
    3976           0 :                                  ierr )
    3977             :                endif
    3978             :             endif
    3979             :          enddo
    3980             :          rstep = maxreq
    3981             : !
    3982             :       endif
    3983             : 
    3984             : ! gather data into global send buffer
    3985           0 :       do istep=1,steps
    3986           0 :          p = swapids(istep)
    3987             : 
    3988           0 :          if (sndlths(p) .ne. 0) then
    3989           0 :             offset_v = sdispls(p)
    3990           0 :             do j = 1, send_bl(p+1)%nparcels
    3991           0 :                do i = 1, send_bl(p+1)%blocksizes(j)
    3992           0 :                   ga_r8_s(offset_v+i) = a1in(send_bl(p+1)%displacements(j)+i)
    3993             :                enddo
    3994           0 :                offset_v = offset_v + send_bl(p+1)%blocksizes(j)
    3995             :             enddo
    3996             : 
    3997             :             ! pack second array (if it exists)
    3998           0 :             if ( present(a2in) ) then
    3999           0 :                offset_v = sdispls(p) + arr_sndlths(p)
    4000           0 :                do j = 1, send_bl(p+1)%nparcels
    4001           0 :                   do i = 1, send_bl(p+1)%blocksizes(j)
    4002           0 :                      ga_r8_s(offset_v+i) = a2in(send_bl(p+1)%displacements(j)+i)
    4003             :                   enddo
    4004           0 :                   offset_v = offset_v + send_bl(p+1)%blocksizes(j)
    4005             :                enddo
    4006             :             endif
    4007             : 
    4008             :          endif
    4009             : 
    4010           0 :          if (.not. alltoall) then
    4011             : 
    4012             : ! Submit new i(r)send request
    4013           0 :             offset_s = sdispls(p)+1
    4014           0 :             if (sndlths(p) > 0) then
    4015           0 :                if (handshake) then
    4016             :                   call mpi_wait( hs_rcvids(istep), MPI_STATUS_IGNORE, ierr )
    4017           0 :                   if (sendd) then
    4018           0 :                      call mpi_rsend( ga_r8_s(offset_s), sndlths(p), mp_r8, &
    4019           0 :                                       p, comm_pid, comm, ierr )
    4020             :                   else
    4021           0 :                      call mpi_irsend( ga_r8_s(offset_s), sndlths(p), mp_r8, &
    4022           0 :                                       p, comm_pid, comm, sndids(istep), ierr )
    4023             :                   endif
    4024             :                else
    4025           0 :                   if (sendd) then
    4026           0 :                      call mpi_send ( ga_r8_s(offset_s), sndlths(p), mp_r8, &
    4027           0 :                                       p, comm_pid, comm, ierr )
    4028             :                   else
    4029           0 :                      call mpi_isend ( ga_r8_s(offset_s), sndlths(p), mp_r8, &
    4030           0 :                                       p, comm_pid, comm, sndids(istep), ierr )
    4031             :                   endif
    4032             :                endif
    4033             :             endif
    4034             : 
    4035           0 :             if (istep > maxreqh) then
    4036             : ! Wait for oldest irecv request to complete
    4037           0 :                call mpi_wait( rcvids(istep-maxreqh), OutStats, ierr )
    4038             : 
    4039           0 :                if (rstep < steps) then
    4040           0 :                   rstep = rstep + 1
    4041           0 :                   p = swapids(rstep)
    4042             : 
    4043             : ! Submit a new handshake irecv request
    4044           0 :                   if (handshake) then
    4045           0 :                      if (sndlths(p) > 0) then
    4046             :                         call mpi_irecv( hs_rcv(rstep), 1, mp_i4, p, comm_pid, comm, &
    4047           0 :                                         hs_rcvids(rstep), ierr )
    4048             :                      endif
    4049             :                   endif
    4050             : 
    4051             : ! Submit a new irecv request
    4052           0 :                   if (rcvlths(p) > 0) then
    4053           0 :                      offset_r = rdispls(p)+1
    4054           0 :                      call mpi_irecv( ga_r8_r(offset_r), rcvlths(p), mp_r8, &
    4055           0 :                                      p, p, comm, rcvids(rstep), ierr )
    4056           0 :                      if (handshake) then
    4057             :                         call mpi_send ( hs_snd, 1, mp_i4, p, p, comm, &
    4058           0 :                                         ierr )
    4059             :                      endif
    4060             :                   endif
    4061             :                endif
    4062             : 
    4063             : ! Wait for outstanding i(r)send request to complete
    4064           0 :                if (.not. sendd) then
    4065           0 :                   call mpi_wait( sndids(istep-maxreqh), InStats, ierr )
    4066             :                endif
    4067             :             endif
    4068             : !
    4069             :          endif
    4070             : !
    4071             :       enddo
    4072             : 
    4073             : ! local copy to send buffer
    4074           0 :       if (sndlths(comm_pid) .ne. 0) then
    4075             : 
    4076           0 :          offset_v = sdispls(comm_pid)
    4077           0 :          do j = 1, send_bl(comm_pid+1)%nparcels
    4078           0 :             do i = 1, send_bl(comm_pid+1)%blocksizes(j)
    4079           0 :                ga_r8_s(offset_v+i) = a1in(send_bl(comm_pid+1)%displacements(j)+i)
    4080             :             enddo
    4081           0 :             offset_v = offset_v + send_bl(comm_pid+1)%blocksizes(j)
    4082             :          enddo
    4083             : 
    4084             :          ! pack second array (if it exists)
    4085           0 :          if ( present(a2in) ) then
    4086           0 :             offset_v = sdispls(comm_pid) + arr_sndlths(comm_pid)
    4087           0 :             do j = 1, send_bl(comm_pid+1)%nparcels
    4088           0 :                do i = 1, send_bl(comm_pid+1)%blocksizes(j)
    4089           0 :                   ga_r8_s(offset_v+i) = a2in(send_bl(comm_pid+1)%displacements(j)+i)
    4090             :                enddo
    4091           0 :                offset_v = offset_v + send_bl(comm_pid+1)%blocksizes(j)
    4092             :             enddo
    4093             :          endif
    4094             : 
    4095           0 :          if (.not. alltoall) then
    4096           0 :             ga_r8_r(rdispls(comm_pid)+1:rdispls(comm_pid)+rcvlths(comm_pid)) = &
    4097           0 :                ga_r8_s(sdispls(comm_pid)+1:sdispls(comm_pid)+sndlths(comm_pid))
    4098             :          endif
    4099             : 
    4100             :       endif
    4101             : 
    4102           0 :       if (alltoall) then
    4103             :          call mpi_alltoallv (ga_r8_s, sndlths, sdispls, mp_r8, &
    4104             :                              ga_r8_r, rcvlths, rdispls, mp_r8, &
    4105           0 :                              comm, ierror)
    4106             :       endif
    4107             : 
    4108             : ! local copy from receive buffer
    4109           0 :       if (rcvlths(comm_pid) .ne. 0) then
    4110             : 
    4111           0 :          offset_v = rdispls(comm_pid)
    4112           0 :          do j = 1, recv_bl(comm_pid+1)%Nparcels
    4113           0 :             do i = 1, recv_bl(comm_pid+1)%blocksizes(j)
    4114           0 :                a1out(recv_bl(comm_pid+1)%displacements(j)+i) = ga_r8_r(offset_v+i)
    4115             :             enddo
    4116           0 :             offset_v = offset_v + recv_bl(comm_pid+1)%blocksizes(j)
    4117             :          enddo
    4118             : 
    4119             :          ! scatter data for second array (if it exists)
    4120           0 :          if ( present(a2out) ) then
    4121           0 :             offset_v = rdispls(comm_pid) + arr_rcvlths(comm_pid)
    4122           0 :             do j = 1, recv_bl(comm_pid+1)%Nparcels
    4123           0 :                do i = 1, recv_bl(comm_pid+1)%blocksizes(j)
    4124           0 :                   a2out(recv_bl(comm_pid+1)%displacements(j)+i) = ga_r8_r(offset_v+i)
    4125             :                enddo
    4126           0 :                offset_v = offset_v + recv_bl(comm_pid+1)%blocksizes(j)
    4127             :             enddo
    4128             :          endif
    4129             : 
    4130             :       endif
    4131             : 
    4132             : ! scatter data from global receive buffer to final destination
    4133           0 :       do istep=1,steps
    4134           0 :          p = swapids(istep)
    4135             : 
    4136           0 :          if (.not. alltoall) then
    4137           0 :             if (istep > steps-maxreqh) then
    4138             :                call mpi_wait( rcvids(istep), OutStats, ierr )
    4139             :             endif
    4140             :          endif
    4141             : 
    4142           0 :          if (rcvlths(p) .ne. 0) then
    4143             : 
    4144           0 :             offset_v = rdispls(p)
    4145           0 :             do j = 1, recv_bl(p+1)%Nparcels
    4146           0 :                do i = 1, recv_bl(p+1)%blocksizes(j)
    4147           0 :                   a1out(recv_bl(p+1)%displacements(j)+i) = ga_r8_r(offset_v+i)
    4148             :                enddo
    4149           0 :                offset_v = offset_v + recv_bl(p+1)%blocksizes(j)
    4150             :             enddo
    4151             : 
    4152             :             ! scatter data for second array (if it exists)
    4153           0 :             if ( present(a2out) ) then
    4154             : 
    4155           0 :                offset_v = rdispls(p) + arr_rcvlths(p)
    4156           0 :                do j = 1, recv_bl(p+1)%Nparcels
    4157           0 :                   do i = 1, recv_bl(p+1)%blocksizes(j)
    4158           0 :                      a2out(recv_bl(p+1)%displacements(j)+i) = ga_r8_r(offset_v+i)
    4159             :                   enddo
    4160           0 :                   offset_v = offset_v + recv_bl(p+1)%blocksizes(j)
    4161             :                enddo
    4162             : 
    4163             :             endif
    4164             : 
    4165             :          endif
    4166             :       enddo
    4167             : 
    4168             : ! Wait for any outstanding send requests to complete.
    4169           0 :       if (.not. alltoall .and. .not. sendd) then
    4170           0 :          call mpi_waitall( maxreqh, sndids(steps-maxreqh+1), InStats, ierr )
    4171             :       endif
    4172             : 
    4173             : ! clean-up
    4174             : ! make used portion of storage window available for reuse
    4175           0 :       r8_win%ncall_s = r8_win%ncall_s - 1
    4176             : 
    4177             : #if defined( MODCM_TIMING )
    4178             :       call t_stopf('mod_comm communication')
    4179             : #endif
    4180             : 
    4181             : !EOC
    4182           0 :       end subroutine mp_swapirr
    4183             : !------------------------------------------------------------------------------
    4184             : !
    4185             : !------------------------------------------------------------------------------
    4186             : !BOP
    4187             : ! !ROUTINE: mp_swapirr_i4 --- Write i4 contiguous parcels to global array
    4188             : !                             using XOR swap ordering
    4189             : !
    4190             : ! !INTERFACE:
    4191           0 :       subroutine mp_swapirr_i4 ( comm, send_bl, recv_bl, a1in, a1out, &
    4192             :                                  a2in, a2out, sw_handshake, sw_maxreq, &
    4193             :                                  sw_alltoall, sw_send )
    4194             :  
    4195             : ! !INPUT PARAMETERS:
    4196             :       integer, intent(in)  :: comm                     ! communicator
    4197             :       type(blockdescriptor), intent(in)  :: send_bl(:) ! send blocks
    4198             :       type(blockdescriptor), intent(in)  :: recv_bl(:) ! receive blocks
    4199             :       integer(i4), intent(in) :: a1in(*)               ! input array
    4200             :       integer(i4), optional, intent(in) :: a2in(*)     ! second input array
    4201             :       logical, optional, intent(in) :: sw_handshake    ! use flow control and 
    4202             :                                                        !  ready send
    4203             :       integer, optional, intent(in) :: sw_maxreq       ! maximum number of outstanding
    4204             :                                                        !  MPI requests
    4205             :       logical, optional, intent(in) :: sw_alltoall     ! use mpi_alltoall
    4206             :       logical, optional, intent(in) :: sw_send         ! use mpi_send instead of isend
    4207             : 
    4208             : ! !OUTPUT PARAMETERS:
    4209             :       integer(i4), intent(out) :: a1out(*)             ! output array
    4210             :       integer(i4), optional, intent(out) :: a2out(*)   ! second output array
    4211             : !
    4212             : ! !DESCRIPTION:
    4213             : !     XOR-ordered version of all-to-all communication
    4214             : !
    4215             : ! WARNING: mod_comm parameter max_irr might need to be set larger than expected
    4216             : !          when swapping two variables; specifically, max_irr must be at least
    4217             : !          as large as the incoming i4_win%ncall_s + the number of variables to
    4218             : !          be swapped
    4219             : !
    4220             : ! !REVISION HISTORY: 
    4221             : !    08.06.30   Worley      original: derived from mp_sendirr, but using 
    4222             : !                            swapm logic and XOR swap order 
    4223             : !    08.08.22   Worley      removed swapm; reimplemented with native MPI,
    4224             : !                            added flow control/ready send option and maxreq
    4225             : !                            throttling, added alltoall option
    4226             : !    09.10.07   Worley      eliminated mpi_recv from handshake logic
    4227             : !
    4228             : ! !BUGS:
    4229             : !
    4230             : !EOP
    4231             : !------------------------------------------------------------------------------
    4232             : !BOC
    4233             : !
    4234             : ! !LOCAL VARIABLES:
    4235             :       integer :: i, j, p, istep, num_s, num_r
    4236             :       integer :: comm_pid, comm_size, steps, ierr
    4237             :       integer :: ipe, offset_s, offset_r, offset_0, unitsize, onetwo
    4238             : 
    4239           0 :       integer :: arr_sndlths(0:numpro-1), arr_rcvlths(0:numpro-1)
    4240           0 :       integer :: sndlths(0:numpro-1), sdispls(0:numpro-1)
    4241           0 :       integer :: rcvlths(0:numpro-1), rdispls(0:numpro-1)
    4242           0 :       integer :: swapids(numpro) 
    4243           0 :       integer :: sndids(numpro)  ! nonblocking MPI send request ids
    4244           0 :       integer :: rcvids(numpro)  ! nonblocking MPI recv request ids
    4245           0 :       integer :: hs_snd, hs_rcv(numpro)! handshake variables (send/receive)
    4246           0 :       integer :: hs_rcvids(numpro) ! nonblocking MPI handshake recv request ids
    4247           0 :       integer :: InStats(numpro*MPI_STATUS_SIZE)
    4248           0 :       integer :: OutStats(numpro*MPI_STATUS_SIZE)
    4249             : 
    4250             :       integer :: offset_v
    4251             : 
    4252             :       integer :: rstep
    4253             : 
    4254             :       integer :: maxreq, maxreqh
    4255             :       logical :: handshake, alltoall, sendd
    4256             : 
    4257             : #if defined( MODCM_TIMING )
    4258             :       call t_startf('mod_comm communication')
    4259             : #endif
    4260             : 
    4261           0 :       call MPI_COMM_SIZE (comm, comm_size, ierr)
    4262           0 :       call MPI_COMM_RANK (comm, comm_pid, ierr)
    4263             : 
    4264             : !     num_s = 0 if this process is not part of the sending decomposition
    4265           0 :       num_s = size(send_bl)
    4266           0 :       if (send_bl(1)%Nparcels == -1) then
    4267           0 :          num_s = 0
    4268             :       endif
    4269             : 
    4270             : !     num_r = 0 if this process is not part of the receiving decomposition
    4271           0 :       num_r = size(recv_bl)
    4272           0 :       if (recv_bl(1)%Nparcels == -1) then
    4273           0 :          num_r = 0
    4274             :       endif
    4275             : 
    4276           0 :       if ( present(a2in) .and. (.not. present(a2out)) ) then
    4277           0 :          write(iulog,*) "Fatal mp_swapirr_i4: a2in specified, but a2out missing - exiting"
    4278           0 :          stop
    4279             :       endif
    4280             : 
    4281           0 :       if ( (.not. present(a2in)) .and. present(a2out)) then
    4282           0 :          write(iulog,*) "Fatal mp_swapirr_i4: a2out specified, but a2in missing - exiting"
    4283           0 :          stop
    4284             :       endif
    4285             : 
    4286           0 :       if ( present(sw_handshake) ) then
    4287           0 :          handshake = sw_handshake
    4288           0 :          hs_snd = 1
    4289             :       else
    4290             :          handshake = .false.
    4291             :       endif
    4292             : 
    4293           0 :       if ( present(sw_alltoall) ) then
    4294           0 :          alltoall = sw_alltoall
    4295             :       else
    4296             :          alltoall = .false.
    4297             :       endif
    4298             : 
    4299           0 :       if ( present(sw_send) ) then
    4300           0 :          sendd = sw_send
    4301             :       else
    4302             :          sendd = .false.
    4303             :       endif
    4304             : 
    4305           0 :       onetwo = 1
    4306           0 :       if (present(a2in)) onetwo = 2
    4307           0 :       unitsize = i4_win%size/max_irr
    4308             : 
    4309             : ! advance to unused portion of storage window
    4310           0 :       i4_win%ncall_s = i4_win%ncall_s + 1
    4311             : 
    4312           0 :       if (i4_win%ncall_s .gt. max_irr-onetwo+1) then
    4313           0 :          write(iulog,*) "mp_swapirr_i4: insufficient window storage - exiting"
    4314           0 :          write(iulog,*) "i4_win%ncall_s max_irr = ", i4_win%ncall_s, max_irr
    4315           0 :          stop
    4316             :       endif
    4317             : 
    4318             : ! calculate send lengths and displacements
    4319           0 :       offset_0 = (i4_win%ncall_s-1)*unitsize
    4320           0 :       offset_s = offset_0
    4321           0 :       sndlths(:) = 0
    4322           0 :       sdispls(:) = 0
    4323           0 :       arr_sndlths(:) = 0
    4324           0 :       do ipe=1, num_s
    4325           0 :          sndlths(ipe-1) = send_bl(ipe)%Tot_Size
    4326           0 :          sdispls(ipe-1) = offset_s
    4327           0 :          if (sndlths(ipe-1) .ne. 0) then
    4328             : 
    4329             :             ! pack first array
    4330           0 :             offset_s = offset_s + sndlths(ipe-1)
    4331           0 :             if (offset_s-offset_0 .gt. onetwo*unitsize) then
    4332           0 :               write(iulog,*) "Fatal mp_swapirr_i4: send window out of space - exiting"
    4333           0 :               write(iulog,*) '1 comm_pid ipe unitsize offset_s offset_0 = ', comm_pid,  &
    4334           0 :                              ipe, unitsize, offset_s, offset_0
    4335           0 :               stop
    4336             :             endif
    4337             : 
    4338           0 :             arr_sndlths(ipe-1) = sndlths(ipe-1)
    4339             : 
    4340             :             ! calculate for second array (if it exists)
    4341           0 :             if ( present(a2in) ) then
    4342             : 
    4343           0 :                offset_s = offset_s + sndlths(ipe-1)
    4344           0 :                if (offset_s-offset_0 .gt. onetwo*unitsize) then
    4345           0 :                  write(iulog,*) "Fatal mp_swapirr_i4: send window out of space - exiting"
    4346           0 :                  write(iulog,*) '2 comm_pid ipe unitsize offset_s offset_0 = ', comm_pid,  &
    4347           0 :                                 ipe, unitsize, offset_s, offset_0
    4348           0 :                  stop
    4349             :                endif
    4350             : 
    4351           0 :                sndlths(ipe-1) = sndlths(ipe-1) + arr_sndlths(ipe-1)
    4352             : 
    4353             :             endif
    4354             : 
    4355             :          endif
    4356             :       enddo
    4357             : 
    4358             : ! calculate receive lengths and displacements
    4359           0 :       offset_r = offset_0
    4360           0 :       rcvlths(:) = 0
    4361           0 :       rdispls(:) = 0
    4362           0 :       arr_rcvlths(:) = 0
    4363           0 :       do ipe=1, num_r
    4364           0 :          rcvlths(ipe-1) = recv_bl(ipe)%Tot_Size
    4365           0 :          rdispls(ipe-1) = offset_r
    4366           0 :          if (rcvlths(ipe-1) .ne. 0) then
    4367             : 
    4368           0 :             offset_r = offset_r + rcvlths(ipe-1)
    4369           0 :             if (onetwo*unitsize < offset_r-offset_0) then
    4370           0 :               write(iulog,*) "Fatal mp_swapirr_i4: receive window out of space - exiting"
    4371           0 :               write(iulog,*) '1 comm_pid ipe unitsize offset_r offset_0 = ', comm_pid,  &
    4372           0 :                         ipe, unitsize, offset_r, offset_0
    4373           0 :               stop
    4374             :             endif
    4375             : 
    4376           0 :             arr_rcvlths(ipe-1) = rcvlths(ipe-1)
    4377             : 
    4378             :             ! compute for second array (if it exists)
    4379           0 :             if ( present(a2out) ) then
    4380             : 
    4381           0 :                offset_r = offset_r + rcvlths(ipe-1)
    4382           0 :                if (onetwo*unitsize < offset_r-offset_0) then
    4383           0 :                  write(iulog,*) "Fatal mp_swapirr_i4: receive window out of space - exiting"
    4384           0 :                  write(iulog,*) '2 comm_pid ipe unitsize offset_r offset_0 = ', comm_pid,  &
    4385           0 :                           ipe, unitsize, offset_r, offset_0
    4386           0 :                  stop
    4387             :                endif
    4388             : 
    4389           0 :                rcvlths(ipe-1) = rcvlths(ipe-1) + arr_rcvlths(ipe-1)
    4390             : 
    4391             :             endif
    4392             : 
    4393             :          endif
    4394             :       enddo
    4395             : 
    4396             : ! Calculate swap partners and number of steps in point-to-point
    4397             : ! implementations of alltoall algorithm.
    4398           0 :       steps = 0
    4399           0 :       do ipe=1,ceil2(comm_size)-1
    4400           0 :          p = pair(comm_size,ipe,comm_pid)
    4401           0 :          if (p >= 0) then
    4402           0 :             if (sndlths(p) > 0 .or. rcvlths(p) > 0) then
    4403           0 :                steps = steps + 1
    4404           0 :                swapids(steps) = p
    4405             :             end if
    4406             :          end if
    4407             :       end do
    4408             : 
    4409           0 :       if (.not. alltoall) then
    4410             : 
    4411           0 :          sndids(1:steps) = MPI_REQUEST_NULL
    4412           0 :          rcvids(1:steps) = MPI_REQUEST_NULL
    4413             : 
    4414           0 :          if (steps .eq. 0) then
    4415           0 :             maxreq  = 0
    4416           0 :             maxreqh = 0
    4417           0 :          elseif (steps .eq. 1) then
    4418           0 :             maxreq  = 1
    4419           0 :             maxreqh = 1
    4420             :          else
    4421           0 :             if ( present(sw_maxreq) ) then
    4422           0 :                if ((sw_maxreq .le. steps) .and. (sw_maxreq .ge. 0)) then
    4423           0 :                   maxreq  = sw_maxreq
    4424           0 :                   if (maxreq > 1) then
    4425           0 :                      maxreqh = maxreq/2
    4426             :                   else
    4427           0 :                      maxreq  = 2
    4428           0 :                      maxreqh = 1
    4429             :                   endif
    4430             :                else
    4431           0 :                   maxreq  = steps
    4432           0 :                   maxreqh = steps
    4433             :                endif
    4434             :             else
    4435           0 :                maxreq  = steps
    4436           0 :                maxreqh = steps
    4437             :             endif
    4438             :          endif
    4439             : 
    4440             : ! Post initial handshake receive requests
    4441           0 :          if (handshake) then
    4442           0 :             do istep=1,maxreq
    4443           0 :                p = swapids(istep)
    4444           0 :                if (sndlths(p) > 0) then
    4445             :                   call mpi_irecv  ( hs_rcv(istep), 1, mp_i4, p, comm_pid, comm, &
    4446           0 :                                     hs_rcvids(istep), ierr )
    4447             :                endif
    4448             :             enddo
    4449             :          endif
    4450             : 
    4451             : ! Post initial receive requests
    4452           0 :          do istep=1,maxreq
    4453           0 :             p = swapids(istep)
    4454           0 :             if (rcvlths(p) > 0) then
    4455           0 :                offset_r = rdispls(p)+1
    4456           0 :                call mpi_irecv ( ga_i4_r(offset_r), rcvlths(p), mp_i4, &
    4457           0 :                                 p, p, comm, rcvids(istep), ierr )
    4458           0 :                if (handshake) then
    4459             :                   call mpi_send( hs_snd, 1, mp_i4, p, p, comm, &
    4460           0 :                                  ierr )
    4461             :                endif
    4462             :             endif
    4463             :          enddo
    4464             :          rstep = maxreq
    4465             : !
    4466             :       endif
    4467             : 
    4468             : ! gather data into global send buffer
    4469           0 :       do istep=1,steps
    4470           0 :          p = swapids(istep)
    4471             : 
    4472           0 :          if (sndlths(p) .ne. 0) then
    4473           0 :             offset_v = sdispls(p)
    4474           0 :             do j = 1, send_bl(p+1)%nparcels
    4475           0 :                do i = 1, send_bl(p+1)%blocksizes(j)
    4476           0 :                   ga_i4_s(offset_v+i) = a1in(send_bl(p+1)%displacements(j)+i)
    4477             :                enddo
    4478           0 :                offset_v = offset_v + send_bl(p+1)%blocksizes(j)
    4479             :             enddo
    4480             : 
    4481             :             ! pack second array (if it exists)
    4482           0 :             if ( present(a2in) ) then
    4483           0 :                offset_v = sdispls(p) + arr_sndlths(p)
    4484           0 :                do j = 1, send_bl(p+1)%nparcels
    4485           0 :                   do i = 1, send_bl(p+1)%blocksizes(j)
    4486           0 :                      ga_i4_s(offset_v+i) = a2in(send_bl(p+1)%displacements(j)+i)
    4487             :                   enddo
    4488           0 :                   offset_v = offset_v + send_bl(p+1)%blocksizes(j)
    4489             :                enddo
    4490             :             endif
    4491             : 
    4492             :          endif
    4493             : 
    4494           0 :          if (.not. alltoall) then
    4495             : 
    4496             : ! Submit new i(r)send request
    4497           0 :             offset_s = sdispls(p)+1
    4498           0 :             if (sndlths(p) > 0) then
    4499           0 :                if (handshake) then
    4500             :                   call mpi_wait( hs_rcvids(istep), MPI_STATUS_IGNORE, ierr )
    4501           0 :                   if (sendd) then
    4502           0 :                      call mpi_rsend( ga_i4_s(offset_s), sndlths(p), mp_i4, &
    4503           0 :                                       p, comm_pid, comm, ierr )
    4504             :                   else
    4505           0 :                      call mpi_irsend( ga_i4_s(offset_s), sndlths(p), mp_i4, &
    4506           0 :                                       p, comm_pid, comm, sndids(istep), ierr )
    4507             :                   endif
    4508             :                else
    4509           0 :                   if (sendd) then
    4510           0 :                      call mpi_send ( ga_i4_s(offset_s), sndlths(p), mp_i4, &
    4511           0 :                                       p, comm_pid, comm, ierr )
    4512             :                   else
    4513           0 :                      call mpi_isend ( ga_i4_s(offset_s), sndlths(p), mp_i4, &
    4514           0 :                                       p, comm_pid, comm, sndids(istep), ierr )
    4515             :                   endif
    4516             :                endif
    4517             :             endif
    4518             : 
    4519           0 :             if (istep > maxreqh) then
    4520             : ! Wait for oldest irecv request to complete
    4521           0 :                call mpi_wait( rcvids(istep-maxreqh), OutStats, ierr )
    4522             : 
    4523           0 :                if (rstep < steps) then
    4524           0 :                   rstep = rstep + 1
    4525           0 :                   p = swapids(rstep)
    4526             : 
    4527             : ! Submit a new handshake irecv request
    4528           0 :                   if (handshake) then
    4529           0 :                      if (sndlths(p) > 0) then
    4530             :                         call mpi_irecv( hs_rcv(rstep), 1, mp_i4, p, comm_pid, comm, &
    4531           0 :                                         hs_rcvids(rstep), ierr )
    4532             :                      endif
    4533             :                   endif
    4534             : 
    4535             : ! Submit a new irecv request
    4536           0 :                   if (rcvlths(p) > 0) then
    4537           0 :                      offset_r = rdispls(p)+1
    4538           0 :                      call mpi_irecv( ga_i4_r(offset_r), rcvlths(p), mp_i4, &
    4539           0 :                                      p, p, comm, rcvids(rstep), ierr )
    4540           0 :                      if (handshake) then
    4541             :                         call mpi_send ( hs_snd, 1, mp_i4, p, p, comm, &
    4542           0 :                                         ierr )
    4543             :                      endif
    4544             :                   endif
    4545             :                endif
    4546             : 
    4547             : ! Wait for outstanding i(r)send request to complete
    4548           0 :                if (.not. sendd) then
    4549           0 :                   call mpi_wait( sndids(istep-maxreqh), InStats, ierr )
    4550             :                endif
    4551             :             endif
    4552             : !
    4553             :          endif
    4554             : !
    4555             :       enddo
    4556             : 
    4557             : ! local copy to send buffer
    4558           0 :       if (sndlths(comm_pid) .ne. 0) then
    4559             : 
    4560           0 :          offset_v = sdispls(comm_pid)
    4561           0 :          do j = 1, send_bl(comm_pid+1)%nparcels
    4562           0 :             do i = 1, send_bl(comm_pid+1)%blocksizes(j)
    4563           0 :                ga_i4_s(offset_v+i) = a1in(send_bl(comm_pid+1)%displacements(j)+i)
    4564             :             enddo
    4565           0 :             offset_v = offset_v + send_bl(comm_pid+1)%blocksizes(j)
    4566             :          enddo
    4567             : 
    4568             :          ! pack second array (if it exists)
    4569           0 :          if ( present(a2in) ) then
    4570           0 :             offset_v = sdispls(comm_pid) + arr_sndlths(comm_pid)
    4571           0 :             do j = 1, send_bl(comm_pid+1)%nparcels
    4572           0 :                do i = 1, send_bl(comm_pid+1)%blocksizes(j)
    4573           0 :                   ga_i4_s(offset_v+i) = a2in(send_bl(comm_pid+1)%displacements(j)+i)
    4574             :                enddo
    4575           0 :                offset_v = offset_v + send_bl(comm_pid+1)%blocksizes(j)
    4576             :             enddo
    4577             :          endif
    4578             : 
    4579           0 :          if (.not. alltoall) then
    4580           0 :             ga_i4_r(rdispls(comm_pid)+1:rdispls(comm_pid)+rcvlths(comm_pid)) = &
    4581           0 :                ga_i4_s(sdispls(comm_pid)+1:sdispls(comm_pid)+sndlths(comm_pid))
    4582             :          endif
    4583             : 
    4584             :       endif
    4585             : 
    4586           0 :       if (alltoall) then
    4587             :          call mpi_alltoallv (ga_i4_s, sndlths, sdispls, mp_i4, &
    4588             :                              ga_i4_r, rcvlths, rdispls, mp_i4, &
    4589           0 :                              comm, ierror)
    4590             :       endif
    4591             : 
    4592             : ! local copy from receive buffer
    4593           0 :       if (rcvlths(comm_pid) .ne. 0) then
    4594             : 
    4595           0 :          offset_v = rdispls(comm_pid)
    4596           0 :          do j = 1, recv_bl(comm_pid+1)%Nparcels
    4597           0 :             do i = 1, recv_bl(comm_pid+1)%blocksizes(j)
    4598           0 :                a1out(recv_bl(comm_pid+1)%displacements(j)+i) = ga_i4_r(offset_v+i)
    4599             :             enddo
    4600           0 :             offset_v = offset_v + recv_bl(comm_pid+1)%blocksizes(j)
    4601             :          enddo
    4602             : 
    4603             :          ! scatter data for second array (if it exists)
    4604           0 :          if ( present(a2out) ) then
    4605           0 :             offset_v = rdispls(comm_pid) + arr_rcvlths(comm_pid)
    4606           0 :             do j = 1, recv_bl(comm_pid+1)%Nparcels
    4607           0 :                do i = 1, recv_bl(comm_pid+1)%blocksizes(j)
    4608           0 :                   a2out(recv_bl(comm_pid+1)%displacements(j)+i) = ga_i4_r(offset_v+i)
    4609             :                enddo
    4610           0 :                offset_v = offset_v + recv_bl(comm_pid+1)%blocksizes(j)
    4611             :             enddo
    4612             :          endif
    4613             : 
    4614             :       endif
    4615             : 
    4616             : ! scatter data from global receive buffer to final destination
    4617           0 :       do istep=1,steps
    4618           0 :          p = swapids(istep)
    4619             : 
    4620           0 :          if (.not. alltoall) then
    4621           0 :             if (istep > steps-maxreqh) then
    4622             :                call mpi_wait( rcvids(istep), OutStats, ierr )
    4623             :             endif
    4624             :          endif
    4625             : 
    4626           0 :          if (rcvlths(p) .ne. 0) then
    4627             : 
    4628           0 :             offset_v = rdispls(p)
    4629           0 :             do j = 1, recv_bl(p+1)%Nparcels
    4630           0 :                do i = 1, recv_bl(p+1)%blocksizes(j)
    4631           0 :                   a1out(recv_bl(p+1)%displacements(j)+i) = ga_i4_r(offset_v+i)
    4632             :                enddo
    4633           0 :                offset_v = offset_v + recv_bl(p+1)%blocksizes(j)
    4634             :             enddo
    4635             : 
    4636             :             ! scatter data for second array (if it exists)
    4637           0 :             if ( present(a2out) ) then
    4638             : 
    4639           0 :                offset_v = rdispls(p) + arr_rcvlths(p)
    4640           0 :                do j = 1, recv_bl(p+1)%Nparcels
    4641           0 :                   do i = 1, recv_bl(p+1)%blocksizes(j)
    4642           0 :                      a2out(recv_bl(p+1)%displacements(j)+i) = ga_i4_r(offset_v+i)
    4643             :                   enddo
    4644           0 :                   offset_v = offset_v + recv_bl(p+1)%blocksizes(j)
    4645             :                enddo
    4646             : 
    4647             :             endif
    4648             : 
    4649             :          endif
    4650             :       enddo
    4651             : 
    4652             : ! Wait for any outstanding send requests to complete.
    4653           0 :       if (.not. alltoall .and. .not. sendd) then
    4654           0 :          call mpi_waitall( maxreqh, sndids(steps-maxreqh+1), InStats, ierr )
    4655             :       endif
    4656             : 
    4657             : ! clean-up
    4658             : ! make used portion of storage window available for reuse
    4659           0 :       i4_win%ncall_s = i4_win%ncall_s - 1
    4660             : 
    4661             : #if defined( MODCM_TIMING )
    4662             :       call t_stopf('mod_comm communication')
    4663             : #endif
    4664             : 
    4665             : !EOC
    4666           0 :       end subroutine mp_swapirr_i4
    4667             : !------------------------------------------------------------------------------
    4668             : !
    4669             : !------------------------------------------------------------------------------
    4670             : !BOP
    4671             : ! !ROUTINE: pair 
    4672             : !
    4673             : ! !INTERFACE:
    4674           0 :       integer function pair(np,p,k)
    4675             : !
    4676             : ! !INPUT PARAMETERS:
    4677             :       integer :: np
    4678             :       integer :: p
    4679             :       integer :: k
    4680             : ! !DESCRIPTION:
    4681             : !
    4682             : !     Bitwise XOR of arguments p and k, if less than upper bound np
    4683             : !
    4684             : ! !REVISION HISTORY: 
    4685             : !    2008.08.21   Worley         Imported from spmdutils
    4686             : !
    4687             : !EOP
    4688             : !------------------------------------------------------------------------------
    4689             : !BOC
    4690             : !
    4691             : ! !LOCAL VARIABLES:
    4692             :       integer q
    4693             : !
    4694           0 :       q = ieor(p,k)
    4695           0 :       if ( q > np-1 ) then
    4696             :          pair = -1
    4697             :       else
    4698           0 :          pair = q
    4699             :       endif
    4700             : 
    4701             :       return
    4702             : 
    4703             : !EOC
    4704             :       end function pair
    4705             : !------------------------------------------------------------------------------
    4706             : !
    4707             : !------------------------------------------------------------------------------
    4708             : !BOP
    4709             : ! !ROUTINE: ceil2
    4710             : !
    4711             : ! !INTERFACE:
    4712    12160512 :      integer function ceil2(n)
    4713             : !
    4714             : ! !INPUT PARAMETERS:
    4715             :      integer :: n
    4716             : ! !DESCRIPTION:
    4717             : !
    4718             : !     Smallest power of 2 greater than or equal to the argument
    4719             : !
    4720             : ! !REVISION HISTORY: 
    4721             : !    2008.08.21   Worley         Imported from spmdutils
    4722             : !
    4723             : !EOP
    4724             : !------------------------------------------------------------------------------
    4725             : !BOC
    4726             : !
    4727             : ! !LOCAL VARIABLES:
    4728             :      integer p
    4729             : 
    4730    12160512 :      p=1
    4731   133765632 :      do while ( p < n )
    4732   121605120 :         p=p*2
    4733             :      enddo
    4734    12160512 :      ceil2=p
    4735             : 
    4736             :      return
    4737             : !EOC
    4738             :      end function ceil2
    4739             : !------------------------------------------------------------------------------
    4740             : !
    4741             : !------------------------------------------------------------------------------
    4742             : # if defined( MOD_ASSUMED_SIZE )
    4743             : !BOP
    4744             : ! !ROUTINE: mp_sendtrirr --- Initiate communication of contiguous tracer parcels
    4745             : !
    4746             : ! !INTERFACE:
    4747     5160960 :       subroutine mp_sendtrirr ( comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq,  &
    4748             :                                 ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts,  &
    4749             :                                 ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr,  &
    4750             :                                 modc )
    4751             :  
    4752             : ! !INPUT PARAMETERS:
    4753             :       integer, intent(in)  :: comm      !  communicator
    4754             :       type(blockdescriptor), intent(in)  :: send_bl(:) ! send blocks
    4755             :       type(blockdescriptor), intent(in)  :: recv_bl(:) ! receive blocks
    4756             :       integer, intent(in)           :: mbeg            ! initial tracer index
    4757             :       integer, intent(in)           :: mend            ! final tracer index
    4758             :       integer, intent(in)           :: mq              ! total tracer indices
    4759             :       integer, intent(in)           :: ifirsts         ! first I index of source
    4760             :       integer, intent(in)           :: ilasts          ! last I index of source
    4761             :       integer, intent(in)           :: jfirsts         ! first j index of source
    4762             :       integer, intent(in)           :: jlasts          ! last j index of source
    4763             :       integer, intent(in)           :: kfirsts         ! first k index of source
    4764             :       integer, intent(in)           :: klasts          ! last k index of source
    4765             :       integer, intent(in)           :: ifirstr         ! first I index of target
    4766             :       integer, intent(in)           :: ilastr          ! last I index of target
    4767             :       integer, intent(in)           :: jfirstr         ! first j index of target
    4768             :       integer, intent(in)           :: jlastr          ! last j index of target
    4769             :       integer, intent(in)           :: kfirstr         ! first k index of target
    4770             :       integer, intent(in)           :: klastr          ! last k index of target
    4771             :       integer, optional, intent(in) :: modc(4)         ! 1: classical, swap p2p, swap a2a
    4772             :                                                        ! 2: handshake
    4773             :                                                        ! 3: send vs isend
    4774             :                                                        ! 4: max number of outstanding requests
    4775             :       real(r8), intent(in) :: qin(*) ! input tracer array
    4776             : 
    4777             : ! !OUTPUT PARAMETERS:
    4778             :       real(r8), intent(out) :: qout(*) ! output tracer array
    4779             : !
    4780             : ! !DESCRIPTION:
    4781             : !     Communicate a number of contiguous parcels to/from arbitrary set of PEs.
    4782             : !     Modc(1): if 0, use original approach of posting all communications here and placing
    4783             : !     wait points in mp_recvtrirr; if 1, call swap routine with p2p messages; if 2, call swap
    4784             : !     routine with a2a messages. 
    4785             : !     Modc(2): if 1, then apply handshaking (don't send until corresponding receive is posted)
    4786             : !     Modc(3): if 1, then use blocking send; otherwise use nonblocking send
    4787             : !     Modc(4): maximum number of outstanding requests (applies to swap routines only)
    4788             : !
    4789             : ! !REVISION HISTORY: 
    4790             : !    02.08.13   Sawyer      Creation
    4791             : !    02.11.06   Mirin       Optimizations
    4792             : !    03.03.03   Sawyer      Use partneroffset
    4793             : !    03.06.24   Sawyer      Integrated Use_Mpi_Types; added qout
    4794             : !    04.02.24   Mirin       Various mpi2 options
    4795             : !    08.09.18   Mirin       Major overhaul, to include approaches from Mirin and Worley
    4796             : !    09.10.07   Worley      eliminated mpi_recv from handshake logic
    4797             : !
    4798             : ! !BUGS:
    4799             : !
    4800             : !EOP
    4801             : !------------------------------------------------------------------------------
    4802             : !BOC
    4803             : !
    4804             : ! !LOCAL VARIABLES:
    4805             :       integer ipe, qsize, offset, blocksize, nparcels, offset_s, offset_r, ierr, mod_method
    4806             :       integer p, mysize, nthpc, minsize, nthrd, pn, pt, tmpsize, unitsize, offset_0
    4807             :       integer i, j, send_tag, recv_tag, num_s, num_r, m
    4808    10321920 :       integer :: offset_v (Max_Nparcels)
    4809    10321920 :       integer :: hs_snd, hs_rcv(numpro), hs_rcvids(numpro)
    4810             :       integer ipe2, ceil2num
    4811             :       integer numtr, numtrm
    4812             :       integer sw_local, maxreq_local
    4813             :       logical hs_local, send_local
    4814             :       logical sw_alltoall
    4815             :       integer comm_pid
    4816             :       integer ijks, ijkr, ij
    4817             : 
    4818             : 
    4819             : #if defined( MODCM_TIMING )
    4820             :       call t_startf('mod_comm communication')
    4821             : #endif
    4822             : 
    4823     5160960 :       if (present(modc)) then
    4824     5160960 :          sw_local   = modc(1)
    4825     5160960 :          hs_local   = (modc(2) .eq. 1)
    4826     5160960 :          send_local = (modc(3) .eq. 1)
    4827     5160960 :          maxreq_local = modc(4)
    4828             :       else
    4829           0 :          sw_local = 0
    4830           0 :          hs_local = .true.
    4831           0 :          send_local = .true.
    4832           0 :          maxreq_local = -1
    4833             :       endif
    4834             : 
    4835             : ! Do not call mp_swaptrirr unless mod_method equals 0
    4836     5160960 :       mod_method = recv_bl(1)%method
    4837     5160960 :       if (mod_method .gt. 0) sw_local = 0
    4838             : 
    4839     5160960 :     if (sw_local .gt. 0) then
    4840           0 :          sw_alltoall = (sw_local .eq. 2)
    4841             :          call mp_swaptrirr(comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq, &
    4842             :                            ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts, &
    4843             :                            ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr, &
    4844             :                            sw_handshake=hs_local, sw_maxreq=maxreq_local,     &
    4845           0 :                            sw_alltoall=sw_alltoall, sw_send=send_local)
    4846             :     else
    4847             : 
    4848     5160960 :       call MPI_COMM_RANK (comm, comm_pid, ierr)
    4849             : 
    4850     5160960 :       hs_snd = 1
    4851     5160960 :       ceil2num = ceil2(numpro)
    4852             : 
    4853     5160960 :       numtrm = mend - mbeg
    4854     5160960 :       numtr = numtrm + 1
    4855             : 
    4856             : !     num_s = 0 if this processes is not part of the sending decomposition
    4857     5160960 :       num_s = size(send_bl)
    4858     5160960 :       if (send_bl(1)%Nparcels == -1) then
    4859           0 :          num_s = 0
    4860             :       endif
    4861             : 
    4862             : !     num_r = 0 if this processes is not part of the receiving decomposition
    4863     5160960 :       num_r = size(recv_bl)
    4864     5160960 :       if (recv_bl(1)%Nparcels == -1) then
    4865           0 :          num_r = 0
    4866             :       endif
    4867             : 
    4868     5160960 :       r8_win%ncall_s = r8_win%ncall_s + 1
    4869             : 
    4870     5160960 :       ijks =(klasts-kfirsts+1)*(jlasts-jfirsts+1)*(ilasts-ifirsts+1)
    4871     5160960 :       ijkr =(klastr-kfirstr+1)*(jlastr-jfirstr+1)*(ilastr-ifirstr+1)
    4872             : 
    4873     5160960 :      if (mod_method .gt. 0) then
    4874             : !
    4875             : ! mpi derived types
    4876           0 :       if (r8_win%ncall_s .gt. MaxTrf-numtrm) then
    4877           0 :          write(iulog,*) "mp_sendtrirr: derived type handle count exceeded - exiting"
    4878           0 :          write(iulog,*) "r8_win%ncall_s MaxTrf = ", r8_win%ncall_s, MaxTrf
    4879           0 :          stop
    4880             :       endif
    4881             : !
    4882             : ! MPI: Irecv over all processes
    4883             : !
    4884           0 :       if (hs_local) then
    4885           0 :          hs_rcvids(:) = MPI_REQUEST_NULL
    4886           0 :          do ipe2=1, ceil2num
    4887           0 :             ipe = ieor(ipe2-1,comm_pid) + 1
    4888           0 :             if (ipe .gt. num_s) cycle
    4889           0 :             if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then
    4890           0 :                if (ipe-1 /= comm_pid) &
    4891           0 :                   call MPI_IRECV ( hs_rcv(ipe), 1, mp_i4, ipe-1, comm_pid, comm, &
    4892           0 :                                    hs_rcvids(ipe), ierr )
    4893             :             endif
    4894             :          enddo
    4895             :       endif
    4896             : 
    4897           0 :       OutHandle(:,r8_win%ncall_s:r8_win%ncall_s+numtrm) = MPI_REQUEST_NULL
    4898           0 :       do ipe2=1, ceil2num
    4899           0 :         ipe = ieor(ipe2-1,comm_pid) + 1
    4900           0 :         if (ipe .gt. num_r) cycle
    4901             : !
    4902             : ! Receive the buffers with MPI_Irecv. Non-blocking
    4903             : !
    4904           0 :         if ( recv_bl(ipe)%type /= MPI_DATATYPE_NULL ) then
    4905           0 :           recv_tag = ipe-1 + modcam_tagoffset
    4906           0 :           do m = mbeg, mend
    4907           0 :              call mpi_irecv( qout((m-1)*ijkr+1), 1, recv_bl(ipe)%type, ipe-1, recv_tag,   &
    4908           0 :                              comm, OutHandle(ipe,r8_win%ncall_s+m-mbeg), ierr )
    4909             :           enddo
    4910           0 :           if (hs_local) then
    4911           0 :              if (ipe-1 /= comm_pid) &
    4912           0 :                call MPI_SEND ( hs_snd, 1, mp_i4, ipe-1, ipe-1, comm, ierr )
    4913             :           endif
    4914             :         endif
    4915             :       enddo
    4916             : 
    4917             : !
    4918             : ! MPI: Isend/Send over all processes; use risend/rsend with hs
    4919             : !
    4920           0 :       InHandle(:,r8_win%ncall_s:r8_win%ncall_s+numtrm) = MPI_REQUEST_NULL
    4921           0 :       do ipe2=1, ceil2num
    4922           0 :         ipe = ieor(ipe2-1,comm_pid) + 1
    4923           0 :         if (ipe .gt. num_s) cycle
    4924             : 
    4925             : !
    4926             : ! Send the individual buffers with non-blocking sends
    4927             : !
    4928           0 :         if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then
    4929           0 :           send_tag = comm_pid + modcam_tagoffset
    4930           0 :           if (hs_local) then
    4931           0 :              if (ipe-1 /= comm_pid) &
    4932           0 :                 call MPI_WAIT ( hs_rcvids(ipe), MPI_STATUS_IGNORE, ierr )
    4933           0 :              if (send_local) then
    4934           0 :                 do m = mbeg, mend
    4935           0 :                    call mpi_rsend( qin((m-1)*ijks+1), 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    4936           0 :                                    comm, ierr )
    4937             :                 enddo
    4938             :              else
    4939           0 :                 do m = mbeg, mend
    4940           0 :                    call mpi_irsend( qin((m-1)*ijks+1), 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    4941           0 :                                     comm, InHandle(ipe,r8_win%ncall_s), ierr )
    4942             :                 enddo
    4943             :              endif
    4944             :           else
    4945           0 :              if (send_local) then
    4946           0 :                 do m = mbeg, mend
    4947           0 :                    call mpi_send( qin((m-1)*ijks+1), 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    4948           0 :                                   comm, ierr )
    4949             :                 enddo
    4950             :              else
    4951           0 :                 do m = mbeg, mend
    4952           0 :                    call mpi_isend( qin((m-1)*ijks+1), 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    4953           0 :                                    comm, InHandle(ipe,r8_win%ncall_s), ierr )
    4954             :                 enddo
    4955             :              endif
    4956             :           endif
    4957             :         endif
    4958             :       enddo
    4959             :      else
    4960             : 
    4961             : ! temporary contiguous buffers
    4962             : 
    4963     5160960 :       if (r8_win%ncall_s .gt. max_irr-numtrm) then
    4964           0 :          write(iulog,*) "mp_sendtrirr: insufficient window storage - exiting"
    4965           0 :          write(iulog,*) "r8_win%ncall_s max_irr = ", r8_win%ncall_s, max_irr
    4966           0 :          stop
    4967             :       endif
    4968     5160960 :       unitsize = r8_win%size/max_irr
    4969             : 
    4970             : ! issue call to receive data in global receive buffer
    4971     5160960 :       offset_0 = (r8_win%ncall_s-1)*unitsize
    4972     5160960 :       offset_s = offset_0
    4973     5160960 :       offset_r = offset_0
    4974             : 
    4975     5160960 :       if (hs_local) then
    4976  3968778240 :          hs_rcvids(:) = MPI_REQUEST_NULL
    4977  5289984000 :          do ipe2=1, ceil2num
    4978  5284823040 :             ipe = ieor(ipe2-1,comm_pid) + 1
    4979  5284823040 :             if (ipe .gt. num_s) cycle
    4980  3963617280 :             qsize = numtr*send_bl(ipe)%Tot_Size
    4981  3968778240 :             if (qsize .ne. 0) then
    4982    61931520 :                r8_win%dest = ipe-1
    4983    61931520 :                send_tag = comm_pid + modcam_tagoffset
    4984    61931520 :                if (r8_win%dest /= comm_pid) &
    4985    61850880 :                   call MPI_IRECV ( hs_rcv(ipe), 1, mp_i4, r8_win%dest, send_tag, comm, &
    4986    61850880 :                                    hs_rcvids(ipe), ierr )
    4987             :             endif
    4988             :          enddo
    4989             :       endif
    4990             : 
    4991  5289984000 :       do ipe2=1, ceil2num
    4992  5284823040 :          ipe = ieor(ipe2-1,comm_pid) + 1
    4993  5284823040 :          if (ipe .gt. num_r) cycle
    4994  3963617280 :          r8_win%size_r = numtr*recv_bl(ipe)%Tot_Size
    4995  3968778240 :          if (r8_win%size_r .ne. 0) then
    4996    61931520 :             r8_win%offset_r = offset_r
    4997    61931520 :             offset_r = offset_r + r8_win%size_r
    4998    61931520 :             r8_win%src = ipe-1
    4999    61931520 :             if (numtr*unitsize >= offset_r-offset_0) then
    5000    61931520 :               recv_tag = r8_win%src + modcam_tagoffset
    5001    61931520 :               qsize    = r8_win%size_r
    5002    61931520 :               r8_win%nrecv    = r8_win%nrecv + 1
    5003           0 :               call MPI_IRECV(ga_r8_r(r8_win%offset_r+1), qsize, mp_r8, r8_win%src, &
    5004    61931520 :                              recv_tag, comm, r8_win%rqest(r8_win%nrecv), ierror)
    5005    61931520 :               if (hs_local) then
    5006    61931520 :                  if (r8_win%src /= comm_pid) &
    5007    61850880 :                    call MPI_SEND ( hs_snd, 1, mp_i4, r8_win%src, recv_tag, comm, ierror)
    5008             :               endif
    5009             :             else
    5010           0 :               write(iulog,*) "Fatal mp_sendtrirr: receive window out of space - exiting"
    5011           0 :               write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid,  &
    5012           0 :                         ipe, unitsize, offset_r, offset_0
    5013           0 :               stop
    5014             :             endif
    5015             :          endif
    5016             :       enddo
    5017             : ! gather data into global send buffer
    5018  5289984000 :       do ipe2=1, ceil2num
    5019  5284823040 :          ipe = ieor(ipe2-1,comm_pid) + 1
    5020  5284823040 :          if (ipe .gt. num_s) cycle
    5021  3963617280 :          qsize = numtr*send_bl(ipe)%Tot_Size
    5022  3968778240 :          if (qsize .ne. 0) then
    5023    61931520 :             r8_win%dest = ipe-1
    5024    61931520 :             r8_win%offset_s = offset_s
    5025    61931520 :             offset_s = offset_s + qsize
    5026    61931520 :             if (offset_s-offset_0 .gt. numtr*unitsize) then
    5027           0 :               write(iulog,*) "Fatal mp_sendtrirr: send window out of space - exiting"
    5028           0 :               write(iulog,*) 'comm_pid ipe unitsize offset_s offset_0 = ', comm_pid,  &
    5029           0 :                         ipe, unitsize, offset_s, offset_0
    5030           0 :               stop
    5031             :             endif
    5032             : 
    5033    61931520 :             offset_v(1) = r8_win%offset_s
    5034   495452160 :             do j = 2, send_bl(ipe)%nparcels
    5035   495452160 :                offset_v(j) = offset_v(j-1) + send_bl(ipe)%blocksizes(j-1)
    5036             :             enddo
    5037             : 
    5038   557383680 :             do j = 1, send_bl(ipe)%nparcels
    5039  2043740160 :                do m = mbeg, mend
    5040 37654364160 :                   do i = 1, send_bl(ipe)%blocksizes(j)
    5041 35672555520 :                      ij = send_bl(ipe)%displacements(j)+i
    5042 37158912000 :                      ga_r8_s(send_bl(ipe)%Tot_Size*(m-mbeg)+offset_v(j)+i) = qin((m-1)*ijks+ij)
    5043             :                   enddo
    5044             :                enddo
    5045             :             enddo
    5046             : 
    5047             : ! nonblocking send
    5048    61931520 :             send_tag = comm_pid + modcam_tagoffset
    5049    61931520 :             r8_win%nsend = r8_win%nsend + 1
    5050    61931520 :             if (hs_local) then
    5051    61931520 :                if (r8_win%dest /= comm_pid) &
    5052    61850880 :                   call MPI_WAIT ( hs_rcvids(ipe), MPI_STATUS_IGNORE, ierr )
    5053    61931520 :                if (send_local) then
    5054           0 :                   call MPI_RSEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, &
    5055    61931520 :                                  send_tag, comm, ierr)
    5056             :                else
    5057           0 :                   call MPI_IRSEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, &
    5058           0 :                                  send_tag, comm, r8_win%sqest(r8_win%nsend), ierr)
    5059             :                endif
    5060             :             else
    5061           0 :                if (send_local) then
    5062           0 :                   call MPI_SEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, &
    5063           0 :                                  send_tag, comm, ierr)
    5064             :                else
    5065           0 :                   call MPI_ISEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, &
    5066           0 :                                  send_tag, comm, r8_win%sqest(r8_win%nsend), ierr)
    5067             :                endif
    5068             :             endif
    5069             :          endif
    5070             :       enddo
    5071             : 
    5072             :      endif   !  mod_method
    5073             : 
    5074     5160960 :       r8_win%ncall_s = r8_win%ncall_s + numtrm
    5075             : 
    5076             :     endif   !  sw_local
    5077             : 
    5078             : #if defined( MODCM_TIMING )
    5079             :       call t_stopf('mod_comm communication')
    5080             : #endif
    5081             : 
    5082     5160960 :       end subroutine mp_sendtrirr
    5083             : !------------------------------------------------------------------------------
    5084             : !
    5085             : !------------------------------------------------------------------------------
    5086             : !BOP
    5087             : ! !ROUTINE: mp_recvtrirr --- Finalize communication of contiguous tracer parcels
    5088             : !
    5089             : ! !INTERFACE:
    5090     5160960 :       subroutine mp_recvtrirr ( comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq,  &
    5091             :                                 ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts,  &
    5092             :                                 ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr,  &
    5093             :                                 modc )
    5094             :  
    5095             : ! !INPUT PARAMETERS:
    5096             :       integer, intent(in)  :: comm      !  communicator
    5097             :       type(blockdescriptor), intent(in)  :: send_bl(:) ! send blocks
    5098             :       type(blockdescriptor), intent(in)  :: recv_bl(:) ! receive blocks
    5099             :       integer, intent(in)           :: mbeg            ! initial tracer index
    5100             :       integer, intent(in)           :: mend            ! final tracer index
    5101             :       integer, intent(in)           :: mq              ! total tracer indices
    5102             :       integer, intent(in)           :: ifirsts         ! first I index of source
    5103             :       integer, intent(in)           :: ilasts          ! last I index of source
    5104             :       integer, intent(in)           :: jfirsts         ! first j index of source
    5105             :       integer, intent(in)           :: jlasts          ! last j index of source
    5106             :       integer, intent(in)           :: kfirsts         ! first k index of source
    5107             :       integer, intent(in)           :: klasts          ! last k index of source
    5108             :       integer, intent(in)           :: ifirstr         ! first I index of target
    5109             :       integer, intent(in)           :: ilastr          ! last I index of target
    5110             :       integer, intent(in)           :: jfirstr         ! first j index of target
    5111             :       integer, intent(in)           :: jlastr          ! last j index of target
    5112             :       integer, intent(in)           :: kfirstr         ! first k index of target
    5113             :       integer, intent(in)           :: klastr          ! last k index of target
    5114             :       integer, optional, intent(in) :: modc(4)         ! 1: classical, swap p2p, swap a2a
    5115             :                                                        ! 2: handshake
    5116             :                                                        ! 3: send vs isend
    5117             :                                                        ! 4: max number of outstanding requests
    5118             :       real(r8), intent(in) :: qin(*) ! input tracer array
    5119             : ! !OUTPUT PARAMETERS:
    5120             :       real(r8), intent(out) :: qout(*) ! output tracer array
    5121             : !
    5122             : ! !DESCRIPTION:
    5123             : !     Complete transfer of a generalized region initiated by {\tt mp\_sendtrirr}.
    5124             : !     Communicate a number of contiguous parcels to/from arbitrary set of PEs.
    5125             : !     Modc(1): if 0, use original approach of posting all communications in mp_sendtrirr and
    5126             : !     placing wait points here; otherwise don't do anything - mp_swaptrirr is called from mp_sendirr.
    5127             : !     Modc(3): if 1, then use blocking send; otherwise use nonblocking send
    5128             : !
    5129             : ! !REVISION HISTORY:
    5130             : !    02.08.15   Sawyer      Creation
    5131             : !    02.11.06   Mirin       Optimizations
    5132             : !    03.03.03   Sawyer      Now using packed arrays for MPI2
    5133             : !    04.02.24   Mirin       Various mpi2 options
    5134             : !    08.09.18   Mirin       Major overhaul, to include approaches from Mirin and Worley
    5135             : !
    5136             : !EOP
    5137             : !------------------------------------------------------------------------------
    5138             : !BOC
    5139             :       integer :: ipe, blocksize, offset_r, mod_method
    5140             :       integer unitsize, offset_0
    5141             :       integer Ierr
    5142    10321920 :       integer InStats(numpro*MPI_STATUS_SIZE)
    5143    10321920 :       integer OutStats(numpro*MPI_STATUS_SIZE)
    5144             :       integer i, j, num_r, num_s, m
    5145     5160960 :       integer :: offset_v (Max_Nparcels)
    5146             :       integer ipe2, ceil2num
    5147             :       integer numtr, numtrm
    5148             :       integer sw_local, maxreq_local
    5149             :       logical hs_local, send_local
    5150             :       logical sw_alltoall
    5151             :       integer comm_size, comm_pid
    5152             :       integer ijks, ijkr, ij
    5153             : 
    5154     5160960 :       if (present(modc)) then
    5155     5160960 :          sw_local   = modc(1)
    5156     5160960 :          hs_local   = (modc(2) .eq. 1)
    5157     5160960 :          send_local = (modc(3) .eq. 1)
    5158     5160960 :          maxreq_local = modc(4)
    5159             :       else
    5160             :          sw_local = 0
    5161     5160960 :          hs_local = .true.
    5162             :          send_local = .true.
    5163     5160960 :          maxreq_local = -1
    5164             :       endif
    5165             : 
    5166             : ! Do not call mp_swaptrirr (hence return) unless mod_method equals 0
    5167     5160960 :       mod_method = recv_bl(1)%method
    5168     5160960 :       if (mod_method .gt. 0) sw_local = 0
    5169             : 
    5170             : ! Return if swap_irr
    5171     5160960 :       if (sw_local .gt. 0) return
    5172             : 
    5173             : #if defined( MODCM_TIMING )
    5174             :       call t_startf('mod_comm communication')
    5175             : #endif
    5176             : 
    5177     5160960 :       call MPI_COMM_SIZE (comm, comm_size, ierr)
    5178     5160960 :       call MPI_COMM_RANK (comm, comm_pid, ierr)
    5179             : 
    5180     5160960 :       ceil2num = ceil2(numpro)
    5181             : 
    5182     5160960 :       numtrm = mend - mbeg
    5183     5160960 :       numtr = numtrm + 1
    5184             : 
    5185             : !     num_s = 0 if this processes is not part of the sending decomposition
    5186     5160960 :       num_s = size(send_bl)
    5187     5160960 :       if (send_bl(1)%Nparcels == -1) then
    5188           0 :          num_s = 0
    5189             :       endif
    5190             : 
    5191             : !     num_r = 0 if this processes is not part of the receiving decomposition
    5192     5160960 :       num_r = size(recv_bl)
    5193     5160960 :       if (recv_bl(1)%Nparcels == -1) then
    5194           0 :          num_r = 0
    5195             :       endif
    5196             : 
    5197     5160960 :       r8_win%ncall_r = r8_win%ncall_r + 1
    5198             : 
    5199     5160960 :       ijks =(klasts-kfirsts+1)*(jlasts-jfirsts+1)*(ilasts-ifirsts+1)
    5200     5160960 :       ijkr =(klastr-kfirstr+1)*(jlastr-jfirstr+1)*(ilastr-ifirstr+1)
    5201             : 
    5202     5160960 :     if (mod_method .gt. 0) then
    5203             : 
    5204             : ! mpi derived types
    5205           0 :       if (r8_win%ncall_r .gt. MaxTrf-numtrm) then
    5206           0 :          write(iulog,*) "mp_recvtrirr: derived type handle count exceeded - exiting"
    5207           0 :          write(iulog,*) "r8_win%ncall_r MaxTrf = ", r8_win%ncall_r, MaxTrf
    5208           0 :          stop
    5209             :       endif
    5210             : 
    5211           0 :       if (num_s .gt. 0 .and. (.not. send_local)) then
    5212           0 :          do m = mbeg, mend
    5213           0 :             CALL MPI_WAITALL( comm_size, InHandle(:,r8_win%ncall_r+m-mbeg), InStats, Ierr )
    5214             :          enddo
    5215             :       endif
    5216           0 :       if (num_r .gt. 0) then
    5217           0 :          do m = mbeg, mend
    5218           0 :             CALL MPI_WAITALL( comm_size, OutHandle(:,r8_win%ncall_r+m-mbeg), OutStats, Ierr )
    5219             :          enddo
    5220             :       endif
    5221             : 
    5222             :     else
    5223             : 
    5224             : ! temporary contiguous buffer / global window
    5225             : 
    5226     5160960 :       if (r8_win%ncall_r .gt. max_irr-numtrm) then
    5227           0 :          write(iulog,*) "mp_recvtrirr: insufficient window storage - exiting"
    5228           0 :          write(iulog,*) "r8_win%ncall_r max_irr = ", r8_win%ncall_r, max_irr
    5229           0 :          stop
    5230             :       endif
    5231     5160960 :       unitsize = r8_win%size/max_irr
    5232             : 
    5233             : ! scatter data from global receive buffer to final destination
    5234     5160960 :       offset_0 = (r8_win%ncall_r-1)*unitsize
    5235     5160960 :       offset_r = offset_0
    5236             : 
    5237  5289984000 :       do ipe2=1, ceil2num
    5238  5284823040 :          ipe = ieor(ipe2-1,comm_pid) + 1
    5239  5284823040 :          if (ipe .gt. num_r) cycle
    5240  3963617280 :          r8_win%size_r = numtr*recv_bl(ipe)%Tot_Size
    5241  3968778240 :          if (r8_win%size_r .ne. 0) then
    5242    61931520 :             r8_win%offset_r = offset_r
    5243    61931520 :             offset_r = offset_r + r8_win%size_r
    5244    61931520 :             if (offset_r-offset_0 .gt. numtr*unitsize) then
    5245           0 :               write(iulog,*) "Fatal mp_recvtrirr: receive window out of space - exiting"
    5246           0 :               write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid,  &
    5247           0 :                         ipe, unitsize, offset_r, offset_0
    5248           0 :               stop
    5249             :             endif
    5250             : 
    5251    61931520 :             r8_win%nread = r8_win%nread + 1
    5252    61931520 :             call MPI_WAIT(r8_win%rqest(r8_win%nread), Status, ierr)
    5253             : 
    5254    61931520 :             offset_v(1) = r8_win%offset_r
    5255   495452160 :             do j = 2, recv_bl(ipe)%Nparcels
    5256   495452160 :                offset_v(j) = offset_v(j-1) + recv_bl(ipe)%blocksizes(j-1)
    5257             :             enddo
    5258             : 
    5259   557383680 :             do j = 1, recv_bl(ipe)%Nparcels
    5260  2043740160 :                do m = mbeg, mend
    5261 37654364160 :                   do i = 1, recv_bl(ipe)%blocksizes(j)
    5262 35672555520 :                      ij = recv_bl(ipe)%displacements(j)+i
    5263 37158912000 :                      qout((m-1)*ijkr+ij) = ga_r8_r(recv_bl(ipe)%Tot_Size*(m-mbeg)+offset_v(j)+i)
    5264             :                   enddo
    5265             :                enddo
    5266             :             enddo
    5267             : 
    5268             :          endif
    5269             :       enddo
    5270             : 
    5271     5160960 :       if ((r8_win%ncall_s == r8_win%ncall_r + numtrm) .and. (.not. send_local)) then
    5272           0 :          call MPI_WAITALL(r8_win%nsend, r8_win%sqest, Stats, ierror)
    5273             :       endif
    5274             : 
    5275             :     endif    !    mod_method .gt. 0
    5276             : 
    5277     5160960 :     r8_win%ncall_r = r8_win%ncall_r + numtrm
    5278             : 
    5279     5160960 :     if (r8_win%ncall_s == r8_win%ncall_r) then
    5280     5160960 :        r8_win%nsend = 0
    5281     5160960 :        r8_win%nrecv = 0
    5282     5160960 :        r8_win%nread = 0
    5283     5160960 :        r8_win%ncall_s = 0
    5284     5160960 :        r8_win%ncall_r = 0
    5285             :     endif
    5286             : 
    5287             : #if defined( MODCM_TIMING )
    5288             :       call t_stopf('mod_comm communication')
    5289             : #endif
    5290             : 
    5291             : !EOC
    5292             :       end subroutine mp_recvtrirr
    5293             : !------------------------------------------------------------------------------
    5294             : !
    5295             : !------------------------------------------------------------------------------
    5296             : !BOP
    5297             : ! !ROUTINE: mp_swaptrirr --- Write r8 contiguous parcels to global array
    5298             : !                            using XOR swap ordering - for multiple tracers
    5299             : !
    5300             : ! !INTERFACE:
    5301           0 :       subroutine mp_swaptrirr ( comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq,  &
    5302             :                                 ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts,  &
    5303             :                                 ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr,  &
    5304             :                                 sw_handshake, sw_maxreq, sw_alltoall, sw_send   )
    5305             :  
    5306             : ! !INPUT PARAMETERS:
    5307             :       integer, intent(in)  :: comm                     ! communicator
    5308             :       type(blockdescriptor), intent(in)  :: send_bl(:) ! send blocks
    5309             :       type(blockdescriptor), intent(in)  :: recv_bl(:) ! receive blocks
    5310             :       integer, intent(in)           :: mbeg            ! initial tracer index
    5311             :       integer, intent(in)           :: mend            ! final tracer index
    5312             :       integer, intent(in)           :: mq              ! total tracer indices
    5313             :       integer, intent(in)           :: ifirsts         ! first I index of source
    5314             :       integer, intent(in)           :: ilasts          ! last I index of source
    5315             :       integer, intent(in)           :: jfirsts         ! first j index of source
    5316             :       integer, intent(in)           :: jlasts          ! last j index of source
    5317             :       integer, intent(in)           :: kfirsts         ! first k index of source
    5318             :       integer, intent(in)           :: klasts          ! last k index of source
    5319             :       integer, intent(in)           :: ifirstr         ! first I index of target
    5320             :       integer, intent(in)           :: ilastr          ! last I index of target
    5321             :       integer, intent(in)           :: jfirstr         ! first j index of target
    5322             :       integer, intent(in)           :: jlastr          ! last j index of target
    5323             :       integer, intent(in)           :: kfirstr         ! first k index of target
    5324             :       integer, intent(in)           :: klastr          ! last k index of target
    5325             :       logical, optional, intent(in) :: sw_handshake    ! use flow control and 
    5326             :                                                        !  ready send
    5327             :       integer, optional, intent(in) :: sw_maxreq       ! maximum number of outstanding
    5328             :                                                        !  MPI requests
    5329             :       logical, optional, intent(in) :: sw_alltoall     ! use mpi_alltoall
    5330             :       logical, optional, intent(in) :: sw_send         ! use mpi_send instead of isend
    5331             :       real(r8), intent(in) :: qin(*) ! input tracer array
    5332             : 
    5333             : ! !OUTPUT PARAMETERS:
    5334             :       real(r8), intent(out) :: qout(*) ! output tracer array
    5335             : !
    5336             : ! !DESCRIPTION:
    5337             : !
    5338             : !     XOR-ordered version of all-to-all communication
    5339             : !
    5340             : ! WARNING: mod_comm parameter max_irr might need to be set larger than expected
    5341             : !          when swapping multiple variables; specifically, max_irr must be at least
    5342             : !          as large as the incoming r8_win%ncall_s + the number of variables to
    5343             : !          be swapped
    5344             : !
    5345             : ! !REVISION HISTORY: 
    5346             : !    08.06.30   Worley      original: derived from mp_sendirr, but using 
    5347             : !                            swapm logic and XOR swap order 
    5348             : !    08.08.22   Worley      removed swapm; reimplemented with native MPI,
    5349             : !                            added flow control/ready send option and maxreq
    5350             : !                            throttling, added alltoall option
    5351             : !    09.10.07   Worley      eliminated mpi_recv from handshake logic
    5352             : !
    5353             : ! !BUGS:
    5354             : !
    5355             : !EOP
    5356             : !------------------------------------------------------------------------------
    5357             : !BOC
    5358             : !
    5359             : ! !LOCAL VARIABLES:
    5360             :       integer :: i, j, p, istep, num_s, num_r
    5361             :       integer :: comm_pid, comm_size, steps, ierr
    5362             :       integer :: ipe, offset_s, offset_r, offset_0, unitsize
    5363             : 
    5364           0 :       integer :: sndlths(0:numpro-1), sdispls(0:numpro-1)
    5365           0 :       integer :: rcvlths(0:numpro-1), rdispls(0:numpro-1)
    5366           0 :       integer :: swapids(numpro) 
    5367           0 :       integer :: sndids(numpro)  ! nonblocking MPI send request ids
    5368           0 :       integer :: rcvids(numpro)  ! nonblocking MPI recv request ids
    5369           0 :       integer :: hs_snd, hs_rcv(numpro)! handshake variables (send/receive)
    5370           0 :       integer :: hs_rcvids(numpro) ! nonblocking MPI handshake recv request ids
    5371           0 :       integer :: InStats(numpro*MPI_STATUS_SIZE)
    5372           0 :       integer :: OutStats(numpro*MPI_STATUS_SIZE)
    5373             : 
    5374             :       integer :: offset_v
    5375             : 
    5376             :       integer :: rstep
    5377             : 
    5378             :       integer :: maxreq, maxreqh
    5379             :       logical :: handshake, alltoall, sendd
    5380             :       integer ::  numtr, numtrm, m
    5381             :       integer ijks, ijkr, ij
    5382             : 
    5383             : #if defined( MODCM_TIMING )
    5384             :       call t_startf('mod_comm communication')
    5385             : #endif
    5386             : 
    5387           0 :       call MPI_COMM_SIZE (comm, comm_size, ierr)
    5388           0 :       call MPI_COMM_RANK (comm, comm_pid, ierr)
    5389             : 
    5390             : !     num_s = 0 if this process is not part of the sending decomposition
    5391           0 :       num_s = size(send_bl)
    5392           0 :       if (send_bl(1)%Nparcels == -1) then
    5393           0 :          num_s = 0
    5394             :       endif
    5395             : 
    5396             : !     num_r = 0 if this process is not part of the receiving decomposition
    5397           0 :       num_r = size(recv_bl)
    5398           0 :       if (recv_bl(1)%Nparcels == -1) then
    5399           0 :          num_r = 0
    5400             :       endif
    5401             : 
    5402           0 :       if ( present(sw_handshake) ) then
    5403           0 :          handshake = sw_handshake
    5404           0 :          hs_snd = 1
    5405             :       else
    5406             :          handshake = .false.
    5407             :       endif
    5408             : 
    5409           0 :       if ( present(sw_alltoall) ) then
    5410           0 :          alltoall = sw_alltoall
    5411             :       else
    5412             :          alltoall = .false.
    5413             :       endif
    5414             : 
    5415           0 :       if ( present(sw_send) ) then
    5416           0 :          sendd = sw_send
    5417             :       else
    5418             :          sendd = .false.
    5419             :       endif
    5420             : 
    5421           0 :       numtrm = mend - mbeg
    5422           0 :       numtr = numtrm + 1
    5423             : 
    5424           0 :       ijks =(klasts-kfirsts+1)*(jlasts-jfirsts+1)*(ilasts-ifirsts+1)
    5425           0 :       ijkr =(klastr-kfirstr+1)*(jlastr-jfirstr+1)*(ilastr-ifirstr+1)
    5426             : 
    5427           0 :       unitsize = r8_win%size/max_irr
    5428             : 
    5429             : ! advance to unused portion of storage window
    5430           0 :       r8_win%ncall_s = r8_win%ncall_s + 1
    5431             : 
    5432           0 :       if (r8_win%ncall_s .gt. max_irr-numtrm) then
    5433           0 :          write(iulog,*) "mp_swaptrirr: insufficient window storage - exiting"
    5434           0 :          write(iulog,*) "r8_win%ncall_s max_irr = ", r8_win%ncall_s, max_irr
    5435           0 :          stop
    5436             :       endif
    5437             : 
    5438             : ! calculate send lengths and displacements
    5439           0 :       offset_0 = (r8_win%ncall_s-1)*unitsize
    5440           0 :       offset_s = offset_0
    5441           0 :       sndlths(:) = 0
    5442           0 :       sdispls(:) = 0
    5443           0 :       do ipe=1, num_s
    5444           0 :          sndlths(ipe-1) = numtr*send_bl(ipe)%Tot_Size
    5445           0 :          sdispls(ipe-1) = offset_s
    5446           0 :          if (sndlths(ipe-1) .ne. 0) then
    5447             : 
    5448           0 :             offset_s = offset_s + sndlths(ipe-1)
    5449           0 :             if (offset_s-offset_0 .gt. numtr*unitsize) then
    5450           0 :               write(iulog,*) "Fatal mp_swaptrirr: send window out of space - exiting"
    5451           0 :               write(iulog,*) '1 comm_pid ipe unitsize offset_s offset_0 = ', comm_pid,  &
    5452           0 :                              ipe, unitsize, offset_s, offset_0
    5453           0 :               stop
    5454             :             endif
    5455             :          endif
    5456             :       enddo
    5457             : 
    5458             : ! calculate receive lengths and displacements
    5459           0 :       offset_r = offset_0
    5460           0 :       rcvlths(:) = 0
    5461           0 :       rdispls(:) = 0
    5462           0 :       do ipe=1, num_r
    5463           0 :          rcvlths(ipe-1) = numtr*recv_bl(ipe)%Tot_Size
    5464           0 :          rdispls(ipe-1) = offset_r
    5465           0 :          if (rcvlths(ipe-1) .ne. 0) then
    5466             : 
    5467           0 :             offset_r = offset_r + rcvlths(ipe-1)
    5468           0 :             if (numtr*unitsize < offset_r-offset_0) then
    5469           0 :               write(iulog,*) "Fatal mp_swaptrirr: receive window out of space - exiting"
    5470           0 :               write(iulog,*) '1 comm_pid ipe unitsize offset_r offset_0 = ', comm_pid,  &
    5471           0 :                         ipe, unitsize, offset_r, offset_0
    5472           0 :               stop
    5473             :             endif
    5474             :          endif
    5475             :       enddo
    5476             : 
    5477             : ! Calculate swap partners and number of steps in point-to-point
    5478             : ! implementations of alltoall algorithm.
    5479           0 :       steps = 0
    5480           0 :       do ipe=1,ceil2(comm_size)-1
    5481           0 :          p = pair(comm_size,ipe,comm_pid)
    5482           0 :          if (p >= 0) then
    5483           0 :             if (sndlths(p) > 0 .or. rcvlths(p) > 0) then
    5484           0 :                steps = steps + 1
    5485           0 :                swapids(steps) = p
    5486             :             end if
    5487             :          end if
    5488             :       end do
    5489             : 
    5490           0 :       if (.not. alltoall) then
    5491             : 
    5492           0 :          sndids(1:steps) = MPI_REQUEST_NULL
    5493           0 :          rcvids(1:steps) = MPI_REQUEST_NULL
    5494             : 
    5495           0 :          if (steps .eq. 0) then
    5496           0 :             maxreq  = 0
    5497           0 :             maxreqh = 0
    5498           0 :          elseif (steps .eq. 1) then
    5499           0 :             maxreq  = 1
    5500           0 :             maxreqh = 1
    5501             :          else
    5502           0 :             if ( present(sw_maxreq) ) then
    5503           0 :                if ((sw_maxreq .le. steps) .and. (sw_maxreq .ge. 0)) then
    5504           0 :                   maxreq  = sw_maxreq
    5505           0 :                   if (maxreq > 1) then
    5506           0 :                      maxreqh = maxreq/2
    5507             :                   else
    5508           0 :                      maxreq  = 2
    5509           0 :                      maxreqh = 1
    5510             :                   endif
    5511             :                else
    5512           0 :                   maxreq  = steps
    5513           0 :                   maxreqh = steps
    5514             :                endif
    5515             :             else
    5516           0 :                maxreq  = steps
    5517           0 :                maxreqh = steps
    5518             :             endif
    5519             :          endif
    5520             : 
    5521             : ! Post initial handshake receive requests
    5522           0 :          if (handshake) then
    5523           0 :             do istep=1,maxreq
    5524           0 :                p = swapids(istep)
    5525           0 :                if (sndlths(p) > 0) then
    5526             :                   call mpi_irecv  ( hs_rcv(istep), 1, mp_i4, p, comm_pid, comm, &
    5527           0 :                                     hs_rcvids(istep), ierr )
    5528             :                endif
    5529             :             enddo
    5530             :          endif
    5531             : 
    5532             : ! Post initial receive requests
    5533           0 :          do istep=1,maxreq
    5534           0 :             p = swapids(istep)
    5535           0 :             if (rcvlths(p) > 0) then
    5536           0 :                offset_r = rdispls(p)+1
    5537           0 :                call mpi_irecv ( ga_r8_r(offset_r), rcvlths(p), mp_r8, &
    5538           0 :                                 p, p, comm, rcvids(istep), ierr )
    5539           0 :                if (handshake) then
    5540             :                   call mpi_send( hs_snd, 1, mp_i4, p, p, comm, &
    5541           0 :                                  ierr )
    5542             :                endif
    5543             :             endif
    5544             :          enddo
    5545             :          rstep = maxreq
    5546             : !
    5547             :       endif
    5548             : 
    5549             : ! gather data into global send buffer
    5550           0 :       do istep=1,steps
    5551           0 :          p = swapids(istep)
    5552             : 
    5553           0 :          if (sndlths(p) .ne. 0) then
    5554           0 :             offset_v = sdispls(p)
    5555           0 :             do j = 1, send_bl(p+1)%nparcels
    5556           0 :                do m = mbeg, mend
    5557           0 :                   do i = 1, send_bl(p+1)%blocksizes(j)
    5558           0 :                      ij = send_bl(p+1)%displacements(j)+i
    5559           0 :                      ga_r8_s(send_bl(p+1)%Tot_Size*(m-mbeg)+offset_v+i) = qin((m-1)*ijks+ij)
    5560             :                   enddo
    5561             :                enddo
    5562           0 :                offset_v = offset_v + send_bl(p+1)%blocksizes(j)
    5563             :             enddo
    5564             :          endif
    5565             : 
    5566           0 :          if (.not. alltoall) then
    5567             : 
    5568             : ! Submit new i(r)send request
    5569           0 :             offset_s = sdispls(p)+1
    5570           0 :             if (sndlths(p) > 0) then
    5571           0 :                if (handshake) then
    5572             :                   call mpi_wait( hs_rcvids(istep), MPI_STATUS_IGNORE, ierr )
    5573           0 :                   if (sendd) then
    5574           0 :                      call mpi_rsend( ga_r8_s(offset_s), sndlths(p), mp_r8, &
    5575           0 :                                       p, comm_pid, comm, ierr )
    5576             :                   else
    5577           0 :                      call mpi_irsend( ga_r8_s(offset_s), sndlths(p), mp_r8, &
    5578           0 :                                       p, comm_pid, comm, sndids(istep), ierr )
    5579             :                   endif
    5580             :                else
    5581           0 :                   if (sendd) then
    5582           0 :                      call mpi_send ( ga_r8_s(offset_s), sndlths(p), mp_r8, &
    5583           0 :                                       p, comm_pid, comm, ierr )
    5584             :                   else
    5585           0 :                      call mpi_isend ( ga_r8_s(offset_s), sndlths(p), mp_r8, &
    5586           0 :                                       p, comm_pid, comm, sndids(istep), ierr )
    5587             :                   endif
    5588             :                endif
    5589             :             endif
    5590             : 
    5591           0 :             if (istep > maxreqh) then
    5592             : ! Wait for oldest irecv request to complete
    5593           0 :                call mpi_wait( rcvids(istep-maxreqh), OutStats, ierr )
    5594             : 
    5595           0 :                if (rstep < steps) then
    5596           0 :                   rstep = rstep + 1
    5597           0 :                   p = swapids(rstep)
    5598             : 
    5599             : ! Submit a new handshake irecv request
    5600           0 :                   if (handshake) then
    5601           0 :                      if (sndlths(p) > 0) then
    5602             :                         call mpi_irecv( hs_rcv(rstep), 1, mp_i4, p, comm_pid, comm, &
    5603           0 :                                         hs_rcvids(rstep), ierr )
    5604             :                      endif
    5605             :                   endif
    5606             : 
    5607             : ! Submit a new irecv request
    5608           0 :                   if (rcvlths(p) > 0) then
    5609           0 :                      offset_r = rdispls(p)+1
    5610           0 :                      call mpi_irecv( ga_r8_r(offset_r), rcvlths(p), mp_r8, &
    5611           0 :                                      p, p, comm, rcvids(rstep), ierr )
    5612           0 :                      if (handshake) then
    5613             :                         call mpi_send ( hs_snd, 1, mp_i4, p, p, comm, &
    5614           0 :                                         ierr )
    5615             :                      endif
    5616             :                   endif
    5617             :                endif
    5618             : 
    5619             : ! Wait for outstanding i(r)send request to complete
    5620           0 :                if (.not. sendd) then
    5621           0 :                   call mpi_wait( sndids(istep-maxreqh), InStats, ierr )
    5622             :                endif
    5623             :             endif
    5624             : !
    5625             :          endif
    5626             : !
    5627             :       enddo
    5628             : 
    5629             : ! local copy to send buffer
    5630           0 :       if (sndlths(comm_pid) .ne. 0) then
    5631             : 
    5632           0 :          offset_v = sdispls(comm_pid)
    5633           0 :          do j = 1, send_bl(comm_pid+1)%nparcels
    5634           0 :             do m = mbeg, mend
    5635           0 :                do i = 1, send_bl(comm_pid+1)%blocksizes(j)
    5636           0 :                   ij = send_bl(comm_pid+1)%displacements(j)+i
    5637           0 :                   ga_r8_s(send_bl(comm_pid+1)%Tot_Size*(m-mbeg)+offset_v+i) = qin((m-1)*ijks+ij)
    5638             :                enddo
    5639             :             enddo
    5640           0 :             offset_v = offset_v + send_bl(comm_pid+1)%blocksizes(j)
    5641             :          enddo
    5642             : 
    5643           0 :          if (.not. alltoall) then
    5644           0 :             ga_r8_r(rdispls(comm_pid)+1:rdispls(comm_pid)+rcvlths(comm_pid)) = &
    5645           0 :                ga_r8_s(sdispls(comm_pid)+1:sdispls(comm_pid)+sndlths(comm_pid))
    5646             :          endif
    5647             : 
    5648             :       endif
    5649             : 
    5650           0 :       if (alltoall) then
    5651             :          call mpi_alltoallv (ga_r8_s, sndlths, sdispls, mp_r8, &
    5652             :                              ga_r8_r, rcvlths, rdispls, mp_r8, &
    5653           0 :                              comm, ierror)
    5654             :       endif
    5655             : 
    5656             : ! local copy from receive buffer
    5657           0 :       if (rcvlths(comm_pid) .ne. 0) then
    5658             : 
    5659           0 :          offset_v = rdispls(comm_pid)
    5660           0 :          do j = 1, recv_bl(comm_pid+1)%Nparcels
    5661           0 :             do m = mbeg, mend
    5662           0 :                do i = 1, recv_bl(comm_pid+1)%blocksizes(j)
    5663           0 :                   ij = recv_bl(comm_pid+1)%displacements(j)+i
    5664           0 :                   qout((m-1)*ijkr+ij) = ga_r8_r(recv_bl(comm_pid+1)%Tot_Size*(m-mbeg)+offset_v+i)
    5665             :                enddo
    5666             :             enddo
    5667           0 :             offset_v = offset_v + recv_bl(comm_pid+1)%blocksizes(j)
    5668             :          enddo
    5669             : 
    5670             :       endif
    5671             : 
    5672             : ! scatter data from global receive buffer to final destination
    5673           0 :       do istep=1,steps
    5674           0 :          p = swapids(istep)
    5675             : 
    5676           0 :          if (.not. alltoall) then
    5677           0 :             if (istep > steps-maxreqh) then
    5678             :                call mpi_wait( rcvids(istep), OutStats, ierr )
    5679             :             endif
    5680             :          endif
    5681             : 
    5682           0 :          if (rcvlths(p) .ne. 0) then
    5683             : 
    5684           0 :             offset_v = rdispls(p)
    5685           0 :             do j = 1, recv_bl(p+1)%Nparcels
    5686           0 :                do m = mbeg, mend
    5687           0 :                   do i = 1, recv_bl(p+1)%blocksizes(j)
    5688           0 :                      ij = recv_bl(p+1)%displacements(j)+i
    5689           0 :                      qout((m-1)*ijkr+ij) = ga_r8_r(recv_bl(p+1)%Tot_Size*(m-mbeg)+offset_v+i)
    5690             :                   enddo
    5691             :                enddo
    5692           0 :                offset_v = offset_v + recv_bl(p+1)%blocksizes(j)
    5693             :             enddo
    5694             : 
    5695             :          endif
    5696             :       enddo
    5697             : 
    5698             : ! Wait for any outstanding send requests to complete.
    5699           0 :       if (.not. alltoall .and. .not. sendd) then
    5700           0 :          call mpi_waitall( maxreqh, sndids(steps-maxreqh+1), InStats, ierr )
    5701             :       endif
    5702             : 
    5703             : ! clean-up
    5704             : ! make used portion of storage window available for reuse
    5705           0 :       r8_win%ncall_s = r8_win%ncall_s - 1
    5706             : 
    5707             : #if defined( MODCM_TIMING )
    5708             :       call t_stopf('mod_comm communication')
    5709             : #endif
    5710             : 
    5711             : !EOC
    5712           0 :       end subroutine mp_swaptrirr
    5713             : # endif
    5714             : !------------------------------------------------------------------------------
    5715             : !
    5716             : !------------------------------------------------------------------------------
    5717             : # if defined( MOD_SPECIFIED_SHAPE )
    5718             : !BOP
    5719             : ! !ROUTINE: mp_sendtrirr --- Initiate communication of contiguous tracer parcels
    5720             : !
    5721             : ! !INTERFACE:
    5722             :       subroutine mp_sendtrirr ( comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq,  &
    5723             :                                 ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts,  &
    5724             :                                 ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr,  &
    5725             :                                 modc )
    5726             :  
    5727             : ! !INPUT PARAMETERS:
    5728             :       integer, intent(in)  :: comm      !  communicator
    5729             :       type(blockdescriptor), intent(in)  :: send_bl(:) ! send blocks
    5730             :       type(blockdescriptor), intent(in)  :: recv_bl(:) ! receive blocks
    5731             :       integer, intent(in)           :: mbeg            ! initial tracer index
    5732             :       integer, intent(in)           :: mend            ! final tracer index
    5733             :       integer, intent(in)           :: mq              ! total tracer indices
    5734             :       integer, intent(in)           :: ifirsts         ! first I index of source
    5735             :       integer, intent(in)           :: ilasts          ! last I index of source
    5736             :       integer, intent(in)           :: jfirsts         ! first j index of source
    5737             :       integer, intent(in)           :: jlasts          ! last j index of source
    5738             :       integer, intent(in)           :: kfirsts         ! first k index of source
    5739             :       integer, intent(in)           :: klasts          ! last k index of source
    5740             :       integer, intent(in)           :: ifirstr         ! first I index of target
    5741             :       integer, intent(in)           :: ilastr          ! last I index of target
    5742             :       integer, intent(in)           :: jfirstr         ! first j index of target
    5743             :       integer, intent(in)           :: jlastr          ! last j index of target
    5744             :       integer, intent(in)           :: kfirstr         ! first k index of target
    5745             :       integer, intent(in)           :: klastr          ! last k index of target
    5746             :       integer, optional, intent(in) :: modc(4)         ! 1: classical, swap p2p, swap a2a
    5747             :                                                        ! 2: handshake
    5748             :                                                        ! 3: send vs isend
    5749             :                                                        ! 4: max number of outstanding requests
    5750             :       real(r8), intent(in) :: qin(ifirsts:ilasts,jfirsts:jlasts,kfirsts:klasts,1:mq) ! input tracer array
    5751             : 
    5752             : ! !OUTPUT PARAMETERS:
    5753             :       real(r8), intent(out) :: qout(ifirstr:ilastr,jfirstr:jlastr,kfirstr:klastr,1:mq) ! output tracer array
    5754             : !
    5755             : ! !DESCRIPTION:
    5756             : !     Communicate a number of contiguous parcels to/from arbitrary set of PEs.
    5757             : !     Modc(1): if 0, use original approach of posting all communications here and placing
    5758             : !     wait points in mp_recvtrirr; if 1, call swap routine with p2p messages; if 2, call swap
    5759             : !     routine with a2a messages. 
    5760             : !     Modc(2): if 1, then apply handshaking (don't send until corresponding receive is posted)
    5761             : !     Modc(3): if 1, then use blocking send; otherwise use nonblocking send
    5762             : !     Modc(4): maximum number of outstanding requests (applies to swap routines only)
    5763             : !
    5764             : ! !REVISION HISTORY: 
    5765             : !    02.08.13   Sawyer      Creation
    5766             : !    02.11.06   Mirin       Optimizations
    5767             : !    03.03.03   Sawyer      Use partneroffset
    5768             : !    03.06.24   Sawyer      Integrated Use_Mpi_Types; added qout
    5769             : !    04.02.24   Mirin       Various mpi2 options
    5770             : !    08.09.18   Mirin       Major overhaul, to include approaches from Mirin and Worley
    5771             : !    09.10.07   Worley      eliminated mpi_recv from handshake logic
    5772             : !
    5773             : ! !BUGS:
    5774             : !
    5775             : !EOP
    5776             : !------------------------------------------------------------------------------
    5777             : !BOC
    5778             : !
    5779             : ! !LOCAL VARIABLES:
    5780             :       integer ipe, qsize, offset, blocksize, nparcels, offset_s, offset_r, ierr, mod_method
    5781             :       integer p, mysize, nthpc, minsize, nthrd, pn, pt, tmpsize, unitsize, offset_0
    5782             :       integer i, j, send_tag, recv_tag, num_s, num_r, m
    5783             :       integer :: offset_v (Max_Nparcels)
    5784             :       integer :: hs_snd, hs_rcv(numpro), hs_rcvids(numpro)
    5785             :       integer ipe2, ceil2num
    5786             :       integer numtr, numtrm
    5787             :       integer sw_local, maxreq_local
    5788             :       logical hs_local, send_local
    5789             :       logical sw_alltoall
    5790             :       integer comm_pid
    5791             :       integer ip, jp, kp, mp, ir, jr, jir, mt
    5792             : 
    5793             : 
    5794             : #if defined( MODCM_TIMING )
    5795             :       call t_startf('mod_comm communication')
    5796             : #endif
    5797             : 
    5798             :       if (present(modc)) then
    5799             :          sw_local   = modc(1)
    5800             :          hs_local   = (modc(2) .eq. 1)
    5801             :          send_local = (modc(3) .eq. 1)
    5802             :          maxreq_local = modc(4)
    5803             :       else
    5804             :          sw_local = 0
    5805             :          hs_local = .true.
    5806             :          send_local = .true.
    5807             :          maxreq_local = -1
    5808             :       endif
    5809             : 
    5810             : ! Do not call mp_swaptrirr unless mod_method equals 0
    5811             :       mod_method = recv_bl(1)%method
    5812             :       if (mod_method .gt. 0) sw_local = 0
    5813             : 
    5814             :     if (sw_local .gt. 0) then
    5815             :          sw_alltoall = (sw_local .eq. 2)
    5816             :          call mp_swaptrirr(comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq, &
    5817             :                            ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts, &
    5818             :                            ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr, &
    5819             :                            sw_handshake=hs_local, sw_maxreq=maxreq_local,     &
    5820             :                            sw_alltoall=sw_alltoall, sw_send=send_local)
    5821             :     else
    5822             : 
    5823             :       call MPI_COMM_RANK (comm, comm_pid, ierr)
    5824             : 
    5825             :       hs_snd = 1
    5826             :       ceil2num = ceil2(numpro)
    5827             : 
    5828             :       numtrm = mend - mbeg
    5829             :       numtr = numtrm + 1
    5830             : 
    5831             : !     num_s = 0 if this processes is not part of the sending decomposition
    5832             :       num_s = size(send_bl)
    5833             :       if (send_bl(1)%Nparcels == -1) then
    5834             :          num_s = 0
    5835             :       endif
    5836             : 
    5837             : !     num_r = 0 if this processes is not part of the receiving decomposition
    5838             :       num_r = size(recv_bl)
    5839             :       if (recv_bl(1)%Nparcels == -1) then
    5840             :          num_r = 0
    5841             :       endif
    5842             : 
    5843             :       r8_win%ncall_s = r8_win%ncall_s + 1
    5844             :      if (mod_method .gt. 0) then
    5845             : !
    5846             : ! mpi derived types
    5847             :       if (r8_win%ncall_s .gt. MaxTrf-numtrm) then
    5848             :          write(iulog,*) "mp_sendtrirr: derived type handle count exceeded - exiting"
    5849             :          write(iulog,*) "r8_win%ncall_s MaxTrf = ", r8_win%ncall_s, MaxTrf
    5850             :          stop
    5851             :       endif
    5852             : !
    5853             : ! MPI: Irecv over all processes
    5854             : !
    5855             :       if (hs_local) then
    5856             :          hs_rcvids(:) = MPI_REQUEST_NULL
    5857             :          do ipe2=1, ceil2num
    5858             :             ipe = ieor(ipe2-1,comm_pid) + 1
    5859             :             if (ipe .gt. num_s) cycle
    5860             :             if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then
    5861             :                if (ipe-1 /= comm_pid) &
    5862             :                   call MPI_IRECV ( hs_rcv(ipe), 1, mp_i4, ipe-1, comm_pid, comm, &
    5863             :                                    hs_rcvids(ipe), ierr )
    5864             :             endif
    5865             :          enddo
    5866             :       endif
    5867             : 
    5868             :       OutHandle(:,r8_win%ncall_s:r8_win%ncall_s+numtrm) = MPI_REQUEST_NULL
    5869             :       do ipe2=1, ceil2num
    5870             :         ipe = ieor(ipe2-1,comm_pid) + 1
    5871             :         if (ipe .gt. num_r) cycle
    5872             : !
    5873             : ! Receive the buffers with MPI_Irecv. Non-blocking
    5874             : !
    5875             :         if ( recv_bl(ipe)%type /= MPI_DATATYPE_NULL ) then
    5876             :           recv_tag = ipe-1 + modcam_tagoffset
    5877             :           do m = mbeg, mend
    5878             :              call mpi_irecv( qout(:,:,:,m), 1, recv_bl(ipe)%type, ipe-1, recv_tag,   &
    5879             :                              comm, OutHandle(ipe,r8_win%ncall_s+m-mbeg), ierr )
    5880             :           enddo
    5881             :           if (hs_local) then
    5882             :              if (ipe-1 /= comm_pid) &
    5883             :                call MPI_SEND ( hs_snd, 1, mp_i4, ipe-1, ipe-1, comm, ierr )
    5884             :           endif
    5885             :         endif
    5886             :       enddo
    5887             : 
    5888             : !
    5889             : ! MPI: Isend/Send over all processes; use risend/rsend with hs
    5890             : !
    5891             :       InHandle(:,r8_win%ncall_s:r8_win%ncall_s+numtrm) = MPI_REQUEST_NULL
    5892             :       do ipe2=1, ceil2num
    5893             :         ipe = ieor(ipe2-1,comm_pid) + 1
    5894             :         if (ipe .gt. num_s) cycle
    5895             : 
    5896             : !
    5897             : ! Send the individual buffers with non-blocking sends
    5898             : !
    5899             :         if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then
    5900             :           send_tag = comm_pid + modcam_tagoffset
    5901             :           if (hs_local) then
    5902             :              if (ipe-1 /= comm_pid) &
    5903             :                 call MPI_WAIT ( hs_rcvids(ipe), MPI_STATUS_IGNORE, ierr )
    5904             :              if (send_local) then
    5905             :                 do m = mbeg, mend
    5906             :                    call mpi_rsend( qin(:,:,:,m), 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    5907             :                                    comm, ierr )
    5908             :                 enddo
    5909             :              else
    5910             :                 do m = mbeg, mend
    5911             :                    call mpi_irsend( qin(:,:,:,m), 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    5912             :                                     comm, InHandle(ipe,r8_win%ncall_s), ierr )
    5913             :                 enddo
    5914             :              endif
    5915             :           else
    5916             :              if (send_local) then
    5917             :                 do m = mbeg, mend
    5918             :                    call mpi_send( qin(:,:,:,m), 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    5919             :                                   comm, ierr )
    5920             :                 enddo
    5921             :              else
    5922             :                 do m = mbeg, mend
    5923             :                    call mpi_isend( qin(:,:,:,m), 1, send_bl(ipe)%type, ipe-1, send_tag,        &
    5924             :                                    comm, InHandle(ipe,r8_win%ncall_s), ierr )
    5925             :                 enddo
    5926             :              endif
    5927             :           endif
    5928             :         endif
    5929             :       enddo
    5930             :      else
    5931             : 
    5932             : ! temporary contiguous buffers
    5933             : 
    5934             :       jr = jlasts - jfirsts + 1
    5935             :       ir = ilasts - ifirsts + 1
    5936             :       jir = jr * ir
    5937             :       if (r8_win%ncall_s .gt. max_irr-numtrm) then
    5938             :          write(iulog,*) "mp_sendtrirr: insufficient window storage - exiting"
    5939             :          write(iulog,*) "r8_win%ncall_s max_irr = ", r8_win%ncall_s, max_irr
    5940             :          stop
    5941             :       endif
    5942             :       unitsize = r8_win%size/max_irr
    5943             : 
    5944             : ! issue call to receive data in global receive buffer
    5945             :       offset_0 = (r8_win%ncall_s-1)*unitsize
    5946             :       offset_s = offset_0
    5947             :       offset_r = offset_0
    5948             : 
    5949             :       if (hs_local) then
    5950             :          hs_rcvids(:) = MPI_REQUEST_NULL
    5951             :          do ipe2=1, ceil2num
    5952             :             ipe = ieor(ipe2-1,comm_pid) + 1
    5953             :             if (ipe .gt. num_s) cycle
    5954             :             qsize = numtr*send_bl(ipe)%Tot_Size
    5955             :             if (qsize .ne. 0) then
    5956             :                r8_win%dest = ipe-1
    5957             :                send_tag = comm_pid + modcam_tagoffset
    5958             :                if (r8_win%dest /= comm_pid) &
    5959             :                   call MPI_IRECV ( hs_rcv(ipe), 1, mp_i4, r8_win%dest, send_tag, comm, &
    5960             :                                    hs_rcvids(ipe), ierr )
    5961             :             endif
    5962             :          enddo
    5963             :       endif
    5964             : 
    5965             :       do ipe2=1, ceil2num
    5966             :          ipe = ieor(ipe2-1,comm_pid) + 1
    5967             :          if (ipe .gt. num_r) cycle
    5968             :          r8_win%size_r = numtr*recv_bl(ipe)%Tot_Size
    5969             :          if (r8_win%size_r .ne. 0) then
    5970             :             r8_win%offset_r = offset_r
    5971             :             offset_r = offset_r + r8_win%size_r
    5972             :             r8_win%src = ipe-1
    5973             :             if (numtr*unitsize >= offset_r-offset_0) then
    5974             :               recv_tag = r8_win%src + modcam_tagoffset
    5975             :               qsize    = r8_win%size_r
    5976             :               r8_win%nrecv    = r8_win%nrecv + 1
    5977             :               call MPI_IRECV(ga_r8_r(r8_win%offset_r+1), qsize, mp_r8, r8_win%src, &
    5978             :                              recv_tag, comm, r8_win%rqest(r8_win%nrecv), ierror)
    5979             :               if (hs_local) then
    5980             :                  if (r8_win%src /= comm_pid) &
    5981             :                    call MPI_SEND ( hs_snd, 1, mp_i4, r8_win%src, recv_tag, comm, ierror)
    5982             :               endif
    5983             :             else
    5984             :               write(iulog,*) "Fatal mp_sendtrirr: receive window out of space - exiting"
    5985             :               write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid,  &
    5986             :                         ipe, unitsize, offset_r, offset_0
    5987             :               stop
    5988             :             endif
    5989             :          endif
    5990             :       enddo
    5991             : ! gather data into global send buffer
    5992             :       do ipe2=1, ceil2num
    5993             :          ipe = ieor(ipe2-1,comm_pid) + 1
    5994             :          if (ipe .gt. num_s) cycle
    5995             :          qsize = numtr*send_bl(ipe)%Tot_Size
    5996             :          if (qsize .ne. 0) then
    5997             :             r8_win%dest = ipe-1
    5998             :             r8_win%offset_s = offset_s
    5999             :             offset_s = offset_s + qsize
    6000             :             if (offset_s-offset_0 .gt. numtr*unitsize) then
    6001             :               write(iulog,*) "Fatal mp_sendtrirr: send window out of space - exiting"
    6002             :               write(iulog,*) 'comm_pid ipe unitsize offset_s offset_0 = ', comm_pid,  &
    6003             :                         ipe, unitsize, offset_s, offset_0
    6004             :               stop
    6005             :             endif
    6006             : 
    6007             :             offset_v(1) = r8_win%offset_s
    6008             :             do j = 2, send_bl(ipe)%nparcels
    6009             :                offset_v(j) = offset_v(j-1) + send_bl(ipe)%blocksizes(j-1)
    6010             :             enddo
    6011             : 
    6012             :             do j = 1, send_bl(ipe)%nparcels
    6013             :                do m = mbeg, mend
    6014             :                   do i = 1, send_bl(ipe)%blocksizes(j)
    6015             :                      mp = send_bl(ipe)%displacements(j)+i
    6016             :                      kp = kfirsts + (mp-1)/jir
    6017             :                      mt = (kp-kfirsts)*jir
    6018             :                      jp = jfirsts + (mp-mt-1)/ir
    6019             :                      ip = mp-mt - (jp-jfirsts)*ir + ifirsts-1
    6020             :                      ga_r8_s(send_bl(ipe)%Tot_Size*(m-mbeg)+offset_v(j)+i) = qin(ip,jp,kp,m)
    6021             :                   enddo
    6022             :                enddo
    6023             :             enddo
    6024             : 
    6025             : ! nonblocking send
    6026             :             send_tag = comm_pid + modcam_tagoffset
    6027             :             r8_win%nsend = r8_win%nsend + 1
    6028             :             if (hs_local) then
    6029             :                if (r8_win%dest /= comm_pid) &
    6030             :                   call MPI_WAIT ( hs_rcvids(ipe), MPI_STATUS_IGNORE, ierr )
    6031             :                if (send_local) then
    6032             :                   call MPI_RSEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, &
    6033             :                                  send_tag, comm, ierr)
    6034             :                else
    6035             :                   call MPI_IRSEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, &
    6036             :                                  send_tag, comm, r8_win%sqest(r8_win%nsend), ierr)
    6037             :                endif
    6038             :             else
    6039             :                if (send_local) then
    6040             :                   call MPI_SEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, &
    6041             :                                  send_tag, comm, ierr)
    6042             :                else
    6043             :                   call MPI_ISEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, &
    6044             :                                  send_tag, comm, r8_win%sqest(r8_win%nsend), ierr)
    6045             :                endif
    6046             :             endif
    6047             :          endif
    6048             :       enddo
    6049             : 
    6050             :      endif   !  mod_method
    6051             : 
    6052             :       r8_win%ncall_s = r8_win%ncall_s + numtrm
    6053             : 
    6054             :     endif   !  sw_local
    6055             : 
    6056             : #if defined( MODCM_TIMING )
    6057             :       call t_stopf('mod_comm communication')
    6058             : #endif
    6059             : 
    6060             :       end subroutine mp_sendtrirr
    6061             : !------------------------------------------------------------------------------
    6062             : !
    6063             : !------------------------------------------------------------------------------
    6064             : !BOP
    6065             : ! !ROUTINE: mp_recvtrirr --- Finalize communication of contiguous tracer parcels
    6066             : !
    6067             : ! !INTERFACE:
    6068             :       subroutine mp_recvtrirr ( comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq,  &
    6069             :                                 ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts,  &
    6070             :                                 ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr,  &
    6071             :                                 modc )
    6072             :  
    6073             : ! !INPUT PARAMETERS:
    6074             :       integer, intent(in)  :: comm      !  communicator
    6075             :       type(blockdescriptor), intent(in)  :: send_bl(:) ! send blocks
    6076             :       type(blockdescriptor), intent(in)  :: recv_bl(:) ! receive blocks
    6077             :       integer, intent(in)           :: mbeg            ! initial tracer index
    6078             :       integer, intent(in)           :: mend            ! final tracer index
    6079             :       integer, intent(in)           :: mq              ! total tracer indices
    6080             :       integer, intent(in)           :: ifirsts         ! first I index of source
    6081             :       integer, intent(in)           :: ilasts          ! last I index of source
    6082             :       integer, intent(in)           :: jfirsts         ! first j index of source
    6083             :       integer, intent(in)           :: jlasts          ! last j index of source
    6084             :       integer, intent(in)           :: kfirsts         ! first k index of source
    6085             :       integer, intent(in)           :: klasts          ! last k index of source
    6086             :       integer, intent(in)           :: ifirstr         ! first I index of target
    6087             :       integer, intent(in)           :: ilastr          ! last I index of target
    6088             :       integer, intent(in)           :: jfirstr         ! first j index of target
    6089             :       integer, intent(in)           :: jlastr          ! last j index of target
    6090             :       integer, intent(in)           :: kfirstr         ! first k index of target
    6091             :       integer, intent(in)           :: klastr          ! last k index of target
    6092             :       integer, optional, intent(in) :: modc(4)         ! 1: classical, swap p2p, swap a2a
    6093             :                                                        ! 2: handshake
    6094             :                                                        ! 3: send vs isend
    6095             :                                                        ! 4: max number of outstanding requests
    6096             :       real(r8), intent(in) :: qin(ifirsts:ilasts,jfirsts:jlasts,kfirsts:klasts,1:mq) ! input tracer array
    6097             : ! !OUTPUT PARAMETERS:
    6098             :       real(r8), intent(out) :: qout(ifirstr:ilastr,jfirstr:jlastr,kfirstr:klastr,1:mq) ! output tracer array
    6099             : !
    6100             : ! !DESCRIPTION:
    6101             : !     Complete transfer of a generalized region initiated by {\tt mp\_sendtrirr}.
    6102             : !     Communicate a number of contiguous parcels to/from arbitrary set of PEs.
    6103             : !     Modc(1): if 0, use original approach of posting all communications in mp_sendtrirr and
    6104             : !     placing wait points here; otherwise don't do anything - mp_swaptrirr is called from mp_sendirr.
    6105             : !     Modc(3): if 1, then use blocking send; otherwise use nonblocking send
    6106             : !
    6107             : ! !REVISION HISTORY:
    6108             : !    02.08.15   Sawyer      Creation
    6109             : !    02.11.06   Mirin       Optimizations
    6110             : !    03.03.03   Sawyer      Now using packed arrays for MPI2
    6111             : !    04.02.24   Mirin       Various mpi2 options
    6112             : !    08.09.18   Mirin       Major overhaul, to include approaches from Mirin and Worley
    6113             : !
    6114             : !EOP
    6115             : !------------------------------------------------------------------------------
    6116             : !BOC
    6117             :       integer :: ipe, blocksize, offset_r, mod_method
    6118             :       integer unitsize, offset_0
    6119             :       integer Ierr
    6120             :       integer InStats(numpro*MPI_STATUS_SIZE)
    6121             :       integer OutStats(numpro*MPI_STATUS_SIZE)
    6122             :       integer i, j, num_r, num_s, m
    6123             :       integer :: offset_v (Max_Nparcels)
    6124             :       integer ipe2, ceil2num
    6125             :       integer numtr, numtrm
    6126             :       integer sw_local, maxreq_local
    6127             :       logical hs_local, send_local
    6128             :       logical sw_alltoall
    6129             :       integer comm_size, comm_pid
    6130             :       integer ip, jp, kp, mp, ir, jr, jir, mt
    6131             : 
    6132             :       if (present(modc)) then
    6133             :          sw_local   = modc(1)
    6134             :          hs_local   = (modc(2) .eq. 1)
    6135             :          send_local = (modc(3) .eq. 1)
    6136             :          maxreq_local = modc(4)
    6137             :       else
    6138             :          sw_local = 0
    6139             :          hs_local = .true.
    6140             :          send_local = .true.
    6141             :          maxreq_local = -1
    6142             :       endif
    6143             : 
    6144             : ! Do not call mp_swaptrirr (hence return) unless mod_method equals 0
    6145             :       mod_method = recv_bl(1)%method
    6146             :       if (mod_method .gt. 0) sw_local = 0
    6147             : 
    6148             : ! Return if swap_irr
    6149             :       if (sw_local .gt. 0) return
    6150             : 
    6151             : #if defined( MODCM_TIMING )
    6152             :       call t_startf('mod_comm communication')
    6153             : #endif
    6154             : 
    6155             :       call MPI_COMM_SIZE (comm, comm_size, ierr)
    6156             :       call MPI_COMM_RANK (comm, comm_pid, ierr)
    6157             : 
    6158             :       ceil2num = ceil2(numpro)
    6159             : 
    6160             :       numtrm = mend - mbeg
    6161             :       numtr = numtrm + 1
    6162             : 
    6163             : !     num_s = 0 if this processes is not part of the sending decomposition
    6164             :       num_s = size(send_bl)
    6165             :       if (send_bl(1)%Nparcels == -1) then
    6166             :          num_s = 0
    6167             :       endif
    6168             : 
    6169             : !     num_r = 0 if this processes is not part of the receiving decomposition
    6170             :       num_r = size(recv_bl)
    6171             :       if (recv_bl(1)%Nparcels == -1) then
    6172             :          num_r = 0
    6173             :       endif
    6174             : 
    6175             :       r8_win%ncall_r = r8_win%ncall_r + 1
    6176             : 
    6177             :     if (mod_method .gt. 0) then
    6178             : 
    6179             : ! mpi derived types
    6180             :       if (r8_win%ncall_r .gt. MaxTrf-numtrm) then
    6181             :          write(iulog,*) "mp_recvtrirr: derived type handle count exceeded - exiting"
    6182             :          write(iulog,*) "r8_win%ncall_r MaxTrf = ", r8_win%ncall_r, MaxTrf
    6183             :          stop
    6184             :       endif
    6185             : 
    6186             :       if (num_s .gt. 0 .and. (.not. send_local)) then
    6187             :          do m = mbeg, mend
    6188             :             CALL MPI_WAITALL( comm_size, InHandle(:,r8_win%ncall_r+m-mbeg), InStats, Ierr )
    6189             :          enddo
    6190             :       endif
    6191             :       if (num_r .gt. 0) then
    6192             :          do m = mbeg, mend
    6193             :             CALL MPI_WAITALL( comm_size, OutHandle(:,r8_win%ncall_r+m-mbeg), OutStats, Ierr )
    6194             :          enddo
    6195             :       endif
    6196             : 
    6197             :     else
    6198             : 
    6199             : ! temporary contiguous buffer / global window
    6200             : 
    6201             :       jr = jlastr - jfirstr + 1
    6202             :       ir = ilastr - ifirstr + 1
    6203             :       jir = jr * ir
    6204             :       if (r8_win%ncall_r .gt. max_irr-numtrm) then
    6205             :          write(iulog,*) "mp_recvtrirr: insufficient window storage - exiting"
    6206             :          write(iulog,*) "r8_win%ncall_r max_irr = ", r8_win%ncall_r, max_irr
    6207             :          stop
    6208             :       endif
    6209             :       unitsize = r8_win%size/max_irr
    6210             : 
    6211             : ! scatter data from global receive buffer to final destination
    6212             :       offset_0 = (r8_win%ncall_r-1)*unitsize
    6213             :       offset_r = offset_0
    6214             : 
    6215             :       do ipe2=1, ceil2num
    6216             :          ipe = ieor(ipe2-1,comm_pid) + 1
    6217             :          if (ipe .gt. num_r) cycle
    6218             :          r8_win%size_r = numtr*recv_bl(ipe)%Tot_Size
    6219             :          if (r8_win%size_r .ne. 0) then
    6220             :             r8_win%offset_r = offset_r
    6221             :             offset_r = offset_r + r8_win%size_r
    6222             :             if (offset_r-offset_0 .gt. numtr*unitsize) then
    6223             :               write(iulog,*) "Fatal mp_recvtrirr: receive window out of space - exiting"
    6224             :               write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid,  &
    6225             :                         ipe, unitsize, offset_r, offset_0
    6226             :               stop
    6227             :             endif
    6228             : 
    6229             :             r8_win%nread = r8_win%nread + 1
    6230             :             call MPI_WAIT(r8_win%rqest(r8_win%nread), Status, ierr)
    6231             : 
    6232             :             offset_v(1) = r8_win%offset_r
    6233             :             do j = 2, recv_bl(ipe)%Nparcels
    6234             :                offset_v(j) = offset_v(j-1) + recv_bl(ipe)%blocksizes(j-1)
    6235             :             enddo
    6236             : 
    6237             :             do j = 1, recv_bl(ipe)%Nparcels
    6238             :                do m = mbeg, mend
    6239             :                   do i = 1, recv_bl(ipe)%blocksizes(j)
    6240             :                      mp = recv_bl(ipe)%displacements(j)+i
    6241             :                      kp = kfirstr + (mp-1)/jir
    6242             :                      mt = (kp-kfirstr)*jir
    6243             :                      jp = jfirstr + (mp-mt-1)/ir
    6244             :                      ip = mp-mt - (jp-jfirstr)*ir + ifirstr-1
    6245             :                      qout(ip,jp,kp,m) = ga_r8_r(recv_bl(ipe)%Tot_Size*(m-mbeg)+offset_v(j)+i)
    6246             :                   enddo
    6247             :                enddo
    6248             :             enddo
    6249             : 
    6250             :          endif
    6251             :       enddo
    6252             : 
    6253             :       if ((r8_win%ncall_s == r8_win%ncall_r + numtrm) .and. (.not. send_local)) then
    6254             :          call MPI_WAITALL(r8_win%nsend, r8_win%sqest, Stats, ierror)
    6255             :       endif
    6256             : 
    6257             :     endif    !    mod_method .gt. 0
    6258             : 
    6259             :     r8_win%ncall_r = r8_win%ncall_r + numtrm
    6260             : 
    6261             :     if (r8_win%ncall_s == r8_win%ncall_r) then
    6262             :        r8_win%nsend = 0
    6263             :        r8_win%nrecv = 0
    6264             :        r8_win%nread = 0
    6265             :        r8_win%ncall_s = 0
    6266             :        r8_win%ncall_r = 0
    6267             :     endif
    6268             : 
    6269             : #if defined( MODCM_TIMING )
    6270             :       call t_stopf('mod_comm communication')
    6271             : #endif
    6272             : 
    6273             : !EOC
    6274             :       end subroutine mp_recvtrirr
    6275             : !------------------------------------------------------------------------------
    6276             : !
    6277             : !------------------------------------------------------------------------------
    6278             : !BOP
    6279             : ! !ROUTINE: mp_swaptrirr --- Write r8 contiguous parcels to global array
    6280             : !                            using XOR swap ordering - for multiple tracers
    6281             : !
    6282             : ! !INTERFACE:
    6283             :       subroutine mp_swaptrirr ( comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq,  &
    6284             :                                 ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts,  &
    6285             :                                 ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr,  &
    6286             :                                 sw_handshake, sw_maxreq, sw_alltoall, sw_send   )
    6287             :  
    6288             : ! !INPUT PARAMETERS:
    6289             :       integer, intent(in)  :: comm                     ! communicator
    6290             :       type(blockdescriptor), intent(in)  :: send_bl(:) ! send blocks
    6291             :       type(blockdescriptor), intent(in)  :: recv_bl(:) ! receive blocks
    6292             :       integer, intent(in)           :: mbeg            ! initial tracer index
    6293             :       integer, intent(in)           :: mend            ! final tracer index
    6294             :       integer, intent(in)           :: mq              ! total tracer indices
    6295             :       integer, intent(in)           :: ifirsts         ! first I index of source
    6296             :       integer, intent(in)           :: ilasts          ! last I index of source
    6297             :       integer, intent(in)           :: jfirsts         ! first j index of source
    6298             :       integer, intent(in)           :: jlasts          ! last j index of source
    6299             :       integer, intent(in)           :: kfirsts         ! first k index of source
    6300             :       integer, intent(in)           :: klasts          ! last k index of source
    6301             :       integer, intent(in)           :: ifirstr         ! first I index of target
    6302             :       integer, intent(in)           :: ilastr          ! last I index of target
    6303             :       integer, intent(in)           :: jfirstr         ! first j index of target
    6304             :       integer, intent(in)           :: jlastr          ! last j index of target
    6305             :       integer, intent(in)           :: kfirstr         ! first k index of target
    6306             :       integer, intent(in)           :: klastr          ! last k index of target
    6307             :       logical, optional, intent(in) :: sw_handshake    ! use flow control and 
    6308             :                                                        !  ready send
    6309             :       integer, optional, intent(in) :: sw_maxreq       ! maximum number of outstanding
    6310             :                                                        !  MPI requests
    6311             :       logical, optional, intent(in) :: sw_alltoall     ! use mpi_alltoall
    6312             :       logical, optional, intent(in) :: sw_send         ! use mpi_send instead of isend
    6313             :       real(r8), intent(in) :: qin(ifirsts:ilasts,jfirsts:jlasts,kfirsts:klasts,1:mq) ! input tracer array
    6314             : 
    6315             : ! !OUTPUT PARAMETERS:
    6316             :       real(r8), intent(out) :: qout(ifirstr:ilastr,jfirstr:jlastr,kfirstr:klastr,1:mq) ! output tracer array
    6317             : !
    6318             : ! !DESCRIPTION:
    6319             : !
    6320             : !     XOR-ordered version of all-to-all communication
    6321             : !
    6322             : ! WARNING: mod_comm parameter max_irr might need to be set larger than expected
    6323             : !          when swapping multiple variables; specifically, max_irr must be at least
    6324             : !          as large as the incoming r8_win%ncall_s + the number of variables to
    6325             : !          be swapped
    6326             : !
    6327             : ! !REVISION HISTORY: 
    6328             : !    08.06.30   Worley      original: derived from mp_sendirr, but using 
    6329             : !                            swapm logic and XOR swap order 
    6330             : !    08.08.22   Worley      removed swapm; reimplemented with native MPI,
    6331             : !                            added flow control/ready send option and maxreq
    6332             : !                            throttling, added alltoall option
    6333             : !
    6334             : ! !BUGS:
    6335             : !
    6336             : !EOP
    6337             : !------------------------------------------------------------------------------
    6338             : !BOC
    6339             : !
    6340             : ! !LOCAL VARIABLES:
    6341             :       integer :: i, j, p, istep, num_s, num_r
    6342             :       integer :: comm_pid, comm_size, steps, ierr
    6343             :       integer :: ipe, offset_s, offset_r, offset_0, unitsize
    6344             : 
    6345             :       integer :: sndlths(0:numpro-1), sdispls(0:numpro-1)
    6346             :       integer :: rcvlths(0:numpro-1), rdispls(0:numpro-1)
    6347             :       integer :: swapids(numpro) 
    6348             :       integer :: sndids(numpro)  ! nonblocking MPI send request ids
    6349             :       integer :: rcvids(numpro)  ! nonblocking MPI recv request ids
    6350             :       integer :: hs_snd, hs_rcv(numpro)! handshake variables (send/receive)
    6351             :       integer :: hs_rcvids(numpro) ! nonblocking MPI handshake recv request ids
    6352             :       integer :: InStats(numpro*MPI_STATUS_SIZE)
    6353             :       integer :: OutStats(numpro*MPI_STATUS_SIZE)
    6354             : 
    6355             :       integer :: offset_v
    6356             : 
    6357             :       integer :: rstep
    6358             : 
    6359             :       integer :: maxreq, maxreqh
    6360             :       logical :: handshake, alltoall, sendd
    6361             :       integer :: ip, jp, kp, mp, irs, jrs, jirs, mt
    6362             :       integer :: numtr, numtrm, irr, jrr, jirr, m
    6363             : 
    6364             : #if defined( MODCM_TIMING )
    6365             :       call t_startf('mod_comm communication')
    6366             : #endif
    6367             : 
    6368             :       call MPI_COMM_SIZE (comm, comm_size, ierr)
    6369             :       call MPI_COMM_RANK (comm, comm_pid, ierr)
    6370             : 
    6371             : !     num_s = 0 if this process is not part of the sending decomposition
    6372             :       num_s = size(send_bl)
    6373             :       if (send_bl(1)%Nparcels == -1) then
    6374             :          num_s = 0
    6375             :       endif
    6376             : 
    6377             : !     num_r = 0 if this process is not part of the receiving decomposition
    6378             :       num_r = size(recv_bl)
    6379             :       if (recv_bl(1)%Nparcels == -1) then
    6380             :          num_r = 0
    6381             :       endif
    6382             : 
    6383             :       if ( present(sw_handshake) ) then
    6384             :          handshake = sw_handshake
    6385             :          hs_snd = 1
    6386             :       else
    6387             :          handshake = .false.
    6388             :       endif
    6389             : 
    6390             :       if ( present(sw_alltoall) ) then
    6391             :          alltoall = sw_alltoall
    6392             :       else
    6393             :          alltoall = .false.
    6394             :       endif
    6395             : 
    6396             :       if ( present(sw_send) ) then
    6397             :          sendd = sw_send
    6398             :       else
    6399             :          sendd = .false.
    6400             :       endif
    6401             : 
    6402             :       numtrm = mend - mbeg
    6403             :       numtr = numtrm + 1
    6404             :       jrs = jlasts - jfirsts + 1
    6405             :       irs = ilasts - ifirsts + 1
    6406             :       jirs = jrs * irs
    6407             :       jrr = jlastr - jfirstr + 1
    6408             :       irr = ilastr - ifirstr + 1
    6409             :       jirr = jrr * irr
    6410             : 
    6411             :       unitsize = r8_win%size/max_irr
    6412             : 
    6413             : ! advance to unused portion of storage window
    6414             :       r8_win%ncall_s = r8_win%ncall_s + 1
    6415             : 
    6416             :       if (r8_win%ncall_s .gt. max_irr-numtrm) then
    6417             :          write(iulog,*) "mp_swaptrirr: insufficient window storage - exiting"
    6418             :          write(iulog,*) "r8_win%ncall_s max_irr = ", r8_win%ncall_s, max_irr
    6419             :          stop
    6420             :       endif
    6421             : 
    6422             : ! calculate send lengths and displacements
    6423             :       offset_0 = (r8_win%ncall_s-1)*unitsize
    6424             :       offset_s = offset_0
    6425             :       sndlths(:) = 0
    6426             :       sdispls(:) = 0
    6427             :       do ipe=1, num_s
    6428             :          sndlths(ipe-1) = numtr*send_bl(ipe)%Tot_Size
    6429             :          sdispls(ipe-1) = offset_s
    6430             :          if (sndlths(ipe-1) .ne. 0) then
    6431             : 
    6432             :             offset_s = offset_s + sndlths(ipe-1)
    6433             :             if (offset_s-offset_0 .gt. numtr*unitsize) then
    6434             :               write(iulog,*) "Fatal mp_swaptrirr: send window out of space - exiting"
    6435             :               write(iulog,*) '1 comm_pid ipe unitsize offset_s offset_0 = ', comm_pid,  &
    6436             :                              ipe, unitsize, offset_s, offset_0
    6437             :               stop
    6438             :             endif
    6439             :          endif
    6440             :       enddo
    6441             : 
    6442             : ! calculate receive lengths and displacements
    6443             :       offset_r = offset_0
    6444             :       rcvlths(:) = 0
    6445             :       rdispls(:) = 0
    6446             :       do ipe=1, num_r
    6447             :          rcvlths(ipe-1) = numtr*recv_bl(ipe)%Tot_Size
    6448             :          rdispls(ipe-1) = offset_r
    6449             :          if (rcvlths(ipe-1) .ne. 0) then
    6450             : 
    6451             :             offset_r = offset_r + rcvlths(ipe-1)
    6452             :             if (numtr*unitsize < offset_r-offset_0) then
    6453             :               write(iulog,*) "Fatal mp_swaptrirr: receive window out of space - exiting"
    6454             :               write(iulog,*) '1 comm_pid ipe unitsize offset_r offset_0 = ', comm_pid,  &
    6455             :                         ipe, unitsize, offset_r, offset_0
    6456             :               stop
    6457             :             endif
    6458             :          endif
    6459             :       enddo
    6460             : 
    6461             : ! Calculate swap partners and number of steps in point-to-point
    6462             : ! implementations of alltoall algorithm.
    6463             :       steps = 0
    6464             :       do ipe=1,ceil2(comm_size)-1
    6465             :          p = pair(comm_size,ipe,comm_pid)
    6466             :          if (p >= 0) then
    6467             :             if (sndlths(p) > 0 .or. rcvlths(p) > 0) then
    6468             :                steps = steps + 1
    6469             :                swapids(steps) = p
    6470             :             end if
    6471             :          end if
    6472             :       end do
    6473             : 
    6474             :       if (.not. alltoall) then
    6475             : 
    6476             :          sndids(1:steps) = MPI_REQUEST_NULL
    6477             :          rcvids(1:steps) = MPI_REQUEST_NULL
    6478             : 
    6479             :          if (steps .eq. 0) then
    6480             :             maxreq  = 0
    6481             :             maxreqh = 0
    6482             :          elseif (steps .eq. 1) then
    6483             :             maxreq  = 1
    6484             :             maxreqh = 1
    6485             :          else
    6486             :             if ( present(sw_maxreq) ) then
    6487             :                if ((sw_maxreq .le. steps) .and. (sw_maxreq .ge. 0)) then
    6488             :                   maxreq  = sw_maxreq
    6489             :                   if (maxreq > 1) then
    6490             :                      maxreqh = maxreq/2
    6491             :                   else
    6492             :                      maxreq  = 2
    6493             :                      maxreqh = 1
    6494             :                   endif
    6495             :                else
    6496             :                   maxreq  = steps
    6497             :                   maxreqh = steps
    6498             :                endif
    6499             :             else
    6500             :                maxreq  = steps
    6501             :                maxreqh = steps
    6502             :             endif
    6503             :          endif
    6504             : 
    6505             : ! Post initial handshake receive requests
    6506             :          if (handshake) then
    6507             :             do istep=1,maxreq
    6508             :                p = swapids(istep)
    6509             :                if (sndlths(p) > 0) then
    6510             :                   call mpi_irecv  ( hs_rcv(istep), 1, mp_i4, p, comm_pid, comm, &
    6511             :                                     hs_rcvids(istep), ierr )
    6512             :                endif
    6513             :             enddo
    6514             :          endif
    6515             : 
    6516             : ! Post initial receive requests
    6517             :          do istep=1,maxreq
    6518             :             p = swapids(istep)
    6519             :             if (rcvlths(p) > 0) then
    6520             :                offset_r = rdispls(p)+1
    6521             :                call mpi_irecv ( ga_r8_r(offset_r), rcvlths(p), mp_r8, &
    6522             :                                 p, p, comm, rcvids(istep), ierr )
    6523             :                if (handshake) then
    6524             :                   call mpi_send( hs_snd, 1, mp_i4, p, p, comm, &
    6525             :                                  ierr )
    6526             :                endif
    6527             :             endif
    6528             :          enddo
    6529             :          rstep = maxreq
    6530             : !
    6531             :       endif
    6532             : 
    6533             : ! gather data into global send buffer
    6534             :       do istep=1,steps
    6535             :          p = swapids(istep)
    6536             : 
    6537             :          if (sndlths(p) .ne. 0) then
    6538             :             offset_v = sdispls(p)
    6539             :             do j = 1, send_bl(p+1)%nparcels
    6540             :                do m = mbeg, mend
    6541             :                   do i = 1, send_bl(p+1)%blocksizes(j)
    6542             :                      mp = send_bl(p+1)%displacements(j)+i
    6543             :                      kp = kfirsts + (mp-1)/jirs
    6544             :                      mt = (kp-kfirsts)*jirs
    6545             :                      jp = jfirsts + (mp-mt-1)/irs
    6546             :                      ip = mp-mt - (jp-jfirsts)*irs + ifirsts-1
    6547             :                      ga_r8_s(send_bl(p+1)%Tot_Size*(m-mbeg)+offset_v+i) = qin(ip,jp,kp,m)
    6548             :                   enddo
    6549             :                enddo
    6550             :                offset_v = offset_v + send_bl(p+1)%blocksizes(j)
    6551             :             enddo
    6552             :          endif
    6553             : 
    6554             :          if (.not. alltoall) then
    6555             : 
    6556             : ! Submit new i(r)send request
    6557             :             offset_s = sdispls(p)+1
    6558             :             if (sndlths(p) > 0) then
    6559             :                if (handshake) then
    6560             :                   call mpi_wait( hs_rcvids(istep), MPI_STATUS_IGNORE, ierr )
    6561             :                   if (sendd) then
    6562             :                      call mpi_rsend( ga_r8_s(offset_s), sndlths(p), mp_r8, &
    6563             :                                       p, comm_pid, comm, ierr )
    6564             :                   else
    6565             :                      call mpi_irsend( ga_r8_s(offset_s), sndlths(p), mp_r8, &
    6566             :                                       p, comm_pid, comm, sndids(istep), ierr )
    6567             :                   endif
    6568             :                else
    6569             :                   if (sendd) then
    6570             :                      call mpi_send ( ga_r8_s(offset_s), sndlths(p), mp_r8, &
    6571             :                                       p, comm_pid, comm, ierr )
    6572             :                   else
    6573             :                      call mpi_isend ( ga_r8_s(offset_s), sndlths(p), mp_r8, &
    6574             :                                       p, comm_pid, comm, sndids(istep), ierr )
    6575             :                   endif
    6576             :                endif
    6577             :             endif
    6578             : 
    6579             :             if (istep > maxreqh) then
    6580             : ! Wait for oldest irecv request to complete
    6581             :                call mpi_wait( rcvids(istep-maxreqh), OutStats, ierr )
    6582             : 
    6583             :                if (rstep < steps) then
    6584             :                   rstep = rstep + 1
    6585             :                   p = swapids(rstep)
    6586             : 
    6587             : ! Submit a new handshake irecv request
    6588             :                   if (handshake) then
    6589             :                      if (sndlths(p) > 0) then
    6590             :                         call mpi_irecv( hs_rcv(rstep), 1, mp_i4, p, comm_pid, comm, &
    6591             :                                         hs_rcvids(rstep), ierr )
    6592             :                      endif
    6593             :                   endif
    6594             : 
    6595             : ! Submit a new irecv request
    6596             :                   if (rcvlths(p) > 0) then
    6597             :                      offset_r = rdispls(p)+1
    6598             :                      call mpi_irecv( ga_r8_r(offset_r), rcvlths(p), mp_r8, &
    6599             :                                      p, p, comm, rcvids(rstep), ierr )
    6600             :                      if (handshake) then
    6601             :                         call mpi_send ( hs_snd, 1, mp_i4, p, p, comm, &
    6602             :                                         ierr )
    6603             :                      endif
    6604             :                   endif
    6605             :                endif
    6606             : 
    6607             : ! Wait for outstanding i(r)send request to complete
    6608             :                if (.not. sendd) then
    6609             :                   call mpi_wait( sndids(istep-maxreqh), InStats, ierr )
    6610             :                endif
    6611             :             endif
    6612             : !
    6613             :          endif
    6614             : !
    6615             :       enddo
    6616             : 
    6617             : ! local copy to send buffer
    6618             :       if (sndlths(comm_pid) .ne. 0) then
    6619             : 
    6620             :          offset_v = sdispls(comm_pid)
    6621             :          do j = 1, send_bl(comm_pid+1)%nparcels
    6622             :             do m = mbeg, mend
    6623             :                do i = 1, send_bl(comm_pid+1)%blocksizes(j)
    6624             :                   mp = send_bl(comm_pid+1)%displacements(j)+i
    6625             :                   kp = kfirsts + (mp-1)/jirs
    6626             :                   mt = (kp-kfirsts)*jirs
    6627             :                   jp = jfirsts + (mp-mt-1)/irs
    6628             :                   ip = mp-mt - (jp-jfirsts)*irs + ifirsts-1
    6629             :                   ga_r8_s(send_bl(comm_pid+1)%Tot_Size*(m-mbeg)+offset_v+i) = qin(ip,jp,kp,m)
    6630             :                enddo
    6631             :             enddo
    6632             :             offset_v = offset_v + send_bl(comm_pid+1)%blocksizes(j)
    6633             :          enddo
    6634             : 
    6635             :          if (.not. alltoall) then
    6636             :             ga_r8_r(rdispls(comm_pid)+1:rdispls(comm_pid)+rcvlths(comm_pid)) = &
    6637             :                ga_r8_s(sdispls(comm_pid)+1:sdispls(comm_pid)+sndlths(comm_pid))
    6638             :          endif
    6639             : 
    6640             :       endif
    6641             : 
    6642             :       if (alltoall) then
    6643             :          call mpi_alltoallv (ga_r8_s, sndlths, sdispls, mp_r8, &
    6644             :                              ga_r8_r, rcvlths, rdispls, mp_r8, &
    6645             :                              comm, ierror)
    6646             :       endif
    6647             : 
    6648             : ! local copy from receive buffer
    6649             :       if (rcvlths(comm_pid) .ne. 0) then
    6650             : 
    6651             :          offset_v = rdispls(comm_pid)
    6652             :          do j = 1, recv_bl(comm_pid+1)%Nparcels
    6653             :             do m = mbeg, mend
    6654             :                do i = 1, recv_bl(comm_pid+1)%blocksizes(j)
    6655             :                   mp = recv_bl(comm_pid+1)%displacements(j)+i
    6656             :                   kp = kfirstr + (mp-1)/jirr
    6657             :                   mt = (kp-kfirstr)*jirr
    6658             :                   jp = jfirstr + (mp-mt-1)/irr
    6659             :                   ip = mp-mt - (jp-jfirstr)*irr + ifirstr-1
    6660             :                   qout(ip,jp,kp,m) = ga_r8_r(recv_bl(comm_pid+1)%Tot_Size*(m-mbeg)+offset_v+i)
    6661             :                enddo
    6662             :             enddo
    6663             :             offset_v = offset_v + recv_bl(comm_pid+1)%blocksizes(j)
    6664             :          enddo
    6665             : 
    6666             :       endif
    6667             : 
    6668             : ! scatter data from global receive buffer to final destination
    6669             :       do istep=1,steps
    6670             :          p = swapids(istep)
    6671             : 
    6672             :          if (.not. alltoall) then
    6673             :             if (istep > steps-maxreqh) then
    6674             :                call mpi_wait( rcvids(istep), OutStats, ierr )
    6675             :             endif
    6676             :          endif
    6677             : 
    6678             :          if (rcvlths(p) .ne. 0) then
    6679             : 
    6680             :             offset_v = rdispls(p)
    6681             :             do j = 1, recv_bl(p+1)%Nparcels
    6682             :                do m = mbeg, mend
    6683             :                   do i = 1, recv_bl(p+1)%blocksizes(j)
    6684             :                      mp = recv_bl(p+1)%displacements(j)+i
    6685             :                      kp = kfirstr + (mp-1)/jirr
    6686             :                      mt = (kp-kfirstr)*jirr
    6687             :                      jp = jfirstr + (mp-mt-1)/irr
    6688             :                      ip = mp-mt - (jp-jfirstr)*irr + ifirstr-1
    6689             :                      qout(ip,jp,kp,m) = ga_r8_r(recv_bl(p+1)%Tot_Size*(m-mbeg)+offset_v+i)
    6690             :                   enddo
    6691             :                enddo
    6692             :                offset_v = offset_v + recv_bl(p+1)%blocksizes(j)
    6693             :             enddo
    6694             : 
    6695             :          endif
    6696             :       enddo
    6697             : 
    6698             : ! Wait for any outstanding send requests to complete.
    6699             :       if (.not. alltoall .and. .not. sendd) then
    6700             :          call mpi_waitall( maxreqh, sndids(steps-maxreqh+1), InStats, ierr )
    6701             :       endif
    6702             : 
    6703             : ! clean-up
    6704             : ! make used portion of storage window available for reuse
    6705             :       r8_win%ncall_s = r8_win%ncall_s - 1
    6706             : 
    6707             : #if defined( MODCM_TIMING )
    6708             :       call t_stopf('mod_comm communication')
    6709             : #endif
    6710             : 
    6711             : !EOC
    6712             :       end subroutine mp_swaptrirr
    6713             : # endif
    6714             : !------------------------------------------------------------------------------
    6715             : #endif
    6716           0 :       end module mod_comm
    6717             : 

Generated by: LCOV version 1.14