Line data Source code
1 : !-----------------------------------------------------------------------
2 : !BOP
3 : ! !ROUTINE: par_xsum --- Calculate x-sum bit-wise consistently
4 : !
5 : ! !INTERFACE:
6 : !****6***0*********0*********0*********0*********0*********0**********72
7 7872 : subroutine par_xsum(grid, a, ltot, sum)
8 : !****6***0*********0*********0*********0*********0*********0**********72
9 : !
10 : ! !USES:
11 : #if defined ( SPMD )
12 : use parutilitiesmodule, only : parexchangevector
13 : #endif
14 : use dynamics_vars, only : T_FVDYCORE_GRID
15 : use shr_kind_mod, only: r8 => shr_kind_r8
16 : use shr_reprosum_mod, only : shr_reprosum_calc, shr_reprosum_tolExceeded, &
17 : shr_reprosum_reldiffmax, &
18 : shr_reprosum_recompute
19 : use cam_logfile, only : iulog
20 : use FVperf_module, only : FVstartclock, FVstopclock
21 :
22 : implicit none
23 :
24 : ! !INPUT PARAMETERS:
25 : type (T_FVDYCORE_GRID), intent(in) :: grid
26 : integer, intent(in) :: ltot ! number of quantities to be summed
27 : ! input vector to be summed
28 : real (r8), intent(in) :: a(grid%ifirstxy:grid%ilastxy,ltot)
29 :
30 : ! !OUTPUT PARAMETERS:
31 : real (r8) sum(ltot) ! sum of all vector entries
32 :
33 : ! !DESCRIPTION:
34 : ! This subroutine calculates the sum of "a" in a reproducible
35 : ! (sequentialized) fashion which should give bit-wise identical
36 : ! results irrespective of the number of MPI processes.
37 : !
38 : ! !CALLED FROM:
39 : ! te_map
40 : !
41 : ! !REVISION HISTORY:
42 : !
43 : ! AAM 00.11.01 : Created
44 : ! WS 03.10.22 : pmgrid removed (now spmd_dyn)
45 : ! WS 04.10.04 : added grid as an argument; removed spmd_dyn
46 : ! WS 05.05.25 : removed ifirst, ilast, im as arguments (in grid)
47 : ! PW 08.06.25 : added fixed point reproducible sum
48 : !
49 : !EOP
50 : !---------------------------------------------------------------------
51 : !BOC
52 :
53 : ! !Local
54 : real(r8), parameter :: D0_0 = 0.0_r8
55 :
56 15744 : real(r8) :: rel_diff(2,ltot)
57 7872 : real(r8),allocatable :: l_a(:)
58 7872 : real(r8),allocatable :: a_tmp(:)
59 :
60 : integer :: i,ipe,l,im,lim,nprxy_x,offset
61 15744 : integer :: sendcount(grid%nprxy_x)
62 15744 : integer :: recvcount(grid%nprxy_x)
63 :
64 : logical :: write_warning
65 :
66 7872 : im = grid%im
67 7872 : lim = (grid%ilastxy-grid%ifirstxy) + 1
68 7872 : nprxy_x = grid%nprxy_x
69 7872 : offset = grid%ifirstxy - 1
70 :
71 7872 : call FVstartclock(grid,'xsum_reprosum')
72 : call shr_reprosum_calc(a, sum, lim, lim, ltot, gbl_count=im, &
73 7872 : commid=grid%commxy_x, rel_diff=rel_diff)
74 7872 : call FVstopclock(grid,'xsum_reprosum')
75 :
76 : ! check that "fast" reproducible sum is accurate enough. If not, calculate
77 : ! using old method
78 7872 : write_warning = .false.
79 7872 : if (grid%myidxy_x == 0) write_warning = .true.
80 7872 : if ( shr_reprosum_tolExceeded('par_xsum', ltot, write_warning, &
81 : iulog, rel_diff) ) then
82 0 : if ( shr_reprosum_recompute ) then
83 0 : call FVstartclock(grid,'xsum_sumfix')
84 0 : allocate( l_a(lim*nprxy_x) )
85 0 : allocate( a_tmp(im) )
86 0 : sendcount(:) = lim
87 :
88 0 : do l=1,ltot
89 0 : if (rel_diff(1,l) > shr_reprosum_reldiffmax) then
90 0 : sum(l) = D0_0
91 : #if defined ( SPMD )
92 0 : do ipe=1,nprxy_x
93 0 : do i=1,lim
94 0 : l_a(i+(ipe-1)*lim) = a(i+offset,l)
95 : enddo
96 : enddo
97 : call parexchangevector( grid%commxy_x, sendcount, l_a, &
98 0 : recvcount, a_tmp )
99 0 : do i=1,im
100 0 : sum(l) = sum(l) + a_tmp(i)
101 : enddo
102 : #else
103 : do i=1,im
104 : sum(l) = sum(l) + a(i,l)
105 : enddo
106 : #endif
107 : endif
108 :
109 : enddo
110 :
111 0 : deallocate( a_tmp )
112 0 : deallocate( l_a )
113 0 : call FVstopclock(grid,'xsum_sumfix')
114 : endif
115 : endif
116 :
117 7872 : return
118 : !EOC
119 7872 : end subroutine par_xsum
120 : !-----------------------------------------------------------------------
121 :
122 : !-----------------------------------------------------------------------
123 : !BOP
124 : ! !ROUTINE: par_xsum_r4 --- Calculate x-sum bit-wise consistently (real4)
125 : !
126 : ! !INTERFACE:
127 : !****6***0*********0*********0*********0*********0*********0**********72
128 0 : subroutine par_xsum_r4(grid, a, ltot, sum)
129 : !****6***0*********0*********0*********0*********0*********0**********72
130 : !
131 : ! !USES:
132 : #if defined ( SPMD )
133 : use parutilitiesmodule, only : parexchangevector
134 : #endif
135 : use dynamics_vars, only : T_FVDYCORE_GRID
136 : use shr_kind_mod, only: r8 => shr_kind_r8, r4 => shr_kind_r4
137 : use shr_reprosum_mod, only : shr_reprosum_calc, shr_reprosum_tolExceeded, &
138 : shr_reprosum_reldiffmax, &
139 : shr_reprosum_recompute
140 : use cam_logfile, only : iulog
141 : use FVperf_module, only : FVstartclock, FVstopclock
142 :
143 : implicit none
144 :
145 : ! !INPUT PARAMETERS:
146 : type (T_FVDYCORE_GRID), intent(in) :: grid
147 : integer, intent(in) :: ltot ! number of quantities to be summed
148 : real (r4) a(grid%ifirstxy:grid%ilastxy,ltot) ! input vector to be summed
149 :
150 : ! !OUTPUT PARAMETERS:
151 : real (r8) sum(ltot) ! sum of all vector entries
152 :
153 : ! !DESCRIPTION:
154 : ! This subroutine calculates the sum of "a" in a reproducible
155 : ! (sequentialized) fashion which should give bit-wise identical
156 : ! results irrespective of the number of MPI processes.
157 : !
158 : ! !REVISION HISTORY:
159 : !
160 : ! WS 05.04.08 : Created from par_xsum
161 : ! WS 05.05.25 : removed ifirst, ilast, im as arguments (in grid)
162 : ! WS 06.06.28 : Fixed bug in sequential version
163 : ! PW 08.06.25 : added fixed point reproducible sum
164 : !
165 : !EOP
166 : !---------------------------------------------------------------------
167 : !BOC
168 :
169 : ! !Local
170 : real(r8), parameter :: D0_0 = 0.0_r8
171 :
172 0 : real(r8) :: a8(grid%ifirstxy:grid%ilastxy,ltot)
173 0 : real(r8) :: rel_diff(2,ltot)
174 0 : real(r4),allocatable :: l_a(:)
175 0 : real(r4),allocatable :: a_tmp(:)
176 :
177 : integer i,ipe,l,im,lim,nprxy_x,offset
178 0 : integer sendcount(grid%nprxy_x)
179 0 : integer recvcount(grid%nprxy_x)
180 :
181 : logical :: write_warning
182 :
183 0 : im = grid%im
184 0 : lim = (grid%ilastxy-grid%ifirstxy) + 1
185 0 : nprxy_x = grid%nprxy_x
186 0 : offset = grid%ifirstxy - 1
187 :
188 0 : call FVstartclock(grid,'xsum_r4_reprosum')
189 0 : a8(:,:) = a(:,:)
190 : call shr_reprosum_calc(a8, sum, lim, lim, ltot, gbl_count=im, &
191 0 : commid=grid%commxy_x, rel_diff=rel_diff)
192 0 : call FVstopclock(grid,'xsum_r4_reprosum')
193 :
194 : ! check that "fast" reproducible sum is accurate enough. If not, calculate
195 : ! using old method
196 0 : write_warning = .false.
197 0 : if (grid%myidxy_x == 0) write_warning = .true.
198 0 : if ( shr_reprosum_tolExceeded('par_xsum_r4', ltot, write_warning, &
199 : iulog, rel_diff) ) then
200 0 : if ( shr_reprosum_recompute ) then
201 0 : call FVstartclock(grid,'xsum_r4_sumfix')
202 0 : allocate( l_a(lim*nprxy_x) )
203 0 : allocate( a_tmp(im) )
204 0 : sendcount(:) = lim
205 :
206 0 : do l=1,ltot
207 0 : if (rel_diff(1,l) > shr_reprosum_reldiffmax) then
208 0 : sum(l) = D0_0
209 : #if defined ( SPMD )
210 0 : do ipe=1,nprxy_x
211 0 : do i=1,lim
212 0 : l_a(i+(ipe-1)*lim) = a(i+offset,l)
213 : enddo
214 : enddo
215 : call parexchangevector( grid%commxy_x, sendcount, l_a, &
216 0 : recvcount, a_tmp )
217 0 : do i=1,im
218 0 : sum(l) = sum(l) + a_tmp(i)
219 : enddo
220 : #else
221 : do i=1,im
222 : sum(l) = sum(l) + a(i,l)
223 : enddo
224 : #endif
225 : endif
226 :
227 : enddo
228 :
229 0 : deallocate( a_tmp )
230 0 : deallocate( l_a )
231 0 : call FVstopclock(grid,'xsum_r4_sumfix')
232 : endif
233 : endif
234 :
235 0 : return
236 : !EOC
237 0 : end subroutine par_xsum_r4
238 : !-----------------------------------------------------------------------
|