cycle.c
References to this file elsewhere.
1 #ifndef MS_SUA
2 # include <stdio.h>
3 #endif
4 #include <fcntl.h>
5
6 #define STANDARD_ERROR 2
7
8 #define STANDARD_OUTPUT 1
9
10 #ifndef STUBMPI
11 # include "mpi.h"
12 #endif
13 #include "rsl_lite.h"
14
15 #define UP_EVEN(A) ((A)+abs((A)%2))
16 #define DOWN_EVEN(A) ((A) - abs((A)%2))
17 #define UP_ODD(A) ((A) + abs(((A)+1)%2))
18 #define DOWN_ODD(A) ((A) - abs(((A)+1)%2))
19 #define MIN(A,B) ((A)<(B)?(A):(B))
20 #define MAX(A,B) ((A)>(B)?(A):(B))
21
22 static int *y_curs_src = NULL ;
23 static int *x_curs_src = NULL ;
24 static int *y_curs_dst = NULL ;
25 static int *x_curs_dst = NULL ;
26 static int *x_peermask_src = NULL ;
27 static int *x_peermask_dst = NULL ;
28 static int *nbytes_src = NULL ;
29 static int *nbytes_dst = NULL ;
30
31 #ifndef STUBMPI
32 static MPI_Request *x_recv = NULL , *x_send = NULL ;
33 #endif
34
35 RSL_LITE_INIT_CYCLE ( int * Fcomm ,
36 int * xy0 , int * inout0 ,
37 int * n3dR0, int *n2dR0, int * typesizeR0 ,
38 int * n3dI0, int *n2dI0, int * typesizeI0 ,
39 int * n3dD0, int *n2dD0, int * typesizeD0 ,
40 int * n3dL0, int *n2dL0, int * typesizeL0 ,
41 int * me0, int * np0 , int * np_x0 , int * np_y0 ,
42 int * ids0 , int * ide0 , int * jds0 , int * jde0 , int * kds0 , int * kde0 ,
43 int * ips0 , int * ipe0 , int * jps0 , int * jpe0 , int * kps0 , int * kpe0 )
44 {
45 int n3dR, n2dR, typesizeR ;
46 int n3dI, n2dI, typesizeI ;
47 int n3dD, n2dD, typesizeD ;
48 int n3dL, n2dL, typesizeL ;
49 int xy, inout ;
50 int me, np, np_x, np_y, np_dim ;
51 int ids , ide , jds , jde , kds , kde ;
52 int ips , ipe , jps , jpe , kps , kpe ;
53 int ips_send , ipe_send ;
54 int npts, i, ii, j, jj, m, n, ps, pe, ops, ope ;
55 int Px, Py, P, Q, swap, coords[2] ;
56 #ifndef STUBMPI
57 MPI_Comm *comm, dummy_comm ;
58
59 comm = &dummy_comm ;
60 *comm = MPI_Comm_f2c( *Fcomm ) ;
61
62 xy = *xy0 ;
63 inout = *inout0 ; /* 1 is in (uncycled to cycled) 0 is out */
64 n3dR = *n3dR0 ; n2dR = *n2dR0 ; typesizeR = *typesizeR0 ;
65 n3dI = *n3dI0 ; n2dI = *n2dI0 ; typesizeI = *typesizeI0 ;
66 n3dD = *n3dD0 ; n2dD = *n2dD0 ; typesizeD = *typesizeD0 ;
67 n3dL = *n3dL0 ; n2dL = *n2dL0 ; typesizeL = *typesizeL0 ;
68 me = *me0 ; np = *np0 ; np_x = *np_x0 ; np_y = *np_y0 ;
69 ids = *ids0-1 ; ide = *ide0-1 ; jds = *jds0-1 ; jde = *jde0-1 ; kds = *kds0-1 ; kde = *kde0-1 ;
70 ips = *ips0-1 ; ipe = *ipe0-1 ; jps = *jps0-1 ; jpe = *jpe0-1 ; kps = *kps0-1 ; kpe = *kpe0-1 ;
71
72 if ( nbytes_src == NULL ) nbytes_src = RSL_MALLOC ( int , np ) ;
73 if ( nbytes_dst == NULL ) nbytes_dst = RSL_MALLOC ( int , np ) ;
74 if ( x_curs_src == NULL ) x_curs_src = RSL_MALLOC ( int , np ) ;
75 if ( x_curs_dst == NULL ) x_curs_dst = RSL_MALLOC ( int , np ) ;
76 if ( x_peermask_src == NULL ) x_peermask_src = RSL_MALLOC ( int , np ) ;
77 if ( x_peermask_dst == NULL ) x_peermask_dst = RSL_MALLOC ( int , np ) ;
78 if ( x_recv == NULL ) x_recv = RSL_MALLOC ( MPI_Request , np ) ;
79 if ( x_send == NULL ) x_send = RSL_MALLOC ( MPI_Request , np ) ;
80 for ( i = 0 ; i < np ; i++ ) { nbytes_src[i] = 0 ; x_curs_src[i] = 0 ; x_peermask_src[i] = 0 ; }
81 for ( i = 0 ; i < np ; i++ ) { nbytes_dst[i] = 0 ; x_curs_dst[i] = 0 ; x_peermask_dst[i] = 0 ; }
82
83 if ( xy == 1 ) { /* xy = 1, cycle in X, otherwise Y */
84 np_dim = np_x ;
85 ps = ips ;
86 pe = ipe ;
87 ops = jps ;
88 ope = jpe ;
89 m = (ide-ids+1)/np_dim ;
90 n = (m*np_dim)/m ;
91 } else {
92 np_dim = np_y ;
93 ps = jps ;
94 pe = jpe ;
95 ops = ips ;
96 ope = ipe ;
97 m = (jde-jds+1)/np_dim ;
98 n = (m*np_dim)/m ;
99 }
100
101 for ( i = ps ; i <= MIN(pe,m*np_dim) ; i++ ) {
102 ii = (i/n) + (i%n)*m ;
103 jj = (i/m) + (i%m)*n ;
104 if ( xy == 1 ) {
105 TASK_FOR_POINT ( &ii , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
106 coords[1] = Px ; coords[0] = Py ;
107 MPI_Cart_rank( *comm, coords, &P ) ;
108 TASK_FOR_POINT ( &jj , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
109 coords[1] = Px ; coords[0] = Py ;
110 MPI_Cart_rank( *comm, coords, &Q ) ;
111 } else {
112 TASK_FOR_POINT ( &ips , &ii , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
113 coords[1] = Px ; coords[0] = Py ;
114 MPI_Cart_rank( *comm, coords, &P ) ;
115 TASK_FOR_POINT ( &ips , &jj , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
116 coords[1] = Px ; coords[0] = Py ;
117 MPI_Cart_rank( *comm, coords, &Q ) ;
118 }
119 if ( inout == 0 ) { swap = P ; P = Q ; Q = swap ; }
120
121 nbytes_src[P] += typesizeR*(ope-ops+1)*(n3dR*(kpe-kps+1)+n2dR) +
122 typesizeI*(ope-ops+1)*(n3dI*(kpe-kps+1)+n2dI) +
123 typesizeD*(ope-ops+1)*(n3dD*(kpe-kps+1)+n2dD) +
124 typesizeL*(ope-ops+1)*(n3dL*(kpe-kps+1)+n2dL) ;
125
126 nbytes_dst[Q] += typesizeR*(ope-ops+1)*(n3dR*(kpe-kps+1)+n2dR) +
127 typesizeI*(ope-ops+1)*(n3dI*(kpe-kps+1)+n2dI) +
128 typesizeD*(ope-ops+1)*(n3dD*(kpe-kps+1)+n2dD) +
129 typesizeL*(ope-ops+1)*(n3dL*(kpe-kps+1)+n2dL) ;
130 }
131
132 for ( P = 0 ; P < np ; P++ ) {
133 buffer_for_proc ( P , nbytes_src[P], RSL_SENDBUF ) ;
134 buffer_for_proc ( P , nbytes_dst[P], RSL_RECVBUF ) ;
135 }
136 #endif
137 }
138
139 RSL_LITE_PACK_CYCLE ( int * Fcomm, char * buf , int * inout0 , int * typesize0 , int * xy0 , int * pu0 , char * memord , int * xstag0 ,
140 int *me0, int * np0 , int * np_x0 , int * np_y0 ,
141 int * ids0 , int * ide0 , int * jds0 , int * jde0 , int * kds0 , int * kde0 ,
142 int * ims0 , int * ime0 , int * jms0 , int * jme0 , int * kms0 , int * kme0 ,
143 int * ips0 , int * ipe0 , int * jps0 , int * jpe0 , int * kps0 , int * kpe0 )
144 {
145 int me, np, np_x, np_y, np_dim ;
146 int inout , typesize ;
147 int ids , ide , jds , jde , kds , kde ;
148 int ims , ime , jms , jme , kms , kme ;
149 int ips , ipe , jps , jpe , kps , kpe ;
150 int xstag ; /* 0 not stag, 1 stag */
151 int xy ; /* y = 0 , x = 1 */
152 int pu ; /* pack = 0 , unpack = 1 */
153 int i, ii, j, jj, m, n ;
154 int ps, pe, ops, ope ;
155 register int k, t ;
156 #ifdef crayx1
157 register int i2,i3,i4,i_offset;
158 #endif
159 char *p ;
160 int da_buf ;
161 int Px, Py, P, coords[2] ;
162 int ierr = 0 ;
163 register int *pi, *qi ;
164 float f ;
165 #ifndef STUBMPI
166 MPI_Comm *comm, dummy_comm ;
167
168 comm = &dummy_comm ;
169 *comm = MPI_Comm_f2c( *Fcomm ) ;
170
171 me = *me0 ; np = *np0 ; np_x = *np_x0 ; np_y = *np_y0 ;
172 xstag = *xstag0 ;
173 inout = *inout0 ; typesize = *typesize0 ;
174 ids = *ids0-1 ; ide = *ide0-1 ; jds = *jds0-1 ; jde = *jde0-1 ; kds = *kds0-1 ; kde = *kde0-1 ;
175 ims = *ims0-1 ; ime = *ime0-1 ; jms = *jms0-1 ; jme = *jme0-1 ; kms = *kms0-1 ; kme = *kme0-1 ;
176 ips = *ips0-1 ; ipe = *ipe0-1 ; jps = *jps0-1 ; jpe = *jpe0-1 ; kps = *kps0-1 ; kpe = *kpe0-1 ;
177 xy = *xy0 ;
178 pu = *pu0 ;
179
180 /* need to adapt for other memory orders */
181 #define IMAX(A) (((A)>ids)?(A):ids)
182 #define IMIN(A) (((A)<ide)?(A):ide)
183 #define JMAX(A) (((A)>jds)?(A):jds)
184 #define JMIN(A) (((A)<jde)?(A):jde)
185
186 da_buf = ( pu == 0 ) ? RSL_SENDBUF : RSL_RECVBUF ;
187
188 if ( xy == 1 ) { /* xy = 1, cycle in X, otherwise Y */
189 np_dim = np_x ;
190 ps = ips ;
191 pe = ipe ;
192 m = (ide-ids+1)/np_dim ;
193 n = (m*np_dim)/m ;
194 } else {
195 np_dim = np_y ;
196 ps = jps ;
197 pe = jpe ;
198 m = (jde-jds+1)/np_dim ;
199 n = (m*np_dim)/m ;
200 }
201
202 if ( np_x > 1 && xy == 1 ) {
203
204 for ( i = ips ; i <= MIN(ipe,m*np_dim-1) ; i++ ) {
205 if ( pu == 0 ) {
206 ii = (inout)?(i/n)+(i%n)*m:(i/m)+(i%m)*n ;
207 TASK_FOR_POINT ( &ii , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
208 coords[1] = Px ; coords[0] = Py ;
209 MPI_Cart_rank( *comm, coords, &P ) ;
210 p = buffer_for_proc( P , 0 , da_buf ) ;
211 if ( typesize == sizeof(int) ) {
212 for ( j = jps ; j <= jpe ; j++ ) {
213 for ( k = kps ; k <= kpe ; k++ ) {
214 pi = (int *)(p+x_curs_src[P]) ;
215 qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
216 (k-kms) + (j-jms)*(kme-kms+1))))) ;
217 *pi++ = *qi++ ;
218 x_curs_src[P] += typesize ;
219 }
220 }
221 }
222 else {
223 for ( j = jps ; j <= jpe ; j++ ) {
224 for ( k = kps ; k <= kpe ; k++ ) {
225 for ( t = 0 ; t < typesize ; t++ ) {
226 *(p+x_curs_src[P]) =
227 *(buf + t + typesize*(
228 (i-ims) + (ime-ims+1)*(
229 (k-kms) + (j-jms)*(kme-kms+1))) ) ;
230 x_curs_src[P]++ ;
231 }
232 }
233 }
234 }
235 } else {
236 ii = (inout)?(i/m)+(i%m)*n:(i/n)+(i%n)*m ;
237 TASK_FOR_POINT ( &ii , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
238 coords[1] = Px ; coords[0] = Py ;
239 MPI_Cart_rank( *comm, coords, &P ) ;
240 p = buffer_for_proc( P , 0 , da_buf ) ;
241 if ( typesize == sizeof(int) ) {
242 for ( j = jps ; j <= jpe ; j++ ) {
243 for ( k = kps ; k <= kpe ; k++ ) {
244 pi = (int *)(p+x_curs_dst[P]) ;
245 qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
246 (k-kms) + (j-jms)*(kme-kms+1))))) ;
247 *qi++ = *pi++ ;
248 x_curs_dst[P] += typesize ;
249 }
250 }
251 }
252 else {
253 for ( j = jps ; j <= jpe ; j++ ) {
254 for ( k = kps ; k <= kpe ; k++ ) {
255 for ( t = 0 ; t < typesize ; t++ ) {
256 *(buf + t + typesize*(
257 (i-ims) + (ime-ims+1)*(
258 (k-kms) + (j-jms)*(kme-kms+1))) ) =
259 *(p+x_curs_dst[P]) ;
260 x_curs_dst[P]++ ;
261 }
262 }
263 }
264 }
265 }
266 }
267 } else if ( np_y > 1 && xy == 0 ) {
268 for ( j = jps ; j <= MIN(jpe,m*np_dim-1) ; j++ ) {
269 if ( pu == 0 ) {
270 jj = (inout)?(j/n) + (j%n)*m:(j/m) + (j%m)*n ;
271 TASK_FOR_POINT ( &ips , &jj , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
272 coords[1] = Px ; coords[0] = Py ;
273 MPI_Cart_rank( *comm, coords, &P ) ;
274 p = buffer_for_proc( P , 0 , da_buf ) ;
275 if ( typesize == sizeof(int) ) {
276 for ( i = ips ; i <= ipe ; i++ ) {
277 for ( k = kps ; k <= kpe ; k++ ) {
278 pi = (int *)(p+x_curs_src[P]) ;
279 qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
280 (k-kms) + (j-jms)*(kme-kms+1))))) ;
281 *pi++ = *qi++ ;
282 x_curs_src[P] += typesize ;
283 }
284 }
285 }
286 else {
287 for ( i = ips ; i <= ipe ; i++ ) {
288 for ( k = kps ; k <= kpe ; k++ ) {
289 for ( t = 0 ; t < typesize ; t++ ) {
290 *(p+x_curs_src[P]) =
291 *(buf + t + typesize*(
292 (i-ims) + (ime-ims+1)*(
293 (k-kms) + (j-jms)*(kme-kms+1))) ) ;
294 x_curs_src[P]++ ;
295 }
296 }
297 }
298 }
299 } else {
300 jj = (inout)?(j/m) + (j%m)*n:(j/n) + (j%n)*m ;
301 TASK_FOR_POINT ( &ips , &jj , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
302 coords[1] = Px ; coords[0] = Py ;
303 MPI_Cart_rank( *comm, coords, &P ) ;
304 p = buffer_for_proc( P , 0 , da_buf ) ;
305 if ( typesize == sizeof(int) ) {
306 for ( i = ips ; i <= ipe ; i++ ) {
307 for ( k = kps ; k <= kpe ; k++ ) {
308 pi = (int *)(p+x_curs_dst[P]) ;
309 qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
310 (k-kms) + (j-jms)*(kme-kms+1))))) ;
311 *qi++ = *pi++ ;
312 x_curs_dst[P] += typesize ;
313 }
314 }
315 }
316 else {
317 for ( i = ips ; i <= ipe ; i++ ) {
318 for ( k = kps ; k <= kpe ; k++ ) {
319 for ( t = 0 ; t < typesize ; t++ ) {
320 *(buf + t + typesize*(
321 (i-ims) + (ime-ims+1)*(
322 (k-kms) + (j-jms)*(kme-kms+1))) ) =
323 *(p+x_curs_dst[P]) ;
324 x_curs_dst[P]++ ;
325 }
326 }
327 }
328 }
329 }
330 }
331 }
332 #endif
333 }
334
335 RSL_LITE_CYCLE ( int * Fcomm0, int *me0, int * np0 , int * np_x0 , int * np_y0 )
336 {
337 int me, np, np_x, np_y ;
338 int yp, ym, xp, xm, nb ;
339 MPI_Status stat ;
340 #ifndef STUBMPI
341 MPI_Comm comm, *comm0, dummy_comm ;
342 int i, P ;
343
344 comm0 = &dummy_comm ;
345 *comm0 = MPI_Comm_f2c( *Fcomm0 ) ;
346
347 comm = *comm0 ; me = *me0 ; np = *np0 ; np_x = *np_x0 ; np_y = *np_y0 ;
348
349 for ( P = 0 ; P < np ; P++ ) {
350 nb = buffer_size_for_proc( P, RSL_RECVBUF ) ;
351 MPI_Irecv ( buffer_for_proc( P, 0, RSL_RECVBUF ), nb, MPI_CHAR, P, me, comm, &(x_recv[P]) ) ;
352 MPI_Isend ( buffer_for_proc( P, 0, RSL_SENDBUF ), x_curs_src[P], MPI_CHAR, P, P, comm, &(x_send[P]) ) ;
353 }
354 for ( P = 0 ; P < np ; P++ ) {
355 MPI_Wait( &x_recv[P], &stat ) ;
356 MPI_Wait( &x_send[P], &stat ) ;
357 }
358 for ( i = 0 ; i < np ; i++ ) { x_curs_src[i] = 0 ; x_curs_dst[i] ; }
359 #endif
360 }
361