cycle.c

References to this file elsewhere.
1 #ifndef MS_SUA
2 # include <stdio.h>
3 #endif
4 #include <fcntl.h>
5 
6 #define STANDARD_ERROR 2
7 
8 #define STANDARD_OUTPUT 1
9 
10 #ifndef STUBMPI
11 #  include "mpi.h"
12 #endif
13 #include "rsl_lite.h"
14 
15 #define  UP_EVEN(A)   ((A)+abs((A)%2))
16 #define  DOWN_EVEN(A) ((A) - abs((A)%2))
17 #define  UP_ODD(A)    ((A) + abs(((A)+1)%2))
18 #define  DOWN_ODD(A)  ((A) - abs(((A)+1)%2))
19 #define  MIN(A,B)     ((A)<(B)?(A):(B))
20 #define  MAX(A,B)     ((A)>(B)?(A):(B))
21 
22 static int *y_curs_src = NULL ;
23 static int *x_curs_src = NULL ;
24 static int *y_curs_dst = NULL ;
25 static int *x_curs_dst = NULL ;
26 static int *x_peermask_src = NULL ;
27 static int *x_peermask_dst = NULL ;
28 static int *nbytes_src = NULL ; 
29 static int *nbytes_dst = NULL ; 
30 
31 #ifndef STUBMPI
32 static MPI_Request *x_recv = NULL ,  *x_send = NULL ;
33 #endif
34 
35 RSL_LITE_INIT_CYCLE (  int * Fcomm ,
36                 int * xy0 , int * inout0 ,
37                 int * n3dR0, int *n2dR0, int * typesizeR0 , 
38                 int * n3dI0, int *n2dI0, int * typesizeI0 , 
39                 int * n3dD0, int *n2dD0, int * typesizeD0 , 
40                 int * n3dL0, int *n2dL0, int * typesizeL0 , 
41                 int * me0, int * np0 , int * np_x0 , int * np_y0 ,
42                 int * ids0 , int * ide0 , int * jds0 , int * jde0 , int * kds0 , int * kde0 ,
43                 int * ips0 , int * ipe0 , int * jps0 , int * jpe0 , int * kps0 , int * kpe0 )
44 {
45   int n3dR, n2dR, typesizeR ;
46   int n3dI, n2dI, typesizeI ;
47   int n3dD, n2dD, typesizeD ;
48   int n3dL, n2dL, typesizeL ;
49   int xy, inout ;
50   int me, np, np_x, np_y, np_dim ;
51   int ids , ide , jds , jde , kds , kde ;
52   int ips , ipe , jps , jpe , kps , kpe ;
53   int ips_send , ipe_send ;
54   int npts, i, ii, j, jj, m, n, ps, pe, ops, ope ;
55   int Px, Py, P, Q, swap, coords[2] ;
56 #ifndef STUBMPI
57   MPI_Comm *comm, dummy_comm ;
58 
59   comm = &dummy_comm ;
60   *comm = MPI_Comm_f2c( *Fcomm ) ;
61 
62   xy = *xy0 ;
63   inout = *inout0 ;     /* 1 is in (uncycled to cycled) 0 is out */
64   n3dR = *n3dR0 ; n2dR = *n2dR0 ; typesizeR = *typesizeR0 ;
65   n3dI = *n3dI0 ; n2dI = *n2dI0 ; typesizeI = *typesizeI0 ;
66   n3dD = *n3dD0 ; n2dD = *n2dD0 ; typesizeD = *typesizeD0 ;
67   n3dL = *n3dL0 ; n2dL = *n2dL0 ; typesizeL = *typesizeL0 ;
68   me = *me0 ; np = *np0 ; np_x = *np_x0 ; np_y = *np_y0 ;
69   ids = *ids0-1 ; ide = *ide0-1 ; jds = *jds0-1 ; jde = *jde0-1 ; kds = *kds0-1 ; kde = *kde0-1 ;
70   ips = *ips0-1 ; ipe = *ipe0-1 ; jps = *jps0-1 ; jpe = *jpe0-1 ; kps = *kps0-1 ; kpe = *kpe0-1 ;
71 
72   if ( nbytes_src == NULL ) nbytes_src = RSL_MALLOC ( int , np ) ;
73   if ( nbytes_dst == NULL ) nbytes_dst = RSL_MALLOC ( int , np ) ;
74   if ( x_curs_src == NULL ) x_curs_src = RSL_MALLOC ( int , np ) ;
75   if ( x_curs_dst == NULL ) x_curs_dst = RSL_MALLOC ( int , np ) ;
76   if ( x_peermask_src == NULL ) x_peermask_src = RSL_MALLOC ( int , np ) ;
77   if ( x_peermask_dst == NULL ) x_peermask_dst = RSL_MALLOC ( int , np ) ;
78   if ( x_recv == NULL ) x_recv = RSL_MALLOC ( MPI_Request , np ) ;
79   if ( x_send == NULL ) x_send = RSL_MALLOC ( MPI_Request , np ) ;
80   for ( i = 0 ; i < np ; i++ ) { nbytes_src[i] = 0 ; x_curs_src[i] = 0 ; x_peermask_src[i] = 0 ; }
81   for ( i = 0 ; i < np ; i++ ) { nbytes_dst[i] = 0 ; x_curs_dst[i] = 0 ; x_peermask_dst[i] = 0 ; }
82 
83   if ( xy == 1 ) {   /* xy = 1, cycle in X, otherwise Y */
84     np_dim = np_x ;
85     ps = ips ;
86     pe = ipe ;
87     ops = jps ;
88     ope = jpe ;
89     m = (ide-ids+1)/np_dim ;
90     n = (m*np_dim)/m ;
91   } else {
92     np_dim = np_y ;
93     ps = jps ;
94     pe = jpe ;
95     ops = ips ;
96     ope = ipe ;
97     m = (jde-jds+1)/np_dim ;
98     n = (m*np_dim)/m ;
99   }
100 
101   for ( i = ps ; i <= MIN(pe,m*np_dim) ; i++ ) {
102     ii = (i/n) + (i%n)*m ;
103     jj = (i/m) + (i%m)*n ;
104     if ( xy == 1 ) {
105       TASK_FOR_POINT ( &ii , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
106       coords[1] = Px ; coords[0] = Py ;
107       MPI_Cart_rank( *comm, coords, &P ) ;
108       TASK_FOR_POINT ( &jj , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
109       coords[1] = Px ; coords[0] = Py ;
110       MPI_Cart_rank( *comm, coords, &Q ) ;
111     } else {
112       TASK_FOR_POINT ( &ips , &ii , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
113       coords[1] = Px ; coords[0] = Py ;
114       MPI_Cart_rank( *comm, coords, &P ) ;
115       TASK_FOR_POINT ( &ips , &jj , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
116       coords[1] = Px ; coords[0] = Py ;
117       MPI_Cart_rank( *comm, coords, &Q ) ;
118     }
119     if ( inout == 0 ) { swap = P ; P = Q ; Q = swap ; }
120 
121     nbytes_src[P] += typesizeR*(ope-ops+1)*(n3dR*(kpe-kps+1)+n2dR) +
122                      typesizeI*(ope-ops+1)*(n3dI*(kpe-kps+1)+n2dI) +
123                      typesizeD*(ope-ops+1)*(n3dD*(kpe-kps+1)+n2dD) +
124                      typesizeL*(ope-ops+1)*(n3dL*(kpe-kps+1)+n2dL) ;
125 
126     nbytes_dst[Q] += typesizeR*(ope-ops+1)*(n3dR*(kpe-kps+1)+n2dR) +
127                      typesizeI*(ope-ops+1)*(n3dI*(kpe-kps+1)+n2dI) +
128                      typesizeD*(ope-ops+1)*(n3dD*(kpe-kps+1)+n2dD) +
129                      typesizeL*(ope-ops+1)*(n3dL*(kpe-kps+1)+n2dL) ;
130   }
131 
132   for ( P = 0 ; P < np ; P++ ) {
133        buffer_for_proc ( P , nbytes_src[P], RSL_SENDBUF ) ;
134        buffer_for_proc ( P , nbytes_dst[P], RSL_RECVBUF ) ;
135   }
136 #endif
137 }
138 
139 RSL_LITE_PACK_CYCLE ( int * Fcomm, char * buf , int * inout0 , int * typesize0 , int * xy0 , int * pu0 , char * memord , int * xstag0 ,
140            int *me0, int * np0 , int * np_x0 , int * np_y0 , 
141            int * ids0 , int * ide0 , int * jds0 , int * jde0 , int * kds0 , int * kde0 ,
142            int * ims0 , int * ime0 , int * jms0 , int * jme0 , int * kms0 , int * kme0 ,
143            int * ips0 , int * ipe0 , int * jps0 , int * jpe0 , int * kps0 , int * kpe0 )
144 {
145   int me, np, np_x, np_y, np_dim ;
146   int inout , typesize ;
147   int ids , ide , jds , jde , kds , kde ;
148   int ims , ime , jms , jme , kms , kme ;
149   int ips , ipe , jps , jpe , kps , kpe ;
150   int xstag ;  /* 0 not stag, 1 stag */
151   int xy ;   /* y = 0 , x = 1 */
152   int pu ;   /* pack = 0 , unpack = 1 */
153   int i, ii, j, jj, m, n  ;
154   int ps, pe, ops, ope ;
155   register int k, t ;
156 #ifdef crayx1
157   register int i2,i3,i4,i_offset;
158 #endif
159   char *p ;
160   int da_buf ;
161   int Px, Py, P, coords[2] ;
162   int ierr = 0 ;
163   register int *pi, *qi ;
164   float f ;
165 #ifndef STUBMPI
166   MPI_Comm *comm, dummy_comm ;
167 
168   comm = &dummy_comm ;
169   *comm = MPI_Comm_f2c( *Fcomm ) ;
170 
171   me = *me0 ; np = *np0 ; np_x = *np_x0 ; np_y = *np_y0 ;
172   xstag = *xstag0 ;
173   inout = *inout0 ; typesize = *typesize0 ;
174   ids = *ids0-1 ; ide = *ide0-1 ; jds = *jds0-1 ; jde = *jde0-1 ; kds = *kds0-1 ; kde = *kde0-1 ;
175   ims = *ims0-1 ; ime = *ime0-1 ; jms = *jms0-1 ; jme = *jme0-1 ; kms = *kms0-1 ; kme = *kme0-1 ;
176   ips = *ips0-1 ; ipe = *ipe0-1 ; jps = *jps0-1 ; jpe = *jpe0-1 ; kps = *kps0-1 ; kpe = *kpe0-1 ;
177   xy = *xy0 ;
178   pu = *pu0 ;
179 
180 /* need to adapt for other memory orders */
181 #define IMAX(A) (((A)>ids)?(A):ids)
182 #define IMIN(A) (((A)<ide)?(A):ide)
183 #define JMAX(A) (((A)>jds)?(A):jds)
184 #define JMIN(A) (((A)<jde)?(A):jde)
185 
186   da_buf = ( pu == 0 ) ? RSL_SENDBUF : RSL_RECVBUF ;
187 
188   if ( xy == 1 ) {   /* xy = 1, cycle in X, otherwise Y */
189     np_dim = np_x ;
190     ps = ips ;
191     pe = ipe ;
192     m = (ide-ids+1)/np_dim ;
193     n = (m*np_dim)/m ;
194   } else {
195     np_dim = np_y ;
196     ps = jps ;
197     pe = jpe ;
198     m = (jde-jds+1)/np_dim ;
199     n = (m*np_dim)/m ;
200   }
201 
202   if ( np_x > 1 && xy == 1 ) {
203 
204     for ( i = ips ; i <= MIN(ipe,m*np_dim-1) ; i++ ) {
205       if ( pu == 0 ) {
206         ii = (inout)?(i/n)+(i%n)*m:(i/m)+(i%m)*n  ;
207         TASK_FOR_POINT ( &ii , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
208         coords[1] = Px ; coords[0] = Py ;
209         MPI_Cart_rank( *comm, coords, &P ) ;
210         p = buffer_for_proc( P , 0 , da_buf ) ;
211 	if ( typesize == sizeof(int) ) {
212           for ( j = jps ; j <= jpe ; j++ ) {
213             for ( k = kps ; k <= kpe ; k++ ) {
214 	      pi = (int *)(p+x_curs_src[P]) ;
215 	      qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
216                                              (k-kms) + (j-jms)*(kme-kms+1))))) ;
217 	      *pi++ = *qi++ ;
218 	      x_curs_src[P] += typesize ;
219 	    }
220 	  }
221 	}
222 	else {
223           for ( j = jps ; j <= jpe ; j++ ) {
224             for ( k = kps ; k <= kpe ; k++ ) {
225               for ( t = 0 ; t < typesize ; t++ ) {
226                 *(p+x_curs_src[P]) = 
227                                *(buf + t + typesize*(
228                                       (i-ims) + (ime-ims+1)*(
229                                       (k-kms) + (j-jms)*(kme-kms+1))) ) ;
230                 x_curs_src[P]++ ;
231               }
232             }
233           }
234 	}
235       } else {
236         ii = (inout)?(i/m)+(i%m)*n:(i/n)+(i%n)*m  ;
237         TASK_FOR_POINT ( &ii , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
238         coords[1] = Px ; coords[0] = Py ;
239         MPI_Cart_rank( *comm, coords, &P ) ;
240         p = buffer_for_proc( P , 0 , da_buf ) ;
241 	if ( typesize == sizeof(int) ) {
242           for ( j = jps ; j <= jpe ; j++ ) {
243             for ( k = kps ; k <= kpe ; k++ ) {
244 	      pi = (int *)(p+x_curs_dst[P]) ;
245 	      qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
246                                              (k-kms) + (j-jms)*(kme-kms+1))))) ;
247 	      *qi++ = *pi++ ;
248 	      x_curs_dst[P] += typesize ;
249 	    }
250 	  }
251 	}
252 	else {
253           for ( j = jps ; j <= jpe ; j++ ) {
254             for ( k = kps ; k <= kpe ; k++ ) {
255               for ( t = 0 ; t < typesize ; t++ ) {
256                                *(buf + t + typesize*(
257                                       (i-ims) + (ime-ims+1)*(
258                                       (k-kms) + (j-jms)*(kme-kms+1))) ) =
259                 *(p+x_curs_dst[P]) ;
260                 x_curs_dst[P]++ ;
261               }
262             }
263           }
264         }
265       }
266     }
267   } else if ( np_y > 1 && xy == 0 ) {
268     for ( j = jps ; j <= MIN(jpe,m*np_dim-1) ; j++ ) {
269       if ( pu == 0 ) {
270         jj = (inout)?(j/n) + (j%n)*m:(j/m) + (j%m)*n ;
271         TASK_FOR_POINT ( &ips , &jj , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
272         coords[1] = Px ; coords[0] = Py ;
273         MPI_Cart_rank( *comm, coords, &P ) ;
274         p = buffer_for_proc( P , 0 , da_buf ) ;
275 	if ( typesize == sizeof(int) ) {
276           for ( i = ips ; i <= ipe ; i++ ) {
277             for ( k = kps ; k <= kpe ; k++ ) {
278 	      pi = (int *)(p+x_curs_src[P]) ;
279 	      qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
280                                              (k-kms) + (j-jms)*(kme-kms+1))))) ;
281 	      *pi++ = *qi++ ;
282 	      x_curs_src[P] += typesize ;
283 	    }
284 	  }
285 	}
286 	else {
287           for ( i = ips ; i <= ipe ; i++ ) {
288             for ( k = kps ; k <= kpe ; k++ ) {
289               for ( t = 0 ; t < typesize ; t++ ) {
290                 *(p+x_curs_src[P]) = 
291                                *(buf + t + typesize*(
292                                       (i-ims) + (ime-ims+1)*(
293                                       (k-kms) + (j-jms)*(kme-kms+1))) ) ;
294                 x_curs_src[P]++ ;
295               }
296             }
297           }
298 	}
299       } else {
300         jj = (inout)?(j/m) + (j%m)*n:(j/n) + (j%n)*m ;
301         TASK_FOR_POINT ( &ips , &jj , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py ) ;
302         coords[1] = Px ; coords[0] = Py ;
303         MPI_Cart_rank( *comm, coords, &P ) ;
304         p = buffer_for_proc( P , 0 , da_buf ) ;
305 	if ( typesize == sizeof(int) ) {
306           for ( i = ips ; i <= ipe ; i++ ) {
307             for ( k = kps ; k <= kpe ; k++ ) {
308 	      pi = (int *)(p+x_curs_dst[P]) ;
309 	      qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
310                                              (k-kms) + (j-jms)*(kme-kms+1))))) ;
311 	      *qi++ = *pi++ ;
312 	      x_curs_dst[P] += typesize ;
313 	    }
314 	  }
315 	}
316 	else {
317           for ( i = ips ; i <= ipe ; i++ ) {
318             for ( k = kps ; k <= kpe ; k++ ) {
319               for ( t = 0 ; t < typesize ; t++ ) {
320                                *(buf + t + typesize*(
321                                       (i-ims) + (ime-ims+1)*(
322                                       (k-kms) + (j-jms)*(kme-kms+1))) ) =
323                 *(p+x_curs_dst[P]) ;
324                 x_curs_dst[P]++ ;
325               }
326             }
327           }
328         }
329       }
330     }
331   }
332 #endif
333 }
334 
335 RSL_LITE_CYCLE ( int * Fcomm0, int *me0, int * np0 , int * np_x0 , int * np_y0 )
336 {
337   int me, np, np_x, np_y ;
338   int yp, ym, xp, xm, nb ;
339   MPI_Status stat ;
340 #ifndef STUBMPI
341   MPI_Comm comm, *comm0, dummy_comm ;
342   int i, P ;
343 
344   comm0 = &dummy_comm ;
345   *comm0 = MPI_Comm_f2c( *Fcomm0 ) ;
346 
347   comm = *comm0 ; me = *me0 ; np = *np0 ; np_x = *np_x0 ; np_y = *np_y0 ;
348 
349   for ( P = 0 ; P < np ; P++ ) {
350       nb = buffer_size_for_proc( P, RSL_RECVBUF ) ;
351       MPI_Irecv ( buffer_for_proc( P, 0, RSL_RECVBUF ), nb, MPI_CHAR, P, me, comm, &(x_recv[P]) ) ;
352       MPI_Isend ( buffer_for_proc( P, 0, RSL_SENDBUF ), x_curs_src[P], MPI_CHAR, P, P, comm, &(x_send[P]) ) ;
353   }
354   for ( P = 0 ; P < np ; P++ ) {
355       MPI_Wait( &x_recv[P], &stat ) ; 
356       MPI_Wait( &x_send[P], &stat ) ; 
357   }
358   for ( i = 0 ; i < np ; i++ ) {  x_curs_src[i] = 0 ; x_curs_dst[i] ; }
359 #endif
360 }
361