#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cublas.h>

#define IDEBUG 12
#define JDEBUG 0

#define MKX 28

#ifndef CRAY
# ifdef NOUNDERSCORE
#      define WSM5_HOST wsm5_host
#      define WSM5_INIT wsm5_init
# else
#   ifdef F2CSTYLE
#      define WSM5_HOST wsm5_host__
#      define WSM5_INIT wsm5_init__
#   else
#      define WSM5_HOST wsm5_host_
#      define WSM5_INIT wsm5_init_
#   endif
# endif
#endif


#define TODEV(A,s) s1=rsl_internal_microclock_() ; float *A##_d;cudaMalloc((void**)&A##_d,(s)*sizeof(float));cudaMemcpy(A##_d,A,(s)*sizeof(float),cudaMemcpyHostToDevice); e1=rsl_internal_microclock_() ; fprintf(stderr,"TODEV %d\n",e1-s1) 

#define FROMDEV(A,s) s1=rsl_internal_microclock_() ; cudaMemcpy(A,A##_d,(s)*sizeof(float),cudaMemcpyDeviceToHost); e1=rsl_internal_microclock_() ; fprintf(stderr,"FROMDEV %d\n",e1-s1) 

#define CLNUP(A) s1=rsl_internal_microclock_() ; cudaFree(A##_d) ; e1=rsl_internal_microclock_() ; fprintf(stderr,"Free %d\n",e1-s1)

extern int rsl_internal_microclock_() ;


#if 0
extern __global__ void wsm5_gpu ( float* retvals ) ;
#endif
extern __global__ void wsm5_gpu (
                    float *th, float *pii                   //_def_ arg ikj:th,pii
                   ,float *q                                //_def_ arg ikj:q
                   ,float *qc,float *qi,float *qr,float *qs //_def_ arg ikj:qc,qi,qr,qs
                   ,float *den, float *p, float *delz       //_def_ arg ikj:den,p,delz
                   ,float *rain,float *rainncv              //_def_ arg ij:rain,rainncv
                   ,float *sr                               //_def_ arg ij:sr
                   ,float *snow,float *snowncv              //_def_ arg ij:snow,snowncv
                   ,float delt
,float* retvals
                   ,int ids, int ide,  int jds, int jde,  int kds, int kde
                   ,int ims, int ime,  int jms, int jme,  int kms, int kme
                   ,int ips, int ipe,  int jps, int jpe,  int kps, int kpe
                         ) ;

int
WSM5_INIT ( )
{
   float x, *x_d ;
   int s, e ;
   s=rsl_internal_microclock_() ;
   cudaMalloc((void **)&x_d,sizeof(float)) ;
   cudaMemcpy(x_d,&x,sizeof(float),cudaMemcpyHostToDevice) ;
   cudaFree(x_d) ;
   e=rsl_internal_microclock_() ;
   fprintf(stderr,"wsm5_init: %d\n",e-s) ;
   return(0) ;
}

int
WSM5_HOST (
                    float *th, float *pii
                   ,float *q
                   ,float *qc, float *qi, float *qr, float *qs
                   ,float *den, float *p, float *delz
                   ,float *delt
                   ,float *rain,float *rainncv
                   ,float *sr
                   ,float *snow,float *snowncv
                   ,int *ids, int *ide,  int *jds, int *jde,  int *kds, int *kde
                   ,int *ims, int *ime,  int *jms, int *jme,  int *kms, int *kme
                   ,int *ips, int *ipe,  int *jps, int *jpe,  int *kps, int *kpe          
          )
{
//float *bigbuf ;
      int s, e, s1, e1, s2, e2 ;
      int d3 = (*ime-*ims+1) * (*jme-*jms+1) * (*kme-*kms+1) ;
      int d2 = (*ime-*ims+1) * (*jme-*jms+1) ;

fprintf(stderr,"ims %d ime %d jms %d jme %d kms %d kme %d\n",*ims,*ime,*jms,*jme,*kms,*kme) ;
fprintf(stderr,"d3 %d \n",d3 ) ;
fprintf(stderr,"d2 %d \n",d2 ) ;
      s = rsl_internal_microclock_() ;

      TODEV(th,d3) ;
      TODEV(pii,d3) ;
      TODEV(q,d3) ;
      TODEV(qc,d3) ;
      TODEV(qi,d3) ;
      TODEV(qr,d3) ;
      TODEV(qs,d3) ;
      TODEV(den,d3) ;
      TODEV(p,d3) ;
      TODEV(delz,d3) ;
      TODEV(rain,d2) ;
      TODEV(rainncv,d2) ;
      TODEV(sr,d2) ;
      TODEV(snow,d2) ;
      TODEV(snowncv,d2) ;
float retvals[100] ;
{ int k ;
for (k=0 ;k<*kme-*kms+1;k++) {retvals[k] = 0.; }
}
TODEV(retvals,(*kme-*kms+1)) ;

      int remx, remy ;  // remainder?

      remx = (*ipe-*ips+1) % 8 != 0 ? 1 : 0 ;
      remy = (*jpe-*jps+1) % 4 != 0 ? 1 : 0 ;

      dim3 dimBlock( 8 , 4 ) ;
      fprintf(stderr,"ipe ips remx jpe jps remy %d %d %d %d %d %d\n",*ipe,*ips,remx,*jpe,*jps,remy) ;
      dim3 dimGrid ( (*ipe-*ips+1) / 8 + remx , (*jpe-*jps+1) / 4 + remy ) ;

      fprintf(stderr,"Call to wsm5_gpu: block dims %d %d\n",dimBlock.x,dimBlock.y) ;
      fprintf(stderr,"Call to wsm5_gpu: grid  dims %d %d\n",dimGrid.x,dimGrid.y) ;


#if 1
      s2 = rsl_internal_microclock_() ;
#   if 0
      wsm5_gpu <<< dimGrid, dimBlock >>> ( retvals_d ) ;
#   endif
      wsm5_gpu <<< dimGrid, dimBlock >>> (
                    th_d, pii_d, q_d, qc_d, qi_d, qr_d, qs_d, den_d, p_d, delz_d
                   ,rain_d,rainncv_d
                   ,sr_d
                   ,snow_d,snowncv_d
                   ,*delt
,retvals_d
                   ,*ids, *ide,  *jds, *jde,  *kds, *kde
                   ,*ims, *ime,  *jms, *jme,  *kms, *kme
                   ,*ips, *ipe,  *jps, *jpe,  *kps, *kpe
                         ) ;
      cudaThreadSynchronize() ;
      e2 = rsl_internal_microclock_() ;
      fprintf(stderr,"Call to wsm5_gpu (not including data xfer): %d microseconds\n",e2-s2) ;
#endif

      FROMDEV(th,d3) ;
      FROMDEV(pii,d3) ;
      FROMDEV(q,d3) ;
      FROMDEV(qc,d3) ;
      FROMDEV(qi,d3) ;
      FROMDEV(qr,d3) ;
      FROMDEV(qs,d3) ;
      FROMDEV(rain,d2) ;
      FROMDEV(rainncv,d2) ;
      FROMDEV(sr,d2) ;
      FROMDEV(snow,d2) ;
      FROMDEV(snowncv,d2) ;
      e = rsl_internal_microclock_() ;
fprintf(stderr,"retrieving retvals %d\n",*kme-*kms+1) ;
FROMDEV(retvals,(*kme-*kms+1)) ;
      fprintf(stderr,"Call to wsm5_gpu (including data xfer): %d microseconds\n",e-s) ;

{ int k ;
//for (k=0 ;k<*kme-*kms+1;k++) {fprintf(stderr,"retvals %d %f\n",k,retvals[k]) ;}
for (k=0 ;k<5;k++) {fprintf(stderr,"retvals %d %f\n",k,retvals[k]) ;}
}

      CLNUP(th) ;
      CLNUP(pii) ;
      CLNUP(q) ;
      CLNUP(qc) ;
      CLNUP(qi) ;
      CLNUP(qr) ;
      CLNUP(qs) ;
      CLNUP(den) ;
      CLNUP(p) ;
      CLNUP(delz) ;
      CLNUP(rain) ;
      CLNUP(rainncv) ;
      CLNUP(sr) ;
      CLNUP(snow) ;
      CLNUP(snowncv) ;
CLNUP(retvals) ;

      return(0) ;
}

#if 0
main( int argc, char **argv ) 
{
                   float *th ; float *pii ; float *q ;
                   float *qc; float *qi; float *qr; float *qs ;
                   float *den; float *p; float *delz ;
                   float *delt ;
                   float *rain;float *rainncv ;
                   float *sr ;
                   float *snow;float *snowncv ;
                   int *ids; int *ide;  int *jds; int *jde;  int *kds; int *kde ;
                   int *ims; int *ime;  int *jms; int *jme;  int *kms; int *kme ;
                   int *ips; int *ipe;  int *jps; int *jpe;  int *kps; int *kpe     ;
     WSM5_HOST (
                    th, pii, q, qc, qi, qr, qs, den, p, delz
                   ,rain,rainncv
                   ,sr
                   ,snow,snowncv
                   ,delt
                   ,ids, ide,  jds, jde,  kds, kde
                   ,ims, ime,  jms, jme,  kms, kme
                   ,ips, ipe,  jps, jpe,  kps, kpe
                         )  ;
}
#endif
