// wsm5_gpu.cu gets preprocessed by spt.pl, which handles the _def_ directives before it is compiled

#ifndef PREPASS
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cublas.h>
#endif

#define IDEBUG ((DEBUG_I)-2)
#define JDEBUG ((DEBUG_J)-2)
#define KDEBUG (DEBUG_K)

#define MKX 28

#define MAX(x,y) ((x)>(y)?(x):(y))
#define MIN(x,y) ((x)<(y)?(x):(y))

// this is an M4 include
include(debug.m4)

#include "spt.h"

__global__ void wsm5_gpu ( 
                    float *th, float *pii                   //_def_ arg ikj:th,pii
                   ,float *q                                //_def_ arg ikj:q
                   ,float *qc,float *qi,float *qr,float *qs //_def_ arg ikj:qc,qi,qr,qs
                   ,float *den, float *p, float *delz       //_def_ arg ikj:den,p,delz
                   ,float *rain,float *rainncv              //_def_ arg ij:rain,rainncv
                   ,float *sr                               //_def_ arg ij:sr
                   ,float *snow,float *snowncv              //_def_ arg ij:snow,snowncv
                   ,float delt
,float* retvals
                   ,int ids, int ide,  int jds, int jde,  int kds, int kde          
                   ,int ims, int ime,  int jms, int jme,  int kms, int kme          
                   ,int ips, int ipe,  int jps, int jpe,  int kps, int kpe          
                         )
{

   float xlf, xmi, vt2i, vt2s, diameter, acrfac, supice ;
   float roqi0, xni0, qimax, value, source, factor, xlwork2 ;

#define hsub   xls
#define hvap   xlv0
#define cvap   cpv
     float ttp ;
     float dldt ;
     float xa ;
     float xb ;
     float dldti ;
     float xai ;
     float xbi ;

     //_def_ local k:qs1,qs2,rh1,rh2


if ( ig < ide-ids+1 && jg < jde-jds+1 ) {

   int k ;

   __shared__ float sm[SM_SIZE] ;

   int isize = 0 ;

#include "wsm5_constants.h"

   //_def_ local k:t
   //_def_ local k:prevp,psdep,praut,psaut,pracw,psaci,psacw,pigen,pidep,pcond,psmlt,psevp
   //_def_ local k:fall1,fall2,falk1,falk2,fallc,falkc,xni
   //_def_ local k:rsloper,rslopebr,rslope2r,rslope3r
   //_def_ local k:rslopes,rslopebs,rslope2s,rslope3s
   //_def_ local k:denfac
   //_def_ local k:n0sfac
   //_def_ local k:w1,w2,w3


   //_def_ copy_up_mem ikj:qs,qr,qi,qc

   for ( k = kps-1 ; k <= kpe-1 ; k++ ) {
     t[k] = th[k] * pii[k] ;
   }

#if 1
   for( k=kps-1 ;k<=kpe-1;k++) { if ( qc[k] < 0. ) { qc[k] = 0. ; } 
                                 if ( qi[k] < 0. ) { qi[k] = 0. ; } 
                                 if ( qr[k] < 0. ) { qr[k] = 0. ; } 
                                 if ( qs[k] < 0. ) { qs[k] = 0. ; } }

// 564 !----------------------------------------------------------------
// 565 !     latent heat for phase changes and heat capacity. neglect the
// 566 !     changes during microphysical process calculation
// 567 !     emanuel(1994)

#define CPMCAL(x) (cpd*(1.-MAX(x,qmin))+MAX(x,qmin)*cpv)
#define XLCAL(x)  (xlv0-xlv1*((x)-t0c))

   if (ig==0&&jg==0) {
retvals[0] = 99. ; 
#if defined(DEVICEEMU) && defined(DEBUGOUTPUT)
fprintf(stderr,"setting retvals %f\n",retvals[0]);
#endif
}

   //_def_ local k:cpm,xl
   for ( k = kps-1 ; k <= kpe-1 ; k++ ) {
DIAGOUTPUT1(q)
     cpm[k] = CPMCAL(q[k]) ;
     xl[k] = XLCAL(t[k]) ;
   }

// 576 !----------------------------------------------------------------
// 577 !     compute the minor time steps.

   float dtcldcr = 120. ;
   int loops = delt/dtcldcr+.5 ;

   loops = MAX(loops,1) ;
   float dtcld = delt/loops ;
   if ( delt <= dtcldcr) dtcld = delt ;

   int loop ;
   int mstep ;
   int numdt, n ;
   float tr, ltr, tt, pp, qq ;
   float w, rmstep ; 
   float dtcldden, rdelz, coeres ;
   float fallsum, fallsum_qsi, rdtcld ;

   for ( loop = 1 ; loop <= 2 ; loop++ ) {

if (ig==0&&jg==0) { retvals[0] = loops ; }

include(body_inline.h)

   }

   //_def_ copy_down_mem ikj:qs,qr,qi,qc
#endif
   for ( k = kps-1 ; k <= kpe-1 ; k++ ) {
     th[k] = t[k] / pii[k] ;
   }
 } // guard 
}


