__global__ void rk_scalar_tend_part1 ( 
                    float * fqx, float * fqy, float * fqz 
                   ,float * fqxl, float * fqyl, float *fqzl 
                   ,float rdx, float rdy, float dt
                   ,float * rdzw, float * fzm, float * fzp 
                   ,int im, int pd
                   ,int ids, int ide,  int jds, int jde,  int kds, int kde          
                   ,int ims, int ime,  int jms, int jme,  int kms, int kme          
                   ,int ips, int ipe,  int jps, int jpe,  int kps, int kpe          
//,float * debuggal
                         )
{
 int k ;
 float mu  ;
 float vel ; 
 float mudyodt ; 
 float cr ; 
 float upwind = 0.;
 int i3 ;

#define FLUX4(q_im2,q_im1,q_i,q_ip1,ua) ((7./12.)*((q_i)+(q_im1))-(1./12.)*((q_ip1)+(q_im2)))
#define FLUX3(q_im2,q_im1,q_i,q_ip1,ua) (FLUX4((q_im2),(q_im1),(q_i),(q_ip1),(ua))+\
                                         SIGN(1.,(ua))*(1./12.)*(((q_ip1)-(q_im2))-3.*((q_i)-(q_im1))))
#define FLUX6(q_im3,q_im2,q_im1,q_i,q_ip1,q_ip2,ua) \
            ((37./60.)*((q_i)+(q_im1))-(2./15.)*((q_ip1)+(q_im2))+(1./60.)*((q_ip2)+(q_im3)))
#define FLUX5(q_im3,q_im2,q_im1,q_i,q_ip1,q_ip2,ua) \
           (FLUX6(q_im3,q_im2,q_im1,q_i,q_ip1,q_ip2,ua)- \
            SIGN(1.,(ua))*(1./60.)*(((q_ip2)-(q_im3))-5.*((q_ip1)-(q_im2))+10.*((q_i)-(q_im1))))
#define FLUX_UPWIND(q_im1,q_i,cr) (0.5*min(1.0,((cr)+abs(cr)))*(q_im1)+0.5*max(-1.0,((cr)-abs(cr)))*(q_i))

 float dx = 2./((TexRef2D(msfty,ti,tj)+TexRef2D(msfty,ti-1,tj))*rdx) ;
 float dy = 2./((TexRef2D(msftx,ti,tj)+TexRef2D(msftx,ti,tj-1))*rdy) ;

 if ( ig >= ids && ig <= ide-1 && jg >= jds && jg <= jde-1 ) {

  for ( k = 0 ; k < (kde-kds+1) ; k++ ) {
   i3 = P3IJK(ti,k,tj) ; // ijk
   fqx[i3] = 0. ;
   fqy[i3] = 0. ;
   fqz[i3] = 0. ;
   if ( pd == 1 ) {
       fqxl[i3] = 0. ;
       fqyl[i3] = 0. ;
       fqzl[i3] = 0. ;
   }
  }

#if 1
// y - advection first
  for ( k = 0 ; k < (kde-kds+1) ; k++ ) {
   i3 = P3IJK(ti,k,tj) ; // ijk
   if ( jg >= jds+1 && jg <= jde-1 ) {
     mu = 0.5*(TexRef2D(mut,ti,tj)+TexRef2D(mut,ti,tj-1)) ;
     vel = TexRef3D(rv,ti,tj,k) ;
     mudyodt = mu*(dy/dt) ;
     upwind = 0. ;
     if ( pd == 1 ) {
       cr = vel*dt/dy/mu ;
       upwind = mudyodt*FLUX_UPWIND(TexRef3D(scalar_old,ti,tj-1,k),TexRef3D(scalar_old,ti,tj,k),cr) ;
       fqyl[i3] = upwind ;
     }
#if 1
     if ( jg >= jds+3 && jg <= jde-3 ) {
       fqy[i3] = vel*FLUX5( TexRef3D(scalar,ti,tj-3,k) ,
                            TexRef3D(scalar,ti,tj-2,k) ,
                            TexRef3D(scalar,ti,tj-1,k) ,
                            TexRef3D(scalar,ti,tj  ,k) ,
                            TexRef3D(scalar,ti,tj+1,k) ,
                            TexRef3D(scalar,ti,tj+2,k) , vel ) - upwind ;
     } else if ( jg == jds+2 || jg == jde-2 ) {  // 3rd or 4th order flux 2 in from N or S boundary
       fqy[i3] = vel*FLUX3( TexRef3D(scalar,ti,tj-2,k) ,
                            TexRef3D(scalar,ti,tj-1,k) ,
                            TexRef3D(scalar,ti,tj  ,k) ,
                            TexRef3D(scalar,ti,tj+1,k) , vel ) - upwind ;
     } else if ( jg == jds+1 || jg == jde-1 ) {  // 2nd ord flux next to N or S boundary
       fqy[i3] = vel*0.5*(TexRef3D(scalar,ti,tj ,k )+TexRef3D(scalar,ti,tj-1,k)) -  upwind ;
     }
#endif
   }
 
// next, x flux
   if ( ig >= ids+1 && ig <= ide-1 ) {
     mu = 0.5*(TexRef2D(mut,ti,tj)+TexRef2D(mut,ti-1,tj)) ;
     vel = TexRef3D(ru,ti,tj,k) ;
     float mudxodt = mu*(dx/dt) ;
     upwind = 0 ;
     if ( pd == 1 ) {
       cr = vel*dt/dx/mu ;
       upwind = mudxodt*FLUX_UPWIND(TexRef3D(scalar_old,ti-1,tj,k),TexRef3D(scalar_old,ti,tj,k),cr) ;
       fqxl[i3] = upwind ;
     }
#if 1
     if ( ig >= ids+3 && ig <= ide-3 ) {
       fqx[i3] = vel*FLUX5( TexRef3D(scalar,ti-3,tj,k) ,
                            TexRef3D(scalar,ti-2,tj,k) ,
                            TexRef3D(scalar,ti-1,tj,k) ,
                            TexRef3D(scalar,ti  ,tj,k) ,
                            TexRef3D(scalar,ti+1,tj,k) ,
                            TexRef3D(scalar,ti+2,tj,k) , vel ) - upwind ;
     } else if ( ig == ids+2 || ig == ide-2 ) {
       fqx[i3] = vel*FLUX3( TexRef3D(scalar,ti-2,tj,k) ,
                            TexRef3D(scalar,ti-1,tj,k) ,
                            TexRef3D(scalar,ti  ,tj,k) ,
                            TexRef3D(scalar,ti+1,tj,k) , vel ) - upwind ; 
     } else if ( ig == ids+1 || ig == ide-1 ) {
       fqx[i3] = vel*0.5*(TexRef3D(scalar,ti,tj  ,k)+TexRef3D(scalar,ti-1,tj,k)) -  upwind ;
     }
#endif
   }
  }
#endif

//-------------------- vertical advection

  int k ;
  int kpf ;
  float dz, upwind = 0. ;

  mu = TexRef2D(mut,ti,tj) ;
  kpf = kpe - 1 ;

  fqz[P3IJK(ti,kps,tj)] = 0. ;
  fqz[P3IJK(ti,kpe,tj)] = 0. ;
  if ( pd == 1 ) {
    fqzl[P3IJK(ti,kps,tj)] = 0. ;
    fqzl[P3IJK(ti,kpe,tj)] = 0. ;
  }

#if 1
  for ( k = kps+3 ; k <= kpf-2 ; k++ ) {
    i3 = P3IJK(ti,k,tj) ; // ijk
    vel = TexRef3D(ww,ti,tj,k) ;
    upwind = 0. ;
    if ( pd == 1 ) {
      dz = 2./(rdzw[k]+rdzw[k-1]) ;
      cr = vel*dt/dz/mu ;
      upwind = mu*(dz/dt)*FLUX_UPWIND(TexRef3D(scalar_old,ti,tj,k-1),TexRef3D(scalar_old,ti,tj,k),cr) ;
      fqzl[i3] = upwind ;
    }
    fqz[i3] = vel*FLUX5( TexRef3D(scalar,ti,tj,k-3) ,
                         TexRef3D(scalar,ti,tj,k-2) ,
                         TexRef3D(scalar,ti,tj,k-1) ,
                         TexRef3D(scalar,ti,tj,k  ) ,
                         TexRef3D(scalar,ti,tj,k+1) ,
                         TexRef3D(scalar,ti,tj,k+2) , -vel ) -  upwind ;
  }

  k=kps+1 ;
  vel = TexRef3D(ww,ti,tj,k) ;
  upwind = 0. ;
  if ( pd == 1 ) {
    dz = 2./(rdzw[k]+rdzw[k-1]) ;
    cr = vel*dt/dz/mu ;
    upwind = mu*(dz/dt)*FLUX_UPWIND(TexRef3D(scalar_old,ti,tj,k-1),TexRef3D(scalar_old,ti,tj,k),cr) ;
    fqzl[P3IJK(ti,k,tj)] = upwind ;
  }
  fqz[P3IJK(ti,k,tj)]=vel*(fzm[k]*TexRef3D(scalar,ti,tj,k)+fzp[k]*TexRef3D(scalar,ti,tj,k-1)) - upwind ;

  k=kps+2 ;
  vel = TexRef3D(ww,ti,tj,k) ;
  upwind = 0. ;
  if ( pd == 1 ) {
    dz = 2./(rdzw[k]+rdzw[k-1]) ;
    cr = vel*dt/dz/mu ;
    upwind = mu*(dz/dt)*FLUX_UPWIND(TexRef3D(scalar_old,ti,tj,k-1),TexRef3D(scalar_old,ti,tj,k),cr) ;
    fqzl[P3IJK(ti,k,tj)] = upwind ;
  }
  fqz[P3IJK(ti,k,tj)]=vel*FLUX3( TexRef3D(scalar,ti,tj,k-2) ,
                                 TexRef3D(scalar,ti,tj,k-1) ,
                                 TexRef3D(scalar,ti,tj,k  ) ,
                                 TexRef3D(scalar,ti,tj,k+1) , -vel ) - upwind ;

  k=kpf-1 ;
  vel = TexRef3D(ww,ti,tj,k) ;
  upwind = 0. ;
  if ( pd == 1 ) {
    dz = 2./(rdzw[k]+rdzw[k-1]) ;
    cr = vel*dt/dz/mu ;
    upwind = mu*(dz/dt)*FLUX_UPWIND(TexRef3D(scalar_old,ti,tj,k-1),TexRef3D(scalar_old,ti,tj,k),cr) ;
    fqzl[P3IJK(ti,k,tj)] = upwind ;
  }
  fqz[P3IJK(ti,k,tj)] = vel*FLUX3( TexRef3D(scalar,ti,tj,k-2) ,
                                   TexRef3D(scalar,ti,tj,k-1) ,
                                   TexRef3D(scalar,ti,tj,k  ) ,
                                   TexRef3D(scalar,ti,tj,k+1) , -vel ) - upwind ;

  k=kpf ;
  vel = TexRef3D(ww,ti,tj,k) ;
  upwind = 0. ;
  if ( pd == 1 ) {
    dz = 2./(rdzw[k]+rdzw[k-1]) ;
    cr = vel*dt/dz/mu ;
    upwind = mu*(dz/dt)*FLUX_UPWIND(TexRef3D(scalar_old,ti,tj,k-1),TexRef3D(scalar_old,ti,tj,k),cr) ;
    fqzl[P3IJK(ti,k,tj)] = upwind ;
  }
  fqz[P3IJK(ti,k,tj)]=vel*(fzm[k]*TexRef3D(scalar,ti,tj,k)+fzp[k]*TexRef3D(scalar,ti,tj,k-1)) -  upwind ;
#endif

}
}
