SHELL = csh
##################### intel compiler #############
ECPP     = icc
CC      = icc
FC      = ifort
CFLAGS  = -w -O3 -ip  -c
FCFLAGS = -align all -FR -convert big_endian -fno-alias -fno-fnalias
OPT     =  -O3
#OPT     =  -O0 -g
LD      = $(FC)
LDOPT   = 
ARCH    = sm_10
# these block dims turned out to be optimal for large chemistry case on gtx280s
XXX = 16
YYY = 11
##################### gcc/gfortran ###############
#CPP     = g++
#CC      = gcc
#FC      = gfortran 
#FCFLAGS = -fconvert=big-endian -frecord-marker=4 -ffree-form
#OPT     =  -O3
#OPT     =  -O0 -g
#LD      = $(FC)
#LDOPT   = 
#ARCH    = sm_10
#XXX = 16
#YYY = 8
##################################################
# do not change this definition, change the one further down
FLOAT   = float
RWORDSIZE = 4
##################### cuda location ##############
# eces-shell
#CUDALIBPATH =  ~/emu/cuda/lib
# ncsa
CUDALIBPATH =  /usr/local/cuda/lib
LIBCUBLAS = $(CUDALIBPATH)/libcublas.so
LIBCUBLASEMU = $(CUDALIBPATH)/libcublasemu.so


########################  THIS SECTION YOU CAN CHANGE ##################
#
# Hard coded number of levels  (35 for conus, 28 for jan00)
MKX = 35

PINNING = -DPINNING
#XPOSE_INPUT = -DXPOSE_INPUT

# uncomment this to use FLOAT4 data type (optimization)
#FLOAT4 = -DFLOAT_4=4 

# this must always be defined but they do not do anything unless
# other settings are set
DEBUG_I = 71
DEBUG_J = 3
DEBUG_K = 1
DEBUGOUTPUT = -DDEBUG_I=$(DEBUG_I) -DDEBUG_J=$(DEBUG_J) -DDEBUG_K=$(DEBUG_K) $(FLOAT4)

# uncomment to run on emulator instead of the device
#DEVICEEMU       = -DDEVICEEMU
#DEVICEEMU_NVCC  = -deviceemu $(DEVICEEMU)
#LIBCUBLAS = $(LIBCUBLASEMU)

# uncomment to output detailed debug data output
# must have DEVICEEMU settings uncommented above
#DEBUGOUTPUT = -DDEBUGOUTPUT -DDEBUG_I=$(DEBUG_I) -DDEBUG_J=$(DEBUG_J) -DDEBUG_K=$(DEBUG_K) $(FLOAT4)

# uncomment to allow settings to force closer agreement
#DEBUGDEBUG = -DDEBUGDEBUG

# uncomment for to promote to 8 byte floats
# note, if you do this without DEVICE EMU above, compiler will complain it does not have enough shared mem
#PROMOTE = -DPROMOTE
#FLOAT = double
#FCFLAGS = -fconvert=big-endian -frecord-marker=4 -fdefault-real-8
#RWORDSIZE = 8
########################  END OF SECTION YOU CAN CHANGE ################

INCLUDE_MODULES = -I../WRFV3/main -I../WRFV3/frame -I../WRFV3/dyn_em -I../WRFV3/share
INCLUDE_MODULES = 
VANILLA_LIBS = -L../WRFV3/main -lwrflib  \
               -L../WRFV3/external/fftpack/fftpack5 -lfftpack -L../WRFV3/external/io_grib1 -lio_grib1 \
               -L../WRFV3/external/io_grib_share -lio_grib_share \
               -L../WRFV3/external/esmf_time_f90 -lesmf_time  \
              ../WRFV3/frame/module_internal_header_util.o  \
              ../WRFV3/frame/pack_utils.o    \
              ../WRFV3/external/io_netcdf/libwrfio_nf.a  \
              -L$(NETCDF)/lib -lnetcdf   \
               -L../WRFV3/external/io_int -lwrfio_int

VANILLA_LIBS =

#NVCC    = nvcc -DCUDA
NVCC    = nvcc -DCUDA -ccbin /usr/bin
CCCCC   = -I. $(DEVICEEMU_NVCC) $(PROMOTE) $(DEBUGDEBUG) $(DEBUGOUTPUT) -DMKX=$(MKX) -DXXX=$(XXX) -DYYY=$(YYY) -DSTANDALONE $(PINNING) $(XPOSE_INPUT) #-DZTIME_ON
NVOPT   = -maxrregcount=$(REGCNT) $(CCCCC) -arch $(ARCH) --host-compilation 'C++' 

.SUFFIXES :

all : vanilla chocolate
#all : vanilla chocolate compare_snaps
#all : chocolate 
#all : compare_snaps

rk_scalar_tend.cu.o : rk_scalar_tend.cu parts.inc part1.inc part3.inc diffusion.inc update_scalar.inc
	m4 rk_scalar_tend.cu | spt.pl | sed "s/float/$(FLOAT)/g" > y.cu
	cpp -I. -C -P $(CCCCC) -DCUDA y.cu > rk_scalar_tend.E
	$(NVCC) -cuda $(NVOPT) y.cu
	/bin/mv y.cu.cpp rk_scalar_tend.cu.cpp
	$(ECPP) $(CFLAGS) $(OPT) -c rk_scalar_tend.cu.cpp

libmassv.o : libmassv.F
	/lib/cpp -C -P libmassv.F > libmassv.f90
	$(FC) -c -O0  $(FCFLAGS) libmassv.f90

microclock.o : microclock.c
	$(CC) -c $(CFLAGS) -DMKX=$(MKX) microclock.c

vanilla : module_em.F module_advect_em.F advect_scalar.inc advect_scalar_pd_00.inc libmassv.o microclock.o
	/lib/cpp -C -P $(DEBUGDEBUG) $(DEVICEEMU) $(DEBUGOUTPUT) -DRWORDSIZE=$(RWORDSIZE) \
                       -DTEST_ON_GPU_RK -DSTANDALONE module_advect_em.F > module_advect_em.f90
	$(FC) -c $(OPT) $(FCFLAGS) $(INCLUDE_MODULES) module_advect_em.f90
	/lib/cpp -C -P $(DEBUGDEBUG) $(DEVICEEMU) $(DEBUGOUTPUT) -DRWORDSIZE=$(RWORDSIZE) \
                       -DTEST_ON_GPU_RK -DSTANDALONE module_em.F > module_em.f90
	$(FC) -c $(OPT) $(FCFLAGS) $(INCLUDE_MODULES) module_em.f90
	$(LD) -o rk_scalar_tend_driver_vanilla $(LDOPT) module_em.o module_advect_em.o libmassv.o microclock.o $(VANILLA_LIBS)

chocolate : module_em.F rk_scalar_tend.cu.o libmassv.o microclock.o
	/lib/cpp -C -P $(DEBUGDEBUG) $(DEVICEEMU) $(DEBUGOUTPUT) -DRWORDSIZE=$(RWORDSIZE) \
                       -DTEST_ON_GPU_RK -DRUN_ON_GPU -DSTANDALONE $(PINNING) $(XPOSE_INPUT) module_em.F > module_em.f90
	$(FC) -c -O0 $(FCFLAGS) $(INCLUDE_MODULES) module_em.f90
	$(LD) -o rk_scalar_tend_driver_chocolate $(LDOPT) module_em.o rk_scalar_tend.cu.o libmassv.o microclock.o $(LIBCUBLAS)  $(VANILLA_LIBS)

#	$(FC) -c $(OPT) $(FCFLAGS) $(INCLUDE_MODULES) module_em.f90


compare_snaps : compare_snaps.F
	\cp compare_snaps.F compare_snaps.f90
	$(FC) -o compare_snaps $(FCFLAGS) compare_snaps.f90
	\rm compare_snaps.f90

clean clena :
	\rm -f *.o *.cu.c x.cu y.* wsm5_driver_* *.mod *.f90 x.ptx *.cpp

