SHELL = csh
CC      = cc
CFLAGS  = -ffast-math
FC      = gfortran 
FCFLAGS = -fconvert=big-endian -frecord-marker=4
OPT     =  -O3
LD      = $(FC)
LDOPT   = 
# do not change this definition, change the one further down
FLOAT   = float
RWORDSIZE = 4
LIBCUBLAS = libcublas.so

########################  THIS SECTION YOU CAN CHANGE ##################
# this must always be defined but they do not do anything unless
# other settings are set
DEBUG_I = 59
DEBUG_J = 45
DEBUG_K = 1
DEBUGOUTPUT = -DDEBUG_I=$(DEBUG_I) -DDEBUG_J=$(DEBUG_J) -DDEBUG_K=$(DEBUG_K)

# uncomment to run on emulator instead of the device
#DEVICEEMU       = -DDEVICEEMU
#DEVICEEMU_NVCC  = -deviceemu $(DEVICEEMU)
#LIBCUBLAS = libcublasemu.so

# uncomment to output detailed debug data output
# must have DEVICEEMU settings uncommented above
#DEBUGOUTPUT = -DDEBUGOUTPUT -DDEBUG_I=$(DEBUG_I) -DDEBUG_J=$(DEBUG_J) -DDEBUG_K=$(DEBUG_K)

# uncomment to allow settings to force closer agreement
#DEBUGDEBUG = -DDEBUGDEBUG

# uncomment for to promote to 8 byte floats
# note, if you do this without DEVICE EMU above, compiler will complain it does not have enough shared mem
#PROMOTE = -DPROMOTE
#FLOAT = double
#FCFLAGS = -fconvert=big-endian -frecord-marker=4 -fdefault-real-8
#RWORDSIZE = 8
########################  END OF SECTION YOU CAN CHANGE ################

NVCC    = nvcc -DCUDA
NVOPT   = $(DEVICEEMU_NVCC) $(PROMOTE) $(DEBUGDEBUG) $(DEBUGOUTPUT)

.SUFFIXES :

all : vanilla chocolate compare_snaps

wsm5.cu.o : wsm5.cu
	m4 wsm5.cu | sed "s/float/$(FLOAT)/g" > y.cu
	$(NVCC) -cuda $(NVOPT) y.cu 
	sed \
             -e 's/[A-Za-z0-9_][A-Za-z0-9_]*wsm5_init_[A-Za-z0-9_][A-Za-z0-9_]*(/wsm5_init_(/g' \
             -e 's/[A-Za-z0-9_][A-Za-z0-9_]*wsm5_host_[A-Za-z0-9_][A-Za-z0-9_]*(/wsm5_host_(/g' \
             -e 's/[A-Za-z0-9_][A-Za-z0-9_]*rsl_internal_microclock_[A-Za-z0-9_][A-Za-z0-9_]*(/rsl_internal_microclock_(/g' y.cu.c >! foo$$
	/bin/mv foo$$ wsm5.cu.c
	cc $(CFLAGS) -c wsm5.cu.c

wsm5_gpu.cu.o : wsm5_gpu.cu
	m4 wsm5_gpu.cu | spt.pl | sed "s/float/$(FLOAT)/g" > x.cu
	$(NVCC) -cuda $(NVOPT) x.cu 
	sed \
	     -e 's/[A-Za-z0-9_][A-Za-z0-9_]*wsm5_init_[A-Za-z0-9_][A-Za-z0-9_]*(/wsm5_init_(/g' \
	     -e 's/[A-Za-z0-9_][A-Za-z0-9_]*wsm5_host_[A-Za-z0-9_][A-Za-z0-9_]*(/wsm5_host_(/g' \
	     -e 's/[A-Za-z0-9_][A-Za-z0-9_]*rsl_internal_microclock_[A-Za-z0-9_][A-Za-z0-9_]*(/rsl_internal_microclock_(/g' x.cu.c >! foo$$
	\mv foo$$ wsm5_gpu.cu.c
	cc $(CFLAGS) -c wsm5_gpu.cu.c

libmassv.o : libmassv.F
	/lib/cpp -C -P libmassv.F > libmassv.f90
	$(FC) -c $(OPT) $(FCFLAGS) libmassv.f90

microclock.o : microclock.c
	$(CC) -c $(CFLAGS) microclock.c

vanilla : module_mp_wsm5.F wsm5.cu.o wsm5_gpu.cu.o libmassv.o microclock.o
	/lib/cpp -C -P $(DEBUGDEBUG) $(DEVICEEMU) $(DEBUGOUTPUT) -DRWORDSIZE=$(RWORDSIZE) module_mp_wsm5.F > module_mp_wsm5.f90
	$(FC) -c $(OPT) $(FCFLAGS) module_mp_wsm5.f90
	$(LD) -g -o wsm5_driver_vanilla $(LDOPT) module_mp_wsm5.o wsm5.cu.o wsm5_gpu.cu.o libmassv.o microclock.o ~/emu/cuda/lib/libcublasemu.so

chocolate : module_mp_wsm5.F wsm5.cu.o wsm5_gpu.cu.o libmassv.o microclock.o
	/lib/cpp -C -P $(DEBUGDEBUG) $(DEVICEEMU) $(DEBUGOUTPUT) -DRUN_ON_GPU -DRWORDSIZE=$(RWORDSIZE) module_mp_wsm5.F > module_mp_wsm5.f90
	$(FC) -c $(OPT) $(FCFLAGS) module_mp_wsm5.f90
	$(LD) -g -o wsm5_driver_chocolate $(LDOPT) module_mp_wsm5.o wsm5.cu.o wsm5_gpu.cu.o libmassv.o microclock.o ~/emu/cuda/lib/$(LIBCUBLAS) 


compare_snaps : compare_snaps.F
	\cp compare_snaps.F compare_snaps.f90
	$(FC) -o compare_snaps $(FCFLAGS) compare_snaps.f90
	\rm compare_snaps.f90

clean:
	\rm -f *.o *.cu.c x.cu wsm5_driver_* *.mod *.f90

