Makefile

# Mandelbrot benchmark makefile
# Usage: make [icc] target [target]
 
# GCC/MPI config
CC      = gcc
MPICC   = mpicc
CF512   = -mavx512f
CFSSE   = -msse
CFAVX   = -mavx
CFAVX2  = -mavx2
CFOMP   = -fopenmp
CFNAT   = -march=native
LIBS    = -lm

# CUDA config
NVCC    = nvcc
ifdef NBLOCKS
DFLAGS  = -DNBLOCKS=$(NBLOCKS) -DNTHREADS=$(NTHREADS)
endif

# HIP config
HIPCC   = hipcc -w

# MinGW config
MINGW32 = i686-w64-mingw32-gcc
MINGW64 = x86_64-w64-mingw32-gcc

# Octave config
OCTAVE = octave

# Targets
OMP     = cpuid.x mandelbrot-real-sse-omp32.x mandelbrot-real-sse-omp.x mandelbrot-real-avx-omp.x \
          mandelbrot-real-fma-omp.x mandelbrot-real-fma512-omp.x

OMPWIN  = cpuid.exe mandelbrot-real-sse-omp32.exe mandelbrot-real-sse-omp.exe mandelbrot-real-avx-omp.exe \
          mandelbrot-real-fma-omp.exe mandelbrot-real-fma512-omp.exe 

MPI     = mandelbrot-real-sse-mpi-dump.x mandelbrot-real-avx-mpi-dump.x mandelbrot-real-fma-mpi-mem4-dump.x \
          mandelbrot-real-fma-mpi-dump-mic.x mandelbrot-ridge-real-fma-zmm-mpi.x

CUDA    = mandelbrot-real-fma-ptx-f16-dump.x mandelbrot-real-fma-ptx-f16x2-dump.x \
          mandelbrot-real-fma-ptx-f32-dump.x mandelbrot-real-fma-ptx-dump.x \
          stream_gpu_flops.x

RDNA    = mandelbrot-real-fma-rdna-f32-dump.x mandelbrot-real-fma-rdna-f64-dump.x accumulator-mfma-cdna-f32.x

WMMA    = mandelbrot-real-wmma-ptx-f16-dump.x mandelbrot-real-wmma-ptx-f64-dump.x

ARM     = mandelbrot-real-fma-sve-f64-omp.x jansik-real-fmla-neon-f64-omp.x mandelbrot-real-neon-f64-omp.x

POWER   = mandelbrot-real-fma-power-f64-omp.x accumulator-ger-power-f64-omp.x

ALL     = $(OMP) $(OMPWIN) $(MPI) $(CUDA) $(WMMA) $(RDNA) $(ARM) $(POWER) mbdata.o mbsdata.o mbdata mbsdata

# General rules

.SUFFIXES:

.PHONY: usage icc clean

usage:
	@echo Usage: make [icc] target [target [target] ... ] 
	@echo
	@echo Select targets to build. 
	@echo Main targets: omp ompwin mpi cudafma cudawmma rdna arm power
	@echo Build targets:
	@echo $(ALL)

all: usage
# Use this target to set up icc compiler and intel mpi
icc: 
	$(eval CC = icc)
	$(eval MPICC = mpiicc)
	$(eval CFAVX =)
	$(eval CFAVX2 =)
	$(eval CF512 =)
	$(eval CFSSE =)
	$(eval CFOMP = -qopenmp)
	$(eval LIBS =)
	@echo Using icc...

omp: $(OMP)

ompwin: $(OMPWIN)

mpi: $(MPI)

cudafma: $(CUDA)

cudawmma: $(WMMA)

rdna: $(RDNA)

arm: $(ARM)

power: $(POWER)

# Build rules
cpuid.x: cpuid.c
	$(CC) $< -o $@

cpuid.exe: cpuid.c
	$(MINGW32) -static -m32 $< -o $@

%.x: %.c 
	@CC=$(CC); \
          if [ "$(findstring omp,$<)" = omp ]; then CFOMP=$(CFOMP);  fi; \
          if [ "$(findstring mpi,$<)" = mpi ]; then CC=$(MPICC);     fi; \
          if [ "$(findstring sse,$<)" = sse ]; then CFARCH=$(CFSSE); fi; \
          if [ "$(findstring avx,$<)" = avx ]; then CFARCH=$(CFAVX); fi; \
          if [ "$(findstring fma,$<)" = fma ]; then CFARCH=$(CFAVX2);fi; \
          if [ "$(findstring 512,$<)" = 512 ]; then CFARCH=$(CF512); fi; \
          if [ "$(findstring mic,$<)" = mic ]; then CFARCH=$(CF512); fi; \
          if [ "$(findstring zmm,$<)" = zmm ]; then CFARCH=$(CF512); fi; \
	  if [ "$(findstring sve,$<)" = sve ]; then CFARCH=$(CFNAT); fi; \
	  if [ "$(findstring neon,$<)" = neon ]; then CFARCH=;       fi; \
	  if [ "$(findstring power,$<)" = power ]; then CFARCH=;      fi; \
          EXEC="$$CC $$CFARCH $$CFOMP $< -o $@ $(LIBS)" ; echo $$EXEC ;  \
	  eval $$EXEC

mandelbrot%.exe : mandelbrot%.c
	@CC=$(MINGW64); \
          if [ "$(findstring 32,$<)"  = 32  ]; then CC=$(MINGW32);    fi; \
          if [ "$(findstring sse,$<)" = sse ]; then CFARCH=$(CFSSE);  fi; \
          if [ "$(findstring avx,$<)" = avx ]; then CFARCH=$(CFAVX);  fi; \
          if [ "$(findstring fma,$<)" = fma ]; then CFARCH=$(CFAVX2); fi; \
          if [ "$(findstring 512,$<)" = 512 ]; then CFARCH=$(CF512);  fi; \
          EXEC="$$CC -static $$CFARCH $(CFOMP) $< -o $@" ; echo $$EXEC ;  \
          eval $$EXEC


mandelbrot-real-fma-ptx%.x : NVARCH=sm_53
mandelbrot-real-fma-ptx%.x : mandelbrot-real-fma-ptx%.cu
	$(NVCC) --gpu-architecture $(NVARCH) $(DFLAGS) $< -o $@

mandelbrot-real-wmma-ptx-f16-dump.x: NVARCH=sm_70
mandelbrot-real-wmma-ptx-f16-dump.x: mandelbrot-real-wmma-ptx-f16-dump.cu mbdata.o
	$(NVCC) --gpu-architecture $(NVARCH) $(DFLAGS) $^ -o $@
mandelbrot-real-wmma-ptx-f64-dump.x: NVARCH=sm_80
mandelbrot-real-wmma-ptx-f64-dump.x: mandelbrot-real-wmma-ptx-f64-dump.cu mbsdata.o
	$(NVCC) --gpu-architecture $(NVARCH) $(DFLAGS) $^ -o $@

stream_gpu_flops.x: NVARCH=sm_53
stream_gpu_flops.x: stream_gpu_flops.cu
	$(NVCC) --gpu-architecture $(NVARCH) $< -o $@

#rdna, cdna rocm hip targets
$(wildcard *dna)%.x: %.cpp
	$(HIPCC) $< -o $@

mbdata: NBLOCKS=1296
mbdata: 
	$(OCTAVE) -q -W --eval 'fd=fopen("mbdata","wb"); for i=1:$(NBLOCKS); fwrite(fd,mbmatrix(16),"float"); end; fclose(fd);'

mbsdata: NBLOCKS=216
mbsdata: NTHREADS=512
mbsdata:
	$(OCTAVE) -q -W --eval 'fd=fopen("mbsdata","wb"); for i=1:($(NBLOCKS)*$(NTHREADS)/32); fwrite(fd,mbsmatrix(8),"double"); end; fclose(fd);'

mb%.o: mb%
	ld -r -b binary -o $@ $<

# Clean rule
clean:
	rm -f $(ALL)

# EOF