From 0ef147e13587e25fd806953fafa60d336df72172 Mon Sep 17 00:00:00 2001
From: Jakub Homola <jakub.homola@vsb.cz>
Date: Mon, 10 Jan 2022 19:01:37 +0100
Subject: [PATCH] openmp readmes, makefile changes

---
 12_omp_offload_vadd/README.md | 14 ++++++++++++++
 13_omp_offload_pi/Makefile    | 12 ++++++------
 13_omp_offload_pi/README.md   |  7 +++++++
 3 files changed, 27 insertions(+), 6 deletions(-)
 create mode 100644 12_omp_offload_vadd/README.md
 create mode 100644 13_omp_offload_pi/README.md

diff --git a/12_omp_offload_vadd/README.md b/12_omp_offload_vadd/README.md
new file mode 100644
index 0000000..d82e941
--- /dev/null
+++ b/12_omp_offload_vadd/README.md
@@ -0,0 +1,14 @@
+
+OpenMP offloading on AMD GPUs
+=============================
+
+This example demostrates how to use AOMP, which can compile programs that use OpenMP offloading.
+
+The `vadd.cpp` source file contains a simple vector add source code. On line 35 there begins a loop performing the vector addition, which is annotated by several OpenMP constructs. The `target` construct makes the code execute on the GPU, `map` informs OpenMP about what data transfers should be done. The `teams` construct creates a league of teams, and `distribute` splits the for loop iterations between all teams, a lot like dividing work between threadblocks in CUDA/HIP. `parallel for` then creates several threads, which together work on the team's loop iterations, just like threads in a threadblock.
+
+The code can be compiled using
+```
+aompcc vadd.cpp -o vadd.x
+```
+
+On machines with other than non-default GPU (default is Vega, gfx900), one would either `export AOMP_GPU=gfx908` or compile using `aompcc --offload-arch gfx908 vadd.cpp -o vadd.x` (for AMD Instinct MI100).
diff --git a/13_omp_offload_pi/Makefile b/13_omp_offload_pi/Makefile
index 422296d..efaac82 100644
--- a/13_omp_offload_pi/Makefile
+++ b/13_omp_offload_pi/Makefile
@@ -2,7 +2,7 @@
 CLANG=/opt/rocm/llvm/bin/clang++
 
 
-.PHONY: compile clean run pi_seq.x pi_omp.x pi_gpu_clang.x pi_gpu_aomp.x pi_hip.x
+.PHONY: compile clean run
 
 
 
@@ -12,11 +12,11 @@ clean:
 	rm -rf *.x
 
 run: compile
-	@echo "Sequential"                &&   ./pi_seq.x       1000000000    &&   echo
-	@echo "OpenMP"                    &&   ./pi_omp.x       10000000000   &&   echo
-	@echo "OpenMP offloading Clang"   &&   ./pi_gpu_clang.x 10000000000   &&   echo
-	@echo "OpenMP offloading AOMP"    &&   ./pi_gpu_aomp.x  10000000000   &&   echo
-	@echo "Classic HIP"               &&   ./pi_hip.x       10000000000   &&   echo
+	@echo "\nSequential, 10x less points" &&   ./pi_seq.x       1000000000
+	@echo "\nOpenMP"                      &&   ./pi_omp.x       10000000000
+	@echo "\nOpenMP offloading Clang"     &&   ./pi_gpu_clang.x 10000000000
+	@echo "\nOpenMP offloading AOMP"      &&   ./pi_gpu_aomp.x  10000000000
+	@echo "\nHIP"                         &&   ./pi_hip.x       10000000000
 
 
 
diff --git a/13_omp_offload_pi/README.md b/13_omp_offload_pi/README.md
new file mode 100644
index 0000000..214b718
--- /dev/null
+++ b/13_omp_offload_pi/README.md
@@ -0,0 +1,7 @@
+
+OpenMP offloading, comparison
+=============================
+
+This example compares different OpenMP parallelization techniques of a simple algorithm calculating $\pi$ based on numerical integration and the fact that $\pi = \int_0^1 \frac{4}{1+x^2} \; \mathrm{d} x$.
+
+The `pi_seq.cpp` source file contains sequential code of this algorithm, `pi_omp.cpp` is parallelized using OpenMP, `pi_omp_offload.cpp` uses OpenMP offloading, and `pi_hip.hip.cpp` is the same algorithm, but written in HIP. Compile the sources by `make` and run them all by `make run`. Watch how many different ways was the code compiled, what commands were used for the compilation, and compare the differences in computation time.
-- 
GitLab