diff --git a/mandelbrot-real-fma-f32-omp.c b/mandelbrot-real-fma-f32-omp.c new file mode 100644 index 0000000000000000000000000000000000000000..5d04758eb001ea0147c406213446110abb3b314a --- /dev/null +++ b/mandelbrot-real-fma-f32-omp.c @@ -0,0 +1,169 @@ +//by Branislav Jansik, @IT4Innovations, 2024 +#include <stdio.h> +#include <stdlib.h> +#include <omp.h> +#include <math.h> + +#define REPEAT10(x) x x x x x x x x x x + +static __inline__ unsigned long long int get_cycles(void) +{ + unsigned hi, lo; + __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); + return ( (unsigned long long int)lo)|( ((unsigned long long int)hi)<<32 ); +} + + +int main(int argc, char **argv) { + + +float ar[128] __attribute__((aligned(32))) = +{ -2.00000, -1.92742, -1.85484, -1.78226, -1.70968, -1.63710, -1.56452, + -1.49194, -1.41935, -1.34677, -1.27419, -1.20161, -1.12903, -1.05645, + -0.98387, -0.91129, -0.83871, -0.76613, -0.69355, -0.62097, -0.54839, + -0.47581, -0.40323, -0.33065, -0.25806, -0.18548, -0.11290, -0.04032, + 0.03226, 0.10484, 0.17742, 0.25000, + -2.00000, -1.92742, -1.85484, -1.78226, -1.70968, -1.63710, -1.56452, + -1.49194, -1.41935, -1.34677, -1.27419, -1.20161, -1.12903, -1.05645, + -0.98387, -0.91129, -0.83871, -0.76613, -0.69355, -0.62097, -0.54839, + -0.47581, -0.40323, -0.33065, -0.25806, -0.18548, -0.11290, -0.04032, + 0.03226, 0.10484, 0.17742, 0.25000, + -2.00000, -1.92742, -1.85484, -1.78226, -1.70968, -1.63710, -1.56452, + -1.49194, -1.41935, -1.34677, -1.27419, -1.20161, -1.12903, -1.05645, + -0.98387, -0.91129, -0.83871, -0.76613, -0.69355, -0.62097, -0.54839, + -0.47581, -0.40323, -0.33065, -0.25806, -0.18548, -0.11290, -0.04032, + 0.03226, 0.10484, 0.17742, 0.25000, + -2.00000, -1.92742, -1.85484, -1.78226, -1.70968, -1.63710, -1.56452, + -1.49194, -1.41935, -1.34677, -1.27419, -1.20161, -1.12903, -1.05645, + -0.98387, -0.91129, -0.83871, -0.76613, -0.69355, -0.62097, -0.54839, + -0.47581, -0.40323, -0.33065, -0.25806, -0.18548, -0.11290, -0.04032, + 0.03226, 0.10484, 0.17742, 0.25000 +}; + +float br[128] __attribute__((aligned(32))); + +double rsum, rmax, rmin, fqmax, fqmin; +double *aflips, *afreq; +int i, size, niter = 1; + +if (argc>1) niter = atoi(argv[1]); + +size=omp_get_max_threads(); + +printf("FLIPS: Single Precision (f32) FMA Instructions Per Second\nRun %d times on %d cores\n", niter, size); + + +for (i=1; i<=niter; i++) { + +rsum=0.0; +rmax=-INFINITY; +rmin=INFINITY; +fqmax=-INFINITY; +fqmin=INFINITY; + +#pragma omp parallel default(shared) +{ +double t0,t, freq, flips; +int j; +unsigned long long int c0, c; + + +#pragma omp barrier + +t0 = omp_get_wtime(); +c0 = get_cycles(); + +__asm__( + //init + "vzeroall\n\t" + + "vmovaps (%0), %%ymm0\n\t" + "vmovaps 32(%0), %%ymm2\n\t" + "vmovaps 64(%0), %%ymm4\n\t" + "vmovaps 96(%0), %%ymm6\n\t" + "vmovaps 128(%0), %%ymm8\n\t" + "vmovaps 160(%0), %%ymm10\n\t" + //"vmovaps 192(%0), %%ymm12\n\t" + //"vmovaps 224(%0), %%ymm14\n\t" + + "xorq %%rcx, %%rcx\n\t" //zero out loop counter + + //loop + "label:\n\t" + "incq %%rcx\n\t" + +REPEAT10( +REPEAT10( + "vfmadd213ps %%ymm0, %%ymm1, %%ymm1\n\t" + "vfmadd213ps %%ymm2, %%ymm3, %%ymm3\n\t" + "vfmadd213ps %%ymm4, %%ymm5, %%ymm5\n\t" + "vfmadd213ps %%ymm6, %%ymm7, %%ymm7\n\t" + "vfmadd213ps %%ymm8, %%ymm9, %%ymm9\n\t" + "vfmadd213ps %%ymm10, %%ymm11, %%ymm11\n\t" + "vfmadd213ps 192(%0), %%ymm13, %%ymm13\n\t" + "vfmadd213ps 224(%0), %%ymm15, %%ymm15\n\t" + "vfmadd213ps 256(%0), %%ymm14, %%ymm14\n\t" + "vfmadd213ps 288(%0), %%ymm12, %%ymm12\n\t" +) +) + + //end loop + "cmpq $40000000, %%rcx\n\t" + "jb label \n\t" + + //offload + "vmovaps %%ymm1, (%1)\n\t" + "vmovaps %%ymm3, 32(%1)\n\t" + "vmovaps %%ymm5, 64(%1)\n\t" + "vmovaps %%ymm7, 96(%1)\n\t" + "vmovaps %%ymm9, 128(%1)\n\t" + "vmovaps %%ymm11, 160(%1)\n\t" + "vmovaps %%ymm13, 192(%1)\n\t" + "vmovaps %%ymm15, 224(%1)\n\t" + "vmovaps %%ymm14, 256(%1)\n\t" + "vmovaps %%ymm12, 288(%1)\n\t" + + + + //inputs, outputs and clobbers + : : "r" (ar) , "r" (br) : + /* "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10","ymm11", "ymm12", "ymm13", "ymm14", "ymm15", */ + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", + "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "rcx", "memory"); + + +c = get_cycles() - c0; +t = omp_get_wtime() - t0; + +flips = 8*10.0/t; +freq = c/t/1e9; + +#pragma omp critical +{ +rsum += flips; +rmax= fmax(flips,rmax); +rmin= fmin(flips,rmin); +fqmax= fmax(freq, fqmax); +fqmin= fmin(freq, fqmin); +} + +//#pragma omp master +//for (j=0; j<32; j++ ) br[j]=tr[j]; + +} + +printf("%d: Summary FLIPS: Min: %6.4fG, Avg: %6.4fG, Max: %6.4fG, Sum: %6.4fG\nFreq: Min: %6.4fG, Max: %6.4fG\n", + i, rmin, rsum/size, rmax, rsum, fqmin, fqmax); + +} + +printf("Outputs:\n"); +for(i=0; i<80; i+=4) +printf("br: %d: %f %f %f %f\n", i, br[i+0], br[i+1], br[i+2], br[i+3]); + +return 0; +}