diff --git a/mandelbrot-real-fma-f32-omp.c b/mandelbrot-real-fma-f32-omp.c
new file mode 100644
index 0000000000000000000000000000000000000000..5d04758eb001ea0147c406213446110abb3b314a
--- /dev/null
+++ b/mandelbrot-real-fma-f32-omp.c
@@ -0,0 +1,169 @@
+//by Branislav Jansik, @IT4Innovations, 2024
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+#include <math.h>
+
+#define REPEAT10(x) x x x x x x x x x x
+
+static __inline__ unsigned long long int get_cycles(void)
+{
+  unsigned hi, lo;
+  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+  return ( (unsigned long long int)lo)|( ((unsigned long long int)hi)<<32 );
+}
+
+
+int main(int argc, char **argv) {
+
+
+float ar[128] __attribute__((aligned(32))) =
+{ -2.00000, -1.92742, -1.85484,   -1.78226, -1.70968, -1.63710, -1.56452,
+  -1.49194, -1.41935, -1.34677,   -1.27419, -1.20161, -1.12903, -1.05645,
+  -0.98387, -0.91129, -0.83871,   -0.76613, -0.69355, -0.62097, -0.54839,
+  -0.47581, -0.40323, -0.33065,   -0.25806, -0.18548, -0.11290, -0.04032,
+   0.03226,  0.10484,  0.17742,    0.25000, 
+  -2.00000, -1.92742, -1.85484,   -1.78226, -1.70968, -1.63710, -1.56452,
+  -1.49194, -1.41935, -1.34677,   -1.27419, -1.20161, -1.12903, -1.05645,
+  -0.98387, -0.91129, -0.83871,   -0.76613, -0.69355, -0.62097, -0.54839,
+  -0.47581, -0.40323, -0.33065,   -0.25806, -0.18548, -0.11290, -0.04032,
+   0.03226,  0.10484,  0.17742,    0.25000,
+  -2.00000, -1.92742, -1.85484,   -1.78226, -1.70968, -1.63710, -1.56452,
+  -1.49194, -1.41935, -1.34677,   -1.27419, -1.20161, -1.12903, -1.05645,
+  -0.98387, -0.91129, -0.83871,   -0.76613, -0.69355, -0.62097, -0.54839,
+  -0.47581, -0.40323, -0.33065,   -0.25806, -0.18548, -0.11290, -0.04032,
+   0.03226,  0.10484,  0.17742,    0.25000, 
+  -2.00000, -1.92742, -1.85484,   -1.78226, -1.70968, -1.63710, -1.56452,
+  -1.49194, -1.41935, -1.34677,   -1.27419, -1.20161, -1.12903, -1.05645,
+  -0.98387, -0.91129, -0.83871,   -0.76613, -0.69355, -0.62097, -0.54839,
+  -0.47581, -0.40323, -0.33065,   -0.25806, -0.18548, -0.11290, -0.04032,
+   0.03226,  0.10484,  0.17742,    0.25000 
+};
+
+float br[128] __attribute__((aligned(32)));
+
+double rsum, rmax, rmin, fqmax, fqmin;
+double *aflips, *afreq;
+int i, size, niter = 1;
+
+if (argc>1) niter = atoi(argv[1]);
+
+size=omp_get_max_threads();
+
+printf("FLIPS: Single Precision (f32) FMA Instructions Per Second\nRun %d times on %d cores\n", niter, size);
+
+
+for (i=1; i<=niter; i++) {
+
+rsum=0.0;
+rmax=-INFINITY;
+rmin=INFINITY;
+fqmax=-INFINITY;
+fqmin=INFINITY;
+
+#pragma omp parallel default(shared)
+{
+double t0,t, freq, flips; 
+int j;
+unsigned long long int c0, c;
+
+
+#pragma omp barrier
+
+t0 = omp_get_wtime();
+c0 = get_cycles();
+
+__asm__(
+        //init
+        "vzeroall\n\t"
+
+        "vmovaps (%0),    %%ymm0\n\t" 
+        "vmovaps 32(%0),  %%ymm2\n\t" 
+        "vmovaps 64(%0),  %%ymm4\n\t" 
+        "vmovaps 96(%0),  %%ymm6\n\t" 
+        "vmovaps 128(%0), %%ymm8\n\t" 
+        "vmovaps 160(%0), %%ymm10\n\t" 
+        //"vmovaps 192(%0), %%ymm12\n\t" 
+        //"vmovaps 224(%0), %%ymm14\n\t" 
+
+        "xorq %%rcx, %%rcx\n\t"            //zero out loop counter
+
+        //loop
+        "label:\n\t"
+        "incq %%rcx\n\t"
+
+REPEAT10(
+REPEAT10(
+        "vfmadd213ps %%ymm0, %%ymm1, %%ymm1\n\t"
+        "vfmadd213ps %%ymm2, %%ymm3, %%ymm3\n\t"
+        "vfmadd213ps %%ymm4, %%ymm5, %%ymm5\n\t"
+        "vfmadd213ps %%ymm6, %%ymm7, %%ymm7\n\t"
+        "vfmadd213ps %%ymm8, %%ymm9, %%ymm9\n\t"
+        "vfmadd213ps %%ymm10, %%ymm11, %%ymm11\n\t"
+        "vfmadd213ps 192(%0), %%ymm13, %%ymm13\n\t"
+        "vfmadd213ps 224(%0), %%ymm15, %%ymm15\n\t"
+        "vfmadd213ps 256(%0), %%ymm14, %%ymm14\n\t"
+        "vfmadd213ps 288(%0), %%ymm12, %%ymm12\n\t"
+)
+)
+
+        //end loop  
+        "cmpq $40000000, %%rcx\n\t"
+        "jb label \n\t"
+
+        //offload
+        "vmovaps %%ymm1,     (%1)\n\t"
+        "vmovaps %%ymm3,   32(%1)\n\t"
+        "vmovaps %%ymm5,   64(%1)\n\t"
+        "vmovaps %%ymm7,   96(%1)\n\t"
+        "vmovaps %%ymm9,  128(%1)\n\t"
+        "vmovaps %%ymm11, 160(%1)\n\t"
+        "vmovaps %%ymm13, 192(%1)\n\t"
+        "vmovaps %%ymm15, 224(%1)\n\t"
+        "vmovaps %%ymm14, 256(%1)\n\t"
+        "vmovaps %%ymm12, 288(%1)\n\t"
+
+
+
+        //inputs, outputs and clobbers
+        :  : "r" (ar) , "r" (br) :
+   /*   "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", 
+        "ymm5", "ymm6", "ymm7", "ymm8", "ymm9",
+        "ymm10","ymm11", "ymm12", "ymm13", "ymm14", "ymm15", */
+        "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", 
+        "xmm5", "xmm6", "xmm7", "xmm8", "xmm9",
+        "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
+        "rcx", "memory");
+
+
+c = get_cycles() - c0;
+t = omp_get_wtime() - t0;
+
+flips = 8*10.0/t;
+freq  = c/t/1e9;
+
+#pragma omp critical
+{
+rsum += flips;
+rmax= fmax(flips,rmax);
+rmin= fmin(flips,rmin);
+fqmax= fmax(freq, fqmax);
+fqmin= fmin(freq, fqmin);
+}
+
+//#pragma omp master
+//for (j=0; j<32; j++ ) br[j]=tr[j];
+
+}
+
+printf("%d: Summary FLIPS: Min: %6.4fG, Avg: %6.4fG, Max: %6.4fG, Sum: %6.4fG\nFreq: Min: %6.4fG, Max: %6.4fG\n",
+       i, rmin, rsum/size, rmax, rsum, fqmin, fqmax);
+
+}
+
+printf("Outputs:\n");
+for(i=0; i<80; i+=4)
+printf("br: %d: %f %f %f %f\n", i, br[i+0], br[i+1], br[i+2], br[i+3]);
+
+return 0;
+}