Skip to content
Snippets Groups Projects
Commit 27cb03ef authored by Branislav Jansik's avatar Branislav Jansik
Browse files

Mandelbrot OMP single precision FMA version

parent 8d9a9879
No related branches found
No related tags found
No related merge requests found
//by Branislav Jansik, @IT4Innovations, 2024
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <math.h>
#define REPEAT10(x) x x x x x x x x x x
static __inline__ unsigned long long int get_cycles(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long int)lo)|( ((unsigned long long int)hi)<<32 );
}
int main(int argc, char **argv) {
float ar[128] __attribute__((aligned(32))) =
{ -2.00000, -1.92742, -1.85484, -1.78226, -1.70968, -1.63710, -1.56452,
-1.49194, -1.41935, -1.34677, -1.27419, -1.20161, -1.12903, -1.05645,
-0.98387, -0.91129, -0.83871, -0.76613, -0.69355, -0.62097, -0.54839,
-0.47581, -0.40323, -0.33065, -0.25806, -0.18548, -0.11290, -0.04032,
0.03226, 0.10484, 0.17742, 0.25000,
-2.00000, -1.92742, -1.85484, -1.78226, -1.70968, -1.63710, -1.56452,
-1.49194, -1.41935, -1.34677, -1.27419, -1.20161, -1.12903, -1.05645,
-0.98387, -0.91129, -0.83871, -0.76613, -0.69355, -0.62097, -0.54839,
-0.47581, -0.40323, -0.33065, -0.25806, -0.18548, -0.11290, -0.04032,
0.03226, 0.10484, 0.17742, 0.25000,
-2.00000, -1.92742, -1.85484, -1.78226, -1.70968, -1.63710, -1.56452,
-1.49194, -1.41935, -1.34677, -1.27419, -1.20161, -1.12903, -1.05645,
-0.98387, -0.91129, -0.83871, -0.76613, -0.69355, -0.62097, -0.54839,
-0.47581, -0.40323, -0.33065, -0.25806, -0.18548, -0.11290, -0.04032,
0.03226, 0.10484, 0.17742, 0.25000,
-2.00000, -1.92742, -1.85484, -1.78226, -1.70968, -1.63710, -1.56452,
-1.49194, -1.41935, -1.34677, -1.27419, -1.20161, -1.12903, -1.05645,
-0.98387, -0.91129, -0.83871, -0.76613, -0.69355, -0.62097, -0.54839,
-0.47581, -0.40323, -0.33065, -0.25806, -0.18548, -0.11290, -0.04032,
0.03226, 0.10484, 0.17742, 0.25000
};
float br[128] __attribute__((aligned(32)));
double rsum, rmax, rmin, fqmax, fqmin;
double *aflips, *afreq;
int i, size, niter = 1;
if (argc>1) niter = atoi(argv[1]);
size=omp_get_max_threads();
printf("FLIPS: Single Precision (f32) FMA Instructions Per Second\nRun %d times on %d cores\n", niter, size);
for (i=1; i<=niter; i++) {
rsum=0.0;
rmax=-INFINITY;
rmin=INFINITY;
fqmax=-INFINITY;
fqmin=INFINITY;
#pragma omp parallel default(shared)
{
double t0,t, freq, flips;
int j;
unsigned long long int c0, c;
#pragma omp barrier
t0 = omp_get_wtime();
c0 = get_cycles();
__asm__(
//init
"vzeroall\n\t"
"vmovaps (%0), %%ymm0\n\t"
"vmovaps 32(%0), %%ymm2\n\t"
"vmovaps 64(%0), %%ymm4\n\t"
"vmovaps 96(%0), %%ymm6\n\t"
"vmovaps 128(%0), %%ymm8\n\t"
"vmovaps 160(%0), %%ymm10\n\t"
//"vmovaps 192(%0), %%ymm12\n\t"
//"vmovaps 224(%0), %%ymm14\n\t"
"xorq %%rcx, %%rcx\n\t" //zero out loop counter
//loop
"label:\n\t"
"incq %%rcx\n\t"
REPEAT10(
REPEAT10(
"vfmadd213ps %%ymm0, %%ymm1, %%ymm1\n\t"
"vfmadd213ps %%ymm2, %%ymm3, %%ymm3\n\t"
"vfmadd213ps %%ymm4, %%ymm5, %%ymm5\n\t"
"vfmadd213ps %%ymm6, %%ymm7, %%ymm7\n\t"
"vfmadd213ps %%ymm8, %%ymm9, %%ymm9\n\t"
"vfmadd213ps %%ymm10, %%ymm11, %%ymm11\n\t"
"vfmadd213ps 192(%0), %%ymm13, %%ymm13\n\t"
"vfmadd213ps 224(%0), %%ymm15, %%ymm15\n\t"
"vfmadd213ps 256(%0), %%ymm14, %%ymm14\n\t"
"vfmadd213ps 288(%0), %%ymm12, %%ymm12\n\t"
)
)
//end loop
"cmpq $40000000, %%rcx\n\t"
"jb label \n\t"
//offload
"vmovaps %%ymm1, (%1)\n\t"
"vmovaps %%ymm3, 32(%1)\n\t"
"vmovaps %%ymm5, 64(%1)\n\t"
"vmovaps %%ymm7, 96(%1)\n\t"
"vmovaps %%ymm9, 128(%1)\n\t"
"vmovaps %%ymm11, 160(%1)\n\t"
"vmovaps %%ymm13, 192(%1)\n\t"
"vmovaps %%ymm15, 224(%1)\n\t"
"vmovaps %%ymm14, 256(%1)\n\t"
"vmovaps %%ymm12, 288(%1)\n\t"
//inputs, outputs and clobbers
: : "r" (ar) , "r" (br) :
/* "ymm0", "ymm1", "ymm2", "ymm3", "ymm4",
"ymm5", "ymm6", "ymm7", "ymm8", "ymm9",
"ymm10","ymm11", "ymm12", "ymm13", "ymm14", "ymm15", */
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
"xmm5", "xmm6", "xmm7", "xmm8", "xmm9",
"xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
"rcx", "memory");
c = get_cycles() - c0;
t = omp_get_wtime() - t0;
flips = 8*10.0/t;
freq = c/t/1e9;
#pragma omp critical
{
rsum += flips;
rmax= fmax(flips,rmax);
rmin= fmin(flips,rmin);
fqmax= fmax(freq, fqmax);
fqmin= fmin(freq, fqmin);
}
//#pragma omp master
//for (j=0; j<32; j++ ) br[j]=tr[j];
}
printf("%d: Summary FLIPS: Min: %6.4fG, Avg: %6.4fG, Max: %6.4fG, Sum: %6.4fG\nFreq: Min: %6.4fG, Max: %6.4fG\n",
i, rmin, rsum/size, rmax, rsum, fqmin, fqmax);
}
printf("Outputs:\n");
for(i=0; i<80; i+=4)
printf("br: %d: %f %f %f %f\n", i, br[i+0], br[i+1], br[i+2], br[i+3]);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment