Something went wrong on our end
mandelbrot-real-fma-power-f64-omp.c 7.80 KiB
//by Branislav Jansik, @IT4Innovations, 2024
//Power ISA for Power7 to Power10 and beyond.
//
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <math.h>
#define REPEAT10(x) x x x x x x x x x x
#define NC 64
/* Cycle measurement not implemented for ARM SVE yet
static __inline__ unsigned long long int get_cycles(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long int)lo)|( ((unsigned long long int)hi)<<32 );
return 1;
}
*/
int main(int argc, char **argv) {
double *ar, *br;
double rsum, rmax, rmin;
double ainc;
double to, to0;
int i, size, niter = 1;
//inputs
if (argc>1) niter = atoi(argv[1]);
//find out threads
size=omp_get_max_threads();
//print info
printf("FLIPS: Double Precision Power ISA VSX FMA (128bit) Instructions Per Second\nRun %d times on %d cores\n", niter, size);
// allocate
posix_memalign((void *) &ar, (size_t)64, (size_t) sizeof(double)*NC);
posix_memalign((void *) &br, (size_t)64, (size_t) sizeof(double)*NC);
// initialize c constants within the mandelbrot set
ainc = 2.25/((double) NC);
for (i=0;i<NC; i++)
ar[i] = -2.0 + ainc*i;
// iterations
for (i=1; i<=niter; i++) {
rsum=0.0;
rmax=-INFINITY;
rmin=INFINITY;
//fqmax=-INFINITY;
//fqmin=INFINITY;
to0 = omp_get_wtime();
#pragma omp parallel default(shared) reduction(+:rsum) reduction(max:rmax) reduction (min:rmin)
{
double t0,t,flips;
//cycle measurement not implemented for POWER ISA yet
//unsigned long long int c0, c;
#pragma omp barrier
t0 = omp_get_wtime();
//c0 = get_cycles();
__asm__(
//load c constants into 128 bit VSX vector registers.
//addi shifts load address by 16 bytes
"lxvd2x 0, 0, %0\n\t li 7, 16\n\t"
"lxvd2x 2, 7, %0\n\t addi 7, 7, 16\n\t"
"lxvd2x 4, 7, %0\n\t addi 7, 7, 16\n\t"
"lxvd2x 6, 7, %0\n\t addi 7, 7, 16\n\t"
"lxvd2x 8, 7, %0\n\t addi 7, 7, 16\n\t"
"lxvd2x 10, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 12, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 14, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 16, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 18, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 20, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 22, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 24, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 26, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 28, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 30, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 32, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 34, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 36, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 38, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 40, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 42, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 44, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 46, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 48, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 50, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 52, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 54, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 56, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 58, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 60, 7, %0\n\taddi 7, 7, 16\n\t"
"lxvd2x 62, 7, %0\n\t"
//zero out registers
"xxlxor 1, 1, 1\n\t"
"xxlxor 3, 3, 3\n\t"
"xxlxor 5, 5, 5\n\t"
"xxlxor 7, 7, 7\n\t"
"xxlxor 9, 9, 9\n\t"
"xxlxor 11, 11, 11\n\t"
"xxlxor 13, 13, 13\n\t"
"xxlxor 15, 15, 15\n\t"
"xxlxor 17, 17, 17\n\t"
"xxlxor 19, 19, 19\n\t"
"xxlxor 21, 21, 21\n\t"
"xxlxor 23, 23, 23\n\t"
"xxlxor 25, 25, 25\n\t"
"xxlxor 27, 27, 27\n\t"
"xxlxor 29, 29, 29\n\t"
"xxlxor 31, 31, 31\n\t"
"xxlxor 33, 33, 33\n\t"
"xxlxor 35, 35, 35\n\t"
"xxlxor 37, 37, 37\n\t"
"xxlxor 39, 39, 39\n\t"
"xxlxor 41, 41, 41\n\t"
"xxlxor 43, 43, 43\n\t"
"xxlxor 45, 45, 45\n\t"
"xxlxor 47, 47, 47\n\t"
"xxlxor 49, 49, 49\n\t"
"xxlxor 51, 51, 51\n\t"
"xxlxor 53, 53, 53\n\t"
"xxlxor 55, 55, 55\n\t"
"xxlxor 57, 57, 57\n\t"
"xxlxor 59, 59, 59\n\t"
"xxlxor 61, 61, 61\n\t"
"xxlxor 63, 63, 63\n\t"
//zero out loop counter
"li 8, 0\n\t"
//set loop limit to 16000000 = 4000*4000
"li 9, 4000\n\t"
"mullw 9, 9, 9\n\t"
//loop
"loop:\n\t"
"addi 8, 8, 1\n\t"
REPEAT10(
REPEAT10(
"xvmaddmdp 1, 1, 0\n\t"
"xvmaddmdp 3, 3, 2\n\t"
"xvmaddmdp 5, 5, 4\n\t"
"xvmaddmdp 7, 7, 6\n\t"
"xvmaddmdp 9, 9, 8\n\t"
"xvmaddmdp 11, 11, 10\n\t"
"xvmaddmdp 13, 13, 12\n\t"
"xvmaddmdp 15, 15, 14\n\t"
"xvmaddmdp 17, 17, 16\n\t"
"xvmaddmdp 19, 19, 18\n\t"
"xvmaddmdp 21, 21, 20\n\t"
"xvmaddmdp 23, 23, 22\n\t"
"xvmaddmdp 25, 25, 24\n\t"
"xvmaddmdp 27, 27, 26\n\t"
"xvmaddmdp 29, 29, 28\n\t"
"xvmaddmdp 31, 31, 30\n\t"
"xvmaddmdp 33, 33, 32\n\t"
"xvmaddmdp 35, 35, 34\n\t"
"xvmaddmdp 37, 37, 36\n\t"
"xvmaddmdp 39, 39, 38\n\t"
"xvmaddmdp 41, 41, 40\n\t"
"xvmaddmdp 43, 43, 42\n\t"
"xvmaddmdp 45, 45, 44\n\t"
"xvmaddmdp 47, 47, 46\n\t"
"xvmaddmdp 49, 49, 48\n\t"
"xvmaddmdp 51, 51, 50\n\t"
"xvmaddmdp 53, 53, 52\n\t"
"xvmaddmdp 55, 55, 54\n\t"
"xvmaddmdp 57, 57, 56\n\t"
"xvmaddmdp 59, 59, 58\n\t"
"xvmaddmdp 61, 61, 60\n\t"
"xvmaddmdp 63, 63, 62\n\t"
)
)
//end loop
"cmpw 8, 9\n\t"
"blt loop\n\t"
//offload
//offload to br array from multiple threads creates race condition
//we ignore it for now.
"stxvd2x 1, 0, %1\n\t li 7, 16\n\t"
"stxvd2x 3, 7, %1\n\t addi 7, 7, 16\n\t"
"stxvd2x 5, 7, %1\n\t addi 7, 7, 16\n\t"
"stxvd2x 7, 7, %1\n\t addi 7, 7, 16\n\t"
"stxvd2x 9, 7, %1\n\t addi 7, 7, 16\n\t"
"stxvd2x 11, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 13, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 15, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 17, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 19, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 21, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 23, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 25, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 27, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 29, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 31, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 33, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 35, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 37, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 39, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 41, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 43, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 45, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 47, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 49, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 51, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 53, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 55, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 57, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 59, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 61, 7, %1\n\taddi 7, 7, 16\n\t"
"stxvd2x 63, 7, %1\n\t"
//inputs, outputs and clobbers
: : "r" (ar) , "r" (br) :
"r7","r8","r9",
"vs0","vs1","vs2","vs3","vs4","vs5","vs6","vs7",
"vs8","vs9","vs10","vs11","vs12","vs13","vs14",
"vs15","vs16","vs17","vs18","vs19","vs20","vs21",
"vs22","vs23","vs24","vs25","vs26","vs27","vs28",
"vs29","vs30","vs31","vs32","vs33","vs34","vs35",
"vs36","vs37","vs38","vs39","vs40","vs41","vs42",
"vs43","vs44","vs45","vs46","vs47","vs48","vs49",
"vs50","vs51","vs52","vs53","vs54","vs55","vs56",
"vs57","vs58","vs59","vs60","vs61","vs62","vs63",
"memory");
//c = get_cycles() - c0;
t = omp_get_wtime() - t0;
//count flips: 16000000*100*32/1e9 = 51.2
#define FC 51.2
flips = FC/t;
//freq = c/t/1e9;
rsum=flips;
rmax=flips;
rmin=flips;
//fqmax=freq;
//fqmin=freq;
}
to = omp_get_wtime() - to0;
printf("%d: Summary FLIPS: Min: %6.4fG, Avg: %6.4fG, Max: %6.4fG, Sum: %6.4fG GrandSum: %6.4fG\n",
i, rmin, rsum/size, rmax, rsum, FC*size/to);
}
printf("Outputs:\n");
for(i=0; i<NC; i+=4)
printf("zr: %d: %f %f %f %f\n", i, br[i+0], br[i+1], br[i+2], br[i+3]);
return 0;
}