accumulator-ger-power-f64-omp.c

//by Branislav Jansik, @IT4Innovations, 2024
//Power ISA for Power10 and beyond. 
//
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <math.h>

#define REPEAT10(x) x x x x x x x x x x

#define NC 64

/* Cycle measurement not implemented for POWER yet
static __inline__ unsigned long long int get_cycles(void)
{
  //Read register PMC6
  unsigned long long int x;
  __asm__ __volatile__ ("mfspr 6, 776\n\t stw 6, 0(%0)\n\t" : : "r"(&x) : "r6", "memory");
  return ( x );
}
*/


int main(int argc, char **argv) {

double *ar, *br;
double rsum, rmax, rmin;
double ainc;
double to, to0;
int i, size, niter = 1;

//inputs
if (argc>1) niter = atoi(argv[1]);

//find out threads 
size=omp_get_max_threads();

//print info
printf("FLIPS: Double Precision Power ISA VSX GER (128bit) Instructions Per Second\nRun %d times on %d cores\n", niter, size);

// allocate 
posix_memalign((void *) &ar, (size_t)64, (size_t) sizeof(double)*NC);
posix_memalign((void *) &br, (size_t)64, (size_t) sizeof(double)*NC);

// initialize c constants within the mandelbrot set
ainc = 2.25/((double) NC);
for (i=0;i<NC; i++)
     ar[i] = i*ainc;
// iterations
for (i=1; i<=niter; i++) {

rsum=0.0;
rmax=-INFINITY;
rmin=INFINITY;
//fqmax=-INFINITY;
//fqmin=INFINITY;

to0 = omp_get_wtime();

#pragma omp parallel default(shared) reduction(+:rsum) reduction(max:rmax) reduction (min:rmin)
{
double t0,t,flips;
//cycle measurement not implemented for POWER ISA yet
//double freq;
//unsigned long long int c0, c;


#pragma omp barrier

t0 = omp_get_wtime();
//c0 = get_cycles();

__asm__(
        //load c constants into 128 bit VSX vector registers.
	//addi shifts load address by 16 bytes
 	"lxvd2x 32, 0, %0\n\t li  7, 16\n\t"
 	"lxvd2x 33, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 34, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 35, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 36, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 37, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 38, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 39, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 40, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 41, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 42, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 43, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 44, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 45, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 46, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 47, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 48, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 49, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 50, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 51, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 52, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 53, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 54, 7, %0\n\taddi 7, 7, 16\n\t"
 	"lxvd2x 55, 7, %0\n\t"

        //zero out accumulators 
        "xxsetaccz 0\n\t"
        "xxsetaccz 1\n\t"
        "xxsetaccz 2\n\t"
        "xxsetaccz 3\n\t"
        "xxsetaccz 4\n\t"
        "xxsetaccz 5\n\t"
        "xxsetaccz 6\n\t"
        "xxsetaccz 7\n\t"

        //zero out loop counter
        "li 8, 0\n\t"

        //set loop limit to 8000000 = 4000*4000/2
	"li 9, 4000\n\t"
        "mullw 9, 9, 9\n\t"
        "srawi 9, 9, 1\n\t"	

        //loop
        "loop:\n\t"
        "addi 8, 8, 1\n\t"
REPEAT10(
REPEAT10(
	"xvf64gerpp 0, 32, 48\n\t"
	"xvf64gerpp 1, 34, 49\n\t"
	"xvf64gerpp 2, 36, 50\n\t"
	"xvf64gerpp 3, 38, 51\n\t"
	"xvf64gerpp 4, 40, 52\n\t"
	"xvf64gerpp 5, 42, 53\n\t"
	"xvf64gerpp 6, 44, 54\n\t"
	"xvf64gerpp 7, 46, 55\n\t"
)
REPEAT10(
	"xvf64gernp 0, 32, 48\n\t"
	"xvf64gernp 1, 34, 49\n\t"
	"xvf64gernp 2, 36, 50\n\t"
	"xvf64gernp 3, 38, 51\n\t"
	"xvf64gernp 4, 40, 52\n\t"
	"xvf64gernp 5, 42, 53\n\t"
	"xvf64gernp 6, 44, 54\n\t"
	"xvf64gernp 7, 46, 55\n\t"

	"xvf64gernp 0, 32, 48\n\t"
	"xvf64gernp 1, 34, 49\n\t"
	"xvf64gernp 2, 36, 50\n\t"
	"xvf64gernp 3, 38, 51\n\t"
	"xvf64gernp 4, 40, 52\n\t"
	"xvf64gernp 5, 42, 53\n\t"
	"xvf64gernp 6, 44, 54\n\t"
	"xvf64gernp 7, 46, 55\n\t"
)
REPEAT10(
	"xvf64gerpp 0, 32, 48\n\t"
	"xvf64gerpp 1, 34, 49\n\t"
	"xvf64gerpp 2, 36, 50\n\t"
	"xvf64gerpp 3, 38, 51\n\t"
	"xvf64gerpp 4, 40, 52\n\t"
	"xvf64gerpp 5, 42, 53\n\t"
	"xvf64gerpp 6, 44, 54\n\t"
	"xvf64gerpp 7, 46, 55\n\t"
))
        //end loop  
        "cmpw 8, 9\n\t"
        "blt loop\n\t"

        //offload
	//offload to br array from multiple threads creates race condition
	//we ignore it for now.
        "xxmfacc 0\n\t"
        "xxmfacc 1\n\t"
        "xxmfacc 2\n\t"
        "xxmfacc 3\n\t"
        "xxmfacc 4\n\t"
        "xxmfacc 5\n\t"
        "xxmfacc 6\n\t"
        "xxmfacc 7\n\t"

        "stxvd2x  0, 0, %1\n\t li 7, 16\n\t"
        "stxvd2x  1, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x  2, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x  3, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x  4, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x  5, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x  6, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x  7, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x  8, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x  9, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 10, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 11, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 12, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 13, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 14, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 15, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 16, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 17, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 18, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 19, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 20, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 21, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 22, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 23, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 24, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 25, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 26, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 27, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 28, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 29, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 30, 7, %1\n\taddi 7, 7, 16\n\t"
        "stxvd2x 31, 7, %1\n\t"
  
        
        //inputs, outputs and clobbers
        :  : "r" (ar) , "r" (br) :
	"r7","r8","r9",
	"vs0","vs1","vs2","vs3","vs4","vs5","vs6","vs7",
	"vs8","vs9","vs10","vs11","vs12","vs13","vs14",
	"vs15","vs16","vs17","vs18","vs19","vs20","vs21",
	"vs22","vs23","vs24","vs25","vs26","vs27","vs28",
	"vs29","vs30","vs31","vs32","vs33","vs34","vs35",
	"vs36","vs37","vs38","vs39","vs40","vs41","vs42",
	"vs43","vs44","vs45","vs46","vs47","vs48","vs49",
	"vs50","vs51","vs52","vs53","vs54","vs55",
        "memory");

//c = get_cycles() - c0;
t = omp_get_wtime() - t0;

//count flips: 8000000*100*32/1e9 = 25.6
#define FC 25.6
flips = FC/t;
//freq  = c/t/1e9;

rsum=flips;
rmax=flips;
rmin=flips;
//fqmax=freq;
//fqmin=freq;

}

to = omp_get_wtime() - to0;

printf("%d: Summary FLIPS: Min: %6.4fG, Avg: %6.4fG, Max: %6.4fG, Sum: %6.4fG GrandSum: %6.4fG\n",
        i, rmin, rsum/size, rmax, rsum, FC*size/to);

}

printf("Outputs: (should be all 0.0)\n");
for(i=0; i<NC; i+=4)
printf("zr: %d: %f %f %f %f\n", i, br[i+0], br[i+1], br[i+2], br[i+3]);

return 0;
}