Select Git revision
mandelbrot-real-sse-mpi-dump.c
Roman Sliva authored
mandelbrot-real-sse-mpi-dump.c 4.63 KiB
//by Branislav Jansik, @IT4Innovations, 2014
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define REPEAT10(x) x x x x x x x x x x
static __inline__ unsigned long long int get_cycles(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long int)lo)|( ((unsigned long long int)hi)<<32 );
}
int main(int argc, char **argv) {
float ar[32] __attribute__((aligned(32))) =
{ -2.00000, -1.92742, -1.85484, -1.78226, -1.70968, -1.63710, -1.56452,
-1.49194, -1.41935, -1.34677, -1.27419, -1.20161, -1.12903, -1.05645,
-0.98387, -0.91129, -0.83871, -0.76613, -0.69355, -0.62097, -0.54839,
-0.47581, -0.40323, -0.33065, -0.25806, -0.18548, -0.11290, -0.04032,
0.03226, 0.10484, 0.17742, 0.25000 };
float br[32] __attribute__((aligned(32)));
double t0,t, flips, rsum, rmax, rmin, freq, fmax, fmin;
double *aflips, *afreq;
int rank, size, i, j, niter=1;
unsigned long long int c0, c;
if (argc>1) niter = atoi(argv[1]);
// Initiate MPI
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm_size(MPI_COMM_WORLD,&size);
if (rank==0)
printf("FLIPS: Single Precision SSE Instructions Per Second\nRun %d times\n", niter);
if (rank==0) {
aflips = (double *) malloc(sizeof(double)*niter*size);
afreq = (double *) malloc(sizeof(double)*niter*size);
}
for (i=1; i<=niter; i++) {
MPI_Barrier(MPI_COMM_WORLD);
t0 = MPI_Wtime();
c0 = get_cycles();
__asm__(
//init
"movaps (%0), %%xmm0\n\t"
"movaps 16(%0), %%xmm2\n\t"
"movaps 32(%0), %%xmm4\n\t"
"movaps 48(%0), %%xmm6\n\t"
"movaps 64(%0), %%xmm8\n\t"
"movaps 80(%0), %%xmm10\n\t"
"movaps 96(%0), %%xmm12\n\t"
"movaps 112(%0),%%xmm14\n\t"
//zero out xmm registers
"xorps %%xmm1, %%xmm1\n\t"
"xorps %%xmm3, %%xmm3\n\t"
"xorps %%xmm5, %%xmm5\n\t"
"xorps %%xmm7, %%xmm7\n\t"
"xorps %%xmm9, %%xmm9\n\t" "xorps %%xmm11,%%xmm11\n\t"
"xorps %%xmm13,%%xmm13\n\t"
"xorps %%xmm15,%%xmm15\n\t"
//zero out loop counter
"xorq %%rcx, %%rcx\n\t"
//loop
"label:\n\t"
"incq %%rcx\n\t"
REPEAT10(
REPEAT10(
"mulps %%xmm1, %%xmm1\n\t"
"mulps %%xmm3, %%xmm3\n\t"
"mulps %%xmm5, %%xmm5\n\t"
"mulps %%xmm7, %%xmm7\n\t"
"mulps %%xmm9, %%xmm9\n\t"
"mulps %%xmm11, %%xmm11\n\t"
"mulps %%xmm13, %%xmm13\n\t"
"mulps %%xmm15, %%xmm15\n\t"
"addps %%xmm0, %%xmm1\n\t"
"addps %%xmm2, %%xmm3\n\t"
"addps %%xmm4, %%xmm5\n\t"
"addps %%xmm6, %%xmm7\n\t"
"addps %%xmm8, %%xmm9\n\t"
"addps %%xmm10, %%xmm11\n\t"
"addps %%xmm12, %%xmm13\n\t"
"addps %%xmm14, %%xmm15\n\t"
)
)
//end loop
"cmpq $20000000, %%rcx\n\t"
"jb label \n\t"
//offload
"movaps %%xmm1, (%1)\n\t"
"movaps %%xmm3, 16(%1)\n\t"
"movaps %%xmm5, 32(%1)\n\t"
"movaps %%xmm7, 48(%1)\n\t"
"movaps %%xmm9, 64(%1)\n\t"
"movaps %%xmm11, 80(%1)\n\t"
"movaps %%xmm13, 96(%1)\n\t"
"movaps %%xmm15, 112(%1)\n\t"
//inputs, outputs and clobbers
: : "r" (ar) , "r" (br) :
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
"xmm5", "xmm6", "xmm7", "xmm8", "xmm9",
"xmm10", "xmm12", "xmm13", "xmm14", "xmm15",
"rcx", "memory");
c = get_cycles() - c0;
t = MPI_Wtime() - t0;
flips = 2*16.0/t;
freq = c/t/1e9;
// Collect perf data across ranks
MPI_Reduce(&flips,&rsum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
MPI_Reduce(&flips,&rmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
MPI_Reduce(&flips,&rmin,1,MPI_DOUBLE,MPI_MIN,0,MPI_COMM_WORLD);
MPI_Reduce(&freq,&fmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
MPI_Reduce(&freq,&fmin,1,MPI_DOUBLE,MPI_MIN,0,MPI_COMM_WORLD);
MPI_Gather(&flips,1,MPI_DOUBLE,&aflips[(i-1)*size],1,MPI_DOUBLE,0,MPI_COMM_WORLD);
MPI_Gather(&freq, 1,MPI_DOUBLE,&afreq [(i-1)*size],1,MPI_DOUBLE,0,MPI_COMM_WORLD);
if (rank==0)
printf("%d: Summary FLIPS: Min: %6.4fG, Avg: %6.4fG, Max: %6.4fG, Sum: %6.4fG\nFreq: Min: %6.4fG, Max: %6.4fG\n",
i, rmin, rsum/size, rmax, rsum, fmin, fmax);
}
// Finalize and exit
MPI_Finalize();
if (rank==0) {
printf("Outputs:\n");
for(i=0; i<32; i+=4)
printf("ar: %d: %f %f %f %f\n", i, br[i+0], br[i+1], br[i+2], br[i+3]);
}
if (rank==0) {
FILE *fp0 = fopen("flips.out","w");
FILE *fp1 = fopen("freqs.out","w");
fwrite((void *)aflips, sizeof(double), niter*size, fp0);
fwrite((void *)afreq, sizeof(double), niter*size, fp1);
fclose(fp0); fclose(fp1);
printf("Full data dump in flips.out and freq.out\n");
}
return 0;
}