Skip to content
Snippets Groups Projects
Select Git revision
  • master
1 result

mandelbrot-real-sse-mpi-dump.c

Blame
  • Roman Slíva's avatar
    Roman Sliva authored
    8dc7e0df
    History
    mandelbrot-real-sse-mpi-dump.c 4.63 KiB
    //by Branislav Jansik, @IT4Innovations, 2014
    #include <stdio.h>
    #include <stdlib.h>
    #include <mpi.h>
    
    #define REPEAT10(x) x x x x x x x x x x
    
    static __inline__ unsigned long long int get_cycles(void)
    {
      unsigned hi, lo;
      __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
      return ( (unsigned long long int)lo)|( ((unsigned long long int)hi)<<32 );
    }
    
    
    int main(int argc, char **argv) {
    
    float ar[32] __attribute__((aligned(32))) =
    { -2.00000, -1.92742, -1.85484,   -1.78226, -1.70968, -1.63710, -1.56452,
      -1.49194, -1.41935, -1.34677,   -1.27419, -1.20161, -1.12903, -1.05645,
      -0.98387, -0.91129, -0.83871,   -0.76613, -0.69355, -0.62097, -0.54839,
      -0.47581, -0.40323, -0.33065,   -0.25806, -0.18548, -0.11290, -0.04032,
       0.03226,  0.10484,  0.17742,    0.25000 };
    float br[32] __attribute__((aligned(32)));
    
    double t0,t, flips, rsum, rmax, rmin, freq, fmax, fmin;
    double *aflips, *afreq;
    int rank, size, i, j, niter=1;
    unsigned long long int c0, c;
    
    if (argc>1) niter = atoi(argv[1]);
    
    // Initiate MPI
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
    MPI_Comm_size(MPI_COMM_WORLD,&size);
    
    if (rank==0) 
    printf("FLIPS: Single Precision SSE Instructions Per Second\nRun %d times\n", niter);
    
    if (rank==0) {
    aflips = (double *) malloc(sizeof(double)*niter*size);
    afreq  = (double *) malloc(sizeof(double)*niter*size);
    }
     
    for (i=1; i<=niter; i++) {
    
    MPI_Barrier(MPI_COMM_WORLD);
    
    t0 = MPI_Wtime();
    c0 = get_cycles();
    
    __asm__(
    
            //init
            "movaps (%0),   %%xmm0\n\t" 
            "movaps 16(%0), %%xmm2\n\t" 
            "movaps 32(%0), %%xmm4\n\t" 
            "movaps 48(%0), %%xmm6\n\t" 
            "movaps 64(%0), %%xmm8\n\t" 
            "movaps 80(%0), %%xmm10\n\t" 
            "movaps 96(%0), %%xmm12\n\t" 
            "movaps 112(%0),%%xmm14\n\t" 
    
            //zero out xmm registers
            "xorps %%xmm1, %%xmm1\n\t" 
            "xorps %%xmm3, %%xmm3\n\t" 
            "xorps %%xmm5, %%xmm5\n\t" 
            "xorps %%xmm7, %%xmm7\n\t" 
            "xorps %%xmm9, %%xmm9\n\t" 
            "xorps %%xmm11,%%xmm11\n\t" 
            "xorps %%xmm13,%%xmm13\n\t" 
            "xorps %%xmm15,%%xmm15\n\t" 
           
            //zero out loop counter
            "xorq %%rcx, %%rcx\n\t"
    
            //loop
            "label:\n\t"
            "incq %%rcx\n\t"
    
    REPEAT10(
    REPEAT10(
            "mulps %%xmm1,  %%xmm1\n\t"
            "mulps %%xmm3,  %%xmm3\n\t"
            "mulps %%xmm5,  %%xmm5\n\t"
            "mulps %%xmm7,  %%xmm7\n\t"
            "mulps %%xmm9,  %%xmm9\n\t"
            "mulps %%xmm11, %%xmm11\n\t"
            "mulps %%xmm13, %%xmm13\n\t"
            "mulps %%xmm15, %%xmm15\n\t"
    
            "addps %%xmm0,  %%xmm1\n\t"
            "addps %%xmm2,  %%xmm3\n\t"
            "addps %%xmm4,  %%xmm5\n\t"
            "addps %%xmm6,  %%xmm7\n\t"
            "addps %%xmm8,  %%xmm9\n\t"
            "addps %%xmm10, %%xmm11\n\t"
            "addps %%xmm12, %%xmm13\n\t"
            "addps %%xmm14, %%xmm15\n\t"
    )
    )
    
            //end loop  
            "cmpq $20000000, %%rcx\n\t"
            "jb label \n\t"
    
            //offload
            "movaps %%xmm1,     (%1)\n\t"
            "movaps %%xmm3,   16(%1)\n\t"
            "movaps %%xmm5,   32(%1)\n\t"
            "movaps %%xmm7,   48(%1)\n\t"
            "movaps %%xmm9,   64(%1)\n\t"
            "movaps %%xmm11,  80(%1)\n\t"
            "movaps %%xmm13,  96(%1)\n\t"
            "movaps %%xmm15, 112(%1)\n\t"
    
            //inputs, outputs and clobbers
            :  : "r" (ar) , "r" (br) :
            "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", 
            "xmm5", "xmm6", "xmm7", "xmm8", "xmm9",
            "xmm10", "xmm12", "xmm13", "xmm14", "xmm15",
            "rcx", "memory");
    
    c = get_cycles() - c0;
    t = MPI_Wtime() - t0;
    
    flips = 2*16.0/t;
    freq  = c/t/1e9;
    
    //  Collect perf data across ranks
    MPI_Reduce(&flips,&rsum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
    MPI_Reduce(&flips,&rmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
    MPI_Reduce(&flips,&rmin,1,MPI_DOUBLE,MPI_MIN,0,MPI_COMM_WORLD);
    MPI_Reduce(&freq,&fmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
    MPI_Reduce(&freq,&fmin,1,MPI_DOUBLE,MPI_MIN,0,MPI_COMM_WORLD);
    
    MPI_Gather(&flips,1,MPI_DOUBLE,&aflips[(i-1)*size],1,MPI_DOUBLE,0,MPI_COMM_WORLD);
    MPI_Gather(&freq, 1,MPI_DOUBLE,&afreq [(i-1)*size],1,MPI_DOUBLE,0,MPI_COMM_WORLD);
    
    
    if (rank==0) 
    printf("%d: Summary FLIPS: Min: %6.4fG, Avg: %6.4fG, Max: %6.4fG, Sum: %6.4fG\nFreq: Min: %6.4fG, Max: %6.4fG\n",
           i, rmin, rsum/size, rmax, rsum, fmin, fmax);
    
    }
    
    // Finalize and exit
    MPI_Finalize();
    
    if (rank==0) {
    printf("Outputs:\n");
    for(i=0; i<32; i+=4)
    printf("ar: %d: %f %f %f %f\n", i, br[i+0], br[i+1], br[i+2], br[i+3]);
    }
    
    if (rank==0) {
    FILE *fp0 = fopen("flips.out","w");
    FILE *fp1 = fopen("freqs.out","w");
    fwrite((void *)aflips, sizeof(double), niter*size, fp0);
    fwrite((void *)afreq,  sizeof(double), niter*size, fp1);
    fclose(fp0); fclose(fp1);
    printf("Full data dump in flips.out and freq.out\n");
    }
    
    return 0;
    }