Skip to content
Snippets Groups Projects
mandelbrot-real-fma-f32-omp.c 4.82 KiB
Newer Older
  • Learn to ignore specific revisions
  • //by Branislav Jansik, @IT4Innovations, 2024
    #include <stdio.h>
    #include <stdlib.h>
    #include <omp.h>
    #include <math.h>
    
    #define REPEAT10(x) x x x x x x x x x x
    
    static __inline__ unsigned long long int get_cycles(void)
    {
      unsigned hi, lo;
      __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
      return ( (unsigned long long int)lo)|( ((unsigned long long int)hi)<<32 );
    }
    
    
    int main(int argc, char **argv) {
    
    
    float ar[128] __attribute__((aligned(32))) =
    { -2.00000, -1.92742, -1.85484,   -1.78226, -1.70968, -1.63710, -1.56452,
      -1.49194, -1.41935, -1.34677,   -1.27419, -1.20161, -1.12903, -1.05645,
      -0.98387, -0.91129, -0.83871,   -0.76613, -0.69355, -0.62097, -0.54839,
      -0.47581, -0.40323, -0.33065,   -0.25806, -0.18548, -0.11290, -0.04032,
       0.03226,  0.10484,  0.17742,    0.25000, 
      -2.00000, -1.92742, -1.85484,   -1.78226, -1.70968, -1.63710, -1.56452,
      -1.49194, -1.41935, -1.34677,   -1.27419, -1.20161, -1.12903, -1.05645,
      -0.98387, -0.91129, -0.83871,   -0.76613, -0.69355, -0.62097, -0.54839,
      -0.47581, -0.40323, -0.33065,   -0.25806, -0.18548, -0.11290, -0.04032,
       0.03226,  0.10484,  0.17742,    0.25000,
      -2.00000, -1.92742, -1.85484,   -1.78226, -1.70968, -1.63710, -1.56452,
      -1.49194, -1.41935, -1.34677,   -1.27419, -1.20161, -1.12903, -1.05645,
      -0.98387, -0.91129, -0.83871,   -0.76613, -0.69355, -0.62097, -0.54839,
      -0.47581, -0.40323, -0.33065,   -0.25806, -0.18548, -0.11290, -0.04032,
       0.03226,  0.10484,  0.17742,    0.25000, 
      -2.00000, -1.92742, -1.85484,   -1.78226, -1.70968, -1.63710, -1.56452,
      -1.49194, -1.41935, -1.34677,   -1.27419, -1.20161, -1.12903, -1.05645,
      -0.98387, -0.91129, -0.83871,   -0.76613, -0.69355, -0.62097, -0.54839,
      -0.47581, -0.40323, -0.33065,   -0.25806, -0.18548, -0.11290, -0.04032,
       0.03226,  0.10484,  0.17742,    0.25000 
    };
    
    float br[128] __attribute__((aligned(32)));
    
    double rsum, rmax, rmin, fqmax, fqmin;
    double *aflips, *afreq;
    int i, size, niter = 1;
    
    if (argc>1) niter = atoi(argv[1]);
    
    size=omp_get_max_threads();
    
    printf("FLIPS: Single Precision (f32) FMA Instructions Per Second\nRun %d times on %d cores\n", niter, size);
    
    
    for (i=1; i<=niter; i++) {
    
    rsum=0.0;
    rmax=-INFINITY;
    rmin=INFINITY;
    fqmax=-INFINITY;
    fqmin=INFINITY;
    
    #pragma omp parallel default(shared)
    {
    double t0,t, freq, flips; 
    int j;
    unsigned long long int c0, c;
    
    
    #pragma omp barrier
    
    t0 = omp_get_wtime();
    c0 = get_cycles();
    
    __asm__(
            //init
            "vzeroall\n\t"
    
            "vmovaps (%0),    %%ymm0\n\t" 
            "vmovaps 32(%0),  %%ymm2\n\t" 
            "vmovaps 64(%0),  %%ymm4\n\t" 
            "vmovaps 96(%0),  %%ymm6\n\t" 
            "vmovaps 128(%0), %%ymm8\n\t" 
            "vmovaps 160(%0), %%ymm10\n\t" 
            //"vmovaps 192(%0), %%ymm12\n\t" 
            //"vmovaps 224(%0), %%ymm14\n\t" 
    
            "xorq %%rcx, %%rcx\n\t"            //zero out loop counter
    
            //loop
            "label:\n\t"
            "incq %%rcx\n\t"
    
    REPEAT10(
    REPEAT10(
            "vfmadd213ps %%ymm0, %%ymm1, %%ymm1\n\t"
            "vfmadd213ps %%ymm2, %%ymm3, %%ymm3\n\t"
            "vfmadd213ps %%ymm4, %%ymm5, %%ymm5\n\t"
            "vfmadd213ps %%ymm6, %%ymm7, %%ymm7\n\t"
            "vfmadd213ps %%ymm8, %%ymm9, %%ymm9\n\t"
            "vfmadd213ps %%ymm10, %%ymm11, %%ymm11\n\t"
            "vfmadd213ps 192(%0), %%ymm13, %%ymm13\n\t"
            "vfmadd213ps 224(%0), %%ymm15, %%ymm15\n\t"
            "vfmadd213ps 256(%0), %%ymm14, %%ymm14\n\t"
            "vfmadd213ps 288(%0), %%ymm12, %%ymm12\n\t"
    )
    )
    
            //end loop  
            "cmpq $40000000, %%rcx\n\t"
            "jb label \n\t"
    
            //offload
            "vmovaps %%ymm1,     (%1)\n\t"
            "vmovaps %%ymm3,   32(%1)\n\t"
            "vmovaps %%ymm5,   64(%1)\n\t"
            "vmovaps %%ymm7,   96(%1)\n\t"
            "vmovaps %%ymm9,  128(%1)\n\t"
            "vmovaps %%ymm11, 160(%1)\n\t"
            "vmovaps %%ymm13, 192(%1)\n\t"
            "vmovaps %%ymm15, 224(%1)\n\t"
            "vmovaps %%ymm14, 256(%1)\n\t"
            "vmovaps %%ymm12, 288(%1)\n\t"
    
    
    
            //inputs, outputs and clobbers
            :  : "r" (ar) , "r" (br) :
       /*   "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", 
            "ymm5", "ymm6", "ymm7", "ymm8", "ymm9",
            "ymm10","ymm11", "ymm12", "ymm13", "ymm14", "ymm15", */
            "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", 
            "xmm5", "xmm6", "xmm7", "xmm8", "xmm9",
            "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
            "rcx", "memory");
    
    
    c = get_cycles() - c0;
    t = omp_get_wtime() - t0;
    
    flips = 8*10.0/t;
    freq  = c/t/1e9;
    
    #pragma omp critical
    {
    rsum += flips;
    rmax= fmax(flips,rmax);
    rmin= fmin(flips,rmin);
    fqmax= fmax(freq, fqmax);
    fqmin= fmin(freq, fqmin);
    }
    
    //#pragma omp master
    //for (j=0; j<32; j++ ) br[j]=tr[j];
    
    }
    
    printf("%d: Summary FLIPS: Min: %6.4fG, Avg: %6.4fG, Max: %6.4fG, Sum: %6.4fG\nFreq: Min: %6.4fG, Max: %6.4fG\n",
           i, rmin, rsum/size, rmax, rsum, fqmin, fqmax);
    
    }
    
    printf("Outputs:\n");
    for(i=0; i<80; i+=4)
    printf("br: %d: %f %f %f %f\n", i, br[i+0], br[i+1], br[i+2], br[i+3]);
    
    return 0;
    }