papi.md

    $ module load papi
    $ papi_avail
    Available events and hardware information.
    --------------------------------------------------------------------------------
    PAPI Version : 5.3.2.0
    Vendor string and code : GenuineIntel (1)
    Model string and code : Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz (45)
    CPU Revision : 7.000000
    CPUID Info : Family: 6 Model: 45 Stepping: 7
    CPU Max Megahertz : 2601
    CPU Min Megahertz : 1200
    Hdw Threads per core : 1
    Cores per Socket : 8
    Sockets : 2
    NUMA Nodes : 2
    CPUs per Node : 8
    Total CPUs : 16
    Running in a VM : no
    Number Hardware Counters : 11
    Max Multiplex Counters : 32
    --------------------------------------------------------------------------------
    Name Code Avail Deriv Description (Note)
    PAPI_L1_DCM 0x80000000 Yes No Level 1 data cache misses
    PAPI_L1_ICM 0x80000001 Yes No Level 1 instruction cache misses
    PAPI_L2_DCM 0x80000002 Yes Yes Level 2 data cache misses
    PAPI_L2_ICM 0x80000003 Yes No Level 2 instruction cache misses
    PAPI_L3_DCM 0x80000004 No No Level 3 data cache misses
    PAPI_L3_ICM 0x80000005 No No Level 3 instruction cache misses
    PAPI_L1_TCM 0x80000006 Yes Yes Level 1 cache misses
    PAPI_L2_TCM 0x80000007 Yes No Level 2 cache misses
    PAPI_L3_TCM 0x80000008 Yes No Level 3 cache misses
    ....
    #include <stdlib.h>
    #include <stdio.h>
    #include "papi.h"
    #define SIZE 1000

    int main(int argc, char **argv) {
     float matrixa[SIZE][SIZE], matrixb[SIZE][SIZE], mresult[SIZE][SIZE];
     float real_time, proc_time, mflops;
     long long flpins;
     int retval;
     int i,j,k;

     /* Initialize the Matrix arrays */
     for ( i=0; i<SIZE*SIZE; i++ ){
     mresult[0][i] = 0.0;
     matrixa[0][i] = matrixb[0][i] = rand()*(float)1.1;
     }

     /* Setup PAPI library and begin collecting data from the counters */
     if((retval=PAPI_flops( &real_time, &proc_time, &flpins, &mflops))<PAPI_OK)
     printf("Error!");

     /* A naive Matrix-Matrix multiplication */
     for (i=0;i<SIZE;i++)
     for(j=0;j<SIZE;j++)
     for(k=0;k<SIZE;k++)
     mresult[i][j]=mresult[i][j] + matrixa[i][k]*matrixb[k][j];

     /* Collect the data into the variables passed in */
     if((retval=PAPI_flops( &real_time, &proc_time, &flpins, &mflops))<PAPI_OK)
     printf("Error!");

     printf("Real_time:t%fnProc_time:t%fnTotal flpins:t%lldnMFLOPS:tt%fn", real_time, proc_time, flpins, mflops);
     PAPI_shutdown();
     return 0;
    }
    $ gcc matrix.c -o matrix -lpapi
    $ ./matrix
    Real_time: 8.852785
    Proc_time: 8.850000
    Total flpins: 6012390908
    MFLOPS: 679.366211
    $ gcc -O3 matrix.c -o matrix -lpapi
    $ ./matrix
    Real_time: 0.000020
    Proc_time: 0.000000
    Total flpins: 6
    MFLOPS: inf
    for (i=0; i<SIZE;i++)
     for (j=0; j<SIZE; j++)
       if (mresult[i][j] == -1.0) printf("x");
    $ gcc -O3 matrix.c -o matrix -lpapi
    $ ./matrix
    Real_time: 8.795956
    Proc_time: 8.790000
    Total flpins: 18700983160
    MFLOPS: 2127.529297
    $ module load papi/5.3.2-mic
    $ module load intel
    $ icc -mmic -Wl,-rpath,/apps/intel/composer_xe_2013.5.192/compiler/lib/mic matrix-mic.c -o matrix-mic -lpapi -lpfm
    $ qsub -q qmic -A NONE-0-0 -I
    $ ssh mic0
    $ export LD_LIBRARY_PATH=/apps/tools/papi/5.4.0-mic/lib/
    $ ./matrix-mic
    $ /apps/tools/papi/5.4.0-mic/bin/papi_native_avail
    $ module load papi/5.4.0
    $ icc matrix-offload.c -o matrix-offload -offload-option,mic,compiler,"-L$PAPI_HOME-mic/lib -lpapi" -lpapi