Skip to content
Snippets Groups Projects
lorenz-blas.c 4.3 KiB
Newer Older
  • Learn to ignore specific revisions
  • //by Branislav Jansik, @IT4Innovations, 2021
    #include <stdio.h>
    #include <stdlib.h>
    #include <stdint.h>
    
    #include <omp.h>
    
    //blas declarations
    double dnrm2_(int *n, double *x, int *ix);
    void dscal_(int *n, double *alpha, double *x, int *ix);
    void daxpy_(int *n, double *alpha, double *x, int *ix, double *y, int *iy);
    
    void dger_(int *m, int *n, double *alpha, double *x, int *ix, double *y, int *iy, double *a, int *lda);
    
    void dgemv_(char *, int *m, int *n, double *alpha, double *a, int *lda, \
    
    	    double *x, int *ix, double *beta, double *y, int *iy);
    
    void dgemm_(char *, char *, int *m, int *n, int *k, double *alpha, double *a, int *lda, \
    
                double *b, int *ldb, double *beta, double *c, int *ldc); 
    
    //pcg random number generator
    typedef struct { uint64_t state;  uint64_t inc; } pcg32_random_t;
    uint32_t pcg32_random_r(pcg32_random_t* rng)
    {
        uint64_t oldstate = rng->state;
        // Advance internal state
        rng->state = oldstate * 6364136223846793005ULL + (rng->inc|1);
        // Calculate output function (XSH RR), uses old state for max ILP
        uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
        uint32_t rot = oldstate >> 59u;
        return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
    }
    
    int main(int argc, char *argv[]) {
    
    //declarations
    
    double *x, *y, *z, *dx, *dy, *dz, *e, *xe;
    
    double p[4] __attribute__((aligned(64))) = {10.0,28.0,8.0/3.0,1e-4};
    double pm[4] __attribute__((aligned(64))) = {-10.0,-28.0,-8.0/3.0,-1e-4};
    double t0, t, f;
    
    
    int ONEI=1, ZEROI=0;
    double ONED=1.0, ONEDm=-1.0, ZEROD=0.0;
    
    
    register int i, j;
    int NMAT=500, NITER=10, NMAT2;
    pcg32_random_t rng;
    
    //read inputs
      if (argc==1) printf("Usage: %s [NMAT] [NITER]\n",argv[0]);
    
      if (argc>1) NMAT    = atoi(argv[1]);
      NMAT2   = NMAT*NMAT;
    
      if (argc>2) NITER = atoi(argv[2]);
    
    //print inputs
    
      printf("Running %d iterations\nMatrix size: %d\nMemory: %.1f MiB\n", NITER, NMAT, (sizeof(double)*(double)(6*NMAT2+2*NMAT)/1024/1024));
    
    
    //allocate memory
    posix_memalign((void *) &x,  (size_t)64, (size_t) sizeof(double)*NMAT2 );
    posix_memalign((void *) &y,  (size_t)64, (size_t) sizeof(double)*NMAT2 );
    posix_memalign((void *) &z,  (size_t)64, (size_t) sizeof(double)*NMAT2 );
    posix_memalign((void *) &dx, (size_t)64, (size_t) sizeof(double)*NMAT2 );
    posix_memalign((void *) &dy, (size_t)64, (size_t) sizeof(double)*NMAT2 );
    posix_memalign((void *) &dz, (size_t)64, (size_t) sizeof(double)*NMAT2 );
    
    posix_memalign((void *) &e,  (size_t)64, (size_t) sizeof(double)*NMAT  );
    posix_memalign((void *) &xe, (size_t)64, (size_t) sizeof(double)*NMAT  );
    
    
    //initialize matrices
    
    
    #pragma omp parallel for default(shared) private(i,rng) schedule(static,8)
    
    for (i=0; i<NMAT2; i++) {
      x[i]=(double)(pcg32_random_r(&rng)%100000)/100000;
      y[i]=(double)(pcg32_random_r(&rng)%100000)/100000;
      z[i]=(double)(pcg32_random_r(&rng)%100000)/100000;
    }
    
    
    #pragma omp parallel for default(shared) private(i) schedule(static,8)
    
    for (i=0; i<NMAT; i++) e[i]=1.0;
    
    
    //run lorenz iterations
    
    t0 = omp_get_wtime();
    
    for (i=0; i<NITER; i++) {
    
      //dx[j] = p[0]*(y[j]-x[j]); 
      dscal_(&NMAT2,&ZEROD,dx,&ONEI);
      daxpy_(&NMAT2,&pm[0],x,&ONEI,dx,&ONEI);
      daxpy_(&NMAT2,&p[0],y,&ONEI,dx,&ONEI);
      
      //dy[j] = x[j]*(p[1]-z[j]) - y[j];
    
      dgemm_(N,N,&NMAT,&NMAT,&NMAT,&ONEDm,x,&NMAT,z,&NMAT,&ZEROD,dy,&NMAT);
      dgemv_(N,&NMAT,&NMAT,&p[1],x,&NMAT,e,&ONEI,&ZEROD,xe,&ONEI);
    
      dger_(&NMAT,&NMAT,&ONED,xe,&ONEI,e,&ONEI,dy,&NMAT);
    
      daxpy_(&NMAT2,&ONEDm,y,&ONEI,dy,&ONEI);
    
      //dz[j] = x[j]*y[j]-p[2]*z[j];
    
      dgemm_(N,N,&NMAT,&NMAT,&NMAT,&ONED,x,&NMAT,y,&NMAT,&ZEROD,dz,&NMAT);
    
      daxpy_(&NMAT2,&pm[2],z,&ONEI,dz,&ONEI);
    
    
      //adaptive integration step size
      f = p[3]*2.0*NMAT/fmax(fmax( \
      dnrm2_(&NMAT2,dx,&ONEI) ,    \
      dnrm2_(&NMAT2,dy,&ONEI)),    \
      dnrm2_(&NMAT2,dz,&ONEI));
    
    
      //x[j] += p[3]*dx[j];
    
      daxpy_(&NMAT2,&f,dx,&ONEI,x,&ONEI);
    
      //y[j] += p[3]*dy[j];
    
      daxpy_(&NMAT2,&f,dy,&ONEI,y,&ONEI);
    
      //z[j] += p[3]*dz[j];
    
      daxpy_(&NMAT2,&f,dz,&ONEI,z,&ONEI);
    
    }
    t = omp_get_wtime();
    
    //print outputs
    
    printf("\nPerformance: %.1f GFlop/s\n\n",NITER*((double)2*2*NMAT2*NMAT + (double)12*2*NMAT2 + (double)NMAT2)/1e9/(t-t0));
    
    printf("Norm x: %f\n",dnrm2_(&NMAT2,x,&ONEI)/NMAT);
    printf("Norm y: %f\n",dnrm2_(&NMAT2,y,&ONEI)/NMAT);
    printf("Norm z: %f\n",dnrm2_(&NMAT2,z,&ONEI)/NMAT);
    
    //release memory
    free(x);
    free(y);
    free(z);
    free(dx);
    free(dy);
    free(dz);
    
    free(e);
    free(xe);