Newer
Older
//by Branislav Jansik, @IT4Innovations, 2021
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <omp.h>
//blas declarations
double dnrm2_(int *n, double *x, int *ix);
void dscal_(int *n, double *alpha, double *x, int *ix);
void daxpy_(int *n, double *alpha, double *x, int *ix, double *y, int *iy);
void dger_(int *m, int *n, double *alpha, double *x, int *ix, double *y, int *iy, double *a, int *lda);
void dgemv_(char *, int *m, int *n, double *alpha, double *a, int *lda, \
double *x, int *ix, double *beta, double *y, int *iy);
void dgemm_(char *, char *, int *m, int *n, int *k, double *alpha, double *a, int *lda, \
double *b, int *ldb, double *beta, double *c, int *ldc);
//pcg random number generator
typedef struct { uint64_t state; uint64_t inc; } pcg32_random_t;
uint32_t pcg32_random_r(pcg32_random_t* rng)
{
uint64_t oldstate = rng->state;
// Advance internal state
rng->state = oldstate * 6364136223846793005ULL + (rng->inc|1);
// Calculate output function (XSH RR), uses old state for max ILP
uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
uint32_t rot = oldstate >> 59u;
return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
}
int main(int argc, char *argv[]) {
//declarations
double *x, *y, *z, *dx, *dy, *dz, *e, *xe;
double p[4] __attribute__((aligned(64))) = {10.0,28.0,8.0/3.0,1e-4};
double pm[4] __attribute__((aligned(64))) = {-10.0,-28.0,-8.0/3.0,-1e-4};
double t0, t, f;
int ONEI=1, ZEROI=0;
double ONED=1.0, ONEDm=-1.0, ZEROD=0.0;
register int i, j;
int NMAT=500, NITER=10, NMAT2;
pcg32_random_t rng;
//read inputs
if (argc==1) printf("Usage: %s [NMAT] [NITER]\n",argv[0]);
if (argc>1) NMAT = atoi(argv[1]);
NMAT2 = NMAT*NMAT;
if (argc>2) NITER = atoi(argv[2]);
//print inputs
printf("Running %d iterations\nMatrix size: %d\nMemory: %.1f MiB\n", NITER, NMAT, (sizeof(double)*(double)(6*NMAT2+2*NMAT)/1024/1024));
//allocate memory
posix_memalign((void *) &x, (size_t)64, (size_t) sizeof(double)*NMAT2 );
posix_memalign((void *) &y, (size_t)64, (size_t) sizeof(double)*NMAT2 );
posix_memalign((void *) &z, (size_t)64, (size_t) sizeof(double)*NMAT2 );
posix_memalign((void *) &dx, (size_t)64, (size_t) sizeof(double)*NMAT2 );
posix_memalign((void *) &dy, (size_t)64, (size_t) sizeof(double)*NMAT2 );
posix_memalign((void *) &dz, (size_t)64, (size_t) sizeof(double)*NMAT2 );
posix_memalign((void *) &e, (size_t)64, (size_t) sizeof(double)*NMAT );
posix_memalign((void *) &xe, (size_t)64, (size_t) sizeof(double)*NMAT );
#pragma omp parallel for default(shared) private(i,rng) schedule(static,8)
for (i=0; i<NMAT2; i++) {
x[i]=(double)(pcg32_random_r(&rng)%100000)/100000;
y[i]=(double)(pcg32_random_r(&rng)%100000)/100000;
z[i]=(double)(pcg32_random_r(&rng)%100000)/100000;
}
#pragma omp parallel for default(shared) private(i) schedule(static,8)
for (i=0; i<NITER; i++) {
//dx[j] = p[0]*(y[j]-x[j]);
dscal_(&NMAT2,&ZEROD,dx,&ONEI);
daxpy_(&NMAT2,&pm[0],x,&ONEI,dx,&ONEI);
daxpy_(&NMAT2,&p[0],y,&ONEI,dx,&ONEI);
//dy[j] = x[j]*(p[1]-z[j]) - y[j];
dgemm_(N,N,&NMAT,&NMAT,&NMAT,&ONEDm,x,&NMAT,z,&NMAT,&ZEROD,dy,&NMAT);
dgemv_(N,&NMAT,&NMAT,&p[1],x,&NMAT,e,&ONEI,&ZEROD,xe,&ONEI);
dger_(&NMAT,&NMAT,&ONED,xe,&ONEI,e,&ONEI,dy,&NMAT);
daxpy_(&NMAT2,&ONEDm,y,&ONEI,dy,&ONEI);
//dz[j] = x[j]*y[j]-p[2]*z[j];
dgemm_(N,N,&NMAT,&NMAT,&NMAT,&ONED,x,&NMAT,y,&NMAT,&ZEROD,dz,&NMAT);
daxpy_(&NMAT2,&pm[2],z,&ONEI,dz,&ONEI);
//adaptive integration step size
f = p[3]*2.0*NMAT/fmax(fmax( \
dnrm2_(&NMAT2,dx,&ONEI) , \
dnrm2_(&NMAT2,dy,&ONEI)), \
dnrm2_(&NMAT2,dz,&ONEI));
daxpy_(&NMAT2,&f,dx,&ONEI,x,&ONEI);
daxpy_(&NMAT2,&f,dy,&ONEI,y,&ONEI);
daxpy_(&NMAT2,&f,dz,&ONEI,z,&ONEI);
}
t = omp_get_wtime();
//print outputs
printf("\nPerformance: %.1f GFlop/s\n\n",NITER*((double)2*2*NMAT2*NMAT + (double)12*2*NMAT2 + (double)NMAT2)/1e9/(t-t0));
printf("Norm x: %f\n",dnrm2_(&NMAT2,x,&ONEI)/NMAT);
printf("Norm y: %f\n",dnrm2_(&NMAT2,y,&ONEI)/NMAT);
printf("Norm z: %f\n",dnrm2_(&NMAT2,z,&ONEI)/NMAT);
//release memory
free(x);
free(y);
free(z);
free(dx);
free(dy);
free(dz);