Commit 98aa96fe authored by nikl's avatar nikl
Browse files

Initial commit

parents
CXX=mpic++
CXX_FLAGS=-O3 -mavx2 -g -std=c++11 -fopenmp
INC_FOLDER=-I/projects/p_readex/it4i/mericGCC/include -I/sw/global/compilers/intel/2017/compilers_and_libraries_2017/linux/mkl/include/
LIB_FOLDER=-L/projects/p_readex/it4i/mericGCC/lib/ -L/sw/global/compilers/intel/2017/compilers_and_libraries_2017/linux/mkl/lib/intel64
LIBS=-lmericmpi -lhdeem -lrt -lpapi -lcpufreq -lx86_adapt -lgomp -lmpi -lmkl_intel_lp64 -lmkl_core -lmkl_gnu_thread -lfreeipmi
all: default
default:
scorep $(CXX) $(CXX_FLAGS) $(INC_FOLDER) src/main.cpp src/**/*.cpp -o blasbench -Wl,-rpath=/projects/p_readex/it4i/mericGCC/lib/ $(LIB_FOLDER) $(LIBS)
scorep:
scorep --user --online-access --thread=none $(CXX) $(CXX_FLAGS) $(INC_FOLDER) -DUSE_SCOREP src/main.cpp src/**/*.cpp -o blasbench -Wl,-rpath=/projects/p_readex/it4i/mericGCC/lib/ $(LIB_FOLDER) $(LIBS)
scorepm:
scorep --user --online-access --thread=none $(CXX) $(CXX_FLAGS) $(INC_FOLDER) -DUSE_SCOREP_MANUAL src/main.cpp src/**/*.cpp -o blasbench -Wl,-rpath=/projects/p_readex/it4i/mericGCC/lib/ $(LIB_FOLDER) $(LIBS)
meric:
$(CXX) $(CXX_FLAGS) $(INC_FOLDER) -DUSE_MERIC src/main.cpp src/**/*.cpp -o blasbench -Wl,-rpath=/projects/p_readex/it4i/mericGCC/lib/ $(LIB_FOLDER) $(LIBS)
dyn:
scorep --user --online-access --thread=none $(CXX) $(CXX_FLAGS) $(INC_FOLDER) src/main.cpp src/**/*.cpp -o blasbench -Wl,-rpath=/projects/p_readex/it4i/mericGCC/lib/ $(LIB_FOLDER) $(LIBS)
clean:
rm blasbench
rm *~
rm -rf scorep-*
rm scorep.filt
rm *.xml
rm ptf_output.txt
rm *.psc
#!/bin/sh
#module use /projects/p_readex/modules
#module load ptf/ci_readex_bullxmpi1.2.8.4_gcc5.3.0_slurm_starter_with_scorep
#: <<'ENERGY_OFF_COMMENT'
#module load scorep-hdeem/sync-2016-07-14-hdeem2.2.2-xmpi-gcc5.3
module load scorep-hdeem/sync-2017-01-31-git-hdeem2.2.20ms-xmpi-gcc5.3
export SCOREP_METRIC_PLUGINS=hdeem_sync_plugin
export SCOREP_METRIC_HDEEM_SYNC_PLUGIN_CONNECTION="INBAND"
export SCOREP_METRIC_HDEEM_SYNC_PLUGIN_VERBOSE="WARN"
export SCOREP_METRIC_HDEEM_SYNC_PLUGIN_STATS_TIMEOUT_MS=1000
export SCOREP_MPI_ENABLE_GROUPS=ENV
#module load pcp/ci_pcp_bullxmpi1.2.8.4_gcc5.3.0
export SCOREP_SUBSTRATE_PLUGINS='rrl'
#export SCOREP_TUNING_PLUGINS=cpu_freq_plugin
export SCOREP_RRL_PLUGINS=cpu_freq_plugin,uncore_freq_plugin,OpenMPTP
#ENERGY_OFF_COMMENT
export SCOREP_FILTERING_FILE=scorep.filt
: <<'AC_COMMENT'
#module load scorep_plugins/control_plugins
export SCOREP_SUBSTRATES_PLUGINS=’rrl’
export SCOREP_TUNING_PLUGINS=OpenMPTP,cpu_freq_plugin
AC_COMMENT
# run the application.. update this for different applications
psc_frontend --apprun="./blasbench --dgemv 100,8192 --dgemm 20,2048 --tan 15000" --mpinumprocs=2 --ompnumthreads=12 --phase=Main --tune=readex_tuning --config-file=readex_config.xml --info=2 --selective-info=AutotuneAll,AutotunePlugins
echo "running psc_frontend done"
#!/bin/sh
#module use /projects/p_readex/modules
#module load readex/ci_readex_bullxmpi1.2.8.4_gcc5.3.0
#module load readex/ci_readex_intelmpi5.1.3.181_intel2016.2.181
export SCOREP_PROFILING_FORMAT=cube_tuple
export SCOREP_METRIC_PAPI=PAPI_TOT_INS,PAPI_L3_TCM
export SCOREP_FILTERING_FILE=scorep.filt
#export LD_LIBRARY_PATH=/sw/global/compilers/intel/2016/compilers_and_libraries_2016/linux/lib/intel64
echo "running blasbench for readex-dyn-detect"
# run the application.. update this line for different applications
srun -n 2 --cpu_bind=socket ./blasbench --mpi_ata 100,256 --dgemv 50000,1000 --dgemm 200,1000 --tan 15000
#mpirun -np $7 ./miniMD_intel64 -i $6 1>/dev/null
echo "running blasbench done"
echo "running readex-dyn-detect"
echo "phase region = Main"
readex-dyn-detect -t 1 -p Main -c 1 -v 1 -w 1 scorep-*/profile.cubex
#readex-dyn-detect -t $1 -p $2 -c $3 -v $4 -w $5 scorep-*/profile.cubex
#readex-dyn-detect -t $1 -p $2 scorep-*/profile.cubex
echo "running readex-dyn-detect done"
#!/bin/sh
#source ./source.readex
export SCOREP_FILTERING_FILE=scorep.filt
#export LD_LIBRARY_PATH=/sw/global/compilers/intel/2016/compilers_and_libraries_2016/linux/lib/intel64
#rm -rf scorep-*
rm -f old_scorep.filt
echo "" > scorep.filt
result=1
while [ $result != 0 ]; do
echo "result = "$result
# run the application.. update this for different applications
#mpirun -np $3 ./miniMD_openmpi -i $2 1>/dev/null
#srun -n 2 ./blasbench --matrixFile bmwcra-1-mtx --mpi_ata 100,256 --dgemv 100,1000 --dgemm 100,1000 --tan 100 --io --io_threads 12 --sparse0 100 --sparse1 100 --sparse2 100 --sparse3 100 --mpi_ring 100,256 --runtime 0.1
srun -n 2 --cpu_bind=socket ./blasbench --mpi_ata 100,256 --dgemv 50000,1000 --dgemm 200,1000 --tan 15000
#srun -n 2 ./blasbench --dgemv 10,1000
echo "blasbench done."
sh -l do_scorep_autofilter_single.sh 1 #$1
result=$?
echo "scorep_autofilter_singe done ($result)."
done
echo "end."
cp scorep.filt ../
#!/bin/sh
cp scorep.filt old_scorep.filt
scorep-autofilter -t $1 -f scorep scorep-*/profile.cubex 1>/dev/null
echo "#############"
cat scorep.filt
echo "#############"
rm -rf scorep-*
commString=$(comm -1 -2 scorep.filt old_scorep.filt)
if [ "$commString" == "" ]; then
cp scorep.filt old_scorep.filt
exit 1
else
filtFileBeginString="SCOREP_REGION_NAMES_BEGIN"
filtFileEndString="SCOREP_REGION_NAMES_END"
filtFileExcludeString="EXCLUDE"
sort scorep.filt > sorted_scorep.filt
sort old_scorep.filt > sorted_old_scorep.filt
commString=$(comm -2 -3 sorted_scorep.filt sorted_old_scorep.filt)
if [ "$commString" != "" ]; then
echo $filtFileBeginString > new_scorep.filt
echo $filtFileExcludeString >> new_scorep.filt
comm -2 -3 sorted_scorep.filt sorted_old_scorep.filt > comm1.txt
exec < "old_scorep.filt"
while read line; do
if [ "$line" != "$filtFileEndString" ] && [ "$line" != "$filtFileBeginString" ] && [ "$line" != "$filtFileExcludeString" ]; then
echo $line >> new_scorep.filt
fi
done
cat comm1.txt >> new_scorep.filt
echo $filtFileEndString >> new_scorep.filt
mv new_scorep.filt scorep.filt
rm -f sorted_scorep.filt sorted_old_scorep.filt comm1.txt
echo "++++++++++++"
echo $commString
echo "++++++++++++"
exit 1
else
mv old_scorep.filt scorep.filt
rm -f sorted_scorep.filt sorted_old_scorep.filt comm1.txt
echo "++++++++++++"
echo $commString
echo "++++++++++++"
exit 0
fi
fi
test run on 1 node:
export MERIC_NUM_THREADS=12
srun -n 2 ./blasbench --matrixFile bmwcra-1-mtx --mpi_ata 100,256 --dgemv 100,1000 --dgemm 100,1000 --tan 100 --io --io_threads 12 --sparse0 100 --sparse1 100 --sparse2 100 --sparse3 100 --mpi_ring 100,256 --runtime 0.1
matrix download https://www.cise.ufl.edu/research/sparse/matrices/GHS_psdef/bmwcra_1.html in mtx format
arguments:
--matrixFile [matrix] path + name of the input sparse matrix
--io Parallel OMP I/O, requires --matrixFile
--io_threads [nThreads] Number of I/O threads
--sparse0 [reps] SpMV multiplication in CSR format, requires --matrixFile and --io
--sparse1 [reps] SpMV multiplication in IJV format, requires --matrixFile and --io
--sparse2 [reps] SpMM multiplication, requires --matrixFile and --io
--sparse3 [reps] SpMM addition, requires --matrixFile and --io
--dgemm [reps],[size] C = A*B, matrices' size is size*size*sizeof(double)
--dgemv [reps],[size] matrix-vector product M*Vi=Vo, matrix size size*size*sizeof(double), vector size*sizeof(double)
--mpi_ata [reps],[size] MPI_Alltoall, a single message size is size*size*sizeof(MPI_DOUBLE)
--mpi_ring [reps],[size] MPI_Isend+MPI_Irecv in a ring, all sending and receiving at the same time
a single message size is size*size*MPIranks
--runtime [multiplier] Increses or decreases the overall runtime (e.q reps), 0.0 < multiplier < inf
#!/bin/bash
#SBATCH -A p_readex
#SBATCH --nodes=2
#SBATCH --ntasks-per-socket=1
#SBATCH --exclusive
#SBATCH -p haswell
#SBATCH --mem 62000
#SBATCH -t 3-00:10:00
# #SBATCH --mail-type=ALL
# #SBATCH --mail-user=ondrej.vysocky@vsb.cz
# sbatch run.sh
# squeue -u $USER
###############################################################################
# LOAD MODULES
source source.readex
#export GOMP_CPU_AFFINITY=0-23
#export OMP_PROC_BIND=close
#export SLURM_CPU_BIND=sockets
#export GOMP_CPU_AFFINITY=0-23
# COMPILE
#make
./do_psc_frontend.sh
module purge
module use /projects/p_readex/modules
module load readex/ci_readex_bullxmpi1.2.8.4_gcc5.3.0
module load scorep-hdeem/sync-2017-01-31-git-hdeem2.2.20ms-xmpi-gcc5.3
export LD_LIBRARY_PATH+=:/sw/global/compilers/intel/2017/mkl/lib/intel64:projects/p_readex/it4i/mericGCC/lib:/projects/p_readex/it4i/mericGCC/hdeem:/home/nikl/fftw-3.3.6-pl1/build/lib:/usr/local/lib
export LIBRARY_PATH+=:/sw/global/compilers/intel/2017/mkl/lib/intel64:/sw/taurus/libraries/papi/5.4.3/lib:/projects/p_readex/it4i/mericGCC/lib:/projects/p_readex/it4i/mericGCC/hdeem:/home/nikl/fftw-3.3.6-pl1/build/lib:/usr/local/lib
#export MERIC_FREQUENCY=25
#export MERIC_UNCORE_FREQUENCY=30
#export MERIC_NUM_THREADS=12
#export SCOREP_PROFILING_FORMAT=cube_tuple
#export SCOREP_METRIC_PAPI=PAPI_TOT_INS,PAPI_L3_TCM
#export SCOREP_FILTERING_FILE=scorep.filt
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#include "DenseSolverMKL.h"
using namespace espreso;
DenseSolverMKL::DenseSolverMKL(){
// keep_factors=true;
// initialized = false;
USE_FLOAT = false;
import_with_copy = false;
mtype = espreso::SparseMatrix::MatrixType::REAL_UNSYMMETRIC;
m_dense_values_size = 0;
m_dense_values_fl_size = 0;
/* Numbers of processors, value of OMP_NUM_THREADS */
int num_procs;
char * var = getenv("PAR_NUM_THREADS");
if(var != NULL)
sscanf( var, "%d", &num_procs );
else {
printf("Set environment PAR_NUM_THREADS to 1");
exit(1);
}
iparm[2] = num_procs;
m_nRhs = 1;
// m_factorized = 0;
}
DenseSolverMKL::~DenseSolverMKL() {
this->Clear();
}
void DenseSolverMKL::Clear() {
if (import_with_copy) {
if(m_dense_values_size > 0) delete [] m_dense_values;
if(m_dense_values_fl_size > 0) delete [] m_dense_values_fl;
}
m_dense_values_size = 0;
m_dense_values_fl_size = 0;
m_dense_values = NULL;
m_dense_values_fl = NULL;
}
void DenseSolverMKL::ImportMatrix(espreso::SparseMatrix & A) {
USE_FLOAT = false;
m_rows = A.rows;
m_cols = A.cols;
m_nnz = A.nnz;
m_lda = A.rows;
m_dense_values_size = A.dense_values.size();
m_dense_values_fl_size = 0;
m_dense_values = new double[m_dense_values_size];
copy(A.dense_values.begin(), A.dense_values.end(), m_dense_values);
import_with_copy = true;
mtype = A.mtype;
}
void DenseSolverMKL::ImportMatrix_fl(espreso::SparseMatrix & A) {
USE_FLOAT = true;
m_rows = A.rows;
m_cols = A.cols;
m_nnz = A.nnz;
m_lda = A.rows;
m_dense_values_size = 0;
m_dense_values_fl_size = A.dense_values.size();
m_dense_values_fl = new float[m_dense_values_fl_size];
for (eslocal i = 0; i < m_dense_values_fl_size; i++)
m_dense_values_fl[i] = (float) A.dense_values[i];
import_with_copy = true;
mtype = A.mtype;
}
void DenseSolverMKL::ImportMatrix_wo_Copy(espreso::SparseMatrix & A) {
USE_FLOAT = false;
m_rows = A.rows;
m_cols = A.cols;
m_nnz = A.nnz;
m_lda = A.rows;
m_dense_values_size = A.dense_values.size();
m_dense_values_fl_size = 0;
m_dense_values = &A.dense_values[0];
import_with_copy = false;
mtype = A.mtype;
}
void DenseSolverMKL::SetThreaded() {
/* Numbers of processors, value of OMP_NUM_THREADS */
int num_procs = 1; // Esutils::getEnv<int>("SOLVER_NUM_THREADS");
iparm[2] = num_procs;
}
int DenseSolverMKL::Factorization(const std::string &str) {
eslocal info;
char U = 'U';
m_ipiv.resize(m_cols);
switch (mtype) {
case espreso::SparseMatrix::MatrixType::REAL_SYMMETRIC_POSITIVE_DEFINITE:
if (USE_FLOAT) {
ssptrf( &U, &m_cols, &m_dense_values_fl[0], &m_ipiv[0] , &info );
} else {
dsptrf( &U, &m_cols, &m_dense_values[0], &m_ipiv[0] , &info );
}
break;
case espreso::SparseMatrix::MatrixType::REAL_SYMMETRIC_INDEFINITE:
if (USE_FLOAT) {
ssptrf( &U, &m_cols, &m_dense_values_fl[0], &m_ipiv[0] , &info );
} else {
dsptrf( &U, &m_cols, &m_dense_values[0], &m_ipiv[0] , &info );
}
break;
case espreso::SparseMatrix::MatrixType::REAL_UNSYMMETRIC:
if (USE_FLOAT) {
//sgetrf( m, n, a, lda, ipiv, info )
sgetrf( &m_cols, &m_cols, &m_dense_values_fl[0], &m_cols, &m_ipiv[0] , &info );
} else {
dgetrf( &m_cols, &m_cols, &m_dense_values[0], &m_cols, &m_ipiv[0] , &info );
}
break;
}
return info;
}
void DenseSolverMKL::Solve( std::vector <double> & rhs_sol) {
char U = 'U';
eslocal info = 0;
m_nRhs = 1;
switch (mtype) {
case espreso::SparseMatrix::MatrixType::REAL_SYMMETRIC_POSITIVE_DEFINITE:
if (USE_FLOAT) {
tmp_sol_fl.resize(rhs_sol.size());
for (size_t i = 0; i < rhs_sol.size(); i++)
tmp_sol_fl[i] = (float)rhs_sol[i];
ssptrs( &U, &m_rows, &m_nRhs, &m_dense_values_fl[0], &m_ipiv[0], &tmp_sol_fl[0], &m_rows, &info );
for (size_t i = 0; i < rhs_sol.size(); i++)
rhs_sol[i] = (double)tmp_sol_fl[i];
} else {
dsptrs( &U, &m_rows, &m_nRhs, &m_dense_values[0], &m_ipiv[0], &rhs_sol[0], &m_rows, &info );
}
break;
case espreso::SparseMatrix::MatrixType::REAL_SYMMETRIC_INDEFINITE:
if (USE_FLOAT) {
tmp_sol_fl.resize(rhs_sol.size());
for (size_t i = 0; i < rhs_sol.size(); i++)
tmp_sol_fl[i] = (float)rhs_sol[i];
ssptrs( &U, &m_rows, &m_nRhs, &m_dense_values_fl[0], &m_ipiv[0], &tmp_sol_fl[0], &m_rows, &info );
for (size_t i = 0; i < rhs_sol.size(); i++)
rhs_sol[i] = (double)tmp_sol_fl[i];
} else {
dsptrs( &U, &m_rows, &m_nRhs, &m_dense_values[0], &m_ipiv[0], &rhs_sol[0], &m_rows, &info );
}
break;
case espreso::SparseMatrix::MatrixType::REAL_UNSYMMETRIC:
if (USE_FLOAT) {
tmp_sol_fl.resize(rhs_sol.size());
for (size_t i = 0; i < rhs_sol.size(); i++)
tmp_sol_fl[i] = (float)rhs_sol[i];
ssptrs( &U, &m_rows, &m_nRhs, &m_dense_values_fl[0], &m_ipiv[0], &tmp_sol_fl[0], &m_rows, &info );
//call sgetrs( trans, n, nrhs, a, lda, ipiv, b, ldb, info )
char trans = 'N';
sgetrs( &trans, &m_rows, &m_nRhs, &m_dense_values_fl[0], &m_rows, &m_ipiv[0], &tmp_sol_fl[0], &m_nRhs, &info);
for (size_t i = 0; i < rhs_sol.size(); i++)
rhs_sol[i] = (double)tmp_sol_fl[i];
} else {
//dsptrs( &U, &m_rows, &m_nRhs, &m_dense_values[0], &m_ipiv[0], &rhs_sol[0], &m_rows, &info );
char trans = 'N';
dgetrs( &trans, &m_rows, &m_nRhs, &m_dense_values[0], &m_rows, &m_ipiv[0], &rhs_sol[0], &m_nRhs, &info);
}
break;
}
}
void DenseSolverMKL::Solve( std::vector <double> & rhs, std::vector <double> & sol, MKL_INT n_rhs) {
char U = 'U';
eslocal info = 0;
eslocal m_nRhs = n_rhs;
//
// if (USE_FLOAT) {
// sol.resize(rhs.size());
// tmp_sol_fl.resize(rhs.size());
//
// for (eslocal i = 0; i < rhs.size(); i++)
// tmp_sol_fl[i] = (float)rhs[i];
//
// ssptrs( &U, &m_rows, &n_rhs, &m_dense_values_fl[0], &m_ipiv[0], &tmp_sol_fl[0], &m_rows, &info );
//
// for (size_t i = 0; i < rhs.size(); i++)
// sol[i] = (double)tmp_sol_fl[i];
// } else {
// sol = rhs;
// dsptrs( &U, &m_rows, &n_rhs, &m_dense_values[0], &m_ipiv[0], &sol[0], &m_rows, &info );
// }
switch (mtype) {
case espreso::SparseMatrix::MatrixType::REAL_SYMMETRIC_POSITIVE_DEFINITE:
if (USE_FLOAT) {
sol.resize(rhs.size());
tmp_sol_fl.resize(rhs.size());
for (size_t i = 0; i < rhs.size(); i++)
tmp_sol_fl[i] = (float)rhs[i];
ssptrs( &U, &m_rows, &n_rhs, &m_dense_values_fl[0], &m_ipiv[0], &tmp_sol_fl[0], &m_rows, &info );
for (size_t i = 0; i < rhs.size(); i++)
sol[i] = (double)tmp_sol_fl[i];
} else {
sol = rhs;
dsptrs( &U, &m_rows, &n_rhs, &m_dense_values[0], &m_ipiv[0], &sol[0], &m_rows, &info );
}
break;
case espreso::SparseMatrix::MatrixType::REAL_SYMMETRIC_INDEFINITE:
if (USE_FLOAT) {
sol.resize(rhs.size());
tmp_sol_fl.resize(rhs.size());
for (size_t i = 0; i < rhs.size(); i++)
tmp_sol_fl[i] = (float)rhs[i];
ssptrs( &U, &m_rows, &n_rhs, &m_dense_values_fl[0], &m_ipiv[0], &tmp_sol_fl[0], &m_rows, &info );
for (size_t i = 0; i < rhs.size(); i++)
sol[i] = (double)tmp_sol_fl[i];
} else {
sol = rhs;
dsptrs( &U, &m_rows, &n_rhs, &m_dense_values[0], &m_ipiv[0], &sol[0], &m_rows, &info );
}
break;
case espreso::SparseMatrix::MatrixType::REAL_UNSYMMETRIC:
char trans = 'N';
if (USE_FLOAT) {
sol.resize(rhs.size());
tmp_sol_fl.resize(rhs.size());
for (size_t i = 0; i < rhs.size(); i++)
tmp_sol_fl[i] = (float)rhs[i];
//ssptrs( &U, &m_rows, &n_rhs, &m_dense_values_fl[0], &m_ipiv[0], &tmp_sol_fl[0], &m_rows, &info );
sgetrs( &trans, &m_rows, &m_nRhs, &m_dense_values_fl[0], &m_rows, &m_ipiv[0], &tmp_sol_fl[0], &m_nRhs, &info);
for (size_t i = 0; i < rhs.size(); i++)
sol[i] = (double)tmp_sol_fl[i];
} else {
sol = rhs;
//dsptrs( &U, &m_rows, &n_rhs, &m_dense_values[0], &m_ipiv[0], &sol[0], &m_rows, &info );
//dgetrs( trans, n, nrhs, a, lda, ipiv, b, ldb, info )
dgetrs( &trans, &m_rows, &m_nRhs, &m_dense_values[0], &m_rows, &m_ipiv[0], &sol[0], &m_rows, &info);
}
break;
}
}
void DenseSolverMKL::Solve( std::vector <double> & rhs, std::vector <double> & sol, MKL_INT rhs_start_index, MKL_INT sol_start_index) {
printf("DenseSolverMKL::Solve( std::vector <double> & rhs, std::vector <double> & sol, MKL_INT rhs_start_index, MKL_INT sol_start_index) not implemented yet.\n");
exit(1);
}
void DenseSolverMKL::SolveMat_Sparse( espreso::SparseMatrix & A) {
SolveMat_Sparse(A, A);
};
void DenseSolverMKL::SolveMat_Sparse( espreso::SparseMatrix & A_in, espreso::SparseMatrix & B_out) {
SolveMat_Sparse(A_in, B_out, 'T');
};
void DenseSolverMKL::SolveMat_Sparse( espreso::SparseMatrix & A_in, espreso::SparseMatrix & B_out, char T_for_input_matrix_is_transposed_N_input_matrix_is_NOT_transposed ) {
char trans = T_for_input_matrix_is_transposed_N_input_matrix_is_NOT_transposed;
espreso::SparseMatrix tmpM;
if (trans == 'T')
A_in.MatTranspose(tmpM);
else
tmpM = A_in;
std::vector<double> rhs;
std::vector<double> sol;
rhs.reserve(tmpM.cols+8);
rhs.reserve(tmpM.cols+8);
rhs.resize(tmpM.cols);
sol.resize(tmpM.cols);
// main loop over rows
MKL_INT col = 0;