diff --git a/README.md b/README.md
index 542346b15ea055169199f945f3aa11ff31574e1c..0fe8b7888894fd98a61585472e3b04ec16e47c18 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,45 @@
-# matmult
+# Matrix Multiplication
 
+A simple code for test several matrix multiplication approaches. The simplest one is implemented by algorithm using 3 for loops. More optimal *blocked* variant is based on the idea scratched [here](http://www.netlib.org/utk/papers/autoblock/node2.html).
+
+It is easy use OpenMP to make both algorithm run in parallel. Distributed memory parallel version can be implemented using systolic arrays described [here](https://www.mcs.anl.gov/~itf/dbpp/text/node45.html#figlasystolic).
+
+The following executable are provided (in directory `src/app/`):
+ - `$ generate rows cols A.bin` - generates a random matrix with provided number of rows and cols to binary file A.bin
+ - `$ print A.bin` - prints matrix A.bin to standard output
+ - `$ compare A.bin B.bin` - compares matrices A.bin and B.bin. Matrices are considered as equal if norm(A-B) < 1e-6
+ - `$ multiply A.bin B.bin C.bin` - multiplies matrices A.bin and B.bin. Output is stored to file C.bin
+
+All functionality is implemented in the `Matrix` class in the `src/` directory. Directory `tests/` contains tests cases for this project.
+
+## STEP 0: Create git repository (10%)
+
+Your code should be forked from this repository and hosted on code.it4i.cz as a private project with access for all teachers.
+
+## STEP 1:  Building the library (10%)
+
+Provide compilation script for your application (the script should run independently on a current path). Script should load all necessary modules and call `cmake`.
+
+## STEP 2:  Analysis of the application (10%)
+
+Use `Arm map` (module Forge) to analyze a sequential run of your application with given use case (`tests/large`). Identify the most time consuming regions that can be parallelized by OpenMP.
+
+## STEP 3:  Use OpenMP to run the application in parallel (10%)
+
+Put OpenMP pragmas to a correct positions with appropriate variables visibility in order to utilize more threads effectively.
+
+## STEP 4:  Test the correctness of the code (10%)
+
+Create script that automatically check correctness of your application for at least 3 different test cases. Comparison can be implemented as comparison of outputs of sequential and parallel runs.
+
+## STEP 5:  Test the behavior of the code on the Karolina cluster (40%)
+
+1. Implement time measurement for all parallel regions using omp_get_wtime(). 
+2. Create script for run strong scalability measurement (PBS script).
+3. Evaluate strong scalability of measured regions up to 128 threads and different thread affinity (compact, scatter, balanced, none).
+4. Evaluate behaviour for domain specific parameters of your applications (project dependent, maybe none). (Blocking implementation for this project)
+5. Prepare charts for all measurements.
+
+## STEP 6:  Presentation of your project (10%)
+
+Prepare presentation in form of slides (pptx, pdf). The slides should address all topics requested above.
diff --git a/src/app/compare.cpp b/src/app/compare.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9183c3756d3edb70d2452ae48a319ab53225c713
--- /dev/null
+++ b/src/app/compare.cpp
@@ -0,0 +1,23 @@
+
+#include "matrix.h"
+
+#include <iostream>
+#include <chrono>
+
+int main(int argc, char **argv)
+{
+	if (argc != 3) {
+		std::cerr << "use: compare 'A' 'B'\n";
+		exit(1);
+	}
+
+	Matrix A, B;
+	load(A, argv[1]);
+	load(B, argv[2]);
+
+	if (!compare(A, B)) {
+		std::cerr << "Matrices are different\n";
+	} else {
+		std::cout << "Matrices are identical\n";
+	}
+}
diff --git a/src/app/generate.cpp b/src/app/generate.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c788cb477eda12eaa579ebe61f7c22d41594cf27
--- /dev/null
+++ b/src/app/generate.cpp
@@ -0,0 +1,25 @@
+
+#include "matrix.h"
+
+#include <iostream>
+#include <sstream>
+#include <fstream>
+
+int main(int argc, char **argv)
+{
+	if (argc != 4) {
+		std::cerr << "use: generator 'matrix_rows' 'matrix_columns' 'output_file'\n";
+		exit(1);
+	}
+
+	size_t rows, cols;
+	std::istringstream ( argv[1] ) >> rows;
+	std::istringstream ( argv[2] ) >> cols;
+
+//	std::cout << "Generate matrix with size=" << rows << "x" << cols << " to file='" << argv[3] << "'\n";
+
+	Matrix A(rows, cols);
+	random(A);
+	store(A, argv[3]);
+}
+
diff --git a/src/app/multiply.cpp b/src/app/multiply.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3e59aca49ccba2d7eeaf182529f18f712dd4b9a
--- /dev/null
+++ b/src/app/multiply.cpp
@@ -0,0 +1,25 @@
+
+#include "matrix.h"
+
+#include <iostream>
+#include <chrono>
+
+int main(int argc, char **argv)
+{
+	if (argc != 4) {
+		std::cerr << "use: sequential 'A' 'B' 'output_file'\n";
+		exit(1);
+	}
+
+	Matrix C, A, B;
+	load(A, argv[1]);
+	load(B, argv[2]);
+	printf("Matrices loaded: A[%d, %d], B[%d, %d]\n", A.rows, A.cols, B.rows, B.cols);
+
+	initMult(C, A, B);
+//	mult(C, A, B);
+	multBlocked(C, A, B, 8);
+
+	printf("Matrices multiplied: C[%d, %d]\n", C.rows, C.cols);
+	store(C, argv[3]);
+}
diff --git a/src/app/print.cpp b/src/app/print.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..651d4820932b0808c858914c35bcb7ceebf66220
--- /dev/null
+++ b/src/app/print.cpp
@@ -0,0 +1,17 @@
+
+#include "matrix.h"
+
+#include <iostream>
+
+int main(int argc, char **argv)
+{
+	if (argc != 2) {
+		std::cerr << "use: print 'input_file'\n";
+		exit(1);
+	}
+	std::cout << "print matrix from file=" << argv[1] << "'\n";
+
+	Matrix A;
+	load(A, argv[1]);
+	print(A);
+}
diff --git a/src/matrix.cpp b/src/matrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b62d9a0c9e595b415b2d8a6ed3e5238ce9e47a27
--- /dev/null
+++ b/src/matrix.cpp
@@ -0,0 +1,110 @@
+
+#include "matrix.h"
+
+#include <random>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+
+bool compare(const Matrix &A, const Matrix &B)
+{
+	if (A.rows != B.rows || A.cols != B.cols) {
+		return false;
+	}
+
+	double norm = 0;
+	for (int r = 0; r < A.rows; ++r) {
+		for (int c = 0; c < A.cols; ++c) {
+			norm += std::fabs(A(r, c) - B(r, c));
+		}
+	}
+	return norm < 1e-6;
+}
+
+void initMult(Matrix &C, const Matrix &A, const Matrix &B)
+{
+	if (A.cols != B.rows) {
+		std::cerr << "Matrices have invalid dimensions\n";
+	}
+	C.resize(A.rows, B.cols);
+	std::fill(C.vals, C.vals + C.rows * C.cols, 0);
+}
+
+void mult(Matrix &C, const Matrix &A, const Matrix &B)
+{
+	for (int i = 0; i < A.rows; ++i) {
+		for (int j = 0; j < B.cols; ++j) {
+			for (int k = 0; k < A.cols; ++k) {
+				C(i, j) += A(i, k) * B(k, j);
+			}
+		}
+	}
+}
+
+void multBlocked(Matrix &C, const Matrix &A, const Matrix &B, int blockSize)
+{
+	int CRBlocks = C.rows / blockSize + C.rows % blockSize;
+	int CCBlocks = C.cols / blockSize + C.cols % blockSize;
+	int CKBlocks = A.cols / blockSize + A.cols % blockSize;
+
+	for (int ci = 0; ci < CRBlocks; ++ci) {
+		int rbegin = ci * blockSize, rend = std::min((ci + 1) * blockSize, C.rows);
+		for (int cj = 0; cj < CCBlocks; ++cj) {
+			int cbegin = cj * blockSize, cend = std::min((cj + 1) * blockSize, C.cols);
+			for (int ck = 0; ck < CKBlocks; ++ck) {
+				int kbegin = ck * blockSize, kend = std::min((ck + 1) * blockSize, A.cols);
+
+				for (int i = rbegin; i < rend; ++i) {
+					for (int j = cbegin; j < cend; ++j) {
+						for (int k = kbegin; k < kend; ++k) {
+							C(i, j) += A(i, k) * B(k, j);
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+void random(Matrix &m) {
+	std::random_device rd;
+	std::mt19937 generator(rd());
+	std::uniform_real_distribution<double> distribution(1.0, 2.0);
+
+	for (int r = 0; r < m.rows; ++r) {
+		for (int c = 0; c < m.cols; ++c) {
+			m.vals[r * m.cols + c] = distribution(generator);
+		}
+	}
+}
+
+void print(Matrix &m)
+{
+	for (int r = 0; r < m.rows; ++r) {
+		for (int c = 0; c < m.cols; ++c) {
+			printf(" %+f", m.vals[r * m.cols + c]);
+		}
+		printf("\n");
+	}
+}
+
+void load(Matrix &m, const char *file)
+{
+	std::ifstream os(file, std::ifstream::binary);
+	int rows, cols;
+	os.read(reinterpret_cast<char*>(&rows), sizeof(int));
+	os.read(reinterpret_cast<char*>(&cols), sizeof(int));
+	m.resize(rows, cols);
+	os.read(reinterpret_cast<char*>(m.vals), sizeof(double) * m.rows * m.cols);
+	os.close();
+}
+
+void store(const Matrix &m, const char *file)
+{
+	std::ofstream os(file, std::ofstream::binary);
+	os.write(reinterpret_cast<const char*>(&m.rows), sizeof(int));
+	os.write(reinterpret_cast<const char*>(&m.cols), sizeof(int));
+	os.write(reinterpret_cast<const char*>(m.vals), sizeof(double) * m.rows * m.cols);
+	os.close();
+}
+
diff --git a/src/matrix.h b/src/matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee2f26fed2dc3456479e591646396ecc6557469f
--- /dev/null
+++ b/src/matrix.h
@@ -0,0 +1,41 @@
+
+#ifndef SRC_MATRIX_H_
+#define SRC_MATRIX_H_
+
+struct Matrix {
+	Matrix(): rows{}, cols{}, vals{} {}
+	Matrix(int rows, int cols): rows(rows), cols(cols), vals(new double[rows * cols]) {}
+	~Matrix() { delete[] vals; }
+
+	Matrix(const Matrix &m) = delete;
+	Matrix(const Matrix &&m) = delete;
+	Matrix& operator=(const Matrix &m) = delete;
+	Matrix& operator=(const Matrix &&m) = delete;
+
+	const double& operator()(int r, int c) const { return vals[r * cols + c]; }
+	double& operator()(int r, int c) { return vals[r * cols + c]; }
+
+	void resize(int rows, int cols)
+	{
+		if (vals) delete vals;
+		this->rows = rows;
+		this->cols = cols;
+		this->vals = new double[rows * cols];
+	}
+
+	int rows, cols;
+	double *vals;
+};
+
+void random(Matrix &m);
+void print(Matrix &m);
+void load(Matrix &m, const char *file);
+void store(const Matrix &m, const char *file);
+bool compare(const Matrix &A, const Matrix &B);
+void initMult(Matrix &C, const Matrix &A, const Matrix &B);
+
+void mult(Matrix &C, const Matrix &A, const Matrix &B);
+void multBlocked(Matrix &C, const Matrix &A, const Matrix &B, int blockSize);
+void multBlockedUnrolled(Matrix &C, const Matrix &A, const Matrix &B);
+
+#endif /* SRC_MATRIX_H_ */
diff --git a/tests/large/A.bin b/tests/large/A.bin
new file mode 100644
index 0000000000000000000000000000000000000000..81b8b496a2e337da95e257f088913e9efc3e1481
Binary files /dev/null and b/tests/large/A.bin differ
diff --git a/tests/large/B.bin b/tests/large/B.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f1659f4063a00e441788e56def3d7bde88a2e9dc
Binary files /dev/null and b/tests/large/B.bin differ
diff --git a/tests/large/C.bin b/tests/large/C.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7ef624fc36924d7c9c5259dcbb4fb49958e8e38e
Binary files /dev/null and b/tests/large/C.bin differ
diff --git a/tests/primes/A.bin b/tests/primes/A.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3a927f0ef7b8e90e8ec66838cded170ad6719531
Binary files /dev/null and b/tests/primes/A.bin differ
diff --git a/tests/primes/B.bin b/tests/primes/B.bin
new file mode 100644
index 0000000000000000000000000000000000000000..16f1cdb9bda1fda08317fb39546bad62f7010f61
Binary files /dev/null and b/tests/primes/B.bin differ
diff --git a/tests/primes/C.bin b/tests/primes/C.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4828ce863b6beccb002b2cd26d8e546928ca67ae
Binary files /dev/null and b/tests/primes/C.bin differ
diff --git a/tests/small/A.bin b/tests/small/A.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e0acb950a26f4eadade3aa3b06e260c20971b56b
Binary files /dev/null and b/tests/small/A.bin differ
diff --git a/tests/small/B.bin b/tests/small/B.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2da753f15bda1b3b029db9af2792379e393d933f
Binary files /dev/null and b/tests/small/B.bin differ
diff --git a/tests/small/C.bin b/tests/small/C.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c9e6bcfb5acef318000412066325dfeebab850ca
Binary files /dev/null and b/tests/small/C.bin differ
diff --git a/tests/tiny/A.bin b/tests/tiny/A.bin
new file mode 100644
index 0000000000000000000000000000000000000000..339f540711dca06a0e2761fad176ea471b184f20
Binary files /dev/null and b/tests/tiny/A.bin differ
diff --git a/tests/tiny/B.bin b/tests/tiny/B.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e9d28b01f9ab592ee2ae25cef960d216fb9cff0b
Binary files /dev/null and b/tests/tiny/B.bin differ
diff --git a/tests/tiny/C.bin b/tests/tiny/C.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5eee4b5bc98d6c27123be027892d29389fb9bb18
Binary files /dev/null and b/tests/tiny/C.bin differ