mat mult

780b811c · Jakub Homola · c317ce9a · 780b811c · 780b811c · 780b811c
Commit 780b811c authored 10 months ago by Jakub Homola
--- a/README.md
+++ b/README.md
@@ -53,4 +53,6 @@ Documentation links:
    1. [reduction](tasks/reduction)
    1. [convolution_1d](tasks/convolution_1d)
    1. [histogram](tasks/histogram)
+1. Other
    1. [matrix_transpose](tasks/matrix_transpose)
+    1. [matrix_multiplication](tasks/matrix_multiplication)
--- a/solution/matrix_multiplication/matrix_multiplication.cu
+++ b/solution/matrix_multiplication/matrix_multiplication.cu
+#include <cstdio>
+
+#define TILE_SIZE 32
+
+
+
+__global__ void matrix_multiplication_naive(float * M, float * N, float * P, int size_m, int size_n, int size_k)
+{
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(col < size_n && row < size_m)
+    {
+        float val = 0;
+        for(int k = 0; k < size_k; k++)
+        {
+            // TODO: multiply the values from matrices M and N and add them to the local P value
+            val += M[row * size_k + k] * N[k * size_n + col];
+        }
+        
+        // TODO: write the local P value to the matrix P in global memory
+        P[row * size_n + col] = val;
+    }
+}
+
+__global__ void matrix_multiplication_tiled(float * M, float * N, float * P, int size_m, int size_n, int size_k)
+{
+    __shared__ float tile_M[TILE_SIZE][TILE_SIZE];
+    __shared__ float tile_N[TILE_SIZE][TILE_SIZE];
+
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+
+    float val = 0;
+
+    int tile_count_k = (size_k - 1) / TILE_SIZE + 1;
+    for(int tile_k = 0; tile_k < tile_count_k; tile_k++)
+    {
+        // TODO: think about why both the IF and ELSE are needed here
+        if(row < size_m && tile_k * TILE_SIZE + threadIdx.x < size_k)
+        {
+            // TODO: read the value from the matrix M and store it in the shared memory
+            tile_M[threadIdx.y][threadIdx.x] = M[row * size_k + tile_k * TILE_SIZE + threadIdx.x];
+        }
+        else
+        {
+            tile_M[threadIdx.y][threadIdx.x] = 0;
+        }
+
+        // TODO: think about why both the IF and ELSE are needed here
+        if(col < size_n && tile_k * TILE_SIZE + threadIdx.y < size_k)
+        {
+            // TODO: read the value from the matrix N and store it in the shared memory
+            tile_N[threadIdx.y][threadIdx.x] = N[(tile_k * TILE_SIZE + threadIdx.y) * size_n + col];
+        }
+        else
+        {
+            tile_N[threadIdx.y][threadIdx.x] = 0;
+        }
+
+        // TODO: synchronize all threads in this threadblock to make sure all threads populated the shared-memory tiles
+        __syncthreads();
+
+        if(row < size_m && col < size_n)
+        {
+            for(int k = 0; k < TILE_SIZE; k++)
+            {
+                // TODO: read the matrix elements from the shared memory, multiply them together and add the result to the local P value
+                val += tile_M[threadIdx.y][k] * tile_N[k][threadIdx.x];
+            }
+        }
+
+        // TODO: synchronize all threads in this threadblock to make sure all threads have finished working with the shared memory and it can be overwritten in the next iteration
+        __syncthreads();
+    }
+
+    if(row < size_m && col < size_n)
+    {
+        P[row * size_n + col] = val;
+    }
+}
+
+
+
+
+
+__global__ void matrix_init(float * matrix, int width, int height, float a_r, float a_c)
+{
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(row < height && col < width)
+        matrix[row * width + col] = a_r * (row + 1) + a_c * (col + 1);
+}
+
+__global__ void matrix_check_result(float * matrix, int size_m, int size_n, int size_k, int * error_count)
+{
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(row < size_m && col < size_n)
+    {
+        float correct = (row + 1.0f) * (col + 1.0f) * size_k;
+        float observed = matrix[row * size_n + col];
+        if(abs(correct - observed) / correct > 1e-3)
+            atomicAdd(error_count, 1);
+    }
+}
+
+
+
+
+
+int main()
+{
+    int size_m = 15394;
+    int size_n = 12287;
+    int size_k = 14480;
+
+    float * d_M;
+    float * d_N;
+    float * d_P;
+    int * d_error_count;
+    int error_count;
+
+    cudaMalloc(&d_M, size_m * size_k * sizeof(float));
+    cudaMalloc(&d_N, size_k * size_n * sizeof(float));
+    cudaMalloc(&d_P, size_m * size_n * sizeof(float));
+    cudaMalloc(&d_error_count, sizeof(int));
+
+    cudaEvent_t event_start, event_end;
+    cudaEventCreate(&event_start);
+    cudaEventCreate(&event_end);
+
+    dim3 tpb, bpg;
+
+    tpb = dim3(32, 32);
+    bpg = dim3((size_k - 1) / tpb.x + 1, (size_m - 1) / tpb.y + 1);
+    matrix_init<<< bpg, tpb >>>(d_M, size_k, size_m, 1, 0);
+    
+    tpb = dim3(32, 32);
+    bpg = dim3((size_n - 1) / tpb.x + 1, (size_k - 1) / tpb.y + 1);
+    matrix_init<<< bpg, tpb >>>(d_N, size_n, size_k, 0, 1);
+
+    cudaDeviceSynchronize();
+
+
+
+    tpb = dim3(32, 32);
+    bpg = dim3((size_n - 1) / tpb.x + 1, (size_m - 1) / tpb.y + 1);
+    cudaEventRecord(event_start);
+    matrix_multiplication_naive<<< bpg, tpb >>>(d_M, d_N, d_P, size_m, size_n, size_k);
+    cudaEventRecord(event_end);
+    cudaDeviceSynchronize();
+    float time_naive;
+    cudaEventElapsedTime(&time_naive, event_start, event_end);
+    error_count = 0;
+    cudaMemcpy(d_error_count, &error_count, sizeof(int), cudaMemcpyDeviceToHost);
+    matrix_check_result<<< bpg, tpb >>>(d_P, size_m, size_n, size_k, d_error_count);
+    cudaMemcpy(&error_count, d_error_count, sizeof(int), cudaMemcpyDeviceToHost);
+    if(error_count > 0) printf("Matrix multiplication naive error count: %d\n", error_count);
+    else printf("Matrix multiplication naive seems OK\n");
+
+
+
+    tpb = dim3(TILE_SIZE, TILE_SIZE);
+    bpg = dim3((size_n - 1) / tpb.x + 1, (size_m - 1) / tpb.y + 1);
+    cudaEventRecord(event_start);
+    matrix_multiplication_tiled<<< bpg, tpb >>>(d_M, d_N, d_P, size_m, size_n, size_k);
+    cudaEventRecord(event_end);
+    cudaDeviceSynchronize();
+    float time_tiled;
+    cudaEventElapsedTime(&time_tiled, event_start, event_end);
+    error_count = 0;
+    cudaMemcpy(d_error_count, &error_count, sizeof(int), cudaMemcpyDeviceToHost);
+    matrix_check_result<<< bpg, tpb >>>(d_P, size_m, size_n, size_k, d_error_count);
+    cudaMemcpy(&error_count, d_error_count, sizeof(int), cudaMemcpyDeviceToHost);
+    if(error_count > 0) printf("Matrix multiplication tiled error count: %d\n", error_count);
+    else printf("Matrix multiplication tiled seems OK\n");
+
+
+
+    cudaEventDestroy(event_start);
+    cudaEventDestroy(event_end);
+    cudaFree(d_M);
+    cudaFree(d_N);
+    cudaFree(d_P);
+    cudaFree(d_error_count);
+
+    
+
+    printf("\n");
+    printf("Time multiplication naive: %10.3f ms\n", time_naive);
+    printf("Time multiplication tiled: %10.3f ms\n", time_tiled);
+    printf("Speedup is %.2f\n", time_naive / time_tiled);
+
+    return 0;
+}
--- a/tasks/matrix_multiplication/README.md
+++ b/tasks/matrix_multiplication/README.md
+
+Matrix multiplication
+=====================
+
+
+
+Finish the implemetation of multiplication of two non-square matrices by completing the TODO tasks in the `matrix_multiplication.cu` file. First, complete the naive matrix multiplication kernel, then move on to the tiled matrix multiplication algorithm. The todo tasks are only in the two kernels, you don't need to worry about the rest of the code (classic memory management, initialization and correctness checking).
+
+You are multiplying two matrices $M$ and $N$ to get a result matrix $P = MN$. The matrix $M$ has $size\_m$ rows and $size\_k$ columns. To make the matrices valid for multiplication, the matrix $N$ has to have $size\_k$ rows, and we assigned it to have $size\_n$ columns. The result matrix $P$ therefore has $size\_m$ rows and $size\_n$ columns. We use row-major memory order to store the matrices in memory.
+
+Compile and run, observe the timing differences.
+
+![matrix multiplication illustration](matmult.png)
--- a/tasks/matrix_multiplication/matmult.png
+++ b/tasks/matrix_multiplication/matmult.png
--- a/tasks/matrix_multiplication/matrix_multiplication.cu
+++ b/tasks/matrix_multiplication/matrix_multiplication.cu
+#include <cstdio>
+
+#define TILE_SIZE 32
+
+
+
+__global__ void matrix_multiplication_naive(float * M, float * N, float * P, int size_m, int size_n, int size_k)
+{
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(col < size_n && row < size_m)
+    {
+        float val = 0;
+        for(int k = 0; k < size_k; k++)
+        {
+            // TODO: multiply the values from matrices M and N and add them to the local P value
+            val += ;
+        }
+        
+        // TODO: write the local P value to the matrix P in global memory
+        P[  ] = val;
+    }
+}
+
+__global__ void matrix_multiplication_tiled(float * M, float * N, float * P, int size_m, int size_n, int size_k)
+{
+    __shared__ float tile_M[TILE_SIZE][TILE_SIZE];
+    __shared__ float tile_N[TILE_SIZE][TILE_SIZE];
+
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+
+    float val = 0;
+
+    int tile_count_k = (size_k - 1) / TILE_SIZE + 1;
+    for(int tile_k = 0; tile_k < tile_count_k; tile_k++)
+    {
+        // TODO: think about why both the IF and ELSE are needed here
+        if(row < size_m && tile_k * TILE_SIZE + threadIdx.x < size_k)
+        {
+            // TODO: read the value from the matrix M and store it in the shared memory
+            tile_M[threadIdx.y][threadIdx.x] = M[  ];
+        }
+        else
+        {
+            tile_M[threadIdx.y][threadIdx.x] = 0;
+        }
+
+        // TODO: think about why both the IF and ELSE are needed here
+        if(col < size_n && tile_k * TILE_SIZE + threadIdx.y < size_k)
+        {
+            // TODO: read the value from the matrix N and store it in the shared memory
+            tile_N[threadIdx.y][threadIdx.x] = N[  ];
+        }
+        else
+        {
+            tile_N[threadIdx.y][threadIdx.x] = 0;
+        }
+
+        // TODO: synchronize all threads in this threadblock to make sure all threads populated the shared-memory tiles
+        
+
+        if(row < size_m && col < size_n)
+        {
+            for(int k = 0; k < TILE_SIZE; k++)
+            {
+                // TODO: read the matrix elements from the shared memory, multiply them together and add the result to the local P value
+                val += ;
+            }
+        }
+
+        // TODO: synchronize all threads in this threadblock to make sure all threads have finished working with the shared memory and it can be overwritten in the next iteration
+        
+    }
+
+    if(row < size_m && col < size_n)
+    {
+        P[row * size_n + col] = val;
+    }
+}
+
+
+
+
+
+__global__ void matrix_init(float * matrix, int width, int height, float a_r, float a_c)
+{
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(row < height && col < width)
+        matrix[row * width + col] = a_r * (row + 1) + a_c * (col + 1);
+}
+
+__global__ void matrix_check_result(float * matrix, int size_m, int size_n, int size_k, int * error_count)
+{
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(row < size_m && col < size_n)
+    {
+        float correct = (row + 1.0f) * (col + 1.0f) * size_k;
+        float observed = matrix[row * size_n + col];
+        if(abs(correct - observed) / correct > 1e-3)
+            atomicAdd(error_count, 1);
+    }
+}
+
+
+
+
+
+int main()
+{
+    int size_m = 15394;
+    int size_n = 12287;
+    int size_k = 14480;
+
+    float * d_M;
+    float * d_N;
+    float * d_P;
+    int * d_error_count;
+    int error_count;
+
+    cudaMalloc(&d_M, size_m * size_k * sizeof(float));
+    cudaMalloc(&d_N, size_k * size_n * sizeof(float));
+    cudaMalloc(&d_P, size_m * size_n * sizeof(float));
+    cudaMalloc(&d_error_count, sizeof(int));
+
+    cudaEvent_t event_start, event_end;
+    cudaEventCreate(&event_start);
+    cudaEventCreate(&event_end);
+
+    dim3 tpb, bpg;
+
+    tpb = dim3(TILE_SIZE, TILE_SIZE);
+    bpg = dim3((size_k - 1) / tpb.x + 1, (size_m - 1) / tpb.y + 1);
+    matrix_init<<< bpg, tpb >>>(d_M, size_k, size_m, 1, 0);
+    
+    tpb = dim3(TILE_SIZE, TILE_SIZE);
+    bpg = dim3((size_n - 1) / tpb.x + 1, (size_k - 1) / tpb.y + 1);
+    matrix_init<<< bpg, tpb >>>(d_N, size_n, size_k, 0, 1);
+
+    cudaDeviceSynchronize();
+
+
+
+    tpb = dim3(32, 32);
+    bpg = dim3((size_n - 1) / tpb.x + 1, (size_m - 1) / tpb.y + 1);
+    cudaEventRecord(event_start);
+    matrix_multiplication_naive<<< bpg, tpb >>>(d_M, d_N, d_P, size_m, size_n, size_k);
+    cudaEventRecord(event_end);
+    cudaDeviceSynchronize();
+    float time_naive;
+    cudaEventElapsedTime(&time_naive, event_start, event_end);
+    error_count = 0;
+    cudaMemcpy(d_error_count, &error_count, sizeof(int), cudaMemcpyDeviceToHost);
+    matrix_check_result<<< bpg, tpb >>>(d_P, size_m, size_n, size_k, d_error_count);
+    cudaMemcpy(&error_count, d_error_count, sizeof(int), cudaMemcpyDeviceToHost);
+    if(error_count > 0) printf("Matrix multiplication naive error count: %d\n", error_count);
+    else printf("Matrix multiplication naive seems OK\n");
+
+
+
+    cudaEventRecord(event_start);
+    matrix_multiplication_tiled<<< bpg, tpb >>>(d_M, d_N, d_P, size_m, size_n, size_k);
+    cudaEventRecord(event_end);
+    cudaDeviceSynchronize();
+    float time_tiled;
+    cudaEventElapsedTime(&time_tiled, event_start, event_end);
+    error_count = 0;
+    cudaMemcpy(d_error_count, &error_count, sizeof(int), cudaMemcpyDeviceToHost);
+    matrix_check_result<<< bpg, tpb >>>(d_P, size_m, size_n, size_k, d_error_count);
+    cudaMemcpy(&error_count, d_error_count, sizeof(int), cudaMemcpyDeviceToHost);
+    if(error_count > 0) printf("Matrix multiplication tiled error count: %d\n", error_count);
+    else printf("Matrix multiplication tiled seems OK\n");
+
+
+
+    cudaEventDestroy(event_start);
+    cudaEventDestroy(event_end);
+    cudaFree(d_M);
+    cudaFree(d_N);
+    cudaFree(d_P);
+    cudaFree(d_error_count);
+
+    
+
+    printf("\n");
+    printf("Time multiplication naive: %10.3f ms\n", time_naive);
+    printf("Time multiplication tiled: %10.3f ms\n", time_tiled);
+
+    return 0;
+}