Skip to content
Snippets Groups Projects
Commit 780b811c authored by Jakub Homola's avatar Jakub Homola
Browse files

mat mult

parent c317ce9a
No related branches found
No related tags found
No related merge requests found
......@@ -53,4 +53,6 @@ Documentation links:
1. [reduction](tasks/reduction)
1. [convolution_1d](tasks/convolution_1d)
1. [histogram](tasks/histogram)
1. Other
1. [matrix_transpose](tasks/matrix_transpose)
1. [matrix_multiplication](tasks/matrix_multiplication)
#include <cstdio>
#define TILE_SIZE 32
__global__ void matrix_multiplication_naive(float * M, float * N, float * P, int size_m, int size_n, int size_k)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if(col < size_n && row < size_m)
{
float val = 0;
for(int k = 0; k < size_k; k++)
{
// TODO: multiply the values from matrices M and N and add them to the local P value
val += M[row * size_k + k] * N[k * size_n + col];
}
// TODO: write the local P value to the matrix P in global memory
P[row * size_n + col] = val;
}
}
__global__ void matrix_multiplication_tiled(float * M, float * N, float * P, int size_m, int size_n, int size_k)
{
__shared__ float tile_M[TILE_SIZE][TILE_SIZE];
__shared__ float tile_N[TILE_SIZE][TILE_SIZE];
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
float val = 0;
int tile_count_k = (size_k - 1) / TILE_SIZE + 1;
for(int tile_k = 0; tile_k < tile_count_k; tile_k++)
{
// TODO: think about why both the IF and ELSE are needed here
if(row < size_m && tile_k * TILE_SIZE + threadIdx.x < size_k)
{
// TODO: read the value from the matrix M and store it in the shared memory
tile_M[threadIdx.y][threadIdx.x] = M[row * size_k + tile_k * TILE_SIZE + threadIdx.x];
}
else
{
tile_M[threadIdx.y][threadIdx.x] = 0;
}
// TODO: think about why both the IF and ELSE are needed here
if(col < size_n && tile_k * TILE_SIZE + threadIdx.y < size_k)
{
// TODO: read the value from the matrix N and store it in the shared memory
tile_N[threadIdx.y][threadIdx.x] = N[(tile_k * TILE_SIZE + threadIdx.y) * size_n + col];
}
else
{
tile_N[threadIdx.y][threadIdx.x] = 0;
}
// TODO: synchronize all threads in this threadblock to make sure all threads populated the shared-memory tiles
__syncthreads();
if(row < size_m && col < size_n)
{
for(int k = 0; k < TILE_SIZE; k++)
{
// TODO: read the matrix elements from the shared memory, multiply them together and add the result to the local P value
val += tile_M[threadIdx.y][k] * tile_N[k][threadIdx.x];
}
}
// TODO: synchronize all threads in this threadblock to make sure all threads have finished working with the shared memory and it can be overwritten in the next iteration
__syncthreads();
}
if(row < size_m && col < size_n)
{
P[row * size_n + col] = val;
}
}
__global__ void matrix_init(float * matrix, int width, int height, float a_r, float a_c)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if(row < height && col < width)
matrix[row * width + col] = a_r * (row + 1) + a_c * (col + 1);
}
__global__ void matrix_check_result(float * matrix, int size_m, int size_n, int size_k, int * error_count)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if(row < size_m && col < size_n)
{
float correct = (row + 1.0f) * (col + 1.0f) * size_k;
float observed = matrix[row * size_n + col];
if(abs(correct - observed) / correct > 1e-3)
atomicAdd(error_count, 1);
}
}
int main()
{
int size_m = 15394;
int size_n = 12287;
int size_k = 14480;
float * d_M;
float * d_N;
float * d_P;
int * d_error_count;
int error_count;
cudaMalloc(&d_M, size_m * size_k * sizeof(float));
cudaMalloc(&d_N, size_k * size_n * sizeof(float));
cudaMalloc(&d_P, size_m * size_n * sizeof(float));
cudaMalloc(&d_error_count, sizeof(int));
cudaEvent_t event_start, event_end;
cudaEventCreate(&event_start);
cudaEventCreate(&event_end);
dim3 tpb, bpg;
tpb = dim3(32, 32);
bpg = dim3((size_k - 1) / tpb.x + 1, (size_m - 1) / tpb.y + 1);
matrix_init<<< bpg, tpb >>>(d_M, size_k, size_m, 1, 0);
tpb = dim3(32, 32);
bpg = dim3((size_n - 1) / tpb.x + 1, (size_k - 1) / tpb.y + 1);
matrix_init<<< bpg, tpb >>>(d_N, size_n, size_k, 0, 1);
cudaDeviceSynchronize();
tpb = dim3(32, 32);
bpg = dim3((size_n - 1) / tpb.x + 1, (size_m - 1) / tpb.y + 1);
cudaEventRecord(event_start);
matrix_multiplication_naive<<< bpg, tpb >>>(d_M, d_N, d_P, size_m, size_n, size_k);
cudaEventRecord(event_end);
cudaDeviceSynchronize();
float time_naive;
cudaEventElapsedTime(&time_naive, event_start, event_end);
error_count = 0;
cudaMemcpy(d_error_count, &error_count, sizeof(int), cudaMemcpyDeviceToHost);
matrix_check_result<<< bpg, tpb >>>(d_P, size_m, size_n, size_k, d_error_count);
cudaMemcpy(&error_count, d_error_count, sizeof(int), cudaMemcpyDeviceToHost);
if(error_count > 0) printf("Matrix multiplication naive error count: %d\n", error_count);
else printf("Matrix multiplication naive seems OK\n");
tpb = dim3(TILE_SIZE, TILE_SIZE);
bpg = dim3((size_n - 1) / tpb.x + 1, (size_m - 1) / tpb.y + 1);
cudaEventRecord(event_start);
matrix_multiplication_tiled<<< bpg, tpb >>>(d_M, d_N, d_P, size_m, size_n, size_k);
cudaEventRecord(event_end);
cudaDeviceSynchronize();
float time_tiled;
cudaEventElapsedTime(&time_tiled, event_start, event_end);
error_count = 0;
cudaMemcpy(d_error_count, &error_count, sizeof(int), cudaMemcpyDeviceToHost);
matrix_check_result<<< bpg, tpb >>>(d_P, size_m, size_n, size_k, d_error_count);
cudaMemcpy(&error_count, d_error_count, sizeof(int), cudaMemcpyDeviceToHost);
if(error_count > 0) printf("Matrix multiplication tiled error count: %d\n", error_count);
else printf("Matrix multiplication tiled seems OK\n");
cudaEventDestroy(event_start);
cudaEventDestroy(event_end);
cudaFree(d_M);
cudaFree(d_N);
cudaFree(d_P);
cudaFree(d_error_count);
printf("\n");
printf("Time multiplication naive: %10.3f ms\n", time_naive);
printf("Time multiplication tiled: %10.3f ms\n", time_tiled);
printf("Speedup is %.2f\n", time_naive / time_tiled);
return 0;
}
Matrix multiplication
=====================
Finish the implemetation of multiplication of two non-square matrices by completing the TODO tasks in the `matrix_multiplication.cu` file. First, complete the naive matrix multiplication kernel, then move on to the tiled matrix multiplication algorithm. The todo tasks are only in the two kernels, you don't need to worry about the rest of the code (classic memory management, initialization and correctness checking).
You are multiplying two matrices $M$ and $N$ to get a result matrix $P = MN$. The matrix $M$ has $size\_m$ rows and $size\_k$ columns. To make the matrices valid for multiplication, the matrix $N$ has to have $size\_k$ rows, and we assigned it to have $size\_n$ columns. The result matrix $P$ therefore has $size\_m$ rows and $size\_n$ columns. We use row-major memory order to store the matrices in memory.
Compile and run, observe the timing differences.
![matrix multiplication illustration](matmult.png)
tasks/matrix_multiplication/matmult.png

19 KiB

#include <cstdio>
#define TILE_SIZE 32
__global__ void matrix_multiplication_naive(float * M, float * N, float * P, int size_m, int size_n, int size_k)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if(col < size_n && row < size_m)
{
float val = 0;
for(int k = 0; k < size_k; k++)
{
// TODO: multiply the values from matrices M and N and add them to the local P value
val += ;
}
// TODO: write the local P value to the matrix P in global memory
P[ ] = val;
}
}
__global__ void matrix_multiplication_tiled(float * M, float * N, float * P, int size_m, int size_n, int size_k)
{
__shared__ float tile_M[TILE_SIZE][TILE_SIZE];
__shared__ float tile_N[TILE_SIZE][TILE_SIZE];
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
float val = 0;
int tile_count_k = (size_k - 1) / TILE_SIZE + 1;
for(int tile_k = 0; tile_k < tile_count_k; tile_k++)
{
// TODO: think about why both the IF and ELSE are needed here
if(row < size_m && tile_k * TILE_SIZE + threadIdx.x < size_k)
{
// TODO: read the value from the matrix M and store it in the shared memory
tile_M[threadIdx.y][threadIdx.x] = M[ ];
}
else
{
tile_M[threadIdx.y][threadIdx.x] = 0;
}
// TODO: think about why both the IF and ELSE are needed here
if(col < size_n && tile_k * TILE_SIZE + threadIdx.y < size_k)
{
// TODO: read the value from the matrix N and store it in the shared memory
tile_N[threadIdx.y][threadIdx.x] = N[ ];
}
else
{
tile_N[threadIdx.y][threadIdx.x] = 0;
}
// TODO: synchronize all threads in this threadblock to make sure all threads populated the shared-memory tiles
if(row < size_m && col < size_n)
{
for(int k = 0; k < TILE_SIZE; k++)
{
// TODO: read the matrix elements from the shared memory, multiply them together and add the result to the local P value
val += ;
}
}
// TODO: synchronize all threads in this threadblock to make sure all threads have finished working with the shared memory and it can be overwritten in the next iteration
}
if(row < size_m && col < size_n)
{
P[row * size_n + col] = val;
}
}
__global__ void matrix_init(float * matrix, int width, int height, float a_r, float a_c)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if(row < height && col < width)
matrix[row * width + col] = a_r * (row + 1) + a_c * (col + 1);
}
__global__ void matrix_check_result(float * matrix, int size_m, int size_n, int size_k, int * error_count)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if(row < size_m && col < size_n)
{
float correct = (row + 1.0f) * (col + 1.0f) * size_k;
float observed = matrix[row * size_n + col];
if(abs(correct - observed) / correct > 1e-3)
atomicAdd(error_count, 1);
}
}
int main()
{
int size_m = 15394;
int size_n = 12287;
int size_k = 14480;
float * d_M;
float * d_N;
float * d_P;
int * d_error_count;
int error_count;
cudaMalloc(&d_M, size_m * size_k * sizeof(float));
cudaMalloc(&d_N, size_k * size_n * sizeof(float));
cudaMalloc(&d_P, size_m * size_n * sizeof(float));
cudaMalloc(&d_error_count, sizeof(int));
cudaEvent_t event_start, event_end;
cudaEventCreate(&event_start);
cudaEventCreate(&event_end);
dim3 tpb, bpg;
tpb = dim3(TILE_SIZE, TILE_SIZE);
bpg = dim3((size_k - 1) / tpb.x + 1, (size_m - 1) / tpb.y + 1);
matrix_init<<< bpg, tpb >>>(d_M, size_k, size_m, 1, 0);
tpb = dim3(TILE_SIZE, TILE_SIZE);
bpg = dim3((size_n - 1) / tpb.x + 1, (size_k - 1) / tpb.y + 1);
matrix_init<<< bpg, tpb >>>(d_N, size_n, size_k, 0, 1);
cudaDeviceSynchronize();
tpb = dim3(32, 32);
bpg = dim3((size_n - 1) / tpb.x + 1, (size_m - 1) / tpb.y + 1);
cudaEventRecord(event_start);
matrix_multiplication_naive<<< bpg, tpb >>>(d_M, d_N, d_P, size_m, size_n, size_k);
cudaEventRecord(event_end);
cudaDeviceSynchronize();
float time_naive;
cudaEventElapsedTime(&time_naive, event_start, event_end);
error_count = 0;
cudaMemcpy(d_error_count, &error_count, sizeof(int), cudaMemcpyDeviceToHost);
matrix_check_result<<< bpg, tpb >>>(d_P, size_m, size_n, size_k, d_error_count);
cudaMemcpy(&error_count, d_error_count, sizeof(int), cudaMemcpyDeviceToHost);
if(error_count > 0) printf("Matrix multiplication naive error count: %d\n", error_count);
else printf("Matrix multiplication naive seems OK\n");
cudaEventRecord(event_start);
matrix_multiplication_tiled<<< bpg, tpb >>>(d_M, d_N, d_P, size_m, size_n, size_k);
cudaEventRecord(event_end);
cudaDeviceSynchronize();
float time_tiled;
cudaEventElapsedTime(&time_tiled, event_start, event_end);
error_count = 0;
cudaMemcpy(d_error_count, &error_count, sizeof(int), cudaMemcpyDeviceToHost);
matrix_check_result<<< bpg, tpb >>>(d_P, size_m, size_n, size_k, d_error_count);
cudaMemcpy(&error_count, d_error_count, sizeof(int), cudaMemcpyDeviceToHost);
if(error_count > 0) printf("Matrix multiplication tiled error count: %d\n", error_count);
else printf("Matrix multiplication tiled seems OK\n");
cudaEventDestroy(event_start);
cudaEventDestroy(event_end);
cudaFree(d_M);
cudaFree(d_N);
cudaFree(d_P);
cudaFree(d_error_count);
printf("\n");
printf("Time multiplication naive: %10.3f ms\n", time_naive);
printf("Time multiplication tiled: %10.3f ms\n", time_tiled);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment