Commit baa01d41 authored by Lubomir Riha's avatar Lubomir Riha
Browse files

Thomas 3D - all kernel works good - there is a barrier in the tiled kernel...

Thomas 3D - all kernel works good - there is a barrier in the tiled kernel between forward and backward substitution
parent d184036e
==65297== NVPROF is profiling process 65297, command: ./test
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
21 Cheb only - iterations
==65297== Profiling application: ./test
==65297== Profiling result:
==2008== NVPROF is profiling process 2008, command: ./test
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
0 CG 3D only - iterations
==2008== Profiling application: ./test
==2008== Profiling result:
Time(%) Time Calls Avg Min Max Name
59.10% 4.56921s 21 217.58ms 216.35ms 218.94ms void thomas_kernel_trans<double>(int, int, int, double, double, double const *, double const *, double*)
16.21% 1.25326s 42 29.840ms 23.174ms 37.027ms void thomas_kernel<double>(int, int, int, double, double, double const *, double const *, double*)
8.67% 670.18ms 63 10.638ms 10.620ms 10.660ms void axpy_kernel_val<double, int=0>(cublasAxpyParamsVal<double>)
4.31% 333.45ms 42 7.9393ms 7.9248ms 7.9554ms void transpose_readWrite_alignment_kernel<double, int=1, bool=0, int=6, int=4, int=4>(cublasTransposeParams<double>, double const *, double*, double const *)
3.41% 263.68ms 22 11.985ms 11.982ms 11.990ms void multiply_by_A<double>(int, double, double const *, double*)
2.19% 169.62ms 44 3.8551ms 29.152us 8.8965ms void nrm2_kernel<double, double, int=0, int=0, int=128, int=0>(cublasNrm2Params<double, double>)
2.14% 165.37ms 23 7.1900ms 7.0845ms 7.2005ms void copy_kernel<double, int=0>(cublasCopyParams<double>)
1.97% 152.29ms 3 50.762ms 1.2800us 152.27ms [CUDA memcpy HtoD]
1.82% 141.00ms 20 7.0501ms 7.0441ms 7.0563ms void scal_kernel_val<double, double, int=0>(cublasScalParamsVal<double, double>)
0.08% 6.5645ms 1 6.5645ms 6.5645ms 6.5645ms void device_memset_kernel<double>(double*, double, unsigned long, unsigned long)
0.08% 6.4024ms 1 6.4024ms 6.4024ms 6.4024ms void device_memset_kernel_inc<double>(double*, double, unsigned long, unsigned long)
0.00% 104.45us 22 4.7470us 4.6720us 5.2160us [CUDA memset]
0.00% 56.802us 22 2.5810us 2.1440us 10.144us [CUDA memcpy DtoH]
86.49% 1.10959s 60 18.493ms 9.0560ms 34.658ms [CUDA memcpy DtoH]
4.63% 59.449ms 60 990.82us 941.46us 1.6431ms void transpose_readWrite_alignment_kernel<float, int=1, bool=0, int=6, int=5, int=3>(cublasTransposeParams<float>, float const *, float*, float const *)
2.63% 33.777ms 30 1.1259ms 1.1113ms 1.2678ms void device_memset_kernel<float>(float*, float, unsigned long, unsigned long)
1.83% 23.475ms 10 2.3475ms 2.1451ms 3.3271ms void thomas_kernel3D_X2<float>(int, float, float, float const *, float const *, float*)
1.79% 22.982ms 10 2.2982ms 2.1727ms 2.8805ms void thomas_kernel3D_X1<float>(int, float, float, float const *, float const *, float*)
1.72% 22.008ms 10 2.2008ms 2.1322ms 2.6128ms void thomas_kernel3D_XT<float>(int, float, float, float const *, float const *, float*)
0.90% 11.558ms 3 3.8526ms 1.0880us 11.555ms [CUDA memcpy HtoD]
==65297== API calls:
==2008== API calls:
Time(%) Time Calls Avg Min Max Name
91.56% 7.72952s 25 309.18ms 23.697us 360.08ms cudaMemcpy
8.34% 704.04ms 4 176.01ms 71.078us 352.97ms cudaFree
0.05% 4.4002ms 11 400.02us 10.274us 1.6367ms cudaMalloc
0.03% 2.1981ms 279 7.8780us 6.4650us 44.498us cudaLaunch
0.01% 642.11us 249 2.5780us 169ns 95.283us cuDeviceGetAttribute
0.00% 229.58us 918 250ns 165ns 7.7740us cudaSetupArgument
0.00% 211.69us 22 9.6220us 8.8190us 17.379us cudaMemsetAsync
0.00% 95.442us 279 342ns 196ns 8.0960us cudaConfigureCall
0.00% 94.196us 384 245ns 146ns 1.3530us cudaGetLastError
0.00% 88.827us 3 29.609us 29.032us 30.188us cuDeviceTotalMem
0.00% 82.462us 23 3.5850us 2.7790us 14.771us cudaFuncGetAttributes
0.00% 74.404us 3 24.801us 22.361us 27.518us cuDeviceGetName
0.00% 68.968us 24 2.8730us 332ns 50.750us cudaDeviceGetAttribute
0.00% 62.369us 22 2.8340us 1.3570us 9.0910us cudaEventRecord
0.00% 57.774us 1 57.774us 57.774us 57.774us cudaGetDeviceCount
0.00% 55.013us 22 2.5000us 2.2850us 4.5390us cudaEventQuery
0.00% 16.134us 1 16.134us 16.134us 16.134us cudaOccupancyMaxActiveBlocksPerMultiprocessor
0.00% 10.411us 8 1.3010us 654ns 4.1590us cudaEventCreateWithFlags
0.00% 9.0740us 3 3.0240us 2.1970us 3.5180us cudaGetDevice
0.00% 2.3890us 4 597ns 253ns 1.4910us cuDeviceGetCount
0.00% 2.0660us 2 1.0330us 874ns 1.1920us cuInit
0.00% 1.5060us 4 376ns 288ns 582ns cuDeviceGet
0.00% 1.0740us 2 537ns 440ns 634ns cuDriverGetVersion
60.21% 1.31093s 64 20.483ms 16.686us 35.443ms cudaMemcpy
39.22% 853.92ms 34 25.115ms 41.126us 468.70ms cudaFree
0.29% 6.2742ms 37 169.57us 9.8680us 1.2145ms cudaMalloc
0.14% 3.0592ms 498 6.1430us 115ns 314.36us cuDeviceGetAttribute
0.11% 2.2868ms 120 19.056us 5.4900us 66.439us cudaLaunch
0.01% 306.86us 6 51.143us 41.052us 57.264us cuDeviceTotalMem
0.01% 244.44us 6 40.740us 30.077us 44.083us cuDeviceGetName
0.01% 128.60us 540 238ns 116ns 5.8340us cudaSetupArgument
0.01% 123.53us 120 1.0290us 113ns 20.042us cudaGetLastError
0.00% 94.913us 120 790ns 151ns 6.9880us cudaConfigureCall
0.00% 11.433us 24 476ns 275ns 1.6420us cudaDeviceGetAttribute
0.00% 10.720us 16 670ns 486ns 2.1280us cudaEventCreateWithFlags
0.00% 5.6170us 1 5.6170us 5.6170us 5.6170us cudaFuncGetAttributes
0.00% 5.5810us 3 1.8600us 1.5550us 2.1820us cudaGetDevice
0.00% 3.0990us 8 387ns 226ns 716ns cuDeviceGet
0.00% 3.0920us 4 773ns 275ns 1.9770us cuDeviceGetCount
0.00% 2.1240us 1 2.1240us 2.1240us 2.1240us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
0.00% 1.6690us 2 834ns 629ns 1.0400us cuInit
0.00% 1.3900us 1 1.3900us 1.3900us 1.3900us cudaGetDeviceCount
0.00% 740ns 2 370ns 247ns 493ns cuDriverGetVersion
......@@ -790,7 +790,7 @@ __global__ void thomas_kernel3D_XT(int const m, FT const alpha, FT const alpha_2
//#define TILE_SIZE 2
//TODO: Should be #define
int TILE_SIZE = blockDim.x;
//int TILE_SIZE = blockDim.x;
__shared__ FT sh_b[TILE_SIZE][TILE_SIZE+1];
__shared__ FT sh_x[TILE_SIZE][TILE_SIZE+1];
......@@ -799,11 +799,16 @@ __global__ void thomas_kernel3D_XT(int const m, FT const alpha, FT const alpha_2
int tid_l = threadIdx.x;
int bid = blockIdx.x;
//int tid = blockIdx.x * + threadIdx.x;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Basis of an adress used to read dat from global memory to tiles in shared memory
// - this is rused multiple times
int base_addr = tid_l + m*m*TILE_SIZE*(bid%TILE_SIZE) + (bid/TILE_SIZE)*m;
//int base_addr = tid_l + m*m*TILE_SIZE*(bid%TILE_SIZE) + (bid/TILE_SIZE)*m;
int base_addr = tid_l +
(tid / m) * m +
(m*m) * TILE_SIZE * ( bid % (m / TILE_SIZE));
// + tile * TILE_SIZE
// + i*m*m
// **************************************************************************************************************
// *** Forward substitution ************************************************************************************
......@@ -824,9 +829,10 @@ __global__ void thomas_kernel3D_XT(int const m, FT const alpha, FT const alpha_2
// Calculate the rest of the forward substitution for the first tile
#pragma unroll
for (int i = 1; i < TILE_SIZE; ++i) {
work_buffer_reg = (sh_b[i][tid_l] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]);
sh_x[i][tid_l] = work_buffer_reg;
//printf("X tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, i-1);
int ca = i - 1;
work_buffer_reg = (sh_b[i][tid_l] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[ca]);
sh_x[i][tid_l] = work_buffer_reg;
//printf("X tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, ca);
}
// Save results of for the first tile to the global memory
......@@ -870,6 +876,7 @@ __global__ void thomas_kernel3D_XT(int const m, FT const alpha, FT const alpha_2
// *** END - Forward substitution ************************************************************************************
// **************************************************************************************************************
__syncthreads();
// **************************************************************************************************************
// *** Backward substitution ************************************************************************************
......@@ -877,7 +884,8 @@ __global__ void thomas_kernel3D_XT(int const m, FT const alpha, FT const alpha_2
// Backward substitution - last TILE - compute backward substitution using data already stored in tile in shared memory
#pragma unroll
for (int i = TILE_SIZE-2; i >= 0; --i) {
work_buffer_reg = sh_x[i][tid_l] - dev_c_prime[m - TILE_SIZE + i] * work_buffer_reg;
int ca = (TILES-1) * TILE_SIZE + i;
work_buffer_reg = sh_x[i][tid_l] - dev_c_prime[ ca ] * work_buffer_reg;
sh_x[i][tid_l] = work_buffer_reg;
//printf("B0 - tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, m - TILE_SIZE + i);
......@@ -886,7 +894,7 @@ __global__ void thomas_kernel3D_XT(int const m, FT const alpha, FT const alpha_2
// Backward substitution - last TILE - store results from tile in shared memory to global memory
#pragma unroll
for (int i = 0; i < TILE_SIZE; i++) {
int a = base_addr + m*m*i + m - TILE_SIZE;
int a = base_addr + m*m*i + (TILES-1) * TILE_SIZE; //m - TILE_SIZE;
x[ a ] = sh_x[tid_l][i];
//printf("Sav0 - tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
}
......@@ -1126,15 +1134,18 @@ public:
// delete --------------
FT *h_x;
h_x = (FT * ) malloc ( (m*m*m)*sizeof(FT) );
h_x = (FT * ) malloc ( (m*m*m)*sizeof(FT) );
FT *h_x2;
h_x2 = (FT * ) malloc ( (m*m*m)*sizeof(FT) );
cudaMemcpy( h_x, b, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0)
std::cout << std::endl;
std::cout << h_x[i] << "\t";
if (m <= 8)
if (i % (m*m) == 0)
std::cout << std::endl;
if (m <= 8) std::cout << h_x[i] << "\t";
}
std::cout << std::endl;
if (m <= 8) std::cout << std::endl;
FT *xx = x;
......@@ -1154,12 +1165,16 @@ public:
FT sum = 0.0;
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0)
std::cout << std::endl;
std::cout << h_x[i] << "\t";
if (m <= 8)
if (i % (m*m) == 0)
std::cout << std::endl;
//std::cout << h_x[i] << "\t";
if (m <= 8) printf("%4.1f\t",h_x[i]);
sum+=h_x[i];
}
std::cout << std::endl << std::endl << sum << std::endl;
if (m <= 8) std::cout << std::endl;
std::cout << sum << std::endl;
// Ker 2 *****************
......@@ -1168,10 +1183,12 @@ public:
cudaMemcpy( h_x, bb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0) std::cout << std::endl;
std::cout << h_x[i] << "\t";
if (m <= 8)
if (i % (m*m) == 0)
std::cout << std::endl;
if (m <= 8) std::cout << h_x[i] << "\t";
}
std::cout << std::endl;
if (m <= 8) std::cout << std::endl;
thomas_kernel3D_X2<FT><<<block_count, threads_per_block>>>(m, alpha, alpha_23, c_prime, bb, xx); //bb);
cublas_transpose2(cublas_handle, m*m, m, xx, bbb);
......@@ -1180,11 +1197,16 @@ public:
cudaMemcpy( h_x, bbb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
sum = 0.0;
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0) std::cout << std::endl;
std::cout << h_x[i] << "\t";
if (m <= 8)
if (i % (m*m) == 0)
std::cout << std::endl;
//std::cout << h_x[i] << "\t";
if (m <= 8) printf("%4.1f\t",h_x[i]);
sum+=h_x[i];
h_x2[i] = h_x[i];
}
std::cout << std::endl << std::endl << sum <<std::endl;
if (m <= 8) std::cout << std::endl;
std::cout << sum <<std::endl;
// Ker 3 *****************
......@@ -1196,12 +1218,20 @@ public:
cudaMemcpy( h_x, bbb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0) std::cout << std::endl;
std::cout << h_x[i] << "\t";
if (m <= 8)
if (i % (m*m) == 0)
std::cout << std::endl;
if (m <= 8) std::cout << h_x[i] << "\t";
}
std::cout << std::endl;
if (m <= 8) std::cout << std::endl;
thomas_kernel3D_XT<FT><<<4, 4>>>(m, alpha, alpha_23, c_prime, bbb, xx); //bb);
int blocks = m*m / TILE_SIZE;
int threads = TILE_SIZE;
//blocks = 1;
//threads = 2;
std::cout << "\nThomas 3D Tiled kernel - Blocks: " << blocks << " Threads = " << threads << "\n";
thomas_kernel3D_XT<FT><<<blocks, threads>>>(m, alpha, alpha_23, c_prime, bbb, xx); //bb);
cublas_transpose2(cublas_handle, m*m, m, xx, bbb);
cublas_transpose2(cublas_handle, m*m, m, bbb, xx);
......@@ -1209,17 +1239,36 @@ public:
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
sum = 0.0;
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0) std::cout << std::endl;
std::cout << h_x[i] << "\t";
if (m <= 8)
if (i % (m*m) == 0)
std::cout << std::endl;
//std::cout << h_x[i] << "\t";
if (m <= 8)
printf("%4.1f\t",h_x[i]);
sum+=h_x[i];
}
std::cout << std::endl << std::endl << sum <<std::endl;
if (m <= 8) std::cout << std::endl;
std::cout << sum <<std::endl;
sum = 0.0;
for (int i = 0; i < m*m*m; i++) {
if (m <= 8)
if (i % (m*m) == 0)
std::cout << std::endl;
//std::cout << h_x[i] << "\t";
if (m <= 8)
printf("%4.1f\t",h_x2[i] - h_x[i]);
sum+=h_x[i];
}
if (m <= 8) std::cout << std::endl;
std::cout << sum <<std::endl;
cudaFree(bb);
cudaFree(bbb);
cudaFree(bbbb);
//device_memset<FT>(x, FT(0), m*m);
......
......@@ -2,18 +2,27 @@
#echo "M BANDS THREADS W" | tr " " "\t"
for M in 4 # 512 1024 2048 4096 8192
for M in 32 64 128 256 # 512 1024 2048 4096 8192
do
#echo " --- "
for BANDS in 1 # 4 8 16 32
do
for THREADS in 4 #8 16 32 64 # 128 256 512 1024 # 256 512 1024 #256 512 1024
for THREADS in 1 #8 16 32 64 # 128 256 512 1024 # 256 512 1024 #256 512 1024
do
for W in 1 # 4 8 16 32
for W in 8 16 32 # 4 8 16 32
do
# echo "M BANDS THREADS W" | tr " " "\t"
# echo $M $BANDS $THREADS $W | tr " " "\t" # | tr "\n" "\t"
nvcc -O3 -DBANDS=$BANDS -DW=$W -DTHREADS=$THREADS -DM=$M -arch sm_35 -lcublas -lcusparse -o test test.cu && ./test 2>&1
# echo "M TILE_SIZE" | tr " " "\t"
echo $M $W | tr " " "\t" | tr "\n" "\t"
# nvcc -O3 -DBANDS=$BANDS -DTILE_SIZE=$W -DW=$W -DTHREADS=$THREADS -DM=$M -arch sm_35 -lcublas -lcusparse -o test test.cu && ./test 2>&1
nvcc -O3 -DBANDS=$BANDS -DTILE_SIZE=$W -DW=$W -DTHREADS=$THREADS -DM=$M -arch sm_35 -lcublas -lcusparse -o test test.cu 2>&1 | grep "rror"
nvprof ./test 2>&1 | tee out.txt > t.t
cat out.txt | grep -e "thomas_kernel3D_X1<" | cut -d "s" -f 3 | tr "\n" "\t" #| cut -d " " -f 11 | tr "\n" "\t"
cat out.txt | grep -e "thomas_kernel3D_X2<" | cut -d "s" -f 3 | tr "\n" "\t" #| cut -d " " -f 11 | tr "\n" "\t"
cat out.txt | grep -e "thomas_kernel3D_XT<" | cut -d "s" -f 3 | tr "\n" "\t" #| cut -d " " -f 11 | tr "\n" "\t"
cat out.txt | grep -e "transpose" | cut -d "s" -f 3 | tr "\n" "\t" #| cut -d " " -f 11 | tr "\n" "\t"
echo " "
#nvprof ./test 2>&1 | grep -e "invM_kernel" -e "transpose" | cut -d "u" -f 1 | cut -d " " -f 15 | tr "\n" "\t"
......
......@@ -387,7 +387,9 @@ int solve_with_conjugate_gradient3D(cublasHandle_t const cublas_handle,
}
//delete
preconditioner->run(b, x);
for (int r = 0; r < 10; r++) {
preconditioner->run(b, x);
}
return 0;
// delete - end
......
==65297== NVPROF is profiling process 65297, command: ./test
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
21 Cheb only - iterations
==65297== Profiling application: ./test
==65297== Profiling result:
==2008== NVPROF is profiling process 2008, command: ./test
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
0 CG 3D only - iterations
==2008== Profiling application: ./test
==2008== Profiling result:
Time(%) Time Calls Avg Min Max Name
59.10% 4.56921s 21 217.58ms 216.35ms 218.94ms void thomas_kernel_trans<double>(int, int, int, double, double, double const *, double const *, double*)
16.21% 1.25326s 42 29.840ms 23.174ms 37.027ms void thomas_kernel<double>(int, int, int, double, double, double const *, double const *, double*)
8.67% 670.18ms 63 10.638ms 10.620ms 10.660ms void axpy_kernel_val<double, int=0>(cublasAxpyParamsVal<double>)
4.31% 333.45ms 42 7.9393ms 7.9248ms 7.9554ms void transpose_readWrite_alignment_kernel<double, int=1, bool=0, int=6, int=4, int=4>(cublasTransposeParams<double>, double const *, double*, double const *)
3.41% 263.68ms 22 11.985ms 11.982ms 11.990ms void multiply_by_A<double>(int, double, double const *, double*)
2.19% 169.62ms 44 3.8551ms 29.152us 8.8965ms void nrm2_kernel<double, double, int=0, int=0, int=128, int=0>(cublasNrm2Params<double, double>)
2.14% 165.37ms 23 7.1900ms 7.0845ms 7.2005ms void copy_kernel<double, int=0>(cublasCopyParams<double>)
1.97% 152.29ms 3 50.762ms 1.2800us 152.27ms [CUDA memcpy HtoD]
1.82% 141.00ms 20 7.0501ms 7.0441ms 7.0563ms void scal_kernel_val<double, double, int=0>(cublasScalParamsVal<double, double>)
0.08% 6.5645ms 1 6.5645ms 6.5645ms 6.5645ms void device_memset_kernel<double>(double*, double, unsigned long, unsigned long)
0.08% 6.4024ms 1 6.4024ms 6.4024ms 6.4024ms void device_memset_kernel_inc<double>(double*, double, unsigned long, unsigned long)
0.00% 104.45us 22 4.7470us 4.6720us 5.2160us [CUDA memset]
0.00% 56.802us 22 2.5810us 2.1440us 10.144us [CUDA memcpy DtoH]
86.49% 1.10959s 60 18.493ms 9.0560ms 34.658ms [CUDA memcpy DtoH]
4.63% 59.449ms 60 990.82us 941.46us 1.6431ms void transpose_readWrite_alignment_kernel<float, int=1, bool=0, int=6, int=5, int=3>(cublasTransposeParams<float>, float const *, float*, float const *)
2.63% 33.777ms 30 1.1259ms 1.1113ms 1.2678ms void device_memset_kernel<float>(float*, float, unsigned long, unsigned long)
1.83% 23.475ms 10 2.3475ms 2.1451ms 3.3271ms void thomas_kernel3D_X2<float>(int, float, float, float const *, float const *, float*)
1.79% 22.982ms 10 2.2982ms 2.1727ms 2.8805ms void thomas_kernel3D_X1<float>(int, float, float, float const *, float const *, float*)
1.72% 22.008ms 10 2.2008ms 2.1322ms 2.6128ms void thomas_kernel3D_XT<float>(int, float, float, float const *, float const *, float*)
0.90% 11.558ms 3 3.8526ms 1.0880us 11.555ms [CUDA memcpy HtoD]
==65297== API calls:
==2008== API calls:
Time(%) Time Calls Avg Min Max Name
91.56% 7.72952s 25 309.18ms 23.697us 360.08ms cudaMemcpy
8.34% 704.04ms 4 176.01ms 71.078us 352.97ms cudaFree
0.05% 4.4002ms 11 400.02us 10.274us 1.6367ms cudaMalloc
0.03% 2.1981ms 279 7.8780us 6.4650us 44.498us cudaLaunch
0.01% 642.11us 249 2.5780us 169ns 95.283us cuDeviceGetAttribute
0.00% 229.58us 918 250ns 165ns 7.7740us cudaSetupArgument
0.00% 211.69us 22 9.6220us 8.8190us 17.379us cudaMemsetAsync
0.00% 95.442us 279 342ns 196ns 8.0960us cudaConfigureCall
0.00% 94.196us 384 245ns 146ns 1.3530us cudaGetLastError
0.00% 88.827us 3 29.609us 29.032us 30.188us cuDeviceTotalMem
0.00% 82.462us 23 3.5850us 2.7790us 14.771us cudaFuncGetAttributes
0.00% 74.404us 3 24.801us 22.361us 27.518us cuDeviceGetName
0.00% 68.968us 24 2.8730us 332ns 50.750us cudaDeviceGetAttribute
0.00% 62.369us 22 2.8340us 1.3570us 9.0910us cudaEventRecord
0.00% 57.774us 1 57.774us 57.774us 57.774us cudaGetDeviceCount
0.00% 55.013us 22 2.5000us 2.2850us 4.5390us cudaEventQuery
0.00% 16.134us 1 16.134us 16.134us 16.134us cudaOccupancyMaxActiveBlocksPerMultiprocessor
0.00% 10.411us 8 1.3010us 654ns 4.1590us cudaEventCreateWithFlags
0.00% 9.0740us 3 3.0240us 2.1970us 3.5180us cudaGetDevice
0.00% 2.3890us 4 597ns 253ns 1.4910us cuDeviceGetCount
0.00% 2.0660us 2 1.0330us 874ns 1.1920us cuInit
0.00% 1.5060us 4 376ns 288ns 582ns cuDeviceGet
0.00% 1.0740us 2 537ns 440ns 634ns cuDriverGetVersion
60.21% 1.31093s 64 20.483ms 16.686us 35.443ms cudaMemcpy
39.22% 853.92ms 34 25.115ms 41.126us 468.70ms cudaFree
0.29% 6.2742ms 37 169.57us 9.8680us 1.2145ms cudaMalloc
0.14% 3.0592ms 498 6.1430us 115ns 314.36us cuDeviceGetAttribute
0.11% 2.2868ms 120 19.056us 5.4900us 66.439us cudaLaunch
0.01% 306.86us 6 51.143us 41.052us 57.264us cuDeviceTotalMem
0.01% 244.44us 6 40.740us 30.077us 44.083us cuDeviceGetName
0.01% 128.60us 540 238ns 116ns 5.8340us cudaSetupArgument
0.01% 123.53us 120 1.0290us 113ns 20.042us cudaGetLastError
0.00% 94.913us 120 790ns 151ns 6.9880us cudaConfigureCall
0.00% 11.433us 24 476ns 275ns 1.6420us cudaDeviceGetAttribute
0.00% 10.720us 16 670ns 486ns 2.1280us cudaEventCreateWithFlags
0.00% 5.6170us 1 5.6170us 5.6170us 5.6170us cudaFuncGetAttributes
0.00% 5.5810us 3 1.8600us 1.5550us 2.1820us cudaGetDevice
0.00% 3.0990us 8 387ns 226ns 716ns cuDeviceGet
0.00% 3.0920us 4 773ns 275ns 1.9770us cuDeviceGetCount
0.00% 2.1240us 1 2.1240us 2.1240us 2.1240us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags