Commit 5ddfb7ad authored by Lubomir Riha's avatar Lubomir Riha
Browse files

ENH: Tests on Greina

parent baa01d41
==2008== NVPROF is profiling process 2008, command: ./test
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
==13471== NVPROF is profiling process 13471, command: ./test
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
0 CG 3D only - iterations
==2008== Profiling application: ./test
==2008== Profiling result:
==13471== Profiling application: ./test
==13471== Profiling result:
Time(%) Time Calls Avg Min Max Name
86.49% 1.10959s 60 18.493ms 9.0560ms 34.658ms [CUDA memcpy DtoH]
4.63% 59.449ms 60 990.82us 941.46us 1.6431ms void transpose_readWrite_alignment_kernel<float, int=1, bool=0, int=6, int=5, int=3>(cublasTransposeParams<float>, float const *, float*, float const *)
2.63% 33.777ms 30 1.1259ms 1.1113ms 1.2678ms void device_memset_kernel<float>(float*, float, unsigned long, unsigned long)
1.83% 23.475ms 10 2.3475ms 2.1451ms 3.3271ms void thomas_kernel3D_X2<float>(int, float, float, float const *, float const *, float*)
1.79% 22.982ms 10 2.2982ms 2.1727ms 2.8805ms void thomas_kernel3D_X1<float>(int, float, float, float const *, float const *, float*)
1.72% 22.008ms 10 2.2008ms 2.1322ms 2.6128ms void thomas_kernel3D_XT<float>(int, float, float, float const *, float const *, float*)
0.90% 11.558ms 3 3.8526ms 1.0880us 11.555ms [CUDA memcpy HtoD]
71.46% 115.47ms 61 1.8929ms 1.6317ms 3.2461ms [CUDA memcpy DtoH]
11.63% 18.785ms 10 1.8785ms 1.8763ms 1.8808ms void thomas_kernel3D_XT<double>(int, double, double, double const *, double const *, double*)
7.24% 11.705ms 60 195.08us 191.17us 199.36us void transpose_readWrite_alignment_kernel<double, int=1, bool=0, int=6, int=4, int=4>(cublasTransposeParams<double>, double const *, double*, double const *)
3.67% 5.9379ms 30 197.93us 197.83us 198.08us void device_memset_kernel<double>(double*, double, unsigned long, unsigned long)
2.53% 4.0851ms 10 408.51us 406.60us 409.60us void thomas_kernel3D_X2<double>(int, double, double, double const *, double const *, double*)
2.45% 3.9618ms 10 396.18us 393.22us 399.56us void thomas_kernel3D_X1<double>(int, double, double, double const *, double const *, double*)
1.02% 1.6467ms 3 548.89us 1.3120us 1.6439ms [CUDA memcpy HtoD]
==2008== API calls:
==13471== API calls:
Time(%) Time Calls Avg Min Max Name
60.21% 1.31093s 64 20.483ms 16.686us 35.443ms cudaMemcpy
39.22% 853.92ms 34 25.115ms 41.126us 468.70ms cudaFree
0.29% 6.2742ms 37 169.57us 9.8680us 1.2145ms cudaMalloc
0.14% 3.0592ms 498 6.1430us 115ns 314.36us cuDeviceGetAttribute
0.11% 2.2868ms 120 19.056us 5.4900us 66.439us cudaLaunch
0.01% 306.86us 6 51.143us 41.052us 57.264us cuDeviceTotalMem
0.01% 244.44us 6 40.740us 30.077us 44.083us cuDeviceGetName
0.01% 128.60us 540 238ns 116ns 5.8340us cudaSetupArgument
0.01% 123.53us 120 1.0290us 113ns 20.042us cudaGetLastError
0.00% 94.913us 120 790ns 151ns 6.9880us cudaConfigureCall
0.00% 11.433us 24 476ns 275ns 1.6420us cudaDeviceGetAttribute
0.00% 10.720us 16 670ns 486ns 2.1280us cudaEventCreateWithFlags
0.00% 5.6170us 1 5.6170us 5.6170us 5.6170us cudaFuncGetAttributes
0.00% 5.5810us 3 1.8600us 1.5550us 2.1820us cudaGetDevice
0.00% 3.0990us 8 387ns 226ns 716ns cuDeviceGet
0.00% 3.0920us 4 773ns 275ns 1.9770us cuDeviceGetCount
0.00% 2.1240us 1 2.1240us 2.1240us 2.1240us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
0.00% 1.6690us 2 834ns 629ns 1.0400us cuInit
0.00% 1.3900us 1 1.3900us 1.3900us 1.3900us cudaGetDeviceCount
0.00% 740ns 2 370ns 247ns 493ns cuDriverGetVersion
79.96% 717.70ms 34 21.109ms 29.559us 448.66ms cudaFree
18.59% 166.90ms 64 2.6078ms 16.337us 3.9920ms cudaMemcpy
0.58% 5.2375ms 37 141.56us 7.4150us 610.90us cudaMalloc
0.39% 3.5014ms 6 583.57us 263.44us 759.30us cuDeviceTotalMem
0.26% 2.3559ms 530 4.4450us 110ns 168.80us cuDeviceGetAttribute
0.15% 1.3777ms 120 11.480us 3.8720us 34.521us cudaLaunch
0.02% 202.11us 6 33.684us 19.455us 41.192us cuDeviceGetName
0.01% 116.60us 540 215ns 123ns 3.0280us cudaSetupArgument
0.01% 62.819us 120 523ns 122ns 12.821us cudaGetLastError
0.01% 55.838us 120 465ns 145ns 2.4750us cudaConfigureCall
0.00% 10.752us 25 430ns 235ns 1.6210us cudaDeviceGetAttribute
0.00% 9.6380us 16 602ns 370ns 2.7540us cudaEventCreateWithFlags
0.00% 6.7920us 5 1.3580us 276ns 4.5080us cuDeviceGetCount
0.00% 6.7040us 1 6.7040us 6.7040us 6.7040us cudaFuncGetAttributes
0.00% 6.1460us 3 2.0480us 1.9950us 2.0820us cudaGetDevice
0.00% 5.8930us 10 589ns 235ns 1.1650us cuDeviceGet
0.00% 3.4380us 1 3.4380us 3.4380us 3.4380us cudaGetDeviceCount
0.00% 3.0020us 1 3.0020us 3.0020us 3.0020us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
0.00% 1.7040us 2 852ns 570ns 1.1340us cuInit
0.00% 1.1100us 2 555ns 519ns 591ns cuDriverGetVersion
......@@ -2,19 +2,21 @@
#echo "M BANDS THREADS W" | tr " " "\t"
for M in 32 64 128 256 # 512 1024 2048 4096 8192
#nvcc -O3 -DBANDS=8 -DTILE_SIZE=8 -DW=8 -DTHREADS=1 -DM=8 -arch sm_35 -lcublas -lcusparse -o test test.cu && ./test 2>&1
for M in 128 # 32 64 128 256 512 # 512 1024 2048 4096 8192
do
#echo " --- "
for BANDS in 1 # 4 8 16 32
do
for THREADS in 1 #8 16 32 64 # 128 256 512 1024 # 256 512 1024 #256 512 1024
do
for W in 8 16 32 # 4 8 16 32
for W in 2 4 8 16 32 # 8 16 32 # 4 8 16 32
do
# echo "M TILE_SIZE" | tr " " "\t"
echo $M $W | tr " " "\t" | tr "\n" "\t"
# nvcc -O3 -DBANDS=$BANDS -DTILE_SIZE=$W -DW=$W -DTHREADS=$THREADS -DM=$M -arch sm_35 -lcublas -lcusparse -o test test.cu && ./test 2>&1
#nvcc -O3 -DBANDS=$BANDS -DTILE_SIZE=$W -DW=$W -DTHREADS=$THREADS -DM=$M -arch sm_35 -lcublas -lcusparse -o test test.cu && ./test 2>&1
nvcc -O3 -DBANDS=$BANDS -DTILE_SIZE=$W -DW=$W -DTHREADS=$THREADS -DM=$M -arch sm_35 -lcublas -lcusparse -o test test.cu 2>&1 | grep "rror"
nvprof ./test 2>&1 | tee out.txt > t.t
......
==2008== NVPROF is profiling process 2008, command: ./test
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
==13471== NVPROF is profiling process 13471, command: ./test
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Blocks = 256
threads_per_block = 256
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
Blocks = 128
threads_per_block = 128
2.17779e+12
2.17779e+12
Thomas 3D Tiled kernel - Blocks: 2048 Threads = 32
1.46611e+14
1.46611e+14
Thomas 3D Tiled kernel - Blocks: 512 Threads = 32
2.17779e+12
2.17779e+12
0 CG 3D only - iterations
==2008== Profiling application: ./test
==2008== Profiling result:
==13471== Profiling application: ./test
==13471== Profiling result:
Time(%) Time Calls Avg Min Max Name
86.49% 1.10959s 60 18.493ms 9.0560ms 34.658ms [CUDA memcpy DtoH]
4.63% 59.449ms 60 990.82us 941.46us 1.6431ms void transpose_readWrite_alignment_kernel<float, int=1, bool=0, int=6, int=5, int=3>(cublasTransposeParams<float>, float const *, float*, float const *)
2.63% 33.777ms 30 1.1259ms 1.1113ms 1.2678ms void device_memset_kernel<float>(float*, float, unsigned long, unsigned long)
1.83% 23.475ms 10 2.3475ms 2.1451ms 3.3271ms void thomas_kernel3D_X2<float>(int, float, float, float const *, float const *, float*)
1.79% 22.982ms 10 2.2982ms 2.1727ms 2.8805ms void thomas_kernel3D_X1<float>(int, float, float, float const *, float const *, float*)
1.72% 22.008ms 10 2.2008ms 2.1322ms 2.6128ms void thomas_kernel3D_XT<float>(int, float, float, float const *, float const *, float*)
0.90% 11.558ms 3 3.8526ms 1.0880us 11.555ms [CUDA memcpy HtoD]
71.46% 115.47ms 61 1.8929ms 1.6317ms 3.2461ms [CUDA memcpy DtoH]
11.63% 18.785ms 10 1.8785ms 1.8763ms 1.8808ms void thomas_kernel3D_XT<double>(int, double, double, double const *, double const *, double*)
7.24% 11.705ms 60 195.08us 191.17us 199.36us void transpose_readWrite_alignment_kernel<double, int=1, bool=0, int=6, int=4, int=4>(cublasTransposeParams<double>, double const *, double*, double const *)
3.67% 5.9379ms 30 197.93us 197.83us 198.08us void device_memset_kernel<double>(double*, double, unsigned long, unsigned long)
2.53% 4.0851ms 10 408.51us 406.60us 409.60us void thomas_kernel3D_X2<double>(int, double, double, double const *, double const *, double*)
2.45% 3.9618ms 10 396.18us 393.22us 399.56us void thomas_kernel3D_X1<double>(int, double, double, double const *, double const *, double*)
1.02% 1.6467ms 3 548.89us 1.3120us 1.6439ms [CUDA memcpy HtoD]
==2008== API calls:
==13471== API calls:
Time(%) Time Calls Avg Min Max Name
60.21% 1.31093s 64 20.483ms 16.686us 35.443ms cudaMemcpy
39.22% 853.92ms 34 25.115ms 41.126us 468.70ms cudaFree
0.29% 6.2742ms 37 169.57us 9.8680us 1.2145ms cudaMalloc
0.14% 3.0592ms 498 6.1430us 115ns 314.36us cuDeviceGetAttribute
0.11% 2.2868ms 120 19.056us 5.4900us 66.439us cudaLaunch
0.01% 306.86us 6 51.143us 41.052us 57.264us cuDeviceTotalMem
0.01% 244.44us 6 40.740us 30.077us 44.083us cuDeviceGetName
0.01% 128.60us 540 238ns 116ns 5.8340us cudaSetupArgument
0.01% 123.53us 120 1.0290us 113ns 20.042us cudaGetLastError
0.00% 94.913us 120 790ns 151ns 6.9880us cudaConfigureCall
0.00% 11.433us 24 476ns 275ns 1.6420us cudaDeviceGetAttribute
0.00% 10.720us 16 670ns 486ns 2.1280us cudaEventCreateWithFlags
0.00% 5.6170us 1 5.6170us 5.6170us 5.6170us cudaFuncGetAttributes
0.00% 5.5810us 3 1.8600us 1.5550us 2.1820us cudaGetDevice
0.00% 3.0990us 8 387ns 226ns 716ns cuDeviceGet
0.00% 3.0920us 4 773ns 275ns 1.9770us cuDeviceGetCount
0.00% 2.1240us 1 2.1240us 2.1240us 2.1240us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
0.00% 1.6690us 2 834ns 629ns 1.0400us cuInit
0.00% 1.3900us 1 1.3900us 1.3900us 1.3900us cudaGetDeviceCount
0.00% 740ns 2 370ns 247ns 493ns cuDriverGetVersion
79.96% 717.70ms 34 21.109ms 29.559us 448.66ms cudaFree
18.59% 166.90ms 64 2.6078ms 16.337us 3.9920ms cudaMemcpy
0.58% 5.2375ms 37 141.56us 7.4150us 610.90us cudaMalloc
0.39% 3.5014ms 6 583.57us 263.44us 759.30us cuDeviceTotalMem
0.26% 2.3559ms 530 4.4450us 110ns 168.80us cuDeviceGetAttribute
0.15% 1.3777ms 120 11.480us 3.8720us 34.521us cudaLaunch
0.02% 202.11us 6 33.684us 19.455us 41.192us cuDeviceGetName
0.01% 116.60us 540 215ns 123ns 3.0280us cudaSetupArgument
0.01% 62.819us 120 523ns 122ns 12.821us cudaGetLastError
0.01% 55.838us 120 465ns 145ns 2.4750us cudaConfigureCall
0.00% 10.752us 25 430ns 235ns 1.6210us cudaDeviceGetAttribute
0.00% 9.6380us 16 602ns 370ns 2.7540us cudaEventCreateWithFlags
0.00% 6.7920us 5 1.3580us 276ns 4.5080us cuDeviceGetCount
0.00% 6.7040us 1 6.7040us 6.7040us 6.7040us cudaFuncGetAttributes
0.00% 6.1460us 3 2.0480us 1.9950us 2.0820us cudaGetDevice
0.00% 5.8930us 10 589ns 235ns 1.1650us cuDeviceGet
0.00% 3.4380us 1 3.4380us 3.4380us 3.4380us cudaGetDeviceCount
0.00% 3.0020us 1 3.0020us 3.0020us 3.0020us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
0.00% 1.7040us 2 852ns 570ns 1.1340us cuInit
0.00% 1.1100us 2 555ns 519ns 591ns cuDriverGetVersion
No preview for this file type
......@@ -184,7 +184,7 @@ int main (int argc, char *argv[])
int num_iter;
#define PREC float
#define PREC double
PREC *b_3d, *x_3d;
int m_3d = M;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment