Commit ac0c5b0f authored by Lubomir Riha's avatar Lubomir Riha
Browse files

init commit 2

parents
File added
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.toolchain.gnu.base.1189300763">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.toolchain.gnu.base.1189300763" moduleId="org.eclipse.cdt.core.settings" name="Default">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration buildProperties="" description="" id="cdt.managedbuild.toolchain.gnu.base.1189300763" name="Default" parent="org.eclipse.cdt.build.core.emptycfg">
<folderInfo id="cdt.managedbuild.toolchain.gnu.base.1189300763.1949234292" name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.base.2083866336" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.base">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.base.54675822" name="Debug Platform" osList="linux,hpux,aix,qnx" superClass="cdt.managedbuild.target.gnu.platform.base"/>
<builder id="cdt.managedbuild.target.gnu.builder.base.697103941" keepEnvironmentInBuildfile="false" managedBuildOn="false" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.base"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1141652391" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.base.1162197004" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.base">
<option id="gnu.cpp.compiler.option.include.paths.439749642" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="/opt/intel/composer_xe_2015.2.164/ipp/include"/>
<listOptionValue builtIn="false" value="/opt/intel/composer_xe_2015.2.164/mkl/include"/>
<listOptionValue builtIn="false" value="/opt/intel/composer_xe_2015.2.164/tbb/include"/>
<listOptionValue builtIn="false" value="/usr/local/cuda-7.5/include"/>
<listOptionValue builtIn="false" value="/usr/local/cuda-7.5/bin"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1738923537" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.base.443427589" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.base">
<option id="gnu.c.compiler.option.include.paths.1922115419" superClass="gnu.c.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="/opt/intel/composer_xe_2015.2.164/ipp/include"/>
<listOptionValue builtIn="false" value="/opt/intel/composer_xe_2015.2.164/mkl/include"/>
<listOptionValue builtIn="false" value="/opt/intel/composer_xe_2015.2.164/tbb/include"/>
<listOptionValue builtIn="false" value="/usr/local/cuda-7.5/include"/>
<listOptionValue builtIn="false" value="/usr/local/cuda-7.5/bin"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.713760702" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.linker.base.1158639792" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.base.11344238" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.base">
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1109276174" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.assembler.base.367700526" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.base">
<option id="gnu.both.asm.option.include.paths.1522581626" superClass="gnu.both.asm.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="/opt/intel/composer_xe_2015.2.164/ipp/include"/>
<listOptionValue builtIn="false" value="/opt/intel/composer_xe_2015.2.164/mkl/include"/>
<listOptionValue builtIn="false" value="/opt/intel/composer_xe_2015.2.164/tbb/include"/>
<listOptionValue builtIn="false" value="/usr/local/cuda-7.5/include"/>
<listOptionValue builtIn="false" value="/usr/local/cuda-7.5/bin"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.841723696" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="eth.null.257619840" name="eth"/>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Default">
<resource resourceType="PROJECT" workspacePath="/eth"/>
</configuration>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.base.1189300763;cdt.managedbuild.toolchain.gnu.base.1189300763.1949234292;cdt.managedbuild.tool.gnu.cpp.compiler.base.1162197004;cdt.managedbuild.tool.gnu.cpp.compiler.input.1738923537">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.base.1189300763;cdt.managedbuild.toolchain.gnu.base.1189300763.1949234292;cdt.managedbuild.tool.gnu.c.compiler.base.443427589;cdt.managedbuild.tool.gnu.c.compiler.input.713760702">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
</cproject>
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>eth</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
<triggers>full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.cdt.core.cnature</nature>
<nature>org.eclipse.cdt.core.ccnature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
</projectDescription>
content-types/enabled=true
content-types/org.eclipse.cdt.core.cxxSource/file-extensions=cu
eclipse.preferences.version=1
File added
This diff is collapsed.
This diff is collapsed.
M BANDS THREADS W
---
M BANDS THREADS W
1024 1 256 16
2.26e+03 0.118913 (1024 1 1) (256 1 1) 16 2.080000 2.064000 - - Tesla K20m (0) 1 7 void invM_kernel_n<double>(int, int, int, double, double, double const *, double const *, double*) [478]
2.26e+03 0.160866 (64 1 1) (256 1 1) 18 2.432000 0.000000 - - Tesla K20m (0) 1 7 void invM_kernel2_n<double>(int, int, int, double, double, double const *, double const *, double*) [488]
M BANDS THREADS W
1024 1 512 16
2.14e+03 0.120897 (1024 1 1) (512 1 1) 12 4.128000 4.112000 - - Tesla K20m (0) 1 7 void invM_kernel_n<double>(int, int, int, double, double, double const *, double const *, double*) [478]
2.14e+03 0.156770 (64 1 1) (512 1 1) 14 4.480000 0.000000 - - Tesla K20m (0) 1 7 void invM_kernel2_n<double>(int, int, int, double, double, double const *, double const *, double*) [488]
M BANDS THREADS W
1024 1 1024 16
2.14e+03 0.129218 (1024 1 1) (1024 1 1) 12 8.224000 8.208000 - - Tesla K20m (0) 1 7 void invM_kernel_n<double>(int, int, int, double, double, double const *, double const *, double*) [478]
2.14e+03 0.159458 (64 1 1) (1024 1 1) 14 8.576000 0.000000 - - Tesla K20m (0) 1 7 void invM_kernel2_n<double>(int, int, int, double, double, double const *, double const *, double*) [488]
M BANDS THREADS W
1024 4 256 16
2.13e+03 0.132769 (1024 1 1) (256 1 1) 16 2.128000 2.112000 - - Tesla K20m (0) 1 7 void invM_kernel_n<double>(int, int, int, double, double, double const *, double const *, double*) [478]
2.13e+03 0.170915 (64 1 1) (256 1 1) 24 3.200000 0.000000 - - Tesla K20m (0) 1 7 void invM_kernel2_n<double>(int, int, int, double, double, double const *, double const *, double*) [488]
M BANDS THREADS W
1024 4 512 16
#ifndef CUBLAS_WRAPPER_H
#define CUBLAS_WRAPPER_H
#include "cublas_v2.h"
#include "cusparse_v2.h"
/**
* cublas_wrapper.h
*
* This file contains wrappers for various cublas and cusparse functions. They are all overloaded for
* single and double precision types, so that we don't have to write the code twice if it should be
* able to work with both kinds of floating point precision.
*
* @author Simon Schoelly
*/
void cublas_transpose(cublasHandle_t const cublas_handle, int const m, double const * const x, double * const x_trans) {
double D_ONE(1);
cublasStatus_t cublas_status = cublasDgeam(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, m, m, &D_ONE, x, m, NULL, NULL, m, x_trans, m);
if (cublas_status != CUBLAS_STATUS_SUCCESS) {
std::cout << "Cublas error in function cublasDgeam: " << cublas_status << std::endl;
std::abort();
}
}
void cublas_transpose(cublasHandle_t const cublas_handle, int const m, float const * const x, float * const x_trans) {
float F_ONE(1);
cublasStatus_t cublas_status = cublasSgeam(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, m, m, &F_ONE, x, m, NULL, NULL, m, x_trans, m);
if (cublas_status != CUBLAS_STATUS_SUCCESS) {
std::cout << "Cublas error in function cublasSgeam: " << cublas_status << std::endl;
std::abort();
}
}
void cublas_transpose2(cublasHandle_t const cublas_handle, int const n, int const m, double const * const x, double * const x_trans) {
double D_ONE(1);
cublasStatus_t cublas_status = cublasDgeam(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, &D_ONE, x, n, NULL, NULL, m, x_trans, m);
if (cublas_status != CUBLAS_STATUS_SUCCESS) {
std::cout << "Cublas error in function cublasDgeam: " << cublas_status << std::endl;
std::abort();
}
}
void cublas_transpose2(cublasHandle_t const cublas_handle, int const n, int const m, float const * const x, float * const x_trans) {
float F_ONE(1);
cublasStatus_t cublas_status = cublasSgeam(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, &F_ONE, x, n, NULL, NULL, m, x_trans, m);
if (cublas_status != CUBLAS_STATUS_SUCCESS) {
std::cout << "Cublas error in function cublasSgeam: " << cublas_status << std::endl;
std::abort();
}
}
void cublas_copy(cublasHandle_t const cublas_handle, int const n, double const * const x, double * const y) {
cublasStatus_t cublas_status = cublasDcopy(cublas_handle, n, x, 1, y, 1);
if (cublas_status != CUBLAS_STATUS_SUCCESS) {
std::cout << "Cublas error in function cublasDcopy: " << cublas_status << std::endl;
std::abort();
}
}
void cublas_copy(cublasHandle_t const cublas_handle, int const n, float const * const x, float * const y) {
cublasStatus_t cublas_status = cublasScopy(cublas_handle, n, x, 1, y, 1);
if (cublas_status != CUBLAS_STATUS_SUCCESS) {
std::cout << "Cublas error in function cublasScopy: " << cublas_status << std::endl;
std::abort();
}
}
void cublas_axpy(cublasHandle_t const cublas_handle, int const n, double const * const alpha, double const * const x, double * const y) {
cublasStatus_t cublas_status = cublasDaxpy(cublas_handle, n, alpha, x, 1, y, 1);
if (cublas_status != CUBLAS_STATUS_SUCCESS) {
std::cout << "Cublas error in function cublasDaxpy: " << cublas_status << std::endl;
std::abort();
}
}
void cublas_axpy(cublasHandle_t const cublas_handle, int const n, float const * const alpha, float const * const x, float * const y) {
cublasStatus_t cublas_status = cublasSaxpy(cublas_handle, n, alpha, x, 1, y, 1);
if (cublas_status != CUBLAS_STATUS_SUCCESS) {
std::cout << "Cublas error in function cublasSaxpy: " << cublas_status << std::endl;
std::abort();
}
}
void cublas_dot(cublasHandle_t const cublas_handle, int const n, double const * const x, double const * const y, double * const result) {
cublasStatus_t cublas_status = cublasDdot(cublas_handle, n, x, 1, y, 1, result);
if (cublas_status != CUBLAS_STATUS_SUCCESS) {
std::cout << "Cublas error in function cublasDdot: " << cublas_status << std::endl;
std::abort();
}
}
void cublas_dot(cublasHandle_t const cublas_handle, int const n, float const * const x, float const * const y, float * const result) {
cublasStatus_t cublas_status = cublasSdot(cublas_handle, n, x, 1, y, 1, result);
if (cublas_status != CUBLAS_STATUS_SUCCESS) {
std::cout << "Cublas error in function cublasSdot: " << cublas_status << std::endl;
std::abort();
}
}
void cublas_nrm2(cublasHandle_t const cublas_handle, int const n, double const * const x, double * result) {
cublasStatus_t cublas_status = cublasDnrm2(cublas_handle, n, x, 1, result);
if (cublas_status != CUBLAS_STATUS_SUCCESS) {
std::cout << "Cublas error in function cublasDnrm2: " << cublas_status << std::endl;
std::abort();
}
}
void cublas_nrm2(cublasHandle_t const cublas_handle, int const n, float const * const x, float * result) {
cublasStatus_t cublas_status = cublasSnrm2(cublas_handle, n, x, 1, result);
if (cublas_status != CUBLAS_STATUS_SUCCESS) {
std::cout << "Cublas error in function cublasSnrm2: " << cublas_status << std::endl;
std::abort();
}
}
void cublas_scal(cublasHandle_t const cublas_handle, int const n, double const * const alpha, double * const x) {
cublasStatus_t cublas_status = cublasDscal(cublas_handle, n, alpha, x, 1);
if (cublas_status != CUBLAS_STATUS_SUCCESS) {
std::cout << "Cublas error in function cublasDscal: " << cublas_status << std::endl;
std::abort();
}
}
void cublas_scal(cublasHandle_t const cublas_handle, int const n, float const * const alpha, float * const x) {
cublasStatus_t cublas_status = cublasSscal(cublas_handle, n, alpha, x, 1);
if (cublas_status != CUBLAS_STATUS_SUCCESS) {
std::cout << "Cublas error in function cublasSscal: " << cublas_status << std::endl;
std::abort();
}
}
void cusparse_gtsv(cusparseHandle_t const cusparse_handle, int const m, int const n, double const * const dl,
double const * const d, double const * const du, double * const x) {
cusparseStatus_t cusparse_status = cusparseDgtsv_nopivot(cusparse_handle, m, n, dl, d, du, x, m);
if (cusparse_status != CUSPARSE_STATUS_SUCCESS) {
std::cout << "Cusparse error in function cusparseDgtsv_nopivot: " << cusparse_status << std::endl;
std::abort();
}
}
void cusparse_gtsv(cusparseHandle_t const cusparse_handle, int const m, int const n, float const * const dl,
float const * const d, float const * const du, float * const x) {
cusparseStatus_t cusparse_status = cusparseSgtsv_nopivot(cusparse_handle, m, n, dl, d, du, x, m);
if (cusparse_status != CUSPARSE_STATUS_SUCCESS) {
std::cout << "Cusparse error in function cusparseDgtsv_nopivot: " << cusparse_status << std::endl;
std::abort();
}
}
#endif
nvcc error : 'gcc' died due to signal 2
#!/bin/bash
echo "M BANDS THREADS W" | tr " " "\t"
for M in 256 512 1024 2048 4096 8192
do
echo " --- "
for BANDS in 1 # 4 8 16 32
do
for THREADS in 4 8 16 32 64 # 128 256 512 1024 # 256 512 1024 #256 512 1024
do
for W in 1 # 4 8 16 32
do
echo "M BANDS THREADS W" | tr " " "\t"
echo $M $BANDS $THREADS $W | tr " " "\t" # | tr "\n" "\t"
nvcc -O3 -DBANDS=$BANDS -DW=$W -DTHREADS=$THREADS -DM=$M -arch sm_35 -lcublas -lcusparse -o test test.cu
#nvprof ./test 2>&1 | grep -e "invM_kernel" -e "transpose" | cut -d "u" -f 1 | cut -d " " -f 15 | tr "\n" "\t"
#nvprof ./test 2>&1 | tee out.txt > t.t
#cat out.txt | grep -e "thomas_kernel_trans<" | cut -d "s" -f 3 | tr "\n" "\t" #| cut -d " " -f 11 | tr "\n" "\t"
#cat out.txt | grep -e "thomas_kernel<" | cut -d "s" -f 3 | tr "\n" "\t" #| cut -d " " -f 11 | tr "\n" "\t"
#cat out.txt | grep -e "transpose" | cut -d "s" -f 3 | tr "\n" "\t" #| cut -d " " -f 11 | tr "\n" "\t"
nvprof -u ms --print-gpu-trace ./test 1 2>&1 | grep -e "thomas" -e "transpose" -e "Chebyschev" | tail -n 4
echo
nvprof -u ms --print-gpu-trace ./test 2 2>&1 | grep -e "thomas" -e "transpose" -e "Chebyschev" | tail -n 2
echo
nvprof -u ms --print-gpu-trace ./test 10 2>&1 | grep -e "thomas" -e "transpose" -e "ST_" -e "Chebyschev" | grep "transpose_readWrite_alignment_kernel" -A5 | tail -n 12
echo
nvprof -u ms --print-gpu-trace ./test 11 2>&1 | grep -e "thomas" -e "transpose" -e "ST_" -e "Chebyschev" | grep "transpose_readWrite_alignment_kernel" -A5 | tail -n 12
echo
nvprof -u ms --print-gpu-trace ./test 12 2>&1 | grep -e "thomas" -e "transpose" -e "ST_" -e "Chebyschev" | grep "transpose_readWrite_alignment_kernel" -A5 | tail -n 12
echo
done
done
done
done
#!/bin/bash
echo "M BANDS THREADS W" | tr " " "\t" | tee result_inv.log
for M in 1024 2048 4096 8192
do
echo " --- " | tee -a result_inv.log
for BANDS in 1 4 8 16 24 32
do
for THREADS in 256 512 1024
do
for W in 16 # 4 8 16 32
do
echo "M BANDS THREADS W" | tr " " "\t" | tee -a result_inv.log
echo $M $BANDS $THREADS $W | tr " " "\t" | tee -a result_inv.log # | tr "\n" "\t" | tee -a result_inv.log
nvcc -O3 -DBANDS=$BANDS -DW=$W -DTHREADS=$THREADS -DM=$M -arch sm_35 -lcublas -lcusparse -o test test.cu
nvprof -u ms --print-gpu-trace ./test 20 2>&1 | grep invM_kernel | tail -n 2 | tee -a result_inv.log
echo | tee -a result_inv.log
done
done
done
done
./eval_inv_view.sh
#!/bin/bash
for M in 1024 2048 4096 8192
do
echo
echo $M
for BANDS in 1 4 8 16 24 32
do
echo $BANDS | tr '\n' '\t'
awk '$1 == "'"$M"'" && $2 == "'"$BANDS"'" {nr[NR+0]; nr[NR+2]}; NR in nr' result_inv.log | grep "invM_kernel" -B 0 | sort -n -k 2 -r | tail -n 1
echo $BANDS | tr '\n' '\t'
awk '$1 == "'"$M"'" && $2 == "'"$BANDS"'" {nr[NR+0]; nr[NR+1]}; NR in nr' result_inv.log | grep "invM_kernel" -B 0 | sort -n -k 2 -r | tail -n 1
done
done
echo
This source diff could not be displayed because it is too large. You can view the blob instead.
M BANDS THREADS W
---
M BANDS THREADS W
1024 1 256 16
2.17e+03 0.677417 (1024 1 1) (64 1 1) 18 1.040000 1.024000 - - Tesla K20m (0) 1 7 void invM_kernel_n<double>(int, int, int, double, double, double const *, double const *, double*) [504]
2.17e+03 5.025697 (32 1 1) (64 1 1) 21 17.15200 0.000000 - - Tesla K20m (0) 1 7 void invM_kernel2_n<double>(int, int, int, double, double, double const *, double const *, double*) [521]
M BANDS THREADS W
1024 1 512 16
2.19e+03 0.680393 (1024 1 1) (64 1 1) 18 1.040000 1.024000 - - Tesla K20m (0) 1 7 void invM_kernel_n<double>(int, int, int, double, double, double const *, double const *, double*) [504]
2.20e+03 5.024897 (32 1 1) (64 1 1) 21 17.15200 0.000000 - - Tesla K20m (0) 1 7 void invM_kernel2_n<double>(int, int, int, double, double, double const *, double const *, double*) [521]
M BANDS THREADS W
1024 1 1024 16
all :
nvcc -O3 -DBANDS=10 -DW=4 -DTHREADS=512 -DM=1024 -arch sm_35 -lcublas -lcusparse -o test test.cu
debug:
nvcc -O2 -DBANDS=10 -DW=16 -DTHREADS=1024 -DM=1024 -g -G -arch sm_35 -lcublas -lcusparse -o test test.cu
./test
run:
./test
run_g:
./test 2>&1 | grep invM_kernel
car:
nvcc -O3 -DBANDS=10 -DW=1 -DTHREADS=1024 -DM=1024 -arch sm_35 -lcublas -lcusparse -o test test.cu
nvprof ./test 2>&1 | grep -e "invM_kernel" -e "Tk 2"
nvcc -O3 -DBANDS=10 -DW=2 -DTHREADS=1024 -DM=1024 -arch sm_35 -lcublas -lcusparse -o test test.cu
nvprof ./test 2>&1 | grep -e "invM_kernel" -e "Tk 2"
nvcc -O3 -DBANDS=10 -DW=4 -DTHREADS=1024 -DM=1024 -arch sm_35 -lcublas -lcusparse -o test test.cu
nvprof ./test 2>&1 | grep -e "invM_kernel" -e "Tk 2"
nvcc -O3 -DBANDS=10 -DW=8 -DTHREADS=1024 -DM=1024 -arch sm_35 -lcublas -lcusparse -o test test.cu
nvprof ./test 2>&1 | grep -e "invM_kernel" -e "Tk 2"
nvcc -O3 -DBANDS=10 -DW=16 -DTHREADS=1024 -DM=1024 -arch sm_35 -lcublas -lcusparse -o test test.cu
nvprof ./test 2>&1 | grep -e "invM_kernel" -e "Tk 2"
nvcc -O3 -DBANDS=10 -DW=32 -DTHREADS=1024 -DM=1024 -arch sm_35 -lcublas -lcusparse -o test test.cu
nvprof ./test 2>&1 | grep -e "invM_kernel" -e "Tk 2"
#include <iostream>
#include "misc.h"
int divide_and_round_up(int const n, int const d) {
if (n % d == 0) {
return n/d;
}
return (n/d + 1);
}
#ifndef MISC_H
#define MISC_H
#include "cublas_v2.h"
#include "cusparse_v2.h"
/*
* misc.h
*
* This file contains miscellaneous helper functions.
*
* @author Simon Schoelly
*/
int divide_and_round_up(int const n, int const d) {
if (n % d == 0) {
return n/d;
}
return (n/d + 1);
}
template<class FT>
void print_device_array(FT const * const array, size_t const num_elements, char const * const array_symbol) {
std::cout << array_symbol << " (" << num_elements << ") elements:";
FT *host_array = new FT[num_elements];
cudaMemcpy(host_array, array, num_elements * sizeof(FT), cudaMemcpyDeviceToHost);
for (int i = 0; i <num_elements; ++i) {
std::cout << " " << host_array[i];
}
std::cout << std::endl;
delete[] host_array;
}
template<class T>
__global__ void device_memset_kernel(T * const devPtr, T const value, size_t const count, size_t offset) {
size_t const tid = threadIdx.x + blockDim.x * blockIdx.x;
if (tid >= count) {
return;
}
devPtr[tid + offset] = value;
}
template<class T>
void device_memset(T * const devPtr, T const value, size_t const count, size_t offset=0) {
if (count <= 0) {
return;
}
device_memset_kernel<T><<<divide_and_round_up(count, 32), 32>>>(devPtr, value, count, offset);
}
template<class T>
__global__ void device_memset_kernel_inc(T * const devPtr, T const value, size_t const count, size_t offset) {
size_t const tid = threadIdx.x + blockDim.x * blockIdx.x;
if (tid >= count) {
return;
}
devPtr[tid + offset] = value + (T)tid;
}
template<class T>
void device_memset_inc(T * const devPtr, T const value, size_t const count, size_t offset=0) {
if (count <= 0) {
return;
}
device_memset_kernel_inc<T><<<divide_and_round_up(count, 32), 32>>>(devPtr, value, count, offset);
}
#endif
==65297== NVPROF is profiling process 65297, command: ./test
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
Tk 1_t
21 Cheb only - iterations
==65297== Profiling application: ./test
==65297== Profiling result:
Time(%) Time Calls Avg Min Max Name
59.10% 4.56921s 21 217.58ms 216.35ms 218.94ms void thomas_kernel_trans<double>(int, int, int, double, double, double const *, double const *, double*)
16.21% 1.25326s 42 29.840ms 23.174ms 37.027ms void thomas_kernel<double>(int, int, int, double, double, double const *, double const *, double*)
8.67% 670.18ms 63 10.638ms 10.620ms 10.660ms void axpy_kernel_val<double, int=0>(cublasAxpyParamsVal<double>)
4.31% 333.45ms 42 7.9393ms 7.9248ms 7.9554ms void transpose_readWrite_alignment_kernel<double, int=1, bool=0, int=6, int=4, int=4>(cublasTransposeParams<double>, double const *, double*, double const *)
3.41% 263.68ms 22 11.985ms 11.982ms 11.990ms void multiply_by_A<double>(int, double, double const *, double*)
2.19% 169.62ms 44 3.8551ms 29.152us 8.8965ms void nrm2_kernel<double, double, int=0, int=0, int=128, int=0>(cublasNrm2Params<double, double>)
2.14% 165.37ms 23 7.1900ms 7.0845ms 7.2005ms void copy_kernel<double, int=0>(cublasCopyParams<double>)
1.97% 152.29ms 3 50.762ms 1.2800us 152.27ms [CUDA memcpy HtoD]
1.82% 141.00ms 20 7.0501ms 7.0441ms 7.0563ms void scal_kernel_val<double, double, int=0>(cublasScalParamsVal<double, double>)
0.08% 6.5645ms 1 6.5645ms 6.5645ms 6.5645ms void device_memset_kernel<double>(double*, double, unsigned long, unsigned long)
0.08% 6.4024ms 1 6.4024ms 6.4024ms 6.4024ms void device_memset_kernel_inc<double>(double*, double, unsigned long, unsigned long)
0.00% 104.45us 22 4.7470us 4.6720us 5.2160us [CUDA memset]
0.00% 56.802us 22 2.5810us 2.1440us 10.144us [CUDA memcpy DtoH]
==65297== API calls:
Time(%) Time Calls Avg Min Max Name
91.56% 7.72952s 25 309.18ms 23.697us 360.08ms cudaMemcpy
8.34% 704.04ms 4 176.01ms 71.078us 352.97ms cudaFree
0.05% 4.4002ms 11 400.02us 10.274us 1.6367ms cudaMalloc
0.03% 2.1981ms 279 7.8780us 6.4650us 44.498us cudaLaunch
0.01% 642.11us 249 2.5780us 169ns 95.283us cuDeviceGetAttribute
0.00% 229.58us 918 250ns 165ns 7.7740us cudaSetupArgument
0.00% 211.69us 22 9.6220us 8.8190us 17.379us cudaMemsetAsync
0.00% 95.442us 279 342ns 196ns 8.0960us cudaConfigureCall
0.00% 94.196us 384 245ns 146ns 1.3530us cudaGetLastError
0.00% 88.827us 3 29.609us 29.032us 30.188us cuDeviceTotalMem
0.00% 82.462us 23 3.5850us 2.7790us 14.771us cudaFuncGetAttributes
0.00% 74.404us 3 24.801us 22.361us 27.518us cuDeviceGetName
0.00% 68.968us 24 2.8730us 332ns 50.750us cudaDeviceGetAttribute
0.00% 62.369us 22 2.8340us 1.3570us 9.0910us cudaEventRecord
0.00% 57.774us 1 57.774us 57.774us 57.774us cudaGetDeviceCount
0.00% 55.013us 22 2.5000us 2.2850us 4.5390us cudaEventQuery
0.00% 16.134us 1 16.134us 16.134us 16.134us cudaOccupancyMaxActiveBlocksPerMultiprocessor
0.00% 10.411us 8 1.3010us 654ns 4.1590us cudaEventCreateWithFlags
0.00% 9.0740us 3 3.0240us 2.1970us 3.5180us cudaGetDevice
0.00% 2.3890us 4 597ns 253ns 1.4910us cuDeviceGetCount
0.00% 2.0660us 2 1.0330us 874ns 1.1920us cuInit
0.00% 1.5060us 4 376ns 288ns 582ns cuDeviceGet
0.00% 1.0740us 2 537ns 440ns 634ns cuDriverGetVersion
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment