45 #include "Kokkos_Core.hpp" 48 #include "Teuchos_CommandLineProcessor.hpp" 49 #include "Teuchos_StandardCatchMacros.hpp" 50 #ifdef KOKKOS_HAVE_CUDA 51 #include "cuda_runtime_api.h" 54 template <
typename Scalar,
typename Ordinal,
typename Device>
67 const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
68 const size_t num_cores_per_socket =
69 Kokkos::hwloc::get_available_cores_per_numa();
76 Teuchos::CommandLineProcessor
CLP;
78 "This test performance of mean-based UQ::PCE multiply routines.\n");
80 CLP.setOption(
"n", &nGrid,
"Number of mesh points in the each direction");
82 CLP.setOption(
"ni", &nIter,
"Number of multiply iterations");
84 CLP.setOption(
"order", &order,
"Polynomial order");
86 CLP.setOption(
"dmin", &dim_min,
"Starting stochastic dimension");
88 CLP.setOption(
"dmax", &dim_max,
"Stopping stochastic dimension");
89 int numa = num_sockets;
90 CLP.setOption(
"numa", &numa,
"Number of numa nodes");
91 int cores = num_cores_per_socket;
92 CLP.setOption(
"cores", &cores,
"Cores per numa node");
93 #ifdef KOKKOS_HAVE_PTHREAD 95 CLP.setOption(
"threads", &threads,
"Number of threads for Threads device");
97 #ifdef KOKKOS_HAVE_OPENMP 99 CLP.setOption(
"openmp", &openmp,
"Number of threads for OpenMP device");
101 #ifdef KOKKOS_HAVE_CUDA 103 CLP.setOption(
"cuda",
"no-cuda", &cuda,
"Enable Cuda device");
105 CLP.setOption(
"device", &device_id,
"CUDA device ID");
112 #ifdef KOKKOS_HAVE_PTHREAD 114 typedef Kokkos::Threads Device;
116 Kokkos::Threads::initialize(threads, numa, cores);
118 std::cout << std::endl
119 <<
"Threads performance with " << threads
120 <<
" threads, " << numa <<
" numas, " << cores
121 <<
" cores/numa:" << std::endl;
123 performance_test_driver<Scalar,Ordinal,Device>(
124 nGrid, nIter, order, dim_min, dim_max);
126 Kokkos::Threads::finalize();
130 #ifdef KOKKOS_HAVE_OPENMP 132 typedef Kokkos::OpenMP Device;
134 Kokkos::OpenMP::initialize(openmp, numa, cores);
136 std::cout << std::endl
137 <<
"OpenMP performance with " << openmp
138 <<
" threads, " << numa <<
" numas, " << cores
139 <<
" cores/numa:" << std::endl;
141 performance_test_driver<Scalar,Ordinal,Device>(
142 nGrid, nIter, order, dim_min, dim_max);
144 Kokkos::OpenMP::finalize();
148 #ifdef KOKKOS_HAVE_CUDA 150 typedef Kokkos::Cuda Device;
152 Kokkos::HostSpace::execution_space::initialize();
153 Kokkos::Cuda::initialize(Kokkos::Cuda::SelectDevice(device_id));
155 cudaDeviceProp deviceProp;
156 cudaGetDeviceProperties(&deviceProp, device_id);
157 std::cout << std::endl
158 <<
"CUDA performance for device " << device_id <<
" (" 159 << deviceProp.name <<
"):" 162 performance_test_driver<Scalar,Ordinal,Device>(
163 nGrid, nIter, order, dim_min, dim_max);
165 Kokkos::HostSpace::execution_space::finalize();
166 Kokkos::Cuda::finalize();
171 TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
int main(int argc, char *argv[])
void performance_test_driver(const Ordinal nGrid, const Ordinal nIter, const Ordinal order, const Ordinal min_var, const Ordinal max_var)