48 #include "Kokkos_Core.hpp" 51 #include "Teuchos_DefaultComm.hpp" 52 #include "Teuchos_CommandLineProcessor.hpp" 53 #include "Teuchos_StandardCatchMacros.hpp" 54 #ifdef KOKKOS_HAVE_CUDA 55 #include "cuda_runtime_api.h" 60 void mainHost(
const Teuchos::RCP<
const Teuchos::Comm<int> >& comm ,
62 const int use_trials ,
63 const int use_nodes[] ,
67 const int entry_min = 8;
68 const int entry_max = 48;
69 const int entry_step = 8;
71 const int entry_min = 4;
72 const int entry_max = 32;
73 const int entry_step = 4;
79 performance_test_driver<Storage,entry_min,entry_max,entry_step,Method>(
80 comm, use_print, use_trials, use_nodes,
check, dev_config);
85 void mainCuda(
const Teuchos::RCP<
const Teuchos::Comm<int> >& comm ,
87 const int use_trials ,
88 const int use_nodes[] ,
91 const int entry_min = 16;
92 const int entry_max = 64;
93 const int entry_step = 16;
94 performance_test_driver<Storage,entry_min,entry_max,entry_step,Method>(
95 comm, use_print, use_trials, use_nodes,
check, dev_config);
101 bool verbose =
false;
104 Teuchos::oblackholestream blackHole;
105 Teuchos::GlobalMPISession mpiSession (&argc, &
argv, &blackHole);
107 Teuchos::RCP<const Teuchos::Comm<int> > comm =
108 Teuchos::DefaultComm<int>::getComm();
110 const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
111 const size_t num_cores_per_socket =
112 Kokkos::hwloc::get_available_cores_per_numa();
113 const size_t num_threads_per_core =
114 Kokkos::hwloc::get_available_threads_per_core();
117 Teuchos::CommandLineProcessor
CLP;
119 "This test performance of MP::Vector FEM assembly.\n");
121 CLP.setOption(
"n", &nGrid,
"Number of mesh points in the each direction");
123 CLP.setOption(
"ni", &nIter,
"Number of assembly iterations");
125 CLP.setOption(
"print",
"no-print", &print,
"Print debugging output");
127 int num_cores = num_cores_per_socket * num_sockets;
129 "Number of CPU cores to use (defaults to all)");
132 "Number of hyper threads per core to use (defaults to all)");
133 int threads_per_vector = 1;
134 CLP.setOption(
"threads_per_vector", &threads_per_vector,
135 "Number of threads to use within each vector");
136 CLP.setOption(
"check",
"no-check", &
check,
"Check correctness");
137 #ifdef KOKKOS_HAVE_SERIAL 139 CLP.setOption(
"serial",
"no-serial", &serial,
"Enable Serial device");
141 #ifdef KOKKOS_HAVE_PTHREAD 143 CLP.setOption(
"threads",
"no-threads", &threads,
"Enable Threads device");
145 #ifdef KOKKOS_HAVE_OPENMP 147 CLP.setOption(
"openmp",
"no-openmp", &openmp,
"Enable OpenMP device");
149 #ifdef KOKKOS_HAVE_CUDA 151 CLP.setOption(
"cuda",
"no-cuda", &cuda,
"Enable Cuda device");
152 int cuda_threads_per_vector = 16;
153 CLP.setOption(
"cuda_threads_per_vector", &cuda_threads_per_vector,
154 "Number of Cuda threads to use within each vector");
155 int cuda_block_size = 256;
156 CLP.setOption(
"cuda_block_size", &cuda_block_size,
158 int num_cuda_blocks = 0;
159 CLP.setOption(
"num_cuda_blocks", &num_cuda_blocks,
160 "Number of Cuda blocks (0 implies the default choice)");
162 CLP.setOption(
"device", &device_id,
"CUDA device ID. Set to default of -1 to use the default device as determined by the local node MPI rank and --ngpus");
164 CLP.setOption(
"ngpus", &ngpus,
"Number of GPUs per node for multi-GPU runs via MPI");
169 use_nodes[0] = nGrid; use_nodes[1] = nGrid; use_nodes[2] = nGrid;
178 #ifdef KOKKOS_HAVE_SERIAL 180 typedef Kokkos::Serial Device;
183 Kokkos::Serial::initialize();
185 if (comm->getRank() == 0)
186 std::cout << std::endl
187 <<
"Serial performance with " << comm->getSize()
188 <<
" MPI ranks" << std::endl;
192 mainHost<Storage,Method>(comm, print, nIter, use_nodes,
check,
195 Kokkos::Serial::finalize();
199 #ifdef KOKKOS_HAVE_PTHREAD 201 typedef Kokkos::Threads Device;
206 if (comm->getRank() == 0)
207 std::cout << std::endl
208 <<
"Threads performance with " << comm->getSize()
210 <<
" threads per rank:" << std::endl;
216 mainHost<Storage,Method>(comm, print, nIter, use_nodes,
check,
219 Kokkos::Threads::finalize();
223 #ifdef KOKKOS_HAVE_OPENMP 225 typedef Kokkos::OpenMP Device;
230 if (comm->getRank() == 0)
231 std::cout << std::endl
232 <<
"OpenMP performance with " << comm->getSize()
234 <<
" threads per rank:" << std::endl;
240 mainHost<Storage,Method>(comm, print, nIter, use_nodes,
check,
243 Kokkos::OpenMP::finalize();
247 #ifdef KOKKOS_HAVE_CUDA 249 typedef Kokkos::Cuda Device;
252 if (device_id == -1) {
255 if ((str = std::getenv(
"SLURM_LOCALID")))
256 local_rank = std::atoi(str);
257 else if ((str = std::getenv(
"MV2_COMM_WORLD_LOCAL_RANK")))
258 local_rank = std::atoi(str);
259 else if ((str = getenv(
"OMPI_COMM_WORLD_LOCAL_RANK")))
260 local_rank = std::atoi(str);
261 device_id = local_rank % ngpus;
264 int num_device; cudaGetDeviceCount(&num_device);
265 TEUCHOS_TEST_FOR_EXCEPTION(
266 device_id >= num_device, std::logic_error,
267 "Invalid device ID " << device_id <<
". You probably are trying" <<
268 " to run with too many GPUs per node");
271 Kokkos::HostSpace::execution_space::initialize();
272 Kokkos::Cuda::initialize(Kokkos::Cuda::SelectDevice(device_id));
274 cudaDeviceProp deviceProp;
275 cudaGetDeviceProperties(&deviceProp, device_id);
276 if (comm->getRank() == 0)
277 std::cout << std::endl
278 <<
"CUDA performance performance with " << comm->getSize()
279 <<
" MPI ranks and device " << device_id <<
" (" 280 << deviceProp.name <<
"):" 285 cuda_threads_per_vector,
286 cuda_threads_per_vector == 0 ? 0 : cuda_block_size / cuda_threads_per_vector);
288 mainCuda<Storage,Method>(comm, print, nIter, use_nodes,
check,
291 Kokkos::HostSpace::execution_space::finalize();
292 Kokkos::Cuda::finalize();
297 TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
Stokhos::StandardStorage< int, double > Storage
void mainHost(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
Statically allocated storage class.
void mainCuda(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
int main(int argc, char *argv[])