|
47 | 47 | #include "KokkosBlas1_scal.hpp"
|
48 | 48 | #include "KokkosSparse_getDiagCopy.hpp"
|
49 | 49 | #include "KokkosSparse_spmv.hpp"
|
| 50 | +#include "Kokkos_StdAlgorithms.hpp" |
50 | 51 |
|
51 | 52 | #include <memory>
|
52 | 53 | #include <sstream>
|
@@ -8301,59 +8302,43 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
|
8301 | 8302 | << std::endl;
|
8302 | 8303 | std::cerr << os.str ();
|
8303 | 8304 | }
|
8304 |
| - // Make sure that host has the latest version, since we're |
8305 |
| - // using the version on host. If host has the latest |
8306 |
| - // version, syncing to host does nothing. |
8307 |
| - destMat->numExportPacketsPerLID_.sync_host (); |
8308 |
| - Teuchos::ArrayView<const size_t> numExportPacketsPerLID = |
8309 |
| - getArrayViewFromDualView (destMat->numExportPacketsPerLID_); |
8310 |
| - destMat->numImportPacketsPerLID_.sync_host (); |
8311 |
| - Teuchos::ArrayView<size_t> numImportPacketsPerLID = |
8312 |
| - getArrayViewFromDualView (destMat->numImportPacketsPerLID_); |
8313 |
| - |
| 8305 | + destMat->numExportPacketsPerLID_.sync_device(); |
| 8306 | + auto numExportPacketsPerLID = destMat->numExportPacketsPerLID_.view_device(); |
| 8307 | + auto numImportPacketsPerLID = destMat->numImportPacketsPerLID_.view_device(); |
8314 | 8308 | if (verbose) {
|
8315 | 8309 | std::ostringstream os;
|
8316 | 8310 | os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
|
8317 | 8311 | << std::endl;
|
8318 | 8312 | std::cerr << os.str ();
|
8319 | 8313 | }
|
8320 |
| - Distor.doReversePostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1, |
8321 |
| - destMat->numImportPacketsPerLID_.view_host()); |
| 8314 | + Distor.doReversePostsAndWaits(numExportPacketsPerLID, 1, numImportPacketsPerLID); |
8322 | 8315 | if (verbose) {
|
8323 | 8316 | std::ostringstream os;
|
8324 | 8317 | os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
|
8325 | 8318 | << std::endl;
|
8326 | 8319 | std::cerr << os.str ();
|
8327 | 8320 | }
|
8328 | 8321 |
|
8329 |
| - size_t totalImportPackets = 0; |
8330 |
| - for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) { |
8331 |
| - totalImportPackets += numImportPacketsPerLID[i]; |
8332 |
| - } |
| 8322 | + size_t totalImportPackets = Kokkos::Experimental::reduce(typename Node::execution_space(), numImportPacketsPerLID); |
8333 | 8323 |
|
8334 | 8324 | // Reallocation MUST go before setting the modified flag,
|
8335 | 8325 | // because it may clear out the flags.
|
8336 | 8326 | destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
|
8337 | 8327 | verbosePrefix.get ());
|
8338 | 8328 | destMat->imports_.modify_host ();
|
8339 |
| - auto hostImports = destMat->imports_.view_host(); |
8340 |
| - // This is a legacy host pack/unpack path, so use the host |
8341 |
| - // version of exports_. |
8342 |
| - destMat->exports_.sync_host (); |
8343 |
| - auto hostExports = destMat->exports_.view_host(); |
| 8329 | + auto deviceImports = destMat->imports_.view_device(); |
| 8330 | + auto deviceExports = destMat->exports_.view_device(); |
8344 | 8331 | if (verbose) {
|
8345 | 8332 | std::ostringstream os;
|
8346 |
| - os << *verbosePrefix << "Calling 4-arg doReversePostsAndWaits" |
| 8333 | + os << *verbosePrefix << "Calling 4-arg doReversePostsAndWaitsKokkos" |
8347 | 8334 | << std::endl;
|
8348 | 8335 | std::cerr << os.str ();
|
8349 | 8336 | }
|
8350 |
| - Distor.doReversePostsAndWaits (hostExports, |
8351 |
| - numExportPacketsPerLID, |
8352 |
| - hostImports, |
8353 |
| - numImportPacketsPerLID); |
| 8337 | + destMat->imports_.sync_device(); |
| 8338 | + Distor.doReversePostsAndWaitsKokkos (deviceExports, numExportPacketsPerLID, deviceImports, numImportPacketsPerLID); |
8354 | 8339 | if (verbose) {
|
8355 | 8340 | std::ostringstream os;
|
8356 |
| - os << *verbosePrefix << "Finished 4-arg doReversePostsAndWaits" |
| 8341 | + os << *verbosePrefix << "Finished 4-arg doReversePostsAndWaitsKokkos" |
8357 | 8342 | << std::endl;
|
8358 | 8343 | std::cerr << os.str ();
|
8359 | 8344 | }
|
@@ -8396,58 +8381,43 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
|
8396 | 8381 | << std::endl;
|
8397 | 8382 | std::cerr << os.str ();
|
8398 | 8383 | }
|
8399 |
| - // Make sure that host has the latest version, since we're |
8400 |
| - // using the version on host. If host has the latest |
8401 |
| - // version, syncing to host does nothing. |
8402 |
| - destMat->numExportPacketsPerLID_.sync_host (); |
8403 |
| - Teuchos::ArrayView<const size_t> numExportPacketsPerLID = |
8404 |
| - getArrayViewFromDualView (destMat->numExportPacketsPerLID_); |
8405 |
| - destMat->numImportPacketsPerLID_.sync_host (); |
8406 |
| - Teuchos::ArrayView<size_t> numImportPacketsPerLID = |
8407 |
| - getArrayViewFromDualView (destMat->numImportPacketsPerLID_); |
| 8384 | + destMat->numExportPacketsPerLID_.sync_device (); |
| 8385 | + auto numExportPacketsPerLID = destMat->numExportPacketsPerLID_.view_device(); |
| 8386 | + auto numImportPacketsPerLID = destMat->numImportPacketsPerLID_.view_device(); |
8408 | 8387 | if (verbose) {
|
8409 | 8388 | std::ostringstream os;
|
8410 | 8389 | os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
|
8411 | 8390 | << std::endl;
|
8412 | 8391 | std::cerr << os.str ();
|
8413 | 8392 | }
|
8414 |
| - Distor.doPostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1, |
8415 |
| - destMat->numImportPacketsPerLID_.view_host()); |
| 8393 | + Distor.doPostsAndWaits(numExportPacketsPerLID, 1, numImportPacketsPerLID); |
8416 | 8394 | if (verbose) {
|
8417 | 8395 | std::ostringstream os;
|
8418 | 8396 | os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
|
8419 | 8397 | << std::endl;
|
8420 | 8398 | std::cerr << os.str ();
|
8421 | 8399 | }
|
8422 | 8400 |
|
8423 |
| - size_t totalImportPackets = 0; |
8424 |
| - for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) { |
8425 |
| - totalImportPackets += numImportPacketsPerLID[i]; |
8426 |
| - } |
| 8401 | + size_t totalImportPackets = Kokkos::Experimental::reduce(typename Node::execution_space(), numImportPacketsPerLID); |
8427 | 8402 |
|
8428 | 8403 | // Reallocation MUST go before setting the modified flag,
|
8429 | 8404 | // because it may clear out the flags.
|
8430 | 8405 | destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
|
8431 | 8406 | verbosePrefix.get ());
|
8432 | 8407 | destMat->imports_.modify_host ();
|
8433 |
| - auto hostImports = destMat->imports_.view_host(); |
8434 |
| - // This is a legacy host pack/unpack path, so use the host |
8435 |
| - // version of exports_. |
8436 |
| - destMat->exports_.sync_host (); |
8437 |
| - auto hostExports = destMat->exports_.view_host(); |
| 8408 | + auto deviceImports = destMat->imports_.view_device(); |
| 8409 | + auto deviceExports = destMat->exports_.view_device(); |
8438 | 8410 | if (verbose) {
|
8439 | 8411 | std::ostringstream os;
|
8440 |
| - os << *verbosePrefix << "Calling 4-arg doPostsAndWaits" |
| 8412 | + os << *verbosePrefix << "Calling 4-arg doPostsAndWaitsKokkos" |
8441 | 8413 | << std::endl;
|
8442 | 8414 | std::cerr << os.str ();
|
8443 | 8415 | }
|
8444 |
| - Distor.doPostsAndWaits (hostExports, |
8445 |
| - numExportPacketsPerLID, |
8446 |
| - hostImports, |
8447 |
| - numImportPacketsPerLID); |
| 8416 | + destMat->imports_.sync_device (); |
| 8417 | + Distor.doPostsAndWaitsKokkos (deviceExports, numExportPacketsPerLID, deviceImports, numImportPacketsPerLID); |
8448 | 8418 | if (verbose) {
|
8449 | 8419 | std::ostringstream os;
|
8450 |
| - os << *verbosePrefix << "Finished 4-arg doPostsAndWaits" |
| 8420 | + os << *verbosePrefix << "Finished 4-arg doPostsAndWaitsKokkos" |
8451 | 8421 | << std::endl;
|
8452 | 8422 | std::cerr << os.str ();
|
8453 | 8423 | }
|
@@ -8494,12 +8464,6 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
|
8494 | 8464 | Teuchos::Array<int> RemotePids;
|
8495 | 8465 | if (runOnHost) {
|
8496 | 8466 | Teuchos::Array<int> TargetPids;
|
8497 |
| - // Backwards compatibility measure. We'll use this again below. |
8498 |
| - |
8499 |
| - // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been) |
8500 |
| - // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits(). |
8501 |
| - // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device. |
8502 |
| - destMat->numImportPacketsPerLID_.modify_host(); //FIXME |
8503 | 8467 |
|
8504 | 8468 | # ifdef HAVE_TPETRA_MMM_TIMINGS
|
8505 | 8469 | RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
|
@@ -8691,14 +8655,6 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
|
8691 | 8655 | } else {
|
8692 | 8656 | // run on device
|
8693 | 8657 |
|
8694 |
| - |
8695 |
| - // Backwards compatibility measure. We'll use this again below. |
8696 |
| - |
8697 |
| - // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been) |
8698 |
| - // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits(). |
8699 |
| - // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device. |
8700 |
| - destMat->numImportPacketsPerLID_.modify_host(); //FIXME |
8701 |
| - |
8702 | 8658 | # ifdef HAVE_TPETRA_MMM_TIMINGS
|
8703 | 8659 | RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
|
8704 | 8660 | # endif
|
|
0 commit comments