LCOV - code coverage report
Current view: top level - dispatchers - SimpleMPIScheduler.cxx (source / functions) Coverage Total Hit
Test: coverage.info Lines: 16.4 % 67 11
Test Date: 2025-10-02 16:03:03 Functions: 12.5 % 8 1

            Line data    Source code
       1              : // Copyright 2024, UChicago Argonne, LLC
       2              : // All Rights Reserved
       3              : // Software Name: NEML2 -- the New Engineering material Model Library, version 2
       4              : // By: Argonne National Laboratory
       5              : // OPEN SOURCE LICENSE (MIT)
       6              : //
       7              : // Permission is hereby granted, free of charge, to any person obtaining a copy
       8              : // of this software and associated documentation files (the "Software"), to deal
       9              : // in the Software without restriction, including without limitation the rights
      10              : // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      11              : // copies of the Software, and to permit persons to whom the Software is
      12              : // furnished to do so, subject to the following conditions:
      13              : //
      14              : // The above copyright notice and this permission notice shall be included in
      15              : // all copies or substantial portions of the Software.
      16              : //
      17              : // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
      18              : // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      19              : // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
      20              : // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      21              : // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
      22              : // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
      23              : // THE SOFTWARE.
      24              : 
      25              : #include "neml2/dispatchers/SimpleMPIScheduler.h"
      26              : #include "neml2/misc/assertions.h"
      27              : 
      28              : #include <string>
      29              : #include <functional>
      30              : 
      31              : namespace neml2
      32              : {
      33              : 
      34              : register_NEML2_object(SimpleMPIScheduler);
      35              : 
      36              : OptionSet
      37            2 : SimpleMPIScheduler::expected_options()
      38              : {
      39            2 :   OptionSet options = WorkScheduler::expected_options();
      40            2 :   options.doc() =
      41            2 :       "Dispatch work to a single device selected based on processor ID in given batch sizes.";
      42              : 
      43            4 :   options.set<std::vector<Device>>("devices");
      44            4 :   options.set("devices").doc() = "List of devices to dispatch work to";
      45              : 
      46            4 :   options.set<std::vector<std::size_t>>("batch_sizes");
      47            2 :   options.set("batch_sizes").doc() = "List of batch sizes for each device";
      48              : 
      49            6 :   options.set<std::vector<std::size_t>>("capacities") = {};
      50            2 :   options.set("capacities").doc() = "List of capacities for each device, default to batch_sizes";
      51              : 
      52            2 :   return options;
      53            0 : }
      54              : 
      55            0 : SimpleMPIScheduler::SimpleMPIScheduler(const OptionSet & options)
      56              :   : WorkScheduler(options),
      57            0 :     _available_devices(options.get<std::vector<Device>>("devices")),
      58            0 :     _batch_sizes(options.get<std::vector<std::size_t>>("batch_sizes")),
      59            0 :     _capacities(options.get("capacities").user_specified()
      60            0 :                     ? options.get<std::vector<std::size_t>>("capacities")
      61            0 :                     : _batch_sizes),
      62            0 :     _comm(TIMPI::Communicator(MPI_COMM_WORLD))
      63              : {
      64            0 :   neml_assert(_available_devices.size() == _batch_sizes.size(),
      65              :               "Number of batch sizes must match the number of devices.");
      66            0 :   neml_assert(
      67            0 :       _available_devices.size() == _capacities.size(),
      68              :       "Number of capacities must match the number of devices and the number of batch sizes.");
      69            0 : }
      70              : 
      71              : void
      72            0 : SimpleMPIScheduler::setup()
      73              : {
      74            0 :   WorkScheduler::setup();
      75              : 
      76              :   // First pass:
      77              :   // - Prohibit any CPU
      78              :   // - Check if any CUDA device is present
      79            0 :   bool cuda = false;
      80            0 :   for (const auto & device : _available_devices)
      81              :   {
      82            0 :     neml_assert(!device.is_cpu(), "CPU device is not allowed in SimpleMPIScheduler");
      83            0 :     if (device.is_cuda())
      84            0 :       cuda = true;
      85              :     else
      86            0 :       neml_assert(false, "Unsupported device type: ", device);
      87              :   }
      88              : 
      89              :   // Second pass:
      90              :   // - If multiple CUDA devices are present, make sure each CUDA device has a concrete
      91              :   //   (nonnegative), unique device ID
      92            0 :   bool has_multiple_cuda_devices = _available_devices.size() > 1 && cuda;
      93            0 :   if (has_multiple_cuda_devices)
      94              :   {
      95            0 :     std::set<DeviceIndex> cuda_device_ids;
      96            0 :     for (const auto & device : _available_devices)
      97              :     {
      98            0 :       auto device_id = device.index();
      99            0 :       neml_assert(device_id >= 0, "Device ID must be nonnegative");
     100            0 :       neml_assert(cuda_device_ids.find(device_id) == cuda_device_ids.end(),
     101              :                   "Device ID must be unique. Found duplicate: ",
     102              :                   device_id);
     103            0 :       cuda_device_ids.insert(device_id);
     104              :     }
     105            0 :   }
     106              : 
     107            0 :   determine_my_device();
     108            0 : }
     109              : 
     110              : void
     111            0 : SimpleMPIScheduler::determine_my_device()
     112              : {
     113              :   // NOLINTBEGIN(modernize-avoid-c-arrays)
     114              :   char c_str_hostname[MPI_MAX_PROCESSOR_NAME];
     115            0 :   int name_len = 0;
     116            0 :   timpi_call_mpi(MPI_Get_processor_name(c_str_hostname, &name_len));
     117            0 :   std::string hostname = std::string(c_str_hostname);
     118              : 
     119              :   std::hash<std::string> hasher;
     120            0 :   int id = static_cast<int>(hasher(hostname) % std::numeric_limits<int>::max());
     121              : 
     122              :   // Make a new communicator based on this hashed hostname
     123            0 :   TIMPI::Communicator new_comm;
     124            0 :   _comm.split(id, int(_comm.rank()), new_comm);
     125              :   // Assign our device index based on the new communicator
     126            0 :   _device_index = new_comm.rank();
     127            0 :   neml_assert(new_comm.size() <= _available_devices.size(),
     128              :               "MPI split by host would require too many devices");
     129              :   // NOLINTEND(modernize-avoid-c-arrays)
     130            0 : }
     131              : 
     132              : bool
     133            0 : SimpleMPIScheduler::schedule_work_impl(Device & device, std::size_t & batch_size) const
     134              : {
     135            0 :   if (_load + _batch_sizes[_device_index] > _capacities[_device_index])
     136            0 :     return false;
     137              : 
     138            0 :   device = _available_devices[_device_index];
     139            0 :   batch_size = _batch_sizes[_device_index];
     140            0 :   return true;
     141              : }
     142              : 
     143              : void
     144            0 : SimpleMPIScheduler::dispatched_work_impl(Device, std::size_t n)
     145              : {
     146            0 :   _load += n;
     147            0 : }
     148              : 
     149              : void
     150            0 : SimpleMPIScheduler::completed_work_impl(Device, std::size_t n)
     151              : {
     152            0 :   neml_assert(_load >= n, "Load underflow");
     153            0 :   _load -= n;
     154            0 : }
     155              : 
     156              : bool
     157            0 : SimpleMPIScheduler::all_work_completed() const
     158              : {
     159            0 :   return _load == 0;
     160              : }
     161              : 
     162              : } // namespace neml2
        

Generated by: LCOV version 2.0-1