neml2/WorkDispatcher_8h_source.html

// Copyright 2024, UChicago Argonne, LLC

// All Rights Reserved

// Software Name: NEML2 -- the New Engineering material Model Library, version 2

// By: Argonne National Laboratory

// OPEN SOURCE LICENSE (MIT)

//

// Permission is hereby granted, free of charge, to any person obtaining a copy

// of this software and associated documentation files (the "Software"), to deal

// in the Software without restriction, including without limitation the rights

// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell

// copies of the Software, and to permit persons to whom the Software is

// furnished to do so, subject to the following conditions:

//

// The above copyright notice and this permission notice shall be included in

// all copies or substantial portions of the Software.

//

// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN

// THE SOFTWARE.


#pragma once


#include <functional>

#include <future>

#include <thread>

#include <queue>

#include <mutex>

#include <condition_variable>


#include "neml2/dispatchers/WorkGenerator.h"

#include "neml2/dispatchers/WorkScheduler.h"

#include "neml2/misc/assertions.h"

#include "neml2/misc/types.h"


#include "neml2/tensors/R2.h"


// Pre-C++20 workaround for std::type_identity

// https://en.cppreference.com/w/cpp/types/type_identity

template <class T>


struct type_identity

{

  using type = T;

};

struct type_identity {…};


namespace neml2

{

template <typename I,

          typename O,

          typename Of = typename std::vector<O>,

          typename Ip = typename type_identity<I>::type,

          typename Op = typename type_identity<O>::type>


class WorkDispatcher

{

public:


  WorkDispatcher(WorkScheduler & scheduler, bool async, std::function<O(I &&, Device)> && do_work)

    : _scheduler(scheduler),

      _devices(scheduler.devices()),

      _async(async),

      _do_work(std::move(do_work))

  {

    init_thread_pool();

  }

  WorkDispatcher(WorkScheduler & scheduler, bool async, std::function<O(I &&, Device)> && do_work) {…}


  WorkDispatcher(WorkScheduler & scheduler,

                 bool async,

                 std::function<O(I &&, Device)> && do_work,

                 std::function<O(std::vector<O> &&)> && reduce)

    : _scheduler(scheduler),

      _devices(scheduler.devices()),

      _async(async),

      _do_work(std::move(do_work)),

      _reduce(std::move(reduce))

  {

    init_thread_pool();

  }

  WorkDispatcher(WorkScheduler & scheduler, {…}


  WorkDispatcher(WorkScheduler & scheduler,

                 bool async,

                 std::function<O(I &&, Device)> && do_work,

                 std::function<Of(std::vector<Op> &&)> && reduce,

                 std::function<I(Ip &&, Device)> && preprocess,

                 std::function<Op(O &&)> && postprocess)

    : _scheduler(scheduler),

      _devices(scheduler.devices()),

      _async(async),

      _do_work(std::move(do_work)),

      _reduce(std::move(reduce)),

      _preprocess(std::move(preprocess)),

      _postprocess(std::move(postprocess))

  {

    init_thread_pool();

  }

  WorkDispatcher(WorkScheduler & scheduler, {…}


  WorkDispatcher(WorkScheduler & scheduler,

                 bool async,

                 std::function<O(I &&, Device)> && do_work,

                 std::function<Of(std::vector<Op> &&)> && reduce,

                 std::function<I(Ip &&, Device)> && preprocess,

                 std::function<Op(O &&)> && postprocess,

                 std::function<void(Device)> && thread_init)

    : _scheduler(scheduler),

      _devices(scheduler.devices()),

      _async(async),

      _do_work(std::move(do_work)),

      _reduce(std::move(reduce)),

      _preprocess(std::move(preprocess)),

      _postprocess(std::move(postprocess)),

      _thread_init(std::move(thread_init))

  {

    if (_thread_init)

      neml_assert_dbg(

          _async, "Custom thread initialization functor is only supported in asynchronous mode");


    init_thread_pool();

  }

  WorkDispatcher(WorkScheduler & scheduler, {…}


  WorkDispatcher() = delete;

  WorkDispatcher(WorkDispatcher &&) = delete;

  WorkDispatcher(const WorkDispatcher &) = delete;

  WorkDispatcher & operator=(WorkDispatcher &&) = delete;

  WorkDispatcher & operator=(const WorkDispatcher &) = delete;

  ~WorkDispatcher() { stop_thread_pool(); }


  Of run(WorkGenerator<Ip> &);


protected:

  void init_thread_pool();


  void thread_pool_main(const Device &);


  bool should_unlock_thread();


  void stop_thread_pool();


  void validate() const;


  Of run_sync(WorkGenerator<Ip> &);


  Of run_async(WorkGenerator<Ip> &);


  WorkScheduler & _scheduler;


  const std::vector<Device> _devices;


  const bool _async;


  std::function<O(I &&, Device)> _do_work;


  std::function<Of(std::vector<Op> &&)> _reduce;


  std::function<I(Ip &&, Device)> _preprocess;


  std::function<Op(O &&)> _postprocess;


  std::function<void(Device)> _thread_init;


  std::vector<Op> _results;


  std::mutex _qmutex;

  std::condition_variable _thread_condition;

  bool _stop = false;

  // As it turns out it's undefined behavior to initialize this before the mutex and condition

  // variable

  std::vector<std::thread> _thread_pool;

  std::unordered_map<Device, std::queue<std::function<void()>>> _tasks;

};

class WorkDispatcher {…};


// Implementation

template <typename I, typename O, typename Of, typename Ip, typename Op>

void


WorkDispatcher<I, O, Of, Ip, Op>::init_thread_pool()

{

  if (!_async)

    return;


  // Setup the task queue

  for (const auto & device : _devices)

    _tasks[device] = std::queue<std::function<void()>>();


  auto nthread = _devices.size();

  _thread_pool.reserve(nthread);

  for (std::size_t i = 0; i < nthread; ++i)

  {

    // This is necessary to initialize the torch linear algebra library prior to threaded calls

    // See: https://github.com/pytorch/pytorch/issues/90613

    auto res = R2::identity().to(_devices[i]).inverse();

    _thread_pool.emplace_back([this, i] { thread_pool_main(_devices[i]); });

  }


  // Initialize the thread

  if (_thread_init)

  {

    for (std::size_t i = 0; i < nthread; ++i)

    {

      auto device = _devices[i];

      auto task = [this, device = device]() mutable

      {

        _thread_init(device);

        _scheduler.completed_work(device, 1);

      };

      _scheduler.dispatched_work(device, 1);

      {

        std::lock_guard<std::mutex> lock(_qmutex);

        _tasks.at(device).push(task);

      }

      _thread_condition.notify_all();

    }

    _scheduler.wait_for_completion();

  }

}

WorkDispatcher<I, O, Of, Ip, Op>::init_thread_pool() {…}


template <typename I, typename O, typename Of, typename Ip, typename Op>

void


WorkDispatcher<I, O, Of, Ip, Op>::thread_pool_main(const Device & device)

{

  while (true)

  {

    std::function<void()> task;

    {

      std::unique_lock<std::mutex> lock(_qmutex);

      _thread_condition.wait(lock, [this, &device] { return _stop || !_tasks.at(device).empty(); });

      if (_stop && _tasks.at(device).empty())

        break;

      task = std::move(_tasks.at(device).front());

      _tasks.at(device).pop();

    }

    task();

  }

}

WorkDispatcher<I, O, Of, Ip, Op>::thread_pool_main(const Device & device) {…}


template <typename I, typename O, typename Of, typename Ip, typename Op>

void


WorkDispatcher<I, O, Of, Ip, Op>::stop_thread_pool()

{

  if (!_async)

    return;

  {

    std::unique_lock<std::mutex> lock(_qmutex);

    _stop = true;

  }

  _thread_condition.notify_all();

  for (auto & thread : _thread_pool)

    thread.join();

}

WorkDispatcher<I, O, Of, Ip, Op>::stop_thread_pool() {…}


template <typename I, typename O, typename Of, typename Ip, typename Op>

Of


WorkDispatcher<I, O, Of, Ip, Op>::run(WorkGenerator<Ip> & generator)

{

  if (_async)

    return run_async(generator);

  return run_sync(generator);

}

WorkDispatcher<I, O, Of, Ip, Op>::run(WorkGenerator<Ip> & generator) {…}


template <typename I, typename O, typename Of, typename Ip, typename Op>

void


WorkDispatcher<I, O, Of, Ip, Op>::validate() const

{

  if (!_do_work)

    throw NEMLException("Do-work function is not set");


  if constexpr (!std::is_same_v<I, Ip>)

    if (!_preprocess)

      throw NEMLException("Preprocess function is not set");


  if constexpr (!std::is_same_v<O, Op>)

    if (!_postprocess)

      throw NEMLException("Postprocess function is not set");


  if constexpr (!std::is_same_v<Of, std::vector<Op>>)

    if (!_reduce)

      throw NEMLException("Reduce function is not set");

}

WorkDispatcher<I, O, Of, Ip, Op>::validate() const {…}


template <typename I, typename O, typename Of, typename Ip, typename Op>

Of


WorkDispatcher<I, O, Of, Ip, Op>::run_sync(WorkGenerator<Ip> & generator)

{

  validate();


  Device device = kCPU;

  std::size_t n = 0;

  _results.clear();

  while (generator.has_more())

  {

    _scheduler.schedule_work(device, n);

    if (n <= 0)

      throw NEMLException("Scheduler returned a batch size of " + std::to_string(n));

    // Generate work

    auto && [m, work] = generator.next(n);

    // Preprocess

    if (_preprocess)

      work = _preprocess(std::move(work), device);

    // Do work. Since there is no asynchronous execution, we do not notify the scheduler (this also

    // avoids potential parallel communication inccured by the scheduler)

    auto result = _do_work(std::move(work), device);

    // Postprocess

    if (_postprocess)

      result = _postprocess(std::move(result));

    _results.push_back(result);

  }


  if (_reduce)

    return _reduce(std::move(_results));


  if constexpr (std::is_same<Of, std::vector<Op>>::value)

    return _results;


  throw NEMLException("Internal error: unreachable code");

}

WorkDispatcher<I, O, Of, Ip, Op>::run_sync(WorkGenerator<Ip> & generator) {…}


template <typename I, typename O, typename Of, typename Ip, typename Op>

Of


WorkDispatcher<I, O, Of, Ip, Op>::run_async(WorkGenerator<Ip> & generator)

{

  validate();


  Device device = kCPU;

  std::size_t n = 0;

  _results.clear();


  // Keep asking the scheduler for an available device

  // - If the generator has no more work, we break out of the loop

  // - If the scheduler schedules work, we dispatch the work and continue with the dispatching loop

  while (generator.has_more())

  {

    _scheduler.schedule_work(device, n);

    if (n <= 0)

      throw NEMLException("Scheduler returned a batch size of " + std::to_string(n));

    // Generate work

    auto && [m, work] = generator.next(n);

    // Reserve space for the result

    _results.resize(_results.size() + 1);

    auto i = _results.size() - 1;

    // Create the task

    auto task = [this, work = std::move(work), device = device, m = m, i = i]() mutable

    {

      // Preprocess

      if (_preprocess)

        work = _preprocess(std::move(work), device);

      // Do work

      auto result = _do_work(std::move(work), device);

      // Postprocess

      if (_postprocess)

        result = _postprocess(std::move(result));

      // Collect result

      _results[i] = std::move(result);

      // Tell the scheduler that we have completed m batches

      _scheduler.completed_work(device, m);

    };

    // Tell the scheduler that we have dispatched m batches

    _scheduler.dispatched_work(device, m);

    // Enqueue the task

    {

      std::lock_guard<std::mutex> lock(_qmutex);

      _tasks.at(device).push(task);

    }

    // Notify the thread pool

    // Note: We notify_all instead of notify_one because we want the thread that's bind to the

    // target device to pick up the task

    _thread_condition.notify_all();

  }


  // Wait for all tasks to complete

  _scheduler.wait_for_completion();


  if (_reduce)

    return _reduce(std::move(_results));


  if constexpr (std::is_same<Of, std::vector<Op>>::value)

    return _results;


  throw NEMLException("Internal error: unreachable code");

}

WorkDispatcher<I, O, Of, Ip, Op>::run_async(WorkGenerator<Ip> & generator) {…}

} // namespace neml2

neml2::NEMLException
Definition errors.h:34

neml2::R2Base::inverse
Derived inverse() const
Inversion.
Definition R2Base.cxx:214

neml2::R2Base< R2 >::identity
static R2 identity(const TensorOptions &options=default_tensor_options())
Definition R2Base.cxx:166

neml2::TensorBase::to
Derived to(const TensorOptions &options) const
Change tensor options.
Definition TensorBaseImpl.h:150

neml2::WorkDispatcher::_devices
const std::vector< Device > _devices
Device pool requested by the scheduler.
Definition WorkDispatcher.h:206

neml2::WorkDispatcher::_thread_pool
std::vector< std::thread > _thread_pool
Definition WorkDispatcher.h:241

neml2::WorkDispatcher::_async
const bool _async
Flag to enable asynchronous execution.
Definition WorkDispatcher.h:209

neml2::WorkDispatcher::_stop
bool _stop
Flag to stop the thread pool.
Definition WorkDispatcher.h:235

neml2::WorkDispatcher::_results
std::vector< Op > _results
Results to be reduced.
Definition WorkDispatcher.h:227

neml2::WorkDispatcher::_thread_init
std::function< void(Device)> _thread_init
Function to initialize the thread.
Definition WorkDispatcher.h:224

neml2::WorkDispatcher::init_thread_pool
void init_thread_pool()
Initialize the thread pool.
Definition WorkDispatcher.h:252

neml2::WorkDispatcher::~WorkDispatcher
~WorkDispatcher()
Definition WorkDispatcher.h:175

neml2::WorkDispatcher::operator=
WorkDispatcher & operator=(const WorkDispatcher &)=delete

neml2::WorkDispatcher::run_async
Of run_async(WorkGenerator< Ip > &)
Run the dispatching loop asynchronously.
Definition WorkDispatcher.h:395

neml2::WorkDispatcher::WorkDispatcher
WorkDispatcher(const WorkDispatcher &)=delete

neml2::WorkDispatcher::_scheduler
WorkScheduler & _scheduler
Reference to the work scheduler.
Definition WorkDispatcher.h:203

neml2::WorkDispatcher::_preprocess
std::function< I(Ip &&, Device)> _preprocess
Function to preprocess the work.
Definition WorkDispatcher.h:218

neml2::WorkDispatcher::_tasks
std::unordered_map< Device, std::queue< std::function< void()> > > _tasks
Task queue for the thread pool.
Definition WorkDispatcher.h:243

neml2::WorkDispatcher::_qmutex
std::mutex _qmutex
Definition WorkDispatcher.h:231

neml2::WorkDispatcher::WorkDispatcher
WorkDispatcher(WorkScheduler &scheduler, bool async, std::function< O(I &&, Device)> &&do_work, std::function< Of(std::vector< Op > &&)> &&reduce, std::function< I(Ip &&, Device)> &&preprocess, std::function< Op(O &&)> &&postprocess)
Definition WorkDispatcher.h:130

neml2::WorkDispatcher::stop_thread_pool
void stop_thread_pool()
Stop the thread pool.
Definition WorkDispatcher.h:314

neml2::WorkDispatcher::thread_pool_main
void thread_pool_main(const Device &)
Thread pool main function.
Definition WorkDispatcher.h:295

neml2::WorkDispatcher::operator=
WorkDispatcher & operator=(WorkDispatcher &&)=delete

neml2::WorkDispatcher::WorkDispatcher
WorkDispatcher(WorkScheduler &scheduler, bool async, std::function< O(I &&, Device)> &&do_work, std::function< Of(std::vector< Op > &&)> &&reduce, std::function< I(Ip &&, Device)> &&preprocess, std::function< Op(O &&)> &&postprocess, std::function< void(Device)> &&thread_init)
Definition WorkDispatcher.h:147

neml2::WorkDispatcher::_postprocess
std::function< Op(O &&)> _postprocess
Function to postprocess the result.
Definition WorkDispatcher.h:221

neml2::WorkDispatcher::validate
void validate() const
Helper function to validate that the dispatcher is properly configured.
Definition WorkDispatcher.h:338

neml2::WorkDispatcher::WorkDispatcher
WorkDispatcher(WorkDispatcher &&)=delete

neml2::WorkDispatcher::run_sync
Of run_sync(WorkGenerator< Ip > &)
Run the dispatching loop synchronously.
Definition WorkDispatcher.h:358

neml2::WorkDispatcher::WorkDispatcher
WorkDispatcher()=delete

neml2::WorkDispatcher::_do_work
std::function< O(I &&, Device)> _do_work
Function to perform the work and return the result.
Definition WorkDispatcher.h:212

neml2::WorkDispatcher::run
Of run(WorkGenerator< Ip > &)
Run the dispatching loop (calls run_sync or run_async based on the async flag)
Definition WorkDispatcher.h:329

neml2::WorkDispatcher::WorkDispatcher
WorkDispatcher(WorkScheduler &scheduler, bool async, std::function< O(I &&, Device)> &&do_work)
Definition WorkDispatcher.h:108

neml2::WorkDispatcher::_reduce
std::function< Of(std::vector< Op > &&)> _reduce
Function to reduce the results.
Definition WorkDispatcher.h:215

neml2::WorkDispatcher::WorkDispatcher
WorkDispatcher(WorkScheduler &scheduler, bool async, std::function< O(I &&, Device)> &&do_work, std::function< O(std::vector< O > &&)> &&reduce)
Definition WorkDispatcher.h:117

neml2::WorkDispatcher::should_unlock_thread
bool should_unlock_thread()
Should unlock thread.

neml2::WorkDispatcher::_thread_condition
std::condition_variable _thread_condition
Condition variable for the tasks queue.
Definition WorkDispatcher.h:233

neml2::WorkGenerator
Definition WorkGenerator.h:34

neml2::WorkGenerator::next
std::pair< std::size_t, T > next(std::size_t n)
Generate the next n batches of work.
Definition WorkGenerator.h:57

neml2::WorkGenerator::has_more
virtual bool has_more() const =0
Whether the generator has more work to generate.

neml2::WorkScheduler
Scheduler for work dispatching.
Definition WorkScheduler.h:47

neml2
Definition DiagnosticsInterface.cxx:30

neml2::Device
c10::Device Device
Definition types.h:66

neml2::neml_assert_dbg
void neml_assert_dbg(bool assertion, Args &&... args)
Definition assertions.h:60

neml2::kCPU
constexpr auto kCPU
Definition types.h:56

type_identity
Definition WorkDispatcher.h:45

type_identity::type
T type
Definition WorkDispatcher.h:46