runtime/api/scheduler_8c_source.html

/* -*- C -*-

 * Serene programming language

 * Copyright (C) 2019-2026 Sameer Rahmani <[email protected]>

 *

 * This library is free software: you can redistribute it and/or modify

 * it under the terms of the GNU Lesser General Public License as published by

 * the Free Software Foundation, either version 3 of the License, or

 * (at your option) any later version.

 *

 * This library is distributed in the hope that it will be useful,

 * but WITHOUT ANY WARRANTY; without even the implied warranty of

 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

 * GNU Lesser General Public License for more details.

 *

 * You should have received a copy of the GNU Lesser General Public License

 * along with this library.  If not, see <https://www.gnu.org/licenses/>.

 */


#include <inttypes.h>

#include <stdatomic.h>

#include <stdio.h>


#include "serene/rt/context.h"

#include "serene/rt/engine.h"

#include "serene/rt/fiber.h"

#include "serene/rt/fiber/thread.h"

#include "serene/rt/mm/interface.h"

#include "serene/rt/reactor.h"

#include "serene/utils.h"


#define SCHED_LOG(FMT, ...) DBG("SCHED", FMT __VA_OPT__(, ) __VA_ARGS__)


/// Per-operation deque and queue tracing (push, pop, steal, wake). This fires

/// on the hot path, so it floods a debug build and is off unless

/// SRN_SCHED_TRACE is defined. SCHED_LOG (scheduler lifecycle, stop, shutdown

/// reap) stays on in a debug build. Both are silent in release, since DBG is.

#ifdef SRN_SCHED_TRACE

#  define SCHED_TRACE(FMT, ...) DBG("SCHED", FMT __VA_OPT__(, ) __VA_ARGS__)

#else


#  define SCHED_TRACE(...) \

    do {                   \

    } while (0)


#endif


// -----------------------------------------------------------------------------

// Model

// -----------------------------------------------------------------------------

// The scheduler decides what runs next and owns the global queue and the

// registry. It does not run fibers. The os threads do. Each os thread runs the

// worker routine over its own worker, which holds a local run queue. The

// routine finds a fiber -- the worker's local queue first, then the global

// queue, then stealing one from a peer -- switches into it, and handles how it

// gives up control. A fiber yields, suspends, or finishes by switching to its

// worker's loop, which is therefore the resumer. The single scheduler is shared

// by every worker.

//

// Migration hygiene. A fiber may run on any worker and resume on a different

// one than it last ran on, so it carries no thread identity. The only

// thread-bound state is the `current_worker` thread-local, read fresh at each

// use and never stored on a fiber across a switch. Allocating through a fiber's

// context is safe from any worker, since the memory manager locks its own

// blocks, and a context is the shared engine arena rather than a per-thread

// one. The one rule the IO layer must keep is to read `errno` before any

// suspend or yield, since the fiber may resume on another thread where `errno`

// differs.


// -----------------------------------------------------------------------------

// Work-stealing

// -----------------------------------------------------------------------------

// Each worker owns a Chase-Lev work-stealing deque (a fixed ring of ready

// fibers, see srn_worker_t), and the scheduler keeps one shared global queue

// besides. Two signed, monotonically increasing counters index a worker's ring.

// `bottom` is the owner's end and `top` is the thieves' end. The live slot for

// an index is `index & (SRN_FIBER_LOCAL_RING_CAP - 1)`.

//

//   top (thieves take the oldest)      bottom (owner pushes/pops the newest)

//    |                                  |

//    [ f3 ][ f4 ][ f5 ][ f6 ][ f7 ]

//

// The owner pushes and pops `bottom`, and since only it touches that end the

// common path is lock free and uncontended. Thieves take from `top` with a

// compare-and-swap, so several can race and exactly one wins. The owner and the

// thieves touch opposite ends, so they collide only over the last remaining

// element, a race `local_pop` and `local_steal` settle with a seq-cst fence and

// the CAS on `top`. The counters are signed so the transient `bottom - 1` on an

// empty deque reads as -1 ("empty") rather than wrapping.

//

// Finding work. A worker drains its own deque, then the global queue, then

// steals one fiber from each peer in turn (`find_work`). Work made while a

// fiber runs (a yield, spawn, or wake) goes onto the running worker's own

// deque, keeping it local. Work from off a worker, or work that overflows a

// full deque, goes to the global queue. So the global lock stays off the hot

// path while every os thread is busy -- it is taken only for the global queue

// and to wake a parked os thread (when one exists).

//

// The deque follows the weak memory correct formulation, so the fences are

// right on ARM and the like, not only on x86:

//   - Chase & Lev, "Dynamic Circular Work-Stealing Deque", SPAA 2005.

//     https://doi.org/10.1145/1073970.1073974

//   - Le, Pop, Cohen & Zappa Nardelli, "Correct and Efficient Work-Stealing for

//     Weak Memory Models", PPoPP 2013. https://doi.org/10.1145/2442516.2442524

//     (PDF: https://fzn.fr/readings/ppopp13.pdf)


/// Defined here, not in fiber.h, which only forward declares `srn_scheduler_t`.

/// Consumers hold a `srn_scheduler_t *` and never see the layout. Two reasons:

///

///  1. The layout can evolve without recompiling or disturbing consumers. Going

///     M:N this struct grows per-thread local queues, a worker array, a reactor

///     handle, steal state -- none of which should ripple into every

///     translation unit that includes `fiber.h`.

///

///  2. It makes the decide/execute boundary physical. The scheduler owns the

///     ready queue and the picking policy. The worker routine and fibers must

///     reach it only through enqueue/yield/ready. Keeping the fields private

///     means no

///     caller *can* poke the queue directly -- the encapsulation is the

///     contract, enforced by the compiler rather than by convention.

typedef struct srn_worker_t srn_worker_t;


/// The scheduler's lifecycle as one atomic value. `RUNNING` means a run is

/// servicing the queues. `DRAINING` is the graceful wind down set by

/// `srn_sched_drain`, workers keep running every runnable fiber and let

/// in-flight IO finish, but new IO submissions are fenced (see

/// `srn_sched_accepting_submissions`) so fibers unwind instead of parking on

/// fresh ops, and the pool converges to quiescence. `STOPPING` tells the

/// workers to wind down at once, set at natural quiescence or abruptly by

/// `srn_sched_stop`. `IDLE` is the resting state before a run starts. Workers

/// read it without the lock, so it is atomic.

///

/// The order matters, a state at or past `DRAINING` no longer accepts new IO,

/// which `srn_sched_accepting_submissions` relies on.


typedef enum srn_sched_state_t {

  SRN_SCHED_IDLE,

  SRN_SCHED_RUNNING,

  SRN_SCHED_DRAINING,

  SRN_SCHED_STOPPING,

} srn_sched_state_t;


struct srn_scheduler_t {

  srn_engine_t *engine;

  /// Global lock. Guards the global/overflow queue, the registry, and the

  /// worker coordination fields below. It does NOT guard the per-worker local

  /// queues, which carry their own locks. Lock order is global-before-local, a

  /// worker may hold this while taking a local lock (only the park scan does),

  /// but a local lock is never held while taking this.

  srn_mutex_t lock;

  /// Global / overflow queue. Holds fibers enqueued with no current worker --

  /// an external waker, or the initial fibers made before the run. A worker

  /// drains its own local queue first, then this. FIFO through the intrusive

  /// `link`.

  srn_fiber_t *ready_head;

  srn_fiber_t *ready_tail;

  /// Registry, head of the doubly-linked list (through `reg_prev`/`reg_next`)

  /// of every live fiber, on a different axis from the run queues. A fiber

  /// joins at srn_fiber_make and leaves when reaped, so the list is every fiber

  /// the scheduler is still responsible for, including SUSPENDED ones that sit

  /// on no run queue. It is how the scheduler accounts for and cleans them up.

  srn_fiber_t *registry;


  /// Worker coordination. Parked os threads wait on `work`. `idle` counts

  /// parked os threads and `runnable` counts fibers waiting in ANY queue (local

  /// or global). Both are atomic because a push reads `idle`, and the park path

  /// reads `runnable`, without holding the lock the other side updates them

  /// under. The "idle++ then read runnable" park ordering against the

  /// "runnable++ then read idle" push ordering is what makes a lost wakeup

  /// impossible.

  ///

  /// WARNING: that pairing is correct ONLY because all four of those operations

  /// are seq_cst (the default for `atomic_fetch_add` and `atomic_load`).

  /// `announce_work` does its `runnable++` and its `idle` read with NO lock

  /// held, so the single seq_cst total order is the only thing tying it to the

  /// park path. Do NOT weaken these to acq_rel or relaxed for "speed". Weaken

  /// them and a push and a park can each fail to see the other, so an os thread

  /// sleeps on `work` forever while a runnable fiber sits in a queue. That is a

  /// lost wakeup, a hang. If these ever must be relaxed, the whole

  /// `runnable`/`idle` handshake has to move under the lock first, the way

  /// `global_enqueue` already does it, so the lock supplies the ordering the

  /// weaker atomics would not.

  ///

  /// `nworkers` is fixed for a run. `state` drives termination. An os thread

  /// stops once it observes `SRN_SCHED_STOPPING`, set at quiescence (idle ==

  /// nworkers and runnable == 0) or by `srn_sched_stop`.

  srn_cond_t work;

  atomic_size_t idle;

  atomic_size_t runnable;

  size_t nworkers;


  _Atomic srn_sched_state_t state;

  /// `srn_sched_run` allocates these two arrays and `srn_sched_shutdown` frees

  /// them. They live until shutdown, so shutdown can join the threads and reap.

  /// The scheduler struct itself is immortal, but these arrays are not.

  ///

  /// `workers` holds all `nworkers` workers in one array, so each worker can

  /// find the others to steal from. `os_threads` holds the OS threads the

  /// scheduler started. There is not one thread per worker. The caller's own

  /// thread runs worker 0, so the scheduler never starts a thread for it. Only

  /// workers 1 through nworkers-1 get a thread, so slot 0 of `os_threads` is

  /// unused and shutdown joins slots 1 through nworkers-1. A thread belongs

  /// here, not in `srn_worker_t`, because it marks a thread the scheduler

  /// started and must join, which is not the same as being a worker.

  srn_worker_t *workers;

  srn_thread_t *os_threads;

  /// True for the duration of an `srn_sched_run` call. `srn_sched_shutdown`

  /// reads it to reject being called while a run is still in flight (it must

  /// run after `run` has returned). Atomic because shutdown may read it from a

  /// different thread than the one inside `run`.

  _Atomic bool run_active;

  /// Set once `srn_sched_shutdown` has torn the scheduler down. The scheduler

  /// is not usable afterwards, a further `run` panics, and a further `shutdown`

  /// is a no-op.

  bool destroyed;

};


/// Capacity of each worker's local work-stealing deque. Must be a power of two:

/// the live slot for a deque index is `index & (cap - 1)`. 256 matches the

/// common choice (Go, Tokio). A fiber that does not fit overflows to the global

/// queue. This is the single source of truth for the size, so a configuration

/// layer can later drive it.

#define SRN_FIBER_LOCAL_RING_CAP 256

static_assert(

  (SRN_FIBER_LOCAL_RING_CAP & (SRN_FIBER_LOCAL_RING_CAP - 1)) == 0,

  "SRN_FIBER_LOCAL_RING_CAP must be a power of two"

);


/// The state one os thread uses to run fibers. The worker's loop (`loop`)

/// represents the os thread itself and is the resumer for every fiber this

/// worker runs. Each worker owns a lock free Chase-Lev work stealing deque. The

/// owner pushes and pops the `bottom` end, while thieves take from the `top`

/// end. So the common push and pop touch no lock, and only a steal contends.


struct srn_worker_t {

  srn_scheduler_t *sched;

  srn_fiber_t loop;

  srn_fiber_t *current;

  srn_worker_id_t id;


  /// Count of fibers this worker has created so far. Feeds the autogenerated

  /// fiber names, worker local so no synchronization is needed.

  uint64_t spawned;


  /// Chase-Lev deque. `top` and `bottom` are signed and monotonically

  /// increasing (signed so the transient `bottom - 1` on an empty deque does

  /// not wrap), and the live slot for an index is `index &

  /// (SRN_FIBER_LOCAL_RING_CAP - 1)`.

  /// TODO(lxsameer): Make the ring capacity configurable via CLI args.

  atomic_intptr_t top;

  atomic_intptr_t bottom;

  _Atomic(srn_fiber_t *) ring[SRN_FIBER_LOCAL_RING_CAP];

  // TODO(lxsameer): a per-thread ring of free stacks to recycle could live

  // here.

};


/// The worker the calling os thread is running, or null when this os thread is

/// not running the worker routine (srn_sched_run is not active on it). This

/// thread-local is the seam that resolves the resumer, the current fiber, and

/// "are we in a fiber?" -- all per os thread state that cannot live in the

/// single shared scheduler.

static _Thread_local srn_worker_t *current_worker = nullptr;


// -----------------------------------------------------------------------------

// Lifecycle

// -----------------------------------------------------------------------------


srn_scheduler_t *srn_sched_init(srn_engine_t *engine) {

  PANIC_IF_NULL(engine);

  // The scheduler outlives every context and fiber, so it is allocated from the

  // immortal region rather than a releasable block.

  srn_scheduler_t *sched = srn_mm_immortal_allocate(engine->mm, srn_scheduler_t);

  PANIC_IF_NULL(sched);


  sched->engine     = engine;

  sched->ready_head = nullptr;

  sched->ready_tail = nullptr;

  sched->registry   = nullptr;

  sched->idle       = 0;

  sched->runnable   = 0;

  sched->nworkers   = 0;

  sched->state      = SRN_SCHED_IDLE;

  sched->workers    = nullptr;

  sched->os_threads = nullptr;

  sched->destroyed  = false;

  atomic_init(&sched->run_active, false);


  PANIC_IF(

    srn_mutex_init(&sched->lock) != SRN_THREAD_OK, "failed to initialise the scheduler lock"

  );


  PANIC_IF(

    srn_cond_init(&sched->work) != SRN_THREAD_OK, "failed to initialise the scheduler condition"

  );


  // srn_engine_make will store this scheduler in the engine

  return sched;

}


/// Insert at the head of the registry. Caller must hold `sched->lock`.


static void registry_add(srn_scheduler_t *sched, srn_fiber_t *fiber) {

  fiber->reg_prev = nullptr;

  fiber->reg_next = sched->registry;


  if (sched->registry != nullptr) {

    sched->registry->reg_prev = fiber;

  }


  sched->registry = fiber;

}


/// Unlink from the registry. O(1), thanks to the back pointer. Caller must hold

/// `sched->lock`.


static void registry_remove(srn_scheduler_t *sched, srn_fiber_t *fiber) {

  if (fiber->reg_prev != nullptr) {

    fiber->reg_prev->reg_next = fiber->reg_next;

  } else {

    sched->registry = fiber->reg_next;

  }


  if (fiber->reg_next != nullptr) {

    fiber->reg_next->reg_prev = fiber->reg_prev;

  }


  fiber->reg_prev = nullptr;

  fiber->reg_next = nullptr;

}


void srn_sched_register(srn_scheduler_t *sched, srn_fiber_t *fiber) {

  PANIC_IF_NULL(sched);

  PANIC_IF_NULL(fiber);


  srn_mutex_lock(&sched->lock);

  registry_add(sched, fiber);

  srn_mutex_unlock(&sched->lock);

}


void srn_sched_shutdown(srn_scheduler_t *sched) {

  PANIC_IF_NULL(sched);


  if (sched->destroyed) {

    return;

  }


  // It must run on a thread outside the pool, a worker, or a fiber (which runs

  // on a worker), would be tearing down the scheduler it is itself running on.

  // `current_worker` is null only off a worker, so it is the test for that.

  PANIC_IF(

    current_worker != nullptr, "srn_sched_shutdown must be called from outside the worker pool"

  );


  // And it must run after `srn_sched_run` has returned, not while a run is in

  // flight. Stop a running pool with `srn_sched_stop` and let `srn_sched_run`

  // return first.

  PANIC_IF(

    atomic_load(&sched->run_active),

    "srn_sched_shutdown called while srn_sched_run is active; call "

    "srn_sched_stop and let the run return first"

  );


  // The run has returned, so worker 0 has stopped. The spawned os threads may

  // still be winding down (`srn_sched_run` does not join them), so join them

  // now. `os_threads` slot 0 is the inline worker 0, never spawned, so the

  // spawned os threads to join are 1..nworkers-1.

  for (size_t i = 1; i < sched->nworkers; i++) {

    (void)srn_thread_join(&sched->os_threads[i]);

  }


  // Every worker is gone, so this runs single threaded now.

  //

  // Any fiber still in the registry never finished, it was left parked in

  // SRN_FIBER_SUSPENDED with no party able to wake it (a deadlock), or was left

  // queued when the run stopped early. Release its stack so it does not leak.

  // The fiber structs themselves live in context blocks and are reclaimed with

  // those blocks, not here. The scheduler is immortal-allocated, so it is not

  // freed either.

  //

  // Unlike the reap path, this unmaps for good, shutdown runs after the workers

  // are gone, so the per-thread stack ring from the fiber.h TODO no longer

  // exists and there is nothing to recycle into. (Draining that ring, when it

  // exists, also belongs here.)

  srn_fiber_t *fiber = sched->registry;

  while (fiber != nullptr) {

    srn_fiber_t *next = fiber->reg_next;

    SCHED_LOG(

      "shutdown reaping unfinished fiber '%s' (never scheduled, or suspended with no waker?)",

      fiber->name

    );

    // TODO(lxsameer): Free up the ring here as well

    FIBER_TRACEPOINT(fiber_stack_free, (void *)fiber, fiber->name);

    srn_fiber_stack_free(fiber->stack);

    srn_fiber_on_reap(fiber);

    fiber->reg_prev = nullptr;

    fiber->reg_next = nullptr;

    fiber           = next;

  }

  sched->registry = nullptr;


  // Release the run-scoped storage (srn_mm_free tolerates null, so a scheduler

  // that never ran is fine).

  srn_mm_free(sched->engine->mm, sched->os_threads);

  srn_mm_free(sched->engine->mm, sched->workers);

  sched->os_threads = nullptr;

  sched->workers    = nullptr;

  sched->nworkers   = 0;


  // Destroy the synchronisation primitives. The scheduler is not usable after

  // this, so they are not re-initialised. No worker holds or waits on them now,

  // the join above made sure of that.

  (void)srn_cond_destroy(&sched->work);

  (void)srn_mutex_destroy(&sched->lock);


  sched->destroyed = true;

}


/// Wake the os thread of one parked worker after a fiber has joined a queue.

/// "Parked" means that os thread is asleep in `srn_cond_wait` because it found

/// no runnable fiber anywhere. This is not a fiber suspending. It is the whole

/// os thread blocked, and the notify wakes it so it looks again.

///

/// `runnable` is bumped first, then `idle` is read. Paired against the park

/// path, which bumps `idle` then reads `runnable`, this ordering means the two

/// sides can never both miss, so a wakeup is never lost. The notify takes the

/// global lock (the condition's lock) but only when an os thread is actually

/// parked, so the common busy case never touches it.

///

/// WARNING: this runs with NO lock around the `runnable++` and the `idle` read,

/// so its only tie to the park path is the seq_cst total order. Both must stay

/// seq_cst. RELAX EITHER AND THE WAKEUP CAN BE LOST (an os thread parked with a

/// runnable fiber queued). See the `srn_scheduler_t` coordination comment for

/// the full reasoning.


static void announce_work(srn_scheduler_t *sched) {

  PANIC_IF_NULL(sched);


  atomic_fetch_add(&sched->runnable, 1);


  if (atomic_load(&sched->idle) > 0) {

    // We have idle os threads. Wake them up

    srn_mutex_lock(&sched->lock);

    SCHED_TRACE("waking a parked os thread (runnable=%ld)", (long)atomic_load(&sched->runnable));

    srn_cond_notify_one(&sched->work);

    srn_mutex_unlock(&sched->lock);

  }

}


// The local deque is a Chase-Lev work-stealing deque. The fence placement

// follows Le, Pop, Cohen and Zappa Nardelli, "Correct and Efficient

// Work-Stealing for Weak Memory Models" (PPoPP 2013), so it is correct on

// weakly-ordered CPUs, not just on x86's strong model. The buffer is fixed, so

// there is no resize and no buffer reclamation. The `runnable` count is

// adjusted by the enqueue path (push) and by find_work (the only taker).


/// This operation is only for the owner of the ring. Push a fiber on the

/// bottom. Returns false when the deque is full, so the caller can overflow it

/// to the global queue. The caller has set `state`.


static bool local_push(srn_worker_t *w, srn_fiber_t *fiber) {

  PANIC_IF_NULL(w);

  PANIC_IF_NULL(fiber);


  intptr_t b = atomic_load_explicit(&w->bottom, memory_order_relaxed);

  intptr_t t = atomic_load_explicit(&w->top, memory_order_acquire);

  if (b - t >= (intptr_t)SRN_FIBER_LOCAL_RING_CAP) {

    return false; // full

  }


  atomic_store_explicit(&w->ring[b & (SRN_FIBER_LOCAL_RING_CAP - 1)], fiber, memory_order_relaxed);

  // After ^^^, the slot isn't published to thieves yet, because they decide

  // what's live by reading bottom, which we haven't bumped


  // Publish the slot. write before the bottom store that exposes it to a thief.

  // This is the key barrier. It orders the slot write before the bottom bump

  // that follows. Paired with a thief's `acquire-load` of bottom in

  // `local_steal`, it guarantees, if a thief sees the new bottom, it also sees

  // the fiber we just wrote, never a stale/garbage slot.

  atomic_thread_fence(memory_order_release);

  atomic_store_explicit(&w->bottom, b + 1, memory_order_relaxed);


  SCHED_TRACE("worker %zu local-push fiber %p", w->id, (void *)fiber);

  return true;

}


/// Owner only. Pop a fiber from the bottom, or null when empty. The seq_cst

/// fence and the compare-and-swap settle the race with a thief over the last

/// element.


static srn_fiber_t *local_pop(srn_worker_t *w) {

  PANIC_IF_NULL(w);


  // Since bottom is local to the owner, there is no other writer competing to

  // write to it. So a load/store is enough here no need for `atomic_fetch_sub`.

  intptr_t b = atomic_load_explicit(&w->bottom, memory_order_relaxed) - 1;

  atomic_store_explicit(&w->bottom, b, memory_order_relaxed);


  atomic_thread_fence(memory_order_seq_cst);

  intptr_t t = atomic_load_explicit(&w->top, memory_order_relaxed);


  srn_fiber_t *fiber = nullptr;

  if (t <= b) {

    // Non-empty.

    fiber =

      atomic_load_explicit(&w->ring[b & (SRN_FIBER_LOCAL_RING_CAP - 1)], memory_order_relaxed);

    if (t == b) {

      // Last element. The owner and a thief can race for it, so settle it with

      // the CAS on `top`. Exactly one of them wins.

      if (

        atomic_compare_exchange_strong_explicit(

          &w->top, &t, t + 1, memory_order_seq_cst, memory_order_relaxed

        )

      ) {

        SCHED_TRACE("worker %zu popped the last fiber %p", w->id, (void *)fiber);


      } else {

        SCHED_TRACE("worker %zu lost the last fiber %p to a thief", w->id, (void *)fiber);

        fiber = nullptr; // the thief won

      }

      atomic_store_explicit(&w->bottom, b + 1, memory_order_relaxed);

    }

  } else {

    // Empty. Restore bottom.

    atomic_store_explicit(&w->bottom, b + 1, memory_order_relaxed);

  }

  return fiber;

}


/// Thief side. Take a fiber from `victim`'s top, or null when the deque is

/// empty or a concurrent take won the race -- the caller then just moves to the

/// next victim.


static srn_fiber_t *local_steal(srn_worker_t *victim) {

  PANIC_IF_NULL(victim);


  intptr_t t = atomic_load_explicit(&victim->top, memory_order_acquire);

  // Pairs with the `seq_cst` fence in `local_pop`. The two fences force a

  // single total order in which the owner (lowering bottom, fence, reading top)

  // and this thief (reading top, fence, reading bottom) cannot both decide they

  // got the last element.

  // Basically this fence handles the steal race against local_pop.

  // Note: Don't mixup `memory_order_seq_cst` with `memory_order_acquire` that

  // we use for loading victim's bottom the next line.

  atomic_thread_fence(memory_order_seq_cst);

  // This acquire on bottom handles slot visibility against `local_push`

  intptr_t b = atomic_load_explicit(&victim->bottom, memory_order_acquire);


  srn_fiber_t *fiber = nullptr;

  if (t < b) {

    fiber =

      atomic_load_explicit(&victim->ring[t & (SRN_FIBER_LOCAL_RING_CAP - 1)], memory_order_relaxed);

    if (!atomic_compare_exchange_strong_explicit(

          &victim->top, &t, t + 1, memory_order_seq_cst, memory_order_relaxed

        )) {

      fiber = nullptr; // lost the race

    }

  }

  return fiber;

}


/// Append a fiber to the global/overflow queue. The caller has set its

/// `state`. The push, the `runnable` bump, and the wake all run under the

/// global lock, so this path is trivially serialized against the park path and

/// needs no separate ordering argument.

///

/// Put a fiber on the global queue and wake a parked os thread if any. Unlike

/// `announce_work`, the `runnable++`, the `idle` read, and the notify all

/// happen under the lock, so this path is safe by mutual exclusion and does not

/// lean on the seq_cst ordering the lockless path does.


static void global_enqueue(srn_scheduler_t *sched, srn_fiber_t *fiber) {

  srn_mutex_lock(&sched->lock);


  fiber->link = nullptr;


  if (sched->ready_tail == nullptr) {

    sched->ready_head = fiber;

  } else {

    sched->ready_tail->link = fiber;

  }

  sched->ready_tail = fiber;


  atomic_fetch_add(&sched->runnable, 1);


  SCHED_TRACE(

    "global-push fiber %p (runnable=%ld)", (void *)fiber, (long)atomic_load(&sched->runnable)

  );


  if (atomic_load(&sched->idle) > 0) {

    srn_cond_notify_one(&sched->work);

  }

  srn_mutex_unlock(&sched->lock);

}


/// Pop the head of the global queue, or null when empty. The `runnable`

/// adjustment is left to `find_work`, the only taker.


static srn_fiber_t *global_take(srn_scheduler_t *sched) {

  srn_mutex_lock(&sched->lock);

  srn_fiber_t *fiber = sched->ready_head;


  if (fiber != nullptr) {

    sched->ready_head = fiber->link;

    if (sched->ready_head == nullptr) {

      sched->ready_tail = nullptr;

    }

    fiber->link = nullptr;

  }


  srn_mutex_unlock(&sched->lock);

  return fiber;

}


/// Put a runnable fiber on a queue, with its `state` already set to `READY`. A

/// fiber enqueued while running on a worker goes onto that worker's local

/// deque, keeping its work local. One enqueued from off a worker (the initial

/// fibers made before the run, or an external waker), or one that does not fit

/// a full local deque, goes to the global queue.


static void push_ready(srn_scheduler_t *sched, srn_fiber_t *fiber) {

  srn_worker_t *w = current_worker;


  // The publish (local_push) happens before announce_work bumps `runnable`,

  // so a thief that takes the fiber first transiently drives `runnable` to

  // SIZE_MAX. Every check of the counter is `== 0`, which the wrap cannot

  // satisfy, so this is benign; do not add `> 0` style or signed comparisons

  // on `runnable` without fixing the ordering here.


  // On a worker, try its own deque first, falling through on a full deque.

  if (w != nullptr) {

    if (local_push(w, fiber)) {

      announce_work(sched);

      return;

    }


    FIBER_TRACEPOINT(sched_overflow, w->id, (void *)fiber);

    SCHED_TRACE("worker %zu local deque full, overflow fiber %p to global", w->id, (void *)fiber);

  }


  // Off a worker, or the deque was full, the global queue takes it.

  global_enqueue(sched, fiber);

}


void srn_sched_enqueue(srn_scheduler_t *sched, srn_fiber_t *fiber) {

  PANIC_IF_NULL(sched);

  PANIC_IF_NULL(fiber);


  fiber->state = SRN_FIBER_READY;

  push_ready(sched, fiber);

}


void srn_fiber_schedule(srn_fiber_t *fiber) {

  PANIC_IF_NULL(fiber);


  // The NEW to READY flip admits exactly one scheduler of this fiber, the

  // same guard ready_fiber uses for SUSPENDED, so a double schedule panics

  // at the losing call site instead of double enqueuing.

  srn_fiber_state_t expected = SRN_FIBER_NEW;

  PANIC_IF(

    !atomic_compare_exchange_strong(&fiber->state, &expected, SRN_FIBER_READY),

    "srn_fiber_schedule needs a NEW fiber. A fiber is scheduled exactly once, "

    "and a suspended one is woken with srn_fiber_ready"

  );

  push_ready(srn_fiber_get_scheduler_m(fiber), fiber);

}


/// Wake a parked fiber by flipping `SUSPENDED` to `READY` and enqueuing it.

/// Only the flip's winner enqueues, so racing wakers cannot double-enqueue it,

/// and a fiber that is not parked is left untouched. The scheduler does not

/// check the awaited condition. A fiber woken early resumes, re-checks, and

/// parks again.


static void ready_fiber(srn_scheduler_t *sched, srn_fiber_t *fiber) {

  srn_fiber_state_t expected = SRN_FIBER_SUSPENDED;

  if (atomic_compare_exchange_strong(&fiber->state, &expected, SRN_FIBER_READY)) {

    push_ready(sched, fiber);

  }

}


/// Find a fiber to run, the worker's own deque first, then the global queue,

/// then a steal of one fiber from each peer in turn. Null when nothing is

/// runnable anywhere this worker can reach. Decrements `runnable` for whatever

/// it takes.


static srn_fiber_t *find_work(srn_worker_t *w) {

  srn_scheduler_t *sched = w->sched;


  srn_fiber_t *fiber = local_pop(w);

  if (fiber == nullptr) {

    fiber = global_take(sched);

  }


  if (fiber == nullptr) {

    for (size_t i = 1; i < sched->nworkers; i++) {

      // We start form the right side neighbour and with `i` growing we will

      // eventually loop back to the left side neighbour in the workers array.

      size_t index         = (w->id + i) % sched->nworkers;

      srn_worker_t *victim = &sched->workers[index];

      fiber                = local_steal(victim);


      if (fiber != nullptr) {

        FIBER_TRACEPOINT(sched_steal, w->id, victim->id, (void *)fiber);

        SCHED_TRACE("worker %zu stole fiber %p from worker %zu", w->id, (void *)fiber, victim->id);

        break;

      }

    }

  }


  if (fiber != nullptr) {

    atomic_fetch_sub(&sched->runnable, 1);

  }


  return fiber;

}


/// Run the worker routine over `worker` on the calling os thread. Find a fiber,

/// run it, handle how it gave up control, and park when nothing is runnable,

/// until the pool is quiescent. Owns the `current_worker` thread-local for its

/// duration. The hot path (`find_work` hitting the local deque, run) touches

/// only this worker's own lock free deque. The global lock is reached to park,

/// for the global queue, and on every yield, a yielded fiber goes to the

/// global tail so its peers get a turn, which is the point of yielding.


static void worker_run(srn_worker_t *worker) {

  srn_scheduler_t *sched = worker->sched;

  current_worker         = worker;


  for (;;) {

    // We check for termination here, between fibers, so an os thread stops at a

    // clean boundary even while its worker's deque still holds work. Whatever

    // is left unrun is reclaimed by `srn_sched_shutdown` through the registry.

    if (atomic_load(&sched->state) == SRN_SCHED_STOPPING) {

      break;

    }


    // Drain this worker's reactor completions before looking for fibers. Runs

    // on the worker, so srn_fiber_ready pushes the woken fibers onto its own

    // deque, keeping them local.

    srn_reactor_consume(sched->engine->reactor, worker->id);


    srn_fiber_t *fiber = find_work(worker);

    if (fiber == nullptr) {

      // Nothing runnable here or in any peer to steal from, so park this os

      // thread. Parking while every other os thread is already parked and

      // nothing is queued means the pool is quiescent. Nothing running can

      // produce more work, so the run ends. Move to STOPPING and wake every os

      // thread to exit. The `runnable` re-check in the loop closes the race

      // with a fiber enqueued between find_work and taking the lock, and

      // absorbs spurious wakeups.

      srn_mutex_lock(&sched->lock);

      atomic_fetch_add(&sched->idle, 1);


      if (

        atomic_load(&sched->idle) == sched->nworkers && atomic_load(&sched->runnable) == 0 &&

        (int)srn_reactor_idle(sched->engine->reactor)

      ) {


        // All the os threads are idle. Time to stop

        atomic_store(&sched->state, SRN_SCHED_STOPPING);

        FIBER_TRACEPOINT(sched_quiesce, worker->id);

        srn_cond_notify_all(&sched->work);

      }


      FIBER_TRACEPOINT(sched_worker_park, worker->id);

      while (atomic_load(&sched->runnable) == 0 &&

             atomic_load(&sched->state) != SRN_SCHED_STOPPING &&

             !srn_reactor_channel_has_completions(sched->engine->reactor, worker->id)) {

        // No runnable fiber around and we are not stopping. A draining pool

        // parks here too, waiting for its in-flight ops to complete and ready

        // their fibers, so they can unwind. Only STOPPING ends the park.

        srn_cond_wait(&sched->work, &sched->lock);

      }


      // it has woken. This os thread is no longer parked. Snapshot whether

      // we're stopping, then drop the lock.

      atomic_fetch_sub(&sched->idle, 1);

      FIBER_TRACEPOINT(sched_worker_unpark, worker->id);

      bool stop = atomic_load(&sched->state) == SRN_SCHED_STOPPING;

      srn_mutex_unlock(&sched->lock);


      if (stop) {

        break;

      }


      continue;

    }


    worker->current = fiber;

    // Read here because the switch may reap the fiber, after which only the address value survives,

    // not the name.

    FIBER_TRACEPOINT(sched_fiber_running, worker->id, (void *)fiber, fiber->name);


    // The worker routine is the single owner of the RUNNING transition, for a

    // fiber's first run and every resume after a yield.

    fiber->state = SRN_FIBER_RUNNING;


    srn_fiber_switch(&worker->loop, fiber);


    const char *slice_reason;


    // `fiber` has switched back, and is not on any queue. How it gave up the

    // CPU is read from the fiber. A parked fiber left a commit on itself

    // (`park_commit` set), a finished one is `DONE`, and a yielded one is still

    // `RUNNING`.

    if (fiber->park_commit != nullptr) {

      slice_reason = "suspend";

      // Parked. It is fully off the CPU now, with its context saved, so this is

      // the first moment it is safe to wake (by others). Stamp `SUSPENDED`

      // here, not in `srn_fiber_suspend` before the switch, so the label only

      // ever marks a fiber that is parked and safe to resume. A waker flipping

      // `SUSPENDED` to `READY` can therefore never catch a fiber still parking.

      fiber->state = SRN_FIBER_SUSPENDED;

      FIBER_TRACEPOINT(sched_fiber_suspended, worker->id, (void *)fiber, fiber->name);

      // `park_commit`/`park_arg` are one-shot, carrying the commit across the

      // switch. Clearing them now loses nothing, since the next suspend sets

      // them again and the fiber never reads them on resume.

      srn_fiber_park_fn commit = fiber->park_commit;

      void *park_arg           = fiber->park_arg;

      fiber->park_commit       = nullptr;

      fiber->park_arg          = nullptr;


      // Run the commit now that the fiber is parked. It hands the fiber to its

      // waker (a waiter list, the reactor, and so on), which reschedules it

      // later. A true return means stay parked. A false return means the

      // condition already held, so wake it back up.

      if (!commit(fiber, park_arg)) {

        ready_fiber(sched, fiber);

      }

    } else if (fiber->state == SRN_FIBER_DONE) {

      slice_reason = "done";

      FIBER_TRACEPOINT(sched_fiber_done, worker->id, (void *)fiber, fiber->name);

      // Detach the waiter list (fibers blocked in `srn_fiber_wait_for`) and

      // drop the fiber from the registry under the global lock, which also

      // guards the waiter list against `wait_for_park`. Then wake the waiters

      // and free the stack outside the lock. Each waiter reads this fiber's

      // result, which outlives the reap since only the stack is freed, not the

      // struct.

      srn_mutex_lock(&sched->lock);

      srn_fiber_t *waiters = fiber->waiters;

      fiber->waiters       = nullptr;

      registry_remove(sched, fiber);

      srn_mutex_unlock(&sched->lock);


      while (waiters != nullptr) {

        srn_fiber_t *waiter = waiters;

        // Advance before the wake reuses `link`

        waiters = waiter->link;

        ready_fiber(sched, waiter);

      }


      // TODO(lxsameer): Instead of freeing the stack, return it to the ring

      // pool

      FIBER_TRACEPOINT(fiber_stack_free, (void *)fiber, fiber->name);

      srn_fiber_stack_free(fiber->stack);

      srn_fiber_on_reap(fiber);

    } else {

      slice_reason = "yield";

      // Yielded. It is fully off the CPU now, with its context saved, so this

      // is the first moment it is safe to put back on a queue, where another

      // worker may take it at once. `srn_fiber_yield` does not enqueue before

      // switching, which would expose a context still being saved to a resuming

      // worker. The fiber goes to the global queue tail, not the local deque:

      // the worker pops its deque LIFO, so a local push would run the same

      // fiber again immediately and starve its peers, making yield a no-op.

      fiber->state = SRN_FIBER_READY;

      FIBER_TRACEPOINT(sched_fiber_ready, worker->id, (void *)fiber, fiber->name);

      global_enqueue(sched, fiber);

    }

    // Slice end, once the reason is known. Pairs with worke_run_end by the

    // fiber address. Passes only the pointer value, never a dereference, so it

    // is safe even after the done branch reaped the fiber's stack.

    FIBER_TRACEPOINT(sched_fiber_finished, worker->id, (void *)fiber, slice_reason);


    worker->current = nullptr;

  }


  current_worker = nullptr;

}


/// The entry an os thread starts in. It sets up its worker's loop -- on its own

/// os thread, so the sanitizer captures the right stack bounds -- then runs the

/// worker routine until the pool is quiescent. `arg` is the worker.


static void worker_main(void *arg) {

  srn_worker_t *worker = arg;

  srn_fiber_init_thread(&worker->loop);


  FIBER_TRACEPOINT(sched_thread_init, worker->id);


  worker_run(worker);

}


void srn_sched_run(srn_scheduler_t *sched, size_t nworkers) {

  PANIC_IF_NULL(sched);

  PANIC_IF(sched->destroyed, "srn_sched_run called on a scheduler that was already shut down");


  // Claiming run_active up front turns an overlapping run into a clean panic

  // instead of two runs clobbering the worker arrays under each other.

  PANIC_IF(

    atomic_exchange(&sched->run_active, true),

    "srn_sched_run called while another run is active on this scheduler"

  );


  // A finished run leaves its worker arrays behind for shutdown to join and

  // free. A second run would replace them while stragglers from the first may

  // still be winding down, reviving those stragglers against the new run's

  // state, and the reactor cannot be activated twice either. A scheduler

  // therefore runs once; re-run support requires reactor reactivation.

  PANIC_IF(

    sched->workers != nullptr,

    "srn_sched_run called on a scheduler that has already run; re-run is not "

    "supported (the reactor cannot be reactivated)"

  );


  // A caller that does not pick a count gets the configured one, and every

  // request is clamped to the configured ceiling. SRN_MAX_WORKERS stays the

  // absolute ceiling above whatever the configuration asks for.

  const srn_configuration_t *config = &sched->engine->config;

  nworkers                          = nworkers == 0 ? config->fiber.workers : nworkers;


  if (nworkers == 0) {

    // A configured count of zero delegates to the machine, one worker per

    // CPU the process may run on.

    nworkers = srn_thread_cpu_count();

  }


  if (nworkers > config->fiber.max_workers) {

    nworkers = config->fiber.max_workers;

  }


  if (nworkers > SRN_MAX_WORKERS) {

    nworkers = SRN_MAX_WORKERS;

  }


  // Allocate `workers` and `os_threads` on the scheduler so

  // `srn_sched_shutdown` can join the threads and free them later. `workers`

  // has one entry per worker. `os_threads` has one per spawned thread, with

  // slot 0 left empty because the caller runs worker 0 inline (see the struct

  // comment). `runnable` is left alone, it already counts the fibers queued

  // before the run.

  sched->workers = srn_mm_malloc(sched->engine->mm, nworkers * sizeof(srn_worker_t));

  PANIC_IF_NULL(sched->workers);


  sched->os_threads = srn_mm_malloc(sched->engine->mm, nworkers * sizeof(srn_thread_t));

  PANIC_IF_NULL(sched->os_threads);


  for (size_t i = 0; i < nworkers; i++) {

    srn_worker_t *w = &sched->workers[i];

    w->sched        = sched;

    w->id           = i;

    w->current      = nullptr;

    w->spawned      = 0;

    // The deque indices start empty. Its ring slots are written before they are

    // read, and the worker's loop is set up by worker_main on its own os

    // thread.

    atomic_init(&w->top, 0);

    atomic_init(&w->bottom, 0);

  }


  // Publish the coordination state before any os thread starts. `nworkers` must

  // be set first so the quiescence check counts the right total, and the state

  // must be RUNNING before an os thread can observe it. `run_active` was

  // claimed at the top of this call; shutdown reads it to see a run in flight.

  sched->idle     = 0;

  sched->nworkers = nworkers;

  atomic_store(&sched->state, SRN_SCHED_RUNNING);


  // Bring the reactor up with one channel per worker before any worker starts,

  // so a fiber's first IO has a channel to submit on. The notify seam wakes the

  // worker that owns the channel a completion lands on.

  srn_reactor_activate(sched->engine->reactor, nworkers, srn_sched_wake_worker);


  FIBER_TRACEPOINT(sched_run, nworkers);


  // Spawn nworkers - 1 os threads. The calling os thread runs worker 0 inline.

  // A spawn failure at startup is fatal, a partial pool would never reach `idle

  // == nworkers` and so never quiesce.

  for (size_t i = 1; i < nworkers; i++) {

    if (srn_thread_spawn(&sched->os_threads[i], worker_main, &sched->workers[i]) != SRN_THREAD_OK) {

      PANIC("failed to spawn an os thread");

    }

    FIBER_TRACEPOINT(sched_thread_spawn, i);

  }


  worker_main(&sched->workers[0]);


  // Worker 0 has stopped, so the run is over from the caller's point of view.

  // The spawned os threads may still be winding down, so they are NOT joined

  // here. `srn_sched_shutdown` joins them (the `os_threads` live on the

  // scheduler) as part of tearing the subsystem down. Clearing `run_active`

  // lets shutdown proceed. The state stays STOPPING, which keeps any os thread

  // still looping on its way out.

  atomic_store(&sched->run_active, false);

}


void srn_sched_stop(srn_scheduler_t *sched) {

  PANIC_IF_NULL(sched);

  // Flip RUNNING or DRAINING to STOPPING once. A drain stalled on an op that

  // never completes must remain abortable, so stop escalates a drain rather

  // than deferring to it. If the scheduler is not running, or is already

  // stopping, there is nothing to do.

  srn_sched_state_t expected = SRN_SCHED_RUNNING;


  if (!atomic_compare_exchange_strong(&sched->state, &expected, SRN_SCHED_STOPPING)) {

    expected = SRN_SCHED_DRAINING;

    if (!atomic_compare_exchange_strong(&sched->state, &expected, SRN_SCHED_STOPPING)) {

      return;

    }

  }


  // Running os threads see STOPPING at the top of their next turn. Parked os

  // threads are roused to observe it. The notify is under the lock, paired with

  // the park path, so no wakeup is lost.

  srn_mutex_lock(&sched->lock);

  srn_cond_notify_all(&sched->work);

  srn_mutex_unlock(&sched->lock);

  SCHED_LOG("stop requested");

}


void srn_sched_drain(srn_scheduler_t *sched) {

  PANIC_IF_NULL(sched);

  // Begin a graceful winddown, only a RUNNING scheduler can enter DRAINING.

  // Already draining or stopping, or not running at all, leaves the state as

  // is.

  srn_sched_state_t expected = SRN_SCHED_RUNNING;


  if (!atomic_compare_exchange_strong(&sched->state, &expected, SRN_SCHED_DRAINING)) {

    return;

  }


  // From here `srn_sched_accepting_submissions` returns false, so the next IO a

  // fiber attempts is fenced into a cancelled completion and the fiber unwinds

  // rather than parking on a fresh op. Workers do NOT break on DRAINING, so

  // every runnable fiber still runs and every in-flight op still completes; the

  // pool converges to the same quiescence as a natural finish, which then moves

  // the state to STOPPING. A never-completing in-flight op (an idle recv, a

  // long sleep) stalls this until it finishes -- bounding that needs op CANCEL

  // (E1) and is out of scope here.

  //

  // The notify wakes any os thread already parked so it re-checks state. A

  // worker parked on outstanding IO simply re-parks (DRAINING keeps it

  // parking), which is harmless.

  srn_mutex_lock(&sched->lock);

  srn_cond_notify_all(&sched->work);

  srn_mutex_unlock(&sched->lock);

  SCHED_LOG("drain requested");

}


// -----------------------------------------------------------------------------

// Fiber-facing operations

// -----------------------------------------------------------------------------

// yield  = switch to the loop, which re-enqueues self once the context is saved

// suspend = switch to the loop, which then runs the commit to publish the

//           parked fiber to its waker

// ready  = enqueue(a named fiber), without switching


void srn_fiber_yield(void) {

  srn_worker_t *worker = current_worker;

  PANIC_IF_NULL(worker);


  // Switch to the worker's loop without enqueuing first. The worker routine

  // puts this fiber back on the ready queue once the switch has saved its

  // context. Enqueuing here, before the switch, would let another os thread

  // dequeue and resume the fiber while this os thread is still saving its

  // context -- two os threads on one fiber stack, which corrupts the switch.

  srn_fiber_t *self = worker->current;

  PANIC_IF_NULL(self);

  srn_fiber_switch(self, &worker->loop);

}


/// A suspended fiber is on no scheduler queue, and the scheduler does not track

/// what it waits on -- whoever wakes it does. The `commit` callback runs on the

/// worker's loop side once the fiber has switched out. It hands the fiber's

/// pointer to the event source it blocks on (a peer fiber, a lock's waiter

/// list, the IO reactor's fd table), so that party can call srn_fiber_ready

/// when the awaited event occurs. Running commit only after the suspend

/// completes is what makes the hand-off race free, a waker can never observe a

/// half-suspended fiber. If commit registers the fiber nowhere, it is genuinely

/// lost -- a deadlock, like an os thread blocking on a condition nobody

/// signals.


void srn_fiber_suspend(srn_fiber_park_fn commit, void *arg) {

  PANIC_IF_NULL(commit);


  srn_worker_t *worker = current_worker;

  PANIC_IF_NULL(worker);


  srn_fiber_t *self = worker->current;

  PANIC_IF_NULL(self);


  // The fiber carries its own commit. The worker routine runs it after we

  // switch out -- the one safe point to publish a fully suspended fiber to its

  // waker. The routine also stamps the `SUSPENDED` state once the switch

  // completes, so the `state` never marks a fiber that is still suspending.

  // This call leaves the `state` as `RUNNING` and lets the switch carry the

  // fiber off the os thread.

  self->park_commit = commit;

  self->park_arg    = arg;

  srn_fiber_switch(self, &worker->loop);

}


void srn_fiber_ready(srn_fiber_t *fiber) {

  PANIC_IF_NULL(fiber);


  // Wake a suspended fiber. The flip in `ready_fiber` lets exactly one of

  // several racing wakers enqueue it (an IO completion and a timeout firing on

  // it, say), while the rest find it no longer `SUSPENDED` and do nothing. The

  // scheduler is resolved from the fiber, not the calling os thread, so the

  // reactor -- the one legitimate waker outside the worker pool, since

  // quiescence accounts for its in-flight ops -- can wake it too. An

  // unrelated os thread must not, its pending wake is invisible to

  // quiescence, so the run can end before the wake arrives.

  ready_fiber(srn_fiber_get_scheduler_m(fiber), fiber);

}


srn_fiber_t *srn_fiber_current(void) {

  return current_worker != nullptr ? current_worker->current : nullptr;

}


srn_fiber_t *srn_fiber_worker_loop(void) {

  PANIC_IF_NULL(current_worker);

  return &current_worker->loop;

}


/// Add the calling fiber to the target's waiter list and stay parked, unless

/// the target has already finished, in which case decline to park so the caller

/// resumes at once. The DONE check and the list insert run together under the

/// global lock, which also guards the list against the DONE handler that drains

/// it. So this either sees the target finished and declines, or joins the list

/// before the drain and is woken by it, never lost in between.


static bool wait_for_park(srn_fiber_t *self, void *arg) {

  srn_fiber_t *target    = arg;

  srn_scheduler_t *sched = srn_fiber_get_scheduler_m(self);


  srn_mutex_lock(&sched->lock);


  if (target->state == SRN_FIBER_DONE) {

    srn_mutex_unlock(&sched->lock);

    return false;

  }


  self->link      = target->waiters;

  target->waiters = self;


  srn_mutex_unlock(&sched->lock);

  return true;

}


srn_fiber_result_t srn_fiber_wait_for(srn_fiber_t *target) {

  PANIC_IF_NULL(target);

  PANIC_IF(target == srn_fiber_current(), "srn_fiber_wait_for: a fiber cannot wait for itself");


  // Suspend until the target finishes (wait_for_park registers us on its waiter

  // list). The target's DONE handling in the worker routine wakes us. The

  // result is read from the struct, which survives the target's reap.

  srn_fiber_suspend(wait_for_park, target);

  return target->result;

}


void srn_fiber_autoname(srn_engine_t *engine, char *dst, size_t size) {

  PANIC_IF_NULL(engine);

  PANIC_IF_NULL(dst);


  srn_worker_t *w = current_worker;

  if (w != nullptr) {

    // The tag names the creating worker and its spawn count. Provenance, not

    // placement, the fiber may be stolen and run anywhere.

    (void)snprintf(dst, size, "f#%zu:%" PRIu64, (size_t)w->id, ++w->spawned);

    return;

  }

  // Off the pool there is no worker id. The tag is `m` and the engine wide

  // object id keeps names unique across every creating thread.

  (void)snprintf(dst, size, "f#m:%" PRIu64, srn_allocate_object_id(engine));

}


srn_worker_id_t srn_sched_current_worker_id() {

  // We use the SIZE_MAX as an idicator that there is no current

  // worker for the running os thread. (size_t)-1 == SIZE_MAX

  return current_worker == nullptr ? (srn_worker_id_t)-1 : current_worker->id;

}


bool srn_sched_accepting_submissions(srn_scheduler_t *sched) {

  PANIC_IF_NULL(sched);

  // Only a RUNNING scheduler takes new IO. Once DRAINING or STOPPING, the IO

  // bridge fences submissions so fibers unwind instead of parking on ops the

  // wind-down would have to wait out.

  return atomic_load(&sched->state) == SRN_SCHED_RUNNING;

}


void srn_sched_wake_worker(srn_scheduler_t *sched, size_t channel) {

  // TODO(lxsameer): Wake up the worker in charge of the given channel. instead

  // of waking all.

  UNUSED(channel);

  srn_mutex_lock(&sched->lock);

  srn_cond_notify_all(&sched->work);

  srn_mutex_unlock(&sched->lock);

}


worker
static srn_fiber_result_t worker(srn_context_t *ctx, void *arg)
Definition 03_wait_for.c:44

waiter
static srn_fiber_result_t waiter(srn_context_t *ctx, void *arg)
Definition 03_wait_for.c:51

SRN_MAX_WORKERS
#define SRN_MAX_WORKERS
The absolute worker ceiling.
Definition configuration.h:82

context.h

srn_mm_free
void srn_mm_free(srn_mm_t *mm, void *ptr)
Release a pointer previously returned by srn_mm_malloc or srn_mm_reallocate.
Definition default.c:169

srn_mm_malloc
void * srn_mm_malloc(srn_mm_t *mm, size_t size)
Generic allocations that do not participate in the block based pools.
Definition default.c:155

srn_allocate_object_id
srn_object_id_t srn_allocate_object_id(srn_engine_t *engine)
Definition engine.c:172

engine.h

srn_fiber_init_thread
void srn_fiber_init_thread(srn_fiber_t *f)
Represent the calling OS thread as the running fiber ("#0"), so the scheduler or a test can switch aw...
Definition fiber.c:153

srn_fiber_switch
void srn_fiber_switch(srn_fiber_t *from, srn_fiber_t *to)
Compiled without AddressSanitizer instrumentation, in stack-use-after-return mode ASan would place fr...
Definition fiber.c:65

srn_fiber_on_reap
void srn_fiber_on_reap(srn_fiber_t *fiber)
Call when a finished fiber is reaped, after it has switched away for the last time.
Definition fiber.c:132

fiber.h
AI Generated (🤦) Fiber subsystem overview.

srn_fiber_get_scheduler_m
#define srn_fiber_get_scheduler_m(fiber)
Definition fiber.h:163

srn_worker_id_t
size_t srn_worker_id_t
Definition fiber.h:153

FIBER_TRACEPOINT
#define FIBER_TRACEPOINT(...)
Definition fiber.h:146

srn_fiber_stack_free
void srn_fiber_stack_free(srn_fiber_stack_t stack)

SRN_FIBER_NEW
@ SRN_FIBER_NEW
Created, stack mapped, never resumed.
Definition fiber.h:236

SRN_FIBER_RUNNING
@ SRN_FIBER_RUNNING
Currently executing.
Definition fiber.h:240

SRN_FIBER_READY
@ SRN_FIBER_READY
On the run queue, eligible to run.
Definition fiber.h:238

SRN_FIBER_DONE
@ SRN_FIBER_DONE
Entry returned. The result is final.
Definition fiber.h:244

SRN_FIBER_SUSPENDED
@ SRN_FIBER_SUSPENDED
Parked off the run queue, awaits srn_fiber_ready.
Definition fiber.h:242

srn_fiber_park_fn
bool(* srn_fiber_park_fn)(srn_fiber_t *self, void *arg)
Suspend commit callback.
Definition fiber.h:266

srn_fiber_result_t
void * srn_fiber_result_t
What a fiber's entry produces, type-erased.
Definition fiber.h:161

srn_fiber_state_t
enum srn_fiber_state_e srn_fiber_state_t

interface.h
Notes:

srn_mm_immortal_allocate
#define srn_mm_immortal_allocate(mm, T)
Definition interface.h:186

srn_reactor_consume
void srn_reactor_consume(srn_reactor_t *reactor, size_t channel)
Runs on the worker loop who owns the channel.
Definition io.c:123

srn_reactor_activate
void srn_reactor_activate(srn_reactor_t *reactor, size_t nchannels, srn_reactor_notify_fn notify)
Bring the reactor up, allocate nchannels channels (one per worker) and start the reactor thread.
Definition reactor.c:401

srn_reactor_idle
bool srn_reactor_idle(srn_reactor_t *reactor)
Whether the reactor has no operations in flight.
Definition reactor.c:175

srn_reactor_channel_has_completions
bool srn_reactor_channel_has_completions(srn_reactor_t *reactor, size_t channel)
Whether channel's completion queue has unconsumed completions.
Definition reactor.c:246

reactor.h
Reactor overview.

srn_sched_register
void srn_sched_register(srn_scheduler_t *sched, srn_fiber_t *fiber)
Record a fiber in the scheduler's registry of live fibers, where it stays until it is reaped.
Definition scheduler.c:324

srn_fiber_worker_loop
srn_fiber_t * srn_fiber_worker_loop(void)
The worker's loop of the worker running on the calling os thread.
Definition scheduler.c:1101

registry_add
static void registry_add(srn_scheduler_t *sched, srn_fiber_t *fiber)
Insert at the head of the registry. Caller must hold sched->lock.
Definition scheduler.c:296

SCHED_LOG
#define SCHED_LOG(FMT,...)
Definition scheduler.c:31

ready_fiber
static void ready_fiber(srn_scheduler_t *sched, srn_fiber_t *fiber)
Wake a parked fiber by flipping SUSPENDED to READY and enqueuing it.
Definition scheduler.c:658

srn_sched_wake_worker
void srn_sched_wake_worker(srn_scheduler_t *sched, size_t channel)
Rouse parked workers so the owner of channel consumes its completions.
Definition scheduler.c:1171

srn_fiber_ready
void srn_fiber_ready(srn_fiber_t *fiber)
Mark a suspended fiber runnable again, waking it when the event it awaited occurs.
Definition scheduler.c:1083

worker_run
static void worker_run(srn_worker_t *worker)
Run the worker routine over worker on the calling os thread.
Definition scheduler.c:707

srn_sched_current_worker_id
srn_worker_id_t srn_sched_current_worker_id()
Return the id of the worker that the calling os thread is running, or SIZE_MAX when the calling threa...
Definition scheduler.c:1157

srn_fiber_wait_for
srn_fiber_result_t srn_fiber_wait_for(srn_fiber_t *target)
Block the calling fiber until target finishes, then return its result.
Definition scheduler.c:1130

srn_sched_drain
void srn_sched_drain(srn_scheduler_t *sched)
Ask a running scheduler to wind down gracefully.
Definition scheduler.c:1002

local_pop
static srn_fiber_t * local_pop(srn_worker_t *w)
Owner only.
Definition scheduler.c:480

current_worker
static _Thread_local srn_worker_t * current_worker
The worker the calling os thread is running, or null when this os thread is not running the worker ro...
Definition scheduler.c:257

srn_sched_shutdown
void srn_sched_shutdown(srn_scheduler_t *sched)
The one stop tear down of the fiber subsystem, should be called once srn_sched_run has returned.
Definition scheduler.c:333

local_push
static bool local_push(srn_worker_t *w, srn_fiber_t *fiber)
This operation is only for the owner of the ring.
Definition scheduler.c:451

registry_remove
static void registry_remove(srn_scheduler_t *sched, srn_fiber_t *fiber)
Unlink from the registry.
Definition scheduler.c:309

srn_sched_accepting_submissions
bool srn_sched_accepting_submissions(srn_scheduler_t *sched)
Whether the scheduler still accepts new IO submissions.
Definition scheduler.c:1163

push_ready
static void push_ready(srn_scheduler_t *sched, srn_fiber_t *fiber)
Put a runnable fiber on a queue, with its state already set to READY.
Definition scheduler.c:606

SRN_FIBER_LOCAL_RING_CAP
#define SRN_FIBER_LOCAL_RING_CAP
Capacity of each worker's local work-stealing deque.
Definition scheduler.c:219

srn_sched_stop
void srn_sched_stop(srn_scheduler_t *sched)
Ask a running scheduler to stop.
Definition scheduler.c:978

srn_sched_enqueue
void srn_sched_enqueue(srn_scheduler_t *sched, srn_fiber_t *fiber)
Place a fiber on a scheduler's ready queue, making it eligible to run.
Definition scheduler.c:630

announce_work
static void announce_work(srn_scheduler_t *sched)
Wake the os thread of one parked worker after a fiber has joined a queue.
Definition scheduler.c:427

srn_sched_init
srn_scheduler_t * srn_sched_init(srn_engine_t *engine)
Definition scheduler.c:263

srn_fiber_current
srn_fiber_t * srn_fiber_current(void)
The fiber currently running on this os thread, or null when the calling thread is not a worker or the...
Definition scheduler.c:1097

worker_main
static void worker_main(void *arg)
The entry an os thread starts in.
Definition scheduler.c:866

srn_sched_state_t
srn_sched_state_t
The scheduler's lifecycle as one atomic value.
Definition scheduler.c:132

SRN_SCHED_RUNNING
@ SRN_SCHED_RUNNING
Definition scheduler.c:134

SRN_SCHED_STOPPING
@ SRN_SCHED_STOPPING
Definition scheduler.c:136

SRN_SCHED_IDLE
@ SRN_SCHED_IDLE
Definition scheduler.c:133

SRN_SCHED_DRAINING
@ SRN_SCHED_DRAINING
Definition scheduler.c:135

srn_fiber_schedule
void srn_fiber_schedule(srn_fiber_t *fiber)
Schedule a NEW fiber, making it eligible to run.
Definition scheduler.c:638

global_take
static srn_fiber_t * global_take(srn_scheduler_t *sched)
Pop the head of the global queue, or null when empty.
Definition scheduler.c:585

global_enqueue
static void global_enqueue(srn_scheduler_t *sched, srn_fiber_t *fiber)
Append a fiber to the global/overflow queue.
Definition scheduler.c:559

find_work
static srn_fiber_t * find_work(srn_worker_t *w)
Find a fiber to run, the worker's own deque first, then the global queue, then a steal of one fiber f...
Definition scheduler.c:669

srn_sched_run
void srn_sched_run(srn_scheduler_t *sched, size_t nworkers)
Run the scheduler with nworkers os threads draining it, returning once the pool goes quiescent (every...
Definition scheduler.c:875

wait_for_park
static bool wait_for_park(srn_fiber_t *self, void *arg)
Add the calling fiber to the target's waiter list and stay parked, unless the target has already fini...
Definition scheduler.c:1112

srn_fiber_autoname
void srn_fiber_autoname(srn_engine_t *engine, char *dst, size_t size)
Write the autogenerated debug name for a new fiber into dst.
Definition scheduler.c:1141

srn_fiber_suspend
void srn_fiber_suspend(srn_fiber_park_fn commit, void *arg)
A suspended fiber is on no scheduler queue, and the scheduler does not track what it waits on – whoev...
Definition scheduler.c:1063

srn_fiber_yield
void srn_fiber_yield(void)
Yield cooperatively, re-enqueue the running fiber and run the next ready one.
Definition scheduler.c:1039

SCHED_TRACE
#define SCHED_TRACE(...)
Per-operation deque and queue tracing (push, pop, steal, wake).
Definition scheduler.c:40

local_steal
static srn_fiber_t * local_steal(srn_worker_t *victim)
Thief side.
Definition scheduler.c:522

srn_cond_t
Definition thread.h:54

srn_configuration_t
Every runtime knob, in one place.
Definition configuration.h:187

srn_configuration_t::fiber
srn_fiber_config_t fiber
Definition configuration.h:189

srn_engine_t
Engine is a structure to own the long living and main pieces of the compiler.
Definition engine.h:51

srn_engine_t::config
srn_configuration_t config
The runtime's tunable knobs, the single source for every configurable value (see configuration....
Definition engine.h:62

srn_engine_t::mm
srn_mm_t * mm
Memory manager.
Definition engine.h:65

srn_engine_t::reactor
srn_reactor_t * reactor
The I/O reactor, that is in charge of handling everything I/O.
Definition engine.h:78

srn_fiber_config_t::workers
size_t workers
Worker count used when a run does not specify one.
Definition configuration.h:131

srn_fiber_config_t::max_workers
size_t max_workers
Hard ceiling a requested worker count is clamped to.
Definition configuration.h:134

srn_fiber_t
Definition fiber.h:268

srn_fiber_t::name
char name[SRN_FIBER_NAME_MAX]
Debug name, the caller's choice copied at creation, or autogenerated when the caller passed none (see...
Definition fiber.h:333

srn_fiber_t::state
_Atomic srn_fiber_state_t state
The lifecycle state.
Definition fiber.h:277

srn_fiber_t::link
srn_fiber_t * link
Intrusive link threading this fiber onto one of the scheduler's singly-linked lists (the ready run qu...
Definition fiber.h:307

srn_fiber_t::waiters
srn_fiber_t * waiters
Head of the list of fibers blocked in srn_fiber_wait_for on this fiber.
Definition fiber.h:313

srn_fiber_t::park_arg
void * park_arg
Definition fiber.h:289

srn_fiber_t::result
srn_fiber_result_t result
Set when state reaches SRN_FIBER_DONE.
Definition fiber.h:283

srn_fiber_t::park_commit
srn_fiber_park_fn park_commit
While this fiber is suspending, the commit the worker routine runs once the fiber is off the stack,...
Definition fiber.h:288

srn_fiber_t::reg_prev
srn_fiber_t * reg_prev
Registry links.
Definition fiber.h:326

srn_fiber_t::reg_next
srn_fiber_t * reg_next
Definition fiber.h:327

srn_mutex_t
Definition thread.h:50

srn_scheduler_t
Definition scheduler.c:139

srn_scheduler_t::runnable
atomic_size_t runnable
Definition scheduler.c:185

srn_scheduler_t::destroyed
bool destroyed
Set once srn_sched_shutdown has torn the scheduler down.
Definition scheduler.c:211

srn_scheduler_t::engine
srn_engine_t * engine
Definition scheduler.c:140

srn_scheduler_t::nworkers
size_t nworkers
Definition scheduler.c:186

srn_scheduler_t::run_active
_Atomic bool run_active
True for the duration of an srn_sched_run call.
Definition scheduler.c:207

srn_scheduler_t::registry
srn_fiber_t * registry
Registry, head of the doubly-linked list (through reg_prev/reg_next) of every live fiber,...
Definition scheduler.c:158

srn_scheduler_t::idle
atomic_size_t idle
Definition scheduler.c:184

srn_scheduler_t::lock
srn_mutex_t lock
Global lock.
Definition scheduler.c:146

srn_scheduler_t::work
srn_cond_t work
Worker coordination.
Definition scheduler.c:183

srn_scheduler_t::ready_head
srn_fiber_t * ready_head
Global / overflow queue.
Definition scheduler.c:151

srn_scheduler_t::os_threads
srn_thread_t * os_threads
Definition scheduler.c:202

srn_scheduler_t::ready_tail
srn_fiber_t * ready_tail
Definition scheduler.c:152

srn_scheduler_t::state
_Atomic srn_sched_state_t state
Definition scheduler.c:188

srn_scheduler_t::workers
srn_worker_t * workers
srn_sched_run allocates these two arrays and srn_sched_shutdown frees them.
Definition scheduler.c:201

srn_thread_t
Definition thread.h:40

srn_worker_t
The state one os thread uses to run fibers.
Definition scheduler.c:230

srn_worker_t::top
atomic_intptr_t top
Chase-Lev deque.
Definition scheduler.c:245

srn_worker_t::current
srn_fiber_t * current
Definition scheduler.c:233

srn_worker_t::spawned
uint64_t spawned
Count of fibers this worker has created so far.
Definition scheduler.c:238

srn_worker_t::sched
srn_scheduler_t * sched
Definition scheduler.c:231

srn_worker_t::bottom
atomic_intptr_t bottom
Definition scheduler.c:246

srn_worker_t::loop
srn_fiber_t loop
Definition scheduler.c:232

srn_worker_t::id
srn_worker_id_t id
Definition scheduler.c:234

thread.h
srn_thread_t, srn_mutex_t, and srn_cond_t model the thread-level operations the runtime needs,...

srn_mutex_destroy
srn_thread_status_t srn_mutex_destroy(srn_mutex_t *m)
Release a mutex's resources.

srn_mutex_init
srn_thread_status_t srn_mutex_init(srn_mutex_t *m)

srn_thread_cpu_count
size_t srn_thread_cpu_count(void)
The number of CPUs the calling process may run threads on, at least 1.

srn_cond_destroy
srn_thread_status_t srn_cond_destroy(srn_cond_t *c)
Release a condition's resources.

srn_thread_join
srn_thread_status_t srn_thread_join(srn_thread_t *t)
Block until the thread started for t returns.

srn_mutex_unlock
srn_thread_status_t srn_mutex_unlock(srn_mutex_t *m)

SRN_THREAD_OK
@ SRN_THREAD_OK
Definition thread.h:62

srn_cond_wait
srn_thread_status_t srn_cond_wait(srn_cond_t *c, srn_mutex_t *m)
Release m, sleep until notified, then re-acquire m before returning.

srn_mutex_lock
srn_thread_status_t srn_mutex_lock(srn_mutex_t *m)

srn_cond_init
srn_thread_status_t srn_cond_init(srn_cond_t *c)

srn_cond_notify_one
srn_thread_status_t srn_cond_notify_one(srn_cond_t *c)
Wake one waiter.

srn_cond_notify_all
srn_thread_status_t srn_cond_notify_all(srn_cond_t *c)
Wake every waiter.

srn_thread_spawn
srn_thread_status_t srn_thread_spawn(srn_thread_t *t, void(*fn)(void *), void *arg)
Run fn(arg) on a new OS thread.

utils.h

PANIC_IF_NULL
#define PANIC_IF_NULL(ptr)
Definition utils.h:66

PANIC_IF
#define PANIC_IF(cond, msg)
Definition utils.h:59

UNUSED
#define UNUSED(x)
Definition utils.h:45

PANIC
#define PANIC(msg)
Definition utils.h:53