From 642d0583c95c11c2699cedf723cd2e51635ac653 Mon Sep 17 00:00:00 2001 From: David Taylor Date: Tue, 16 Sep 2025 04:40:50 +0000 Subject: [PATCH] runtime: add runtime.Yield --- src/runtime/proc.go | 144 ++++++++++++++++++++++++++++++++++++++++ src/runtime/runtime2.go | 15 +++++ 2 files changed, 159 insertions(+) diff --git a/src/runtime/proc.go b/src/runtime/proc.go index b1159870902cd3..4b0cd2ec1edabf 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -353,6 +353,106 @@ func Gosched() { mcall(gosched_m) } +// Yield cooperatively yields if, and only if, the scheduler is "busy". +// +// This can be called by any work wishing to utilize strictly spare capacity +// while minimizing the degree to which it delays other work from being promptly +// scheduled. +// +// Yield is intended to be very low overhead, particularly in the no-op case +// where the scheduler is not at capacity, to ensure it can be called often +// enough in tasks wishing to yield promptly to waiting work when needed. +// needed. When the scheduler is busy, the yielded goroutine can be parked in a +// waiting state until the scheduler has idle capacity again to resume it. +// +// A goroutine calling Yield may be parked in a yielded state for an arbitrary +// amount of time as long as the scheduler remains busy; callers should consider +// this when deciding where to and where not to yield, such as considering when +// locks that are contended by other work might be held. +// +// Yield will never park if the calling goroutine is locked to an OS thread. +// +// go:nosplit +func Yield() { + // Common/fast case: do nothing if ngqueued is zero. Doing only this check here + // and leaving more detailed decisions to yield_slow keeps this wrapper + // inlineable (complexity cost as of writing is 70 out of the allowed 80). + if sched.ngqueued.Load() != 0 { + yield_slow() + } +} + +// yield_slow is intended to be called after a check of ngqueued suggests that +// yielding would be appreciated to determine how to actually yield (to P's +// local runq vs parking in yieldq). It is split out to ensure Yield() and its +// cheap check of ngqueued remains inlineable. +// +// If there is work on the local runq, the cheapest option is to just hop behind +// it in the local runq to let it run and then pick back up. However this will +// end up thrashing if the work we yield to also then yields right back. We +// don't mark goroutines in any way when they yield so we cannot directly detech +// if the next goroutine in our local runq got there via a yield/will yield +// back, so we can use a heuristic: if we ran for <100us, it is possible we are +// thrashing so we can go park in the yieldq to let the remaining local runq +// work drain. +// +// If there is no work in the local and global run queues but ngqueued got us +// here, it is likely there is work on a different P's local queue: we could +// immediately park in the yieldq to free this P to go try to steal, but we +// would prefer that the work currently running on that P yield to it (or +// finish/block/be preempted) instead of parking this work, stealing that work, +// and then unparking this work again. +// +// At the same time, we *do* want to yield -- that's why we are here -- if there +// is work waiting for a chance to run. We can balance our preference to give +// the other P a chance to just run it vs not making it wait too longwith a +// heuristic: an ideal one might use how long that work has been waiting (either +// by changing ngqueued to be a time or by locally remembering when/how many times +// we see it non-zero), but a simple rule that uses the existing fields for now +// is just to go park if we have been running for 1ms: this bounds how long we +// defer parking (to at most 1ms) and while we might park immediately if we were +// already running >1ms before ngqueued was set, at least the fact we ran for 1ms +// means the overhead of parking and unparking may be proportionally lower. +// +// If the global runq has work, we always park right away, as unlike the other-P +// local runq case, there isn't a P we think is better suited to running it, so +// we should just do it. +func yield_slow() { + gp := getg() + + running := nanotime() - gp.lastsched + if !runqempty(gp.m.p.ptr()) { + if running > 100_000 { // 100us + goyield() + return + } + } else if sched.runqsize == 0 && running < 1_000_000 { // 1ms + return + } + + // Don't park while locked to an OS thread. + if gp.lockedm != 0 { + return + } + + // Eagerly decrement ngqueued; we could leave it for findRunnable to reset it + // next time it finds no work, but there could be a thundering herd of yields + // in the meantime; we know we're parking to go find _some_ work so we can + // decrement it by one right away. This decrement does race with the reset in + // findRunnable, so if we notice it go negative, just reset it and skip yield. + // Of course that too races with a concurrent increment but that's fine - + // it is an approximate signal anyway. + if sched.ngqueued.Add(-1) < 0 { + sched.ngqueued.Store(0) + return + } + + checkTimeouts() + + // traceskip=1 so stacks show runtime.Yield + gopark(yield_put, nil, waitReasonYield, traceBlockPreempted, 1) +} + // goschedguarded yields the processor like gosched, but also checks // for forbidden states and opts out of the yield in those cases. // @@ -3165,6 +3265,7 @@ func wakep() { lock(&sched.lock) pp, _ = pidlegetSpinning(0) if pp == nil { + sched.ngqueued.Add(1) if sched.nmspinning.Add(-1) < 0 { throw("wakep: negative nmspinning") } @@ -3445,6 +3546,29 @@ top: } } + sched.ngqueued.Store(0) + + // As a last resort before we give up the P, try yieldq. + if sched.yieldqsize != 0 { + lock(&sched.lock) + bg := sched.yieldq.pop() + if bg != nil { + sched.yieldqsize-- + } + unlock(&sched.lock) + if bg != nil { + // Transition from _Gwaiting (yield) to _Grunnable. + trace := traceAcquire() + casgstatus(bg, _Gwaiting, _Grunnable) + if trace.ok() { + // Match other ready paths for trace visibility. + trace.GoUnpark(bg, 0) + traceRelease(trace) + } + return bg, false, false + } + } + // We have nothing to do. // // If we're in the GC mark phase, can safely scan and blacken objects, @@ -3509,6 +3633,10 @@ top: unlock(&sched.lock) return gp, false, false } + if sched.yieldqsize != 0 { + unlock(&sched.lock) + goto top + } if !mp.spinning && sched.needspinning.Load() == 1 { // See "Delicate dance" comment below. mp.becomeSpinning() @@ -3666,6 +3794,7 @@ top: unlock(&sched.lock) if pp == nil { injectglist(&list) + sched.ngqueued.Add(1) netpollAdjustWaiters(delta) } else { acquirep(pp) @@ -4889,6 +5018,7 @@ func exitsyscall0(gp *g) { var locked bool if pp == nil { globrunqput(gp) + sched.ngqueued.Add(1) // Below, we stoplockedm if gp is locked. globrunqput releases // ownership of gp, so we must check if gp is locked prior to @@ -7111,6 +7241,20 @@ func (q *gQueue) popList() gList { return stack } +// yield_put is the gopark unlock function for Yield. It enqueues the goroutine +// onto the global yield queue. Returning true keeps the G parked until another +// part of the scheduler makes it runnable again. The G remains in _Gwaiting +// after this returns. +// +//go:nosplit +func yield_put(gp *g, _ unsafe.Pointer) bool { + lock(&sched.lock) + sched.yieldq.pushBack(gp) + sched.yieldqsize++ + unlock(&sched.lock) + return true +} + // A gList is a list of Gs linked through g.schedlink. A G can only be // on one gQueue or gList at a time. type gList struct { diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 24e4fe4b07e3e1..fad821cf86b228 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -801,10 +801,23 @@ type schedt struct { nmspinning atomic.Int32 // See "Worker thread parking/unparking" comment in proc.go. needspinning atomic.Uint32 // See "Delicate dance" comment in proc.go. Boolean. Must hold sched.lock to set to 1. + // ngqueued is a rough apprximation of the number of goroutines waiting for + // scheduler capacity to run (incremented when an idle P is not found e.g. during wakep). + // It is used to signal scheduler exhaustion for cooperative yield decisions; + // it does not need to be exact as long as it broadly captures saturation. + ngqueued atomic.Uint32 + // Global runnable queue. runq gQueue runqsize int32 + // yieldq holds goroutines that voluntarily yielded due to the scheduler + // reporting capacity exhaustion. These were (are) runnable, but have moved to + // waiting while they "block" on "spare" scheduler capacity opening up. Does NOT + // contribute to runqsize. + yieldq gQueue + yieldqsize int32 + // disable controls selective disabling of the scheduler. // // Use schedEnableUser to control this. @@ -1099,6 +1112,7 @@ const ( waitReasonTraceProcStatus // "trace proc status" waitReasonPageTraceFlush // "page trace flush" waitReasonCoroutine // "coroutine" + waitReasonYield // "yield" waitReasonGCWeakToStrongWait // "GC weak to strong wait" ) @@ -1140,6 +1154,7 @@ var waitReasonStrings = [...]string{ waitReasonTraceProcStatus: "trace proc status", waitReasonPageTraceFlush: "page trace flush", waitReasonCoroutine: "coroutine", + waitReasonYield: "yield", waitReasonGCWeakToStrongWait: "GC weak to strong wait", }