@@ -353,6 +353,106 @@ func Gosched() {
353353 mcall (gosched_m )
354354}
355355
356+ // Yield cooperatively yields if, and only if, the scheduler is "busy".
357+ //
358+ // This can be called by any work wishing to utilize strictly spare capacity
359+ // while minimizing the degree to which it delays other work from being promptly
360+ // scheduled.
361+ //
362+ // Yield is intended to be very low overhead, particularly in the no-op case
363+ // where the scheduler is not at capacity, to ensure it can be called often
364+ // enough in tasks wishing to yield promptly to waiting work when needed.
365+ // needed. When the scheduler is busy, the yielded goroutine can be parked in a
366+ // waiting state until the scheduler has idle capacity again to resume it.
367+ //
368+ // A goroutine calling Yield may be parked in a yielded state for an arbitrary
369+ // amount of time as long as the scheduler remains busy; callers should consider
370+ // this when deciding where to and where not to yield, such as considering when
371+ // locks that are contended by other work might be held.
372+ //
373+ // Yield will never park if the calling goroutine is locked to an OS thread.
374+ //
375+ // go:nosplit
376+ func Yield () {
377+ // Common/fast case: do nothing if ngqueued is zero. Doing only this check here
378+ // and leaving more detailed decisions to yield_slow keeps this wrapper
379+ // inlineable (complexity cost as of writing is 70 out of the allowed 80).
380+ if sched .ngqueued .Load () != 0 {
381+ yield_slow ()
382+ }
383+ }
384+
385+ // yield_slow is intended to be called after a check of ngqueued suggests that
386+ // yielding would be appreciated to determine how to actually yield (to P's
387+ // local runq vs parking in yieldq). It is split out to ensure Yield() and its
388+ // cheap check of ngqueued remains inlineable.
389+ //
390+ // If there is work on the local runq, the cheapest option is to just hop behind
391+ // it in the local runq to let it run and then pick back up. However this will
392+ // end up thrashing if the work we yield to also then yields right back. We
393+ // don't mark goroutines in any way when they yield so we cannot directly detech
394+ // if the next goroutine in our local runq got there via a yield/will yield
395+ // back, so we can use a heuristic: if we ran for <100us, it is possible we are
396+ // thrashing so we can go park in the yieldq to let the remaining local runq
397+ // work drain.
398+ //
399+ // If there is no work in the local and global run queues but ngqueued got us
400+ // here, it is likely there is work on a different P's local queue: we could
401+ // immediately park in the yieldq to free this P to go try to steal, but we
402+ // would prefer that the work currently running on that P yield to it (or
403+ // finish/block/be preempted) instead of parking this work, stealing that work,
404+ // and then unparking this work again.
405+ //
406+ // At the same time, we *do* want to yield -- that's why we are here -- if there
407+ // is work waiting for a chance to run. We can balance our preference to give
408+ // the other P a chance to just run it vs not making it wait too longwith a
409+ // heuristic: an ideal one might use how long that work has been waiting (either
410+ // by changing ngqueued to be a time or by locally remembering when/how many times
411+ // we see it non-zero), but a simple rule that uses the existing fields for now
412+ // is just to go park if we have been running for 1ms: this bounds how long we
413+ // defer parking (to at most 1ms) and while we might park immediately if we were
414+ // already running >1ms before ngqueued was set, at least the fact we ran for 1ms
415+ // means the overhead of parking and unparking may be proportionally lower.
416+ //
417+ // If the global runq has work, we always park right away, as unlike the other-P
418+ // local runq case, there isn't a P we think is better suited to running it, so
419+ // we should just do it.
420+ func yield_slow () {
421+ gp := getg ()
422+
423+ running := nanotime () - gp .lastsched
424+ if ! runqempty (gp .m .p .ptr ()) {
425+ if running > 100_000 { // 100us
426+ goyield ()
427+ return
428+ }
429+ } else if sched .runqsize == 0 && running < 1_000_000 { // 1ms
430+ return
431+ }
432+
433+ // Don't park while locked to an OS thread.
434+ if gp .lockedm != 0 {
435+ return
436+ }
437+
438+ // Eagerly decrement ngqueued; we could leave it for findRunnable to reset it
439+ // next time it finds no work, but there could be a thundering herd of yields
440+ // in the meantime; we know we're parking to go find _some_ work so we can
441+ // decrement it by one right away. This decrement does race with the reset in
442+ // findRunnable, so if we notice it go negative, just reset it and skip yield.
443+ // Of course that too races with a concurrent increment but that's fine -
444+ // it is an approximate signal anyway.
445+ if sched .ngqueued .Add (- 1 ) < 0 {
446+ sched .ngqueued .Store (0 )
447+ return
448+ }
449+
450+ checkTimeouts ()
451+
452+ // traceskip=1 so stacks show runtime.Yield
453+ gopark (yield_put , nil , waitReasonYield , traceBlockPreempted , 1 )
454+ }
455+
356456// goschedguarded yields the processor like gosched, but also checks
357457// for forbidden states and opts out of the yield in those cases.
358458//
@@ -3165,6 +3265,7 @@ func wakep() {
31653265 lock (& sched .lock )
31663266 pp , _ = pidlegetSpinning (0 )
31673267 if pp == nil {
3268+ sched .ngqueued .Add (1 )
31683269 if sched .nmspinning .Add (- 1 ) < 0 {
31693270 throw ("wakep: negative nmspinning" )
31703271 }
@@ -3445,6 +3546,29 @@ top:
34453546 }
34463547 }
34473548
3549+ sched .ngqueued .Store (0 )
3550+
3551+ // As a last resort before we give up the P, try yieldq.
3552+ if sched .yieldqsize != 0 {
3553+ lock (& sched .lock )
3554+ bg := sched .yieldq .pop ()
3555+ if bg != nil {
3556+ sched .yieldqsize --
3557+ }
3558+ unlock (& sched .lock )
3559+ if bg != nil {
3560+ // Transition from _Gwaiting (yield) to _Grunnable.
3561+ trace := traceAcquire ()
3562+ casgstatus (bg , _Gwaiting , _Grunnable )
3563+ if trace .ok () {
3564+ // Match other ready paths for trace visibility.
3565+ trace .GoUnpark (bg , 0 )
3566+ traceRelease (trace )
3567+ }
3568+ return bg , false , false
3569+ }
3570+ }
3571+
34483572 // We have nothing to do.
34493573 //
34503574 // If we're in the GC mark phase, can safely scan and blacken objects,
@@ -3509,6 +3633,10 @@ top:
35093633 unlock (& sched .lock )
35103634 return gp , false , false
35113635 }
3636+ if sched .yieldqsize != 0 {
3637+ unlock (& sched .lock )
3638+ goto top
3639+ }
35123640 if ! mp .spinning && sched .needspinning .Load () == 1 {
35133641 // See "Delicate dance" comment below.
35143642 mp .becomeSpinning ()
@@ -3666,6 +3794,7 @@ top:
36663794 unlock (& sched .lock )
36673795 if pp == nil {
36683796 injectglist (& list )
3797+ sched .ngqueued .Add (1 )
36693798 netpollAdjustWaiters (delta )
36703799 } else {
36713800 acquirep (pp )
@@ -4889,6 +5018,7 @@ func exitsyscall0(gp *g) {
48895018 var locked bool
48905019 if pp == nil {
48915020 globrunqput (gp )
5021+ sched .ngqueued .Add (1 )
48925022
48935023 // Below, we stoplockedm if gp is locked. globrunqput releases
48945024 // ownership of gp, so we must check if gp is locked prior to
@@ -7111,6 +7241,20 @@ func (q *gQueue) popList() gList {
71117241 return stack
71127242}
71137243
7244+ // yield_put is the gopark unlock function for Yield. It enqueues the goroutine
7245+ // onto the global yield queue. Returning true keeps the G parked until another
7246+ // part of the scheduler makes it runnable again. The G remains in _Gwaiting
7247+ // after this returns.
7248+ //
7249+ //go:nosplit
7250+ func yield_put (gp * g , _ unsafe.Pointer ) bool {
7251+ lock (& sched .lock )
7252+ sched .yieldq .pushBack (gp )
7253+ sched .yieldqsize ++
7254+ unlock (& sched .lock )
7255+ return true
7256+ }
7257+
71147258// A gList is a list of Gs linked through g.schedlink. A G can only be
71157259// on one gQueue or gList at a time.
71167260type gList struct {
0 commit comments