diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 37ed2c6fbb1310..1d8ad4a0ddcccc 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -389,6 +389,190 @@ func Gosched() { mcall(gosched_m) } +// Yield cooperatively yields if, and only if, the scheduler is "busy". +// +// This can be called by any work wishing to utilize strictly spare capacity +// while minimizing the degree to which it delays other work from being promptly +// scheduled. +// +// Yield is intended to have very low overhead, particularly in its no-op case +// where there is idle capacity in the scheduler and the caller does not need to +// yield. This should allow it to be called often, such as in the body of tight +// loops, in any tasks wishing to yield promptly to any waiting work. +// +// When there is waiting work, the yielding goroutine may briefly be rescheduled +// after it, or may, in some cases, be parked in a waiting 'yield' state until +// the scheduler next has spare capacity to resume it. Yield does not guarantee +// fairness or starvation-prevention: once a goroutine Yields(), it may remain +// parked until the scheduler next has idle capacity. This means Yield can block +// for unbounded durations in the presence of sustained over-saturation; callers +// are responsible for deciding where to Yield() to avoid priority inversions. +// +// Yield will never park if the calling goroutine is locked to an OS thread. +func Yield() { + // Common/fast case: do nothing if npidle is non-zero meaning there is + // an idle P so no reason to yield this one. Doing only this check here keeps + // Yield inlineable (~70 of 80 as of writing). + if sched.npidle.Load() == 0 { + maybeYield() + } +} + +// maybeYield is called by Yield if npidle is zero, meaning there are no idle Ps +// and thus there may be work to which the caller should yield. Such work could +// be on this local runq of the caller's P, on the global runq, in the runq of +// some other P, or even in the form of ready conns waiting to be noticed by a +// netpoll which would then ready runnable goroutines. +// +// Keeping this function extremely cheap is essential: it must be cheap enough +// that callers can call it in very tight loops, as very frequent calls ensure a +// task wishing to yield when work is waiting will do so promptly. Checking the +// runq of every P or calling netpoll are too expensive to do in every call, so +// given intent is to bound how long work may wait, such checks only need to be +// performed after some amount of time has elapsed (e.g. 0.25ms). To minimize +// overhead when called at a higher frequency, this elapsed time is checked with +// an exponential backoff. +// +// runqs are checked directly with non-atomic reads rather than runqempty: being +// cheap is our top priority and a microsecond of staleness is fine as long as +// the check does not get optimized out of a calling loop body (hence noinline). +// +//go:noinline +func maybeYield() { + gp := getg() + + // Don't park while locked to an OS thread. + if gp.lockedm != 0 { + return + } + + // If the local P's runq ring buffer/next is non-empty, yield to waiting G. + if p := gp.m.p.ptr(); p.runqhead != p.runqtail || p.runnext != 0 { + // If there is work in the local P's runq, we can yield by just going to the + // back of the local P's runq via goyield: this achieves the same goal of + // letting waiting work run instead of us, but without parking on the global + // yieldq and potentially switching Ps. While that's our preferred choice, + // we want to avoid thrashing back and forth between multiple Yield-calling + // goroutines: in such a case it is better to just park one so the other + // stops seeing it in the queue and yielding to it. To detect and break this + // cycle, we put a 1 in the yieldchecks field: if the other goroutine yields + // right back, but is then still in this runq bringing us here again, we'll + // see this 1 and park instead. We can clobber yieldchecks here since we're + // actively yielding -- we don't need the counter to decide to do so. And + // our sentinel will in turn be clobbered the very next time the time is put + // in the upper bits, which it will be when they're zero if we don't yield, + // so this sentinel should be relatively reliable in indicating thrashing. + if gp.yieldchecks == 1 { + yieldPark() + return + } + gp.yieldchecks = 1 + // Go to the back of the local runq. + goyield() + return + } + + // If the global runq is non-empty, park in the global yieldq right away: that + // is work someone needs to pick up and it might as well be our P. We could, + // potentially, directly claim it here and goyield or equivalently to try to + // remain on this P, but just parking and letting this P go to findRunnable + // avoid duplication of its logic and seems good enough. + if !sched.runq.empty() { + yieldPark() + return + } + + // We didn't find anything via cheap O(1) checks of our runq or global runq but + // it is possible there are goroutines waiting in runqs of other Ps that are + // not being stolen by an idle P -- the lack of idle Ps (npidle=0) is what got + // us here. Furthermore, given the lack of idle Ps, it is also possible that + // ready conns are waiting for a netpoll to notice them and ready their + // goroutines i.e. work to which we should then yield. However, searching all + // runqs, and even more so netpoll, is too expensive for every maybeYield + // call: being extremely low overhead is essential to allowing Yield() to be + // called at high enough frequency to make the caller respond to changing load + // promptly. + // + // Given our main goal here is to reduce/bound *how long* work waits, we can + // do more extensive/expensive checks searching all runqs / netpoll less often + // but we still need to do them often "enough". Given our goal is to bound the + // time that work may wait before a call to Yield detects it, the time elapsed + // since the last check would be a good signal, but even checking nanotime() + // on each call to measure this would be too expensive. Instead, we can check + // nanotime() with an exponential backoff using a simple counter, to ensure we + // avoid overly frequent time checks under higher call frequencies while still + // checking the time often at lower frequencies. + // + // To implement such a time-based cap with elapsed-time checked on a subset of + // calls, we can combine a call count and elapsed-time indicator into a single + // uint32 on G: its 11 lower bits store a counter while the remaining 21 bits + // store nanos quantized to 0.25ms "epochs" by discarding the lower 18 bits. + // of a int64 nanotime() value. For counter values after increment of 2^k-1, + // we check if the time -- quantized to 0.25ms -- has changed and if so move + // to do the more throrough check for waiting work. + // + // Choosing 11 bits for a counter allows backing off to a rate of checking the + // clock once every 1k calls if called extremely frequently; it seems unlikely + // a caller would be able to call this at a frequency high enough to desire a + // higher backoff. The 21 remaining bits allows ~9mins between rollover of + // the epoch: the slim chance of a false negative is quite acceptable as if we + // hit it, we just delay one check of the runqs by a quarter millisecond. + const yieldCountBits, yieldCountMask = 11, (1 << 11) - 1 + const yieldEpochShift = 18 - yieldCountBits // only need to shift by the differnce, then mask. + gp.yieldchecks++ + // Exp-backoff using 2^k-1 as when we check. + if count := gp.yieldchecks & yieldCountMask; (count & (count + 1)) == 0 { + prev := gp.yieldchecks &^ yieldCountMask + now := uint32(nanotime()>>yieldEpochShift) &^ yieldCountMask + if now != prev { + // Set yieldchecks to just new high timestamp bits, cleaning counter. + gp.yieldchecks = now + + // Check runqs of all Ps; if we find anything park free this P to steal. + for i := range allp { + // We don't need the extra accuracy (and cost) of runqempty here either; + // Worst-case we'll yield a check later or maybe park and unpark. + if allp[i].runqhead != allp[i].runqtail || allp[i].runnext != 0 { + yieldPark() + return + } + } + + // Check netpoll; a ready conn is basically a runnable goroutine which we + // would yield to if we saw it, but the lack of idle Ps may mean nobody is + // checking this as often right now and there may be ready conns waiting. + if netpollinited() && netpollAnyWaiters() && sched.lastpoll.Load() != 0 { + var found bool + systemstack(func() { + if list, delta := netpoll(0); !list.empty() { + injectglist(&list) + netpollAdjustWaiters(delta) + found = true + } + }) + if found { + goyield() + } + } + } else if count == yieldCountMask { + // Counter overflow before hitting time; reset half way back. + gp.yieldchecks = prev | (yieldCountMask / 2) + } + } +} + +// yieldPark parks the current goroutine in a waiting state with reason yield +// and puts it in the yieldq queue for findRunnable. A goroutine that has to +// park to Yield is considered "waiting" rather than "runnable" as it is blocked +// in this state until there is strictly spare execution capacity available to +// resume it, unlike runnable goroutines which generally take runs running at +// regular intervals. A parked yielded goroutine is more like being blocked on +// a cond var or lock that will be signaled when we next detect spare capacity. +func yieldPark() { + checkTimeouts() + gopark(yield_put, nil, waitReasonYield, traceBlockPreempted, 1) +} + // goschedguarded yields the processor like gosched, but also checks // for forbidden states and opts out of the yield in those cases. // @@ -3546,6 +3730,23 @@ top: } } + // Nothing runnable, so check for yielded goroutines parked in yieldq. + if !sched.yieldq.empty() { + lock(&sched.lock) + bg := sched.yieldq.pop() + unlock(&sched.lock) + if bg != nil { + trace := traceAcquire() + casgstatus(bg, _Gwaiting, _Grunnable) + if trace.ok() { + // Match other ready paths for trace visibility. + trace.GoUnpark(bg, 0) + traceRelease(trace) + } + return bg, false, false + } + } + // We have nothing to do. // // If we're in the GC mark phase, can safely scan and blacken objects, @@ -3616,6 +3817,12 @@ top: } return gp, false, false } + + // Re-check yieldq again, this time while holding sched.lock. + if !sched.yieldq.empty() { + unlock(&sched.lock) + goto top + } if !mp.spinning && sched.needspinning.Load() == 1 { // See "Delicate dance" comment below. mp.becomeSpinning() @@ -7416,6 +7623,20 @@ func (q *gQueue) popList() gList { return stack } +// yield_put is the gopark unlock function for Yield. It enqueues the goroutine +// onto the global yield queue. Returning true keeps the G parked until another +// part of the scheduler makes it runnable again. The G remains in _Gwaiting +// after this returns. Nothing else will find/ready this G in the interim since +// it isn't on a runq until we put it on the yieldq for findRunnable to find. +// +//go:nosplit +func yield_put(gp *g, _ unsafe.Pointer) bool { + lock(&sched.lock) + sched.yieldq.pushBack(gp) + unlock(&sched.lock) + return true +} + // A gList is a list of Gs linked through g.schedlink. A G can only be // on one gQueue or gList at a time. type gList struct { diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go index 3b606f62e4320c..f76fb5269a69b7 100644 --- a/src/runtime/proc_test.go +++ b/src/runtime/proc_test.go @@ -103,6 +103,25 @@ func TestYieldLocked(t *testing.T) { <-c } +func TestYield(t *testing.T) { + var wg sync.WaitGroup + start := make(chan struct{}) + for i := 0; i < runtime.GOMAXPROCS(0)*2; i++ { + wg.Add(1) + go func() { + defer wg.Done() + <-start + for j := 0; j < 1000; j++ { + if i%2 == 0 || j == 999 { + runtime.Yield() + } + } + }() + } + close(start) + wg.Wait() +} + func TestGoroutineParallelism(t *testing.T) { if runtime.NumCPU() == 1 { // Takes too long, too easy to deadlock, etc. diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 8cf1fad3b5340d..5b4e17877745fd 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -485,6 +485,8 @@ type g struct { sleepWhen int64 // when to sleep until selectDone atomic.Uint32 // are we participating in a select and did someone win the race? + yieldchecks uint32 // a packed approx time and count of maybeYield checks; see Yield(). + // goroutineProfiled indicates the status of this goroutine's stack for the // current in-progress goroutine profile goroutineProfiled goroutineProfileStateHolder @@ -797,6 +799,10 @@ type schedt struct { // Global runnable queue. runq gQueue + // Global background-yield queue: goroutines that voluntarily yielded + // while the scheduler was busy. Does NOT contribute to runqsize. + yieldq gQueue + // disable controls selective disabling of the scheduler. // // Use schedEnableUser to control this. @@ -1094,6 +1100,7 @@ const ( waitReasonTraceProcStatus // "trace proc status" waitReasonPageTraceFlush // "page trace flush" waitReasonCoroutine // "coroutine" + waitReasonYield // "yield" waitReasonGCWeakToStrongWait // "GC weak to strong wait" waitReasonSynctestRun // "synctest.Run" waitReasonSynctestWait // "synctest.Wait" @@ -1144,6 +1151,7 @@ var waitReasonStrings = [...]string{ waitReasonTraceProcStatus: "trace proc status", waitReasonPageTraceFlush: "page trace flush", waitReasonCoroutine: "coroutine", + waitReasonYield: "yield", waitReasonGCWeakToStrongWait: "GC weak to strong wait", waitReasonSynctestRun: "synctest.Run", waitReasonSynctestWait: "synctest.Wait",