@@ -353,6 +353,190 @@ func Gosched() {
353353	mcall (gosched_m )
354354}
355355
356+ // Yield cooperatively yields if, and only if, the scheduler is "busy". 
357+ // 
358+ // This can be called by any work wishing to utilize strictly spare capacity 
359+ // while minimizing the degree to which it delays other work from being promptly 
360+ // scheduled. 
361+ // 
362+ // Yield is intended to have very low overhead, particularly in its no-op case 
363+ // where there is idle capacity in the scheduler and the caller does not need to 
364+ // yield. This should allow it to be called often, such as in the body of tight 
365+ // loops, in any tasks wishing to yield promptly to any waiting work. 
366+ // 
367+ // When there is waiting work, the yielding goroutine may briefly be rescheduled 
368+ // after it, or may, in some cases, be parked in a waiting 'yield' state until 
369+ // the scheduler next has spare capacity to resume it. Yield does not guarantee 
370+ // fairness or starvation-prevention: once a goroutine Yields(), it may remain 
371+ // parked until the scheduler next has idle capacity. This means Yield can block 
372+ // for unbounded durations in the presence of sustained over-saturation; callers 
373+ // are responsible for deciding where to Yield() to avoid priority inversions. 
374+ // 
375+ // Yield will never park if the calling goroutine is locked to an OS thread. 
376+ func  Yield () {
377+ 	// Common/fast case: do nothing if npidle is non-zero meaning there is 
378+ 	// an idle P so no reason to yield this one. Doing only this check here keeps 
379+ 	// Yield inlineable (~70 of 80 as of writing). 
380+ 	if  sched .npidle .Load () ==  0  {
381+ 		maybeYield ()
382+ 	}
383+ }
384+ 
385+ // maybeYield is called by Yield if npidle is zero, meaning there are no idle Ps 
386+ // and thus there may be work to which the caller should yield. Such work could 
387+ // be on this local runq of the caller's P, on the global runq, in the runq of 
388+ // some other P, or even in the form of ready conns waiting to be noticed by a 
389+ // netpoll which would then ready runnable goroutines. 
390+ // 
391+ // Keeping this function extremely cheap is essential: it must be cheap enough 
392+ // that callers can call it in very tight loops, as very frequent calls ensure a 
393+ // task wishing to yield when work is waiting will do so promptly. Checking the 
394+ // runq of every P or calling netpoll are too expensive to do in every call, so 
395+ // given intent is to bound how long work may wait, such checks only need to be 
396+ // performed after some amount of time has elapsed (e.g. 0.25ms). To minimize 
397+ // overhead when called at a higher frequency, this elapsed time is checked with 
398+ // an exponential backoff. 
399+ // 
400+ // runqs are checked directly with non-atomic reads rather than runqempty: being 
401+ // cheap is our top priority and a microsecond of staleness is fine as long as 
402+ // the check does not get optimized out of a calling loop body (hence noinline). 
403+ // 
404+ //go:noinline 
405+ func  maybeYield () {
406+ 	gp  :=  getg ()
407+ 
408+ 	// Don't park while locked to an OS thread. 
409+ 	if  gp .lockedm  !=  0  {
410+ 		return 
411+ 	}
412+ 
413+ 	// If the local P's runq ring buffer/next is non-empty, yield to waiting G. 
414+ 	if  p  :=  gp .m .p .ptr (); p .runqhead  !=  p .runqtail  ||  p .runnext  !=  0  {
415+ 		// If there is work in the local P's runq, we can yield by just going to the 
416+ 		// back of the local P's runq via goyield: this achieves the same goal of 
417+ 		// letting waiting work run instead of us, but without parking on the global 
418+ 		// yieldq and potentially switching Ps. While that's our preferred choice, 
419+ 		// we want to avoid thrashing back and forth between multiple Yield-calling 
420+ 		// goroutines: in such a case it is better to just park one so the other 
421+ 		// stops seeing it in the queue and yielding to it. To detect and break this 
422+ 		// cycle, we put a 1 in the yieldchecks field: if the other goroutine yields 
423+ 		// right back, but is then still in this runq bringing us here again, we'll 
424+ 		// see this 1 and park instead. We can clobber yieldchecks here since we're 
425+ 		// actively yielding -- we don't need the counter to decide to do so. And 
426+ 		// our sentinel will in turn be clobbered the very next time the time is put 
427+ 		// in the upper bits, which it will be when they're zero if we don't yield, 
428+ 		// so this sentinel should be relatively reliable in indicating thrashing. 
429+ 		if  gp .yieldchecks  ==  1  {
430+ 			yieldPark ()
431+ 			return 
432+ 		}
433+ 		gp .yieldchecks  =  1 
434+ 		// Go to the back of the local runq. 
435+ 		goyield ()
436+ 		return 
437+ 	}
438+ 
439+ 	// If the global runq is non-empty, park in the global yieldq right away: that 
440+ 	// is work someone needs to pick up and it might as well be our P. We could, 
441+ 	// potentially, directly claim it here and goyield or equivalently to try to 
442+ 	// remain on this P, but just parking and letting this P go to findRunnable 
443+ 	// avoid duplication of its logic and seems good enough. 
444+ 	if  ! sched .runq .empty () {
445+ 		yieldPark ()
446+ 		return 
447+ 	}
448+ 
449+ 	// We didn't find anything via cheap O(1) checks of our runq or global runq but 
450+ 	// it is possible there are goroutines waiting in runqs of other Ps that are 
451+ 	// not being stolen by an idle P -- the lack of idle Ps (npidle=0) is what got 
452+ 	// us here. Furthermore, given the lack of idle Ps, it is also possible that 
453+ 	// ready conns are waiting for a netpoll to notice them and ready their 
454+ 	// goroutines i.e. work to which we should then yield. However, searching all 
455+ 	// runqs, and even more so netpoll, is too expensive for every maybeYield 
456+ 	// call: being extremely low overhead is essential to allowing Yield() to be 
457+ 	// called at high enough frequency to make the caller respond to changing load 
458+ 	// promptly. 
459+ 	// 
460+ 	// Given our main goal here is to reduce/bound *how long* work waits, we can 
461+ 	// do more extensive/expensive checks searching all runqs / netpoll less often 
462+ 	// but we still need to do them often "enough". Given our goal is to bound the 
463+ 	// time that work may wait before a call to Yield detects it, the time elapsed 
464+ 	// since the last check would be a good signal, but even checking nanotime() 
465+ 	// on each call to measure this would be too expensive. Instead, we can check 
466+ 	// nanotime() with an exponential backoff using a simple counter, to ensure we 
467+ 	// avoid overly frequent time checks under higher call frequencies while still 
468+ 	// checking the time often at lower frequencies. 
469+ 	// 
470+ 	// To implement such a time-based cap with elapsed-time checked on a subset of 
471+ 	// calls, we can combine a call count and elapsed-time indicator into a single 
472+ 	// uint32 on G: its 11 lower bits store a counter while the remaining 21 bits 
473+ 	// store nanos quantized to 0.25ms "epochs" by discarding the lower 18 bits. 
474+ 	// of a int64 nanotime() value. For counter values after increment of 2^k-1, 
475+ 	// we check if the time -- quantized to 0.25ms -- has changed and if so move 
476+ 	// to do the more throrough check for waiting work. 
477+ 	// 
478+ 	// Choosing 11 bits for a counter allows backing off to a rate of checking the 
479+ 	// clock once every 1k calls if called extremely frequently; it seems unlikely 
480+ 	// a caller would be able to call this at a frequency high enough to desire a 
481+ 	// higher backoff. The 21 remaining bits allows ~9mins between rollover of 
482+ 	// the epoch: the slim chance of a false negative is quite acceptable as if we 
483+ 	// hit it, we just delay one check of the runqs by a quarter millisecond. 
484+ 	const  yieldCountBits , yieldCountMask  =  11 , (1  <<  11 ) -  1 
485+ 	const  yieldEpochShift  =  18  -  yieldCountBits  // only need to shift by the differnce, then mask. 
486+ 	gp .yieldchecks ++ 
487+ 	// Exp-backoff using 2^k-1 as when we check. 
488+ 	if  count  :=  gp .yieldchecks  &  yieldCountMask ; (count  &  (count  +  1 )) ==  0  {
489+ 		prev  :=  gp .yieldchecks  &^ yieldCountMask 
490+ 		now  :=  uint32 (nanotime ()>> yieldEpochShift ) &^ yieldCountMask 
491+ 		if  now  !=  prev  {
492+ 			// Set yieldchecks to just new high timestamp bits, cleaning counter. 
493+ 			gp .yieldchecks  =  now 
494+ 
495+ 			// Check runqs of all Ps; if we find anything park free this P to steal. 
496+ 			for  i  :=  range  allp  {
497+ 				// We don't need the extra accuracy (and cost) of runqempty here either; 
498+ 				// Worst-case we'll yield a check later or maybe park and unpark. 
499+ 				if  allp [i ].runqhead  !=  allp [i ].runqtail  ||  allp [i ].runnext  !=  0  {
500+ 					yieldPark ()
501+ 					return 
502+ 				}
503+ 			}
504+ 
505+ 			// Check netpoll; a ready conn is basically a runnable goroutine which we 
506+ 			// would yield to if we saw it, but the lack of idle Ps may mean nobody is 
507+ 			// checking this as often right now and there may be ready conns waiting. 
508+ 			if  netpollinited () &&  netpollAnyWaiters () &&  sched .lastpoll .Load () !=  0  {
509+ 				var  found  bool 
510+ 				systemstack (func () {
511+ 					if  list , delta  :=  netpoll (0 ); ! list .empty () {
512+ 						injectglist (& list )
513+ 						netpollAdjustWaiters (delta )
514+ 						found  =  true 
515+ 					}
516+ 				})
517+ 				if  found  {
518+ 					goyield ()
519+ 				}
520+ 			}
521+ 		} else  if  count  ==  yieldCountMask  {
522+ 			// Counter overflow before hitting time; reset half way back. 
523+ 			gp .yieldchecks  =  prev  |  (yieldCountMask  /  2 )
524+ 		}
525+ 	}
526+ }
527+ 
528+ // yieldPark parks the current goroutine in a waiting state with reason yield 
529+ // and puts it in the yieldq queue for findRunnable. A goroutine that has to 
530+ // park to Yield is considered "waiting" rather than "runnable" as it is blocked 
531+ // in this state until there is strictly spare execution capacity available to 
532+ // resume it, unlike runnable goroutines which generally take runs running at 
533+ // regular intervals. A parked yielded goroutine is more like being blocked on 
534+ // a cond var or lock that will be signaled when we next detect spare capacity. 
535+ func  yieldPark () {
536+ 	checkTimeouts ()
537+ 	gopark (yield_put , nil , waitReasonYield , traceBlockPreempted , 1 )
538+ }
539+ 
356540// goschedguarded yields the processor like gosched, but also checks 
357541// for forbidden states and opts out of the yield in those cases. 
358542// 
@@ -3445,6 +3629,23 @@ top:
34453629		}
34463630	}
34473631
3632+ 	// Nothing runnable, so check for yielded goroutines parked in yieldq. 
3633+ 	if  ! sched .yieldq .empty () {
3634+ 		lock (& sched .lock )
3635+ 		bg  :=  sched .yieldq .pop ()
3636+ 		unlock (& sched .lock )
3637+ 		if  bg  !=  nil  {
3638+ 			trace  :=  traceAcquire ()
3639+ 			casgstatus (bg , _Gwaiting , _Grunnable )
3640+ 			if  trace .ok () {
3641+ 				// Match other ready paths for trace visibility. 
3642+ 				trace .GoUnpark (bg , 0 )
3643+ 				traceRelease (trace )
3644+ 			}
3645+ 			return  bg , false , false 
3646+ 		}
3647+ 	}
3648+ 
34483649	// We have nothing to do. 
34493650	// 
34503651	// If we're in the GC mark phase, can safely scan and blacken objects, 
@@ -3509,6 +3710,12 @@ top:
35093710		unlock (& sched .lock )
35103711		return  gp , false , false 
35113712	}
3713+ 
3714+ 	// Re-check yieldq again, this time while holding sched.lock. 
3715+ 	if  ! sched .yieldq .empty () {
3716+ 		unlock (& sched .lock )
3717+ 		goto  top
3718+ 	}
35123719	if  ! mp .spinning  &&  sched .needspinning .Load () ==  1  {
35133720		// See "Delicate dance" comment below. 
35143721		mp .becomeSpinning ()
@@ -7111,6 +7318,20 @@ func (q *gQueue) popList() gList {
71117318	return  stack 
71127319}
71137320
7321+ // yield_put is the gopark unlock function for Yield. It enqueues the goroutine 
7322+ // onto the global yield queue. Returning true keeps the G parked until another 
7323+ // part of the scheduler makes it runnable again. The G remains in _Gwaiting 
7324+ // after this returns. Nothing else will find/ready this G in the interim since 
7325+ // it isn't on a runq until we put it on the yieldq for findRunnable to find. 
7326+ // 
7327+ //go:nosplit 
7328+ func  yield_put (gp  * g , _  unsafe.Pointer ) bool  {
7329+ 	lock (& sched .lock )
7330+ 	sched .yieldq .pushBack (gp )
7331+ 	unlock (& sched .lock )
7332+ 	return  true 
7333+ }
7334+ 
71147335// A gList is a list of Gs linked through g.schedlink. A G can only be 
71157336// on one gQueue or gList at a time. 
71167337type  gList  struct  {
0 commit comments