runtime: let sysmon sleep in netpoll if possible

hzzb · hzzb · commit d38fc4ec00aa · 2022-03-14T19:46:14.000+08:00
Currently, the scheduler picks up next to-run g in the following
delicate order:

1 locked g.
2 trace reader g.
3 GC backgroud mark worker g.
4 global runq if schedtick%61 == 0.
5 local runq including timers.
6 findrunnable loop.
  6.1 finalizer g.
  6.2 local runq including timers.
  6.3 global runq, no schedtick checked.
  6.4 non-blocking netpoll(0) if netpollWaiters &gt; 0 and no other M polling.
  6.5 steal any timers and runnable g from other P. spinning state.
  6.6 GC idle mark worker g.
  6.7 recheck all other P's local runq, idle GC marker and timers.
      non-spinning state, P is idle.
  6.8 blocking netpoll(delay) until next earliest timer or IO ready.
      if any other M already polling, no blocking netpoll. M parked.

If found a runnable g in any step, all steps following it will be skipped.

Suppose there are many runnable gs, scheduler would almost find a g before
trying findrunnable, so netpoll left untouched. Normal Ms have no opportunity
to unpark netpoll waiters g even though IO readied. The special M sysmon polls
network if not polled for more than 10ms, which is the only opportunity to
ready any netpoll waiters g into the global runq. This causes a false positive
timeout problem.

I did a simple test to demostrate this problem. Dialing a localhost TCP
listening port costs about 10ms. This is obviously not credible. See file
runtime/sysmon_test.go for detail test code.

Normal Ms will sleep in netpoll waiting for timer and IO ready if no g
found at last. The special M sysmon simply sleeps 20us~10ms and does
a non-blocking netpoll if needed. This CL let sysmon sleep in netpoll,
do a blocking netpoll like what normal Ms do. So netpoll waiters g will
be readied into runq as soon as IO readied.

With this change, the above mentioned test shows that dialing to localhost
TCP listening port costs mostly less than 1ms. Its much more reasonable than
10ms. Detailed test results for the old and new versions are as follows:

$ ../../bin/go test -v -run TestSysmonReadyNetpollWaitersASAP
=== RUN   TestSysmonReadyNetpollWaitersASAP
    sysmon_test.go:105: dialed 85 times within 1.000029211s
    sysmon_test.go:106: timeBucket      count   percent
    sysmon_test.go:108: [ 0, 1)ms       1       1.18%
    sysmon_test.go:108: [ 1, 2)ms       0       0.00%
    sysmon_test.go:108: [ 2, 3)ms       0       0.00%
    sysmon_test.go:108: [ 3, 4)ms       0       0.00%
    sysmon_test.go:108: [ 4, 5)ms       0       0.00%
    sysmon_test.go:108: [ 5, 6)ms       0       0.00%
    sysmon_test.go:108: [ 6, 7)ms       0       0.00%
    sysmon_test.go:108: [ 7, 8)ms       0       0.00%
    sysmon_test.go:108: [ 8, 9)ms       1       1.18%
    sysmon_test.go:108: [ 9,10)ms       2       2.35%
    sysmon_test.go:108: [10,11)ms       32      37.65%
    sysmon_test.go:108: [11,12)ms       49      57.65%
--- FAIL: TestSysmonReadyNetpollWaitersASAP (1.11s)

$ ../../bin/go test -v -run TestSysmonReadyNetpollWaitersASAP
=== RUN   TestSysmonReadyNetpollWaitersASAP
    sysmon_test.go:105: dialed 2369 times within 1.000021368s
    sysmon_test.go:106: timeBucket      count   percent
    sysmon_test.go:108: [ 0, 1)ms       2209    93.25%
    sysmon_test.go:108: [ 1, 2)ms       13      0.55%
    sysmon_test.go:108: [ 2, 3)ms       33      1.39%
    sysmon_test.go:108: [ 3, 4)ms       68      2.87%
    sysmon_test.go:108: [ 4, 5)ms       8       0.34%
    sysmon_test.go:108: [ 5, 6)ms       9       0.38%
    sysmon_test.go:108: [ 6, 7)ms       6       0.25%
    sysmon_test.go:108: [ 7, 8)ms       4       0.17%
    sysmon_test.go:108: [ 8, 9)ms       7       0.30%
    sysmon_test.go:108: [ 9,10)ms       4       0.17%
    sysmon_test.go:108: [10,11)ms       5       0.21%
    sysmon_test.go:108: [11,12)ms       3       0.13%
--- PASS: TestSysmonReadyNetpollWaitersASAP (1.11s)
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
@@ -48,6 +48,10 @@ var MemclrNoHeapPointers = memclrNoHeapPointers
 
 var LockPartialOrder = lockPartialOrder
 
+var Goyield = goyield
+
+var NeedSysmonWorkaround = needSysmonWorkaround
+
 type LockRank lockRank
 
 func (l LockRank) String() string {
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
@@ -5050,20 +5050,26 @@ func sysmon() {
 	checkdead()
 	unlock(&sched.lock)
 
-	lasttrace := int64(0)
-	idle := 0 // how many cycles in succession we had not wokeup somebody
-	delay := uint32(0)
+	if netpollInited == 0 {
+		netpollGenericInit()
+	}
+	tracedelay := int64(debug.schedtrace) * 1000000
+	nexttrace := int64(maxWhen) // when should we output next schedtrace. much far future if disabled.
+	if tracedelay > 0 {
+		nexttrace = nanotime()
+	}
+	idle := 0         // how many cycles in succession we had not wokeup somebody
+	delay := int64(0) // ns
 
 	for {
 		if idle == 0 { // start with 20us sleep...
-			delay = 20
+			delay = 20 * 1000
 		} else if idle > 50 { // start doubling the sleep after 1ms...
 			delay *= 2
 		}
-		if delay > 10*1000 { // up to 10ms
-			delay = 10 * 1000
+		if delay > 10*1000*1000 { // up to 10ms
+			delay = 10 * 1000 * 1000
 		}
-		usleep(delay)
 
 		// sysmon should not enter deep sleep if schedtrace is enabled so that
 		// it can print that information at the right time.
@@ -5109,7 +5115,7 @@ func sysmon() {
 				}
 				if syscallWake {
 					idle = 0
-					delay = 20
+					delay = 20 * 1000
 				}
 			}
 			unlock(&sched.lock)
@@ -5124,23 +5130,62 @@ func sysmon() {
 		if *cgo_yield != nil {
 			asmcgocall(*cgo_yield, nil)
 		}
-		// poll network if not polled for more than 10ms
-		lastpoll := int64(atomic.Load64(&sched.lastpoll))
-		if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
-			atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
-			list := netpoll(0) // non-blocking - returns list of goroutines
-			if !list.empty() {
-				// Need to decrement number of idle locked M's
-				// (pretending that one more is running) before injectglist.
-				// Otherwise it can lead to the following situation:
-				// injectglist grabs all P's but before it starts M's to run the P's,
-				// another M returns from syscall, finishes running its G,
-				// observes that there is no work to do and no other running M's
-				// and reports deadlock.
-				incidlelocked(-1)
-				injectglist(&list)
-				incidlelocked(1)
+
+		if delay < 1000*1000 || (GOOS == "netbsd" && needSysmonWorkaround) {
+			// netpoll() will convert (0, 999us] to 1ms on some platforms.
+			// to let retake() happen as often as want, using usleep if delay is less than 1ms.
+			// issue 42515 reports netbsd may sometimes miss netpoll wake-ups, so skip it.
+			usleep(uint32(delay / 1000))
+
+			// non-blocking poll if no other M polling, not polled for more than 2ms and there is any waiter.
+			lastpoll := int64(atomic.Load64(&sched.lastpoll))
+			if lastpoll != 0 && lastpoll+2*1000*1000 < now && atomic.Load(&netpollWaiters) > 0 {
+				atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
+				list := netpoll(0) // non-blocking - returns list of goroutines
+				if !list.empty() {
+					// Need to decrement number of idle locked M's
+					// (pretending that one more is running) before injectglist.
+					// Otherwise it can lead to the following situation:
+					// injectglist grabs all P's but before it starts M's to run the P's,
+					// another M returns from syscall, finishes running its G,
+					// observes that there is no work to do and no other running M's
+					// and reports deadlock.
+					incidlelocked(-1)
+					injectglist(&list)
+					incidlelocked(1)
+				}
+			}
+		} else {
+			// poll network until earliest timer, next retake or next schedtrace, may blocking.
+			sleep := delay
+			pollUntil, _ := timeSleepUntil()
+			if nexttrace < pollUntil {
+				pollUntil = nexttrace
 			}
+			if pollUntil-now < sleep {
+				sleep = pollUntil - now
+			}
+			if sleep < 0 || faketime != 0 {
+				sleep = 0
+			}
+			pollUntil = now + sleep
+
+			// sysmon pretends to be a normal M waiting for timer and IO ready.
+			// so need to decrement number of idle locked M's.
+			incidlelocked(-1)
+			if atomic.Xchg64(&sched.lastpoll, 0) != 0 {
+				atomic.Store64(&sched.pollUntil, uint64(pollUntil))
+				list := netpoll(sleep)
+				atomic.Store64(&sched.pollUntil, 0)
+				atomic.Store64(&sched.lastpoll, uint64(nanotime()))
+				if !list.empty() {
+					injectglist(&list)
+				} else {
+					// may wake up by timer.
+					wakep()
+				}
+			}
+			incidlelocked(1)
 		}
 		if GOOS == "netbsd" && needSysmonWorkaround {
 			// netpoll is responsible for waiting for timer
@@ -5182,8 +5227,8 @@ func sysmon() {
 			injectglist(&list)
 			unlock(&forcegc.lock)
 		}
-		if debug.schedtrace > 0 && lasttrace+int64(debug.schedtrace)*1000000 <= now {
-			lasttrace = now
+		if nexttrace <= now {
+			nexttrace += tracedelay
 			schedtrace(debug.scheddetail > 0)
 		}
 		unlock(&sched.sysmonlock)
diff --git a/src/runtime/sysmon_test.go b/src/runtime/sysmon_test.go
@@ -0,0 +1,134 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test netpoll waiters G's will be unparked as soon as IO readied
+// even though M's are busy with G's in local runq.
+
+package runtime_test
+
+import (
+	"net"
+	"runtime"
+	"testing"
+	"time"
+)
+
+type busysrv struct {
+	l           net.Listener
+	bucket      []int
+	bucketTotal int
+	exit        chan struct{}
+	start, end  time.Time
+}
+
+func (srv *busysrv) stop() {
+	close(srv.exit)
+}
+
+func (srv *busysrv) startListening() {
+	l, _ := net.Listen("tcp4", "localhost:0")
+	bucket := make([]int, 12)
+	exit := make(chan struct{})
+	srv.l = l
+	srv.bucket = bucket
+	srv.exit = exit
+	go func() {
+		for {
+			select {
+			case _, ok := <-exit:
+				if !ok {
+					l.Close()
+					return
+				}
+			default:
+			}
+
+			if con, _ := l.Accept(); con != nil {
+				con.Close()
+			}
+		}
+	}()
+}
+
+func (srv *busysrv) startDialing() {
+	srv.start = time.Now()
+	defer func() {
+		srv.end = time.Now()
+	}()
+	network, addr := srv.l.Addr().Network(), srv.l.Addr().String()
+	for {
+		select {
+		case _, ok := <-srv.exit:
+			if !ok {
+				return
+			}
+		default:
+		}
+
+		start := time.Now()
+		con, _ := net.Dial(network, addr)
+		ms := int(time.Since(start) / 1000000)
+		if ms >= len(srv.bucket) {
+			ms = len(srv.bucket) - 1
+		}
+		srv.bucket[ms]++
+		srv.bucketTotal++
+		if con != nil {
+			con.Close()
+		}
+	}
+}
+
+func (srv *busysrv) busy() {
+	for {
+		select {
+		case _, ok := <-srv.exit:
+			if !ok {
+				return
+			}
+		default:
+		}
+		runtime.Goyield() // simulate many runnable G's in local runq.
+	}
+}
+
+func (srv *busysrv) expect(bucket int, percent float64) bool {
+	count := 0
+	for i := 0; i < bucket && i < len(srv.bucket); i++ {
+		count += srv.bucket[i]
+	}
+	return float64(count)/float64(srv.bucketTotal)*100.0 > percent
+}
+
+func (srv *busysrv) printf(ffn func(format string, args ...interface{})) {
+	ffn("dialed %d times within %v\n", srv.bucketTotal, srv.end.Sub(srv.start))
+	ffn("timeBucket\tcount\tpercent\n")
+	for ms, cnt := range srv.bucket {
+		ffn("[%2d,%2d)ms\t%d\t%.2f%%\n", ms, ms+1, cnt, float64(cnt)/float64(srv.bucketTotal)*100.0)
+	}
+}
+
+func TestSysmonReadyNetpollWaitersASAP(t *testing.T) {
+	if runtime.GOOS == "netbsd" && runtime.NeedSysmonWorkaround {
+		t.Skip("netbsd 9.2 earlier")
+	}
+
+	srv := &busysrv{}
+	srv.startListening()
+	np := runtime.GOMAXPROCS(0)
+	for i := 0; i < np*5; i++ {
+		go srv.busy()
+	}
+	go srv.startDialing()
+
+	time.Sleep(time.Second)
+	srv.stop()
+	time.Sleep(time.Millisecond * 100)
+
+	// expect more than 80% dialings accomplished within 2ms.
+	if !srv.expect(2, 80.0) {
+		t.Fail()
+	}
+	srv.printf(t.Logf)
+}