Skip to content

Commit 8873e8b

Browse files
VladSaiocUbermknyszek
authored andcommitted
runtime,runtime/pprof: clean up goroutine leak profile writing
Cleaned up goroutine leak profile extraction: - removed the acquisition of goroutineProfile semaphore - inlined the call to saveg when recording stacks instead of using doRecordGoroutineProfile, which had side-effects over goroutineProfile fields. Added regression tests for goroutine leak profiling frontend for binary and debug=1 profile formats. Added stress tests for concurrent goroutine and goroutine leak profile requests. Change-Id: I55c1bcef11e9a7fb7699b4c5a2353e594d3e7173 GitHub-Last-Rev: 5e9eb3b GitHub-Pull-Request: #76045 Reviewed-on: https://go-review.googlesource.com/c/go/+/714580 Reviewed-by: Michael Pratt <[email protected]> Reviewed-by: Michael Knyszek <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Junyang Shao <[email protected]>
1 parent b8b84b7 commit 8873e8b

File tree

3 files changed

+253
-16
lines changed

3 files changed

+253
-16
lines changed

src/runtime/mprof.go

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1261,7 +1261,7 @@ func goroutineProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.P
12611261

12621262
//go:linkname pprof_goroutineLeakProfileWithLabels
12631263
func pprof_goroutineLeakProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1264-
return goroutineLeakProfileWithLabelsConcurrent(p, labels)
1264+
return goroutineLeakProfileWithLabels(p, labels)
12651265
}
12661266

12671267
// labels may be nil. If labels is non-nil, it must have the same length as p.
@@ -1323,38 +1323,33 @@ func goroutineLeakProfileWithLabelsConcurrent(p []profilerecord.StackRecord, lab
13231323
return work.goroutineLeak.count, false
13241324
}
13251325

1326-
// Use the same semaphore as goroutineProfileWithLabelsConcurrent,
1327-
// because ultimately we still use goroutine profiles.
1328-
semacquire(&goroutineProfile.sema)
1329-
1330-
// Unlike in goroutineProfileWithLabelsConcurrent, we don't need to
1331-
// save the current goroutine stack, because it is obviously not leaked.
1332-
13331326
pcbuf := makeProfStack() // see saveg() for explanation
13341327

13351328
// Prepare a profile large enough to store all leaked goroutines.
13361329
n = work.goroutineLeak.count
13371330

13381331
if n > len(p) {
1339-
// There's not enough space in p to store the whole profile, so (per the
1340-
// contract of runtime.GoroutineProfile) we're not allowed to write to p
1341-
// at all and must return n, false.
1342-
semrelease(&goroutineProfile.sema)
1332+
// There's not enough space in p to store the whole profile, so
1333+
// we're not allowed to write to p at all and must return n, false.
13431334
return n, false
13441335
}
13451336

13461337
// Visit each leaked goroutine and try to record its stack.
1338+
var offset int
13471339
forEachGRace(func(gp1 *g) {
1348-
if readgstatus(gp1) == _Gleaked {
1349-
doRecordGoroutineProfile(gp1, pcbuf)
1340+
if readgstatus(gp1)&^_Gscan == _Gleaked {
1341+
systemstack(func() { saveg(^uintptr(0), ^uintptr(0), gp1, &p[offset], pcbuf) })
1342+
if labels != nil {
1343+
labels[offset] = gp1.labels
1344+
}
1345+
offset++
13501346
}
13511347
})
13521348

13531349
if raceenabled {
13541350
raceacquire(unsafe.Pointer(&labelSync))
13551351
}
13561352

1357-
semrelease(&goroutineProfile.sema)
13581353
return n, true
13591354
}
13601355

src/runtime/pprof/pprof.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,31 @@ var mutexProfile = &Profile{
228228
write: writeMutex,
229229
}
230230

231+
// goroutineLeakProfileLock ensures that the goroutine leak profile writer observes the
232+
// leaked goroutines discovered during the goroutine leak detection GC cycle
233+
// that was triggered by the profile request.
234+
// This prevents a race condition between the garbage collector and the profile writer
235+
// when multiple profile requests are issued concurrently: the status of leaked goroutines
236+
// is reset to _Gwaiting at the beginning of a leak detection cycle, which may lead the
237+
// profile writer of another concurrent request to produce an incomplete profile.
238+
//
239+
// Example trace:
240+
//
241+
// G1 | GC | G2
242+
// ----------------------+-----------------------------+---------------------
243+
// Request profile | . | .
244+
// . | . | Request profile
245+
// . | [G1] Resets leaked g status | .
246+
// . | [G1] Leaks detected | .
247+
// . | <New cycle> | .
248+
// . | [G2] Resets leaked g status | .
249+
// Write profile | . | .
250+
// . | [G2] Leaks detected | .
251+
// . | . | Write profile
252+
// ----------------------+-----------------------------+---------------------
253+
// Incomplete profile |+++++++++++++++++++++++++++++| Complete profile
254+
var goroutineLeakProfileLock sync.Mutex
255+
231256
func lockProfiles() {
232257
profiles.mu.Lock()
233258
if profiles.m == nil {
@@ -763,6 +788,15 @@ func writeGoroutine(w io.Writer, debug int) error {
763788
// writeGoroutineLeak first invokes a GC cycle that performs goroutine leak detection.
764789
// It then writes the goroutine profile, filtering for leaked goroutines.
765790
func writeGoroutineLeak(w io.Writer, debug int) error {
791+
// Acquire the goroutine leak detection lock and release
792+
// it after the goroutine leak profile is written.
793+
//
794+
// While the critical section is long, this is needed to prevent
795+
// a race condition between the garbage collector and the goroutine
796+
// leak profile writer when multiple profile requests are issued concurrently.
797+
goroutineLeakProfileLock.Lock()
798+
defer goroutineLeakProfileLock.Unlock()
799+
766800
// Run the GC with leak detection first so that leaked goroutines
767801
// may transition to the leaked state.
768802
runtime_goroutineLeakGC()

src/runtime/pprof/pprof_test.go

Lines changed: 209 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -774,7 +774,11 @@ func TestMorestack(t *testing.T) {
774774
for {
775775
go func() {
776776
growstack1()
777-
c <- true
777+
// NOTE(vsaioc): This goroutine may leak without this select.
778+
select {
779+
case c <- true:
780+
case <-time.After(duration):
781+
}
778782
}()
779783
select {
780784
case <-t:
@@ -1565,6 +1569,210 @@ func containsCountsLabels(prof *profile.Profile, countLabels map[int64]map[strin
15651569
return true
15661570
}
15671571

1572+
func goroutineLeakExample() {
1573+
<-make(chan struct{})
1574+
panic("unreachable")
1575+
}
1576+
1577+
func TestGoroutineLeakProfileConcurrency(t *testing.T) {
1578+
const leakCount = 3
1579+
1580+
testenv.MustHaveParallelism(t)
1581+
regexLeakCount := regexp.MustCompile("goroutineleak profile: total ")
1582+
whiteSpace := regexp.MustCompile("\\s+")
1583+
1584+
// Regular goroutine profile. Used to check that there is no interference between
1585+
// the two profile types.
1586+
goroutineProf := Lookup("goroutine")
1587+
goroutineLeakProf := goroutineLeakProfile
1588+
1589+
// Check that a profile with debug information contains
1590+
includesLeak := func(t *testing.T, name, s string) {
1591+
if !strings.Contains(s, "runtime/pprof.goroutineLeakExample") {
1592+
t.Errorf("%s profile does not contain expected leaked goroutine (runtime/pprof.goroutineLeakExample): %s", name, s)
1593+
}
1594+
}
1595+
1596+
checkFrame := func(i int, j int, locations []*profile.Location, expectedFunctionName string) {
1597+
if len(locations) <= i {
1598+
t.Errorf("leaked goroutine stack locations out of range at %d of %d", i+1, len(locations))
1599+
return
1600+
}
1601+
location := locations[i]
1602+
if len(location.Line) <= j {
1603+
t.Errorf("leaked goroutine stack location lines out of range at %d of %d", j+1, len(location.Line))
1604+
return
1605+
}
1606+
if location.Line[j].Function.Name != expectedFunctionName {
1607+
t.Errorf("leaked goroutine stack expected %s as the location[%d].Line[%d] but found %s (%s:%d)", expectedFunctionName, i, j, location.Line[j].Function.Name, location.Line[j].Function.Filename, location.Line[j].Line)
1608+
}
1609+
}
1610+
1611+
// We use this helper to count the total number of leaked goroutines in the profile.
1612+
//
1613+
// NOTE(vsaioc): This value should match for the number of leaks produced in this test,
1614+
// but other tests could also leak goroutines, in which case we would have a mismatch
1615+
// when bulk-running tests.
1616+
//
1617+
// The two mismatching outcomes are therefore:
1618+
// - More leaks than expected, which is a correctness issue with other tests.
1619+
// In this case, this test effectively checks other tests wrt
1620+
// goroutine leaks during bulk executions (e.g., running all.bash).
1621+
//
1622+
// - Fewer leaks than expected; this is an unfortunate symptom of scheduling
1623+
// non-determinism, which may occur once in a blue moon. We make
1624+
// a best-effort attempt to allow the expected leaks to occur, by yielding
1625+
// the main thread, but it is never a guarantee.
1626+
countLeaks := func(t *testing.T, number int, s string) {
1627+
// Strip the profile header
1628+
parts := regexLeakCount.Split(s, -1)
1629+
if len(parts) < 2 {
1630+
t.Fatalf("goroutineleak profile does not contain 'goroutineleak profile: total ': %s\nparts: %v", s, parts)
1631+
return
1632+
}
1633+
1634+
parts = whiteSpace.Split(parts[1], -1)
1635+
1636+
count, err := strconv.ParseInt(parts[0], 10, 64)
1637+
if err != nil {
1638+
t.Fatalf("goroutineleak profile count is not a number: %s\nerror: %v", s, err)
1639+
}
1640+
1641+
// Check that the total number of leaked goroutines is exactly the expected number.
1642+
if count != int64(number) {
1643+
t.Errorf("goroutineleak profile does not contain exactly %d leaked goroutines: %d", number, count)
1644+
}
1645+
}
1646+
1647+
checkLeakStack := func(t *testing.T) func(pc uintptr, locations []*profile.Location, _ map[string][]string) {
1648+
return func(pc uintptr, locations []*profile.Location, _ map[string][]string) {
1649+
if pc != leakCount {
1650+
t.Errorf("expected %d leaked goroutines with specific stack configurations, but found %d", leakCount, pc)
1651+
return
1652+
}
1653+
switch len(locations) {
1654+
case 4:
1655+
// We expect a receive operation. This is the typical stack.
1656+
checkFrame(0, 0, locations, "runtime.gopark")
1657+
checkFrame(1, 0, locations, "runtime.chanrecv")
1658+
checkFrame(2, 0, locations, "runtime.chanrecv1")
1659+
switch len(locations[3].Line) {
1660+
case 2:
1661+
// Running `go func() { goroutineLeakExample() }()` will produce a stack with 2 lines.
1662+
// The anonymous function will have the call to goroutineLeakExample inlined.
1663+
checkFrame(3, 1, locations, "runtime/pprof.TestGoroutineLeakProfileConcurrency.func5")
1664+
fallthrough
1665+
case 1:
1666+
// Running `go goroutineLeakExample()` will produce a stack with 1 line.
1667+
checkFrame(3, 0, locations, "runtime/pprof.goroutineLeakExample")
1668+
default:
1669+
t.Errorf("leaked goroutine stack location expected 1 or 2 lines in the 4th location but found %d", len(locations[3].Line))
1670+
return
1671+
}
1672+
default:
1673+
message := fmt.Sprintf("leaked goroutine stack expected 4 or 5 locations but found %d", len(locations))
1674+
for _, location := range locations {
1675+
for _, line := range location.Line {
1676+
message += fmt.Sprintf("\n%s:%d", line.Function.Name, line.Line)
1677+
}
1678+
}
1679+
t.Errorf("%s", message)
1680+
}
1681+
}
1682+
}
1683+
// Leak some goroutines that will feature in the goroutine leak profile
1684+
for i := 0; i < leakCount; i++ {
1685+
go goroutineLeakExample()
1686+
go func() {
1687+
// Leak another goroutine that will feature a slightly different stack.
1688+
// This includes the frame runtime/pprof.TestGoroutineLeakProfileConcurrency.func1.
1689+
goroutineLeakExample()
1690+
panic("unreachable")
1691+
}()
1692+
// Yield several times to allow the goroutines to leak.
1693+
runtime.Gosched()
1694+
runtime.Gosched()
1695+
}
1696+
1697+
// Give all goroutines a chance to leak.
1698+
time.Sleep(time.Second)
1699+
1700+
t.Run("profile contains leak", func(t *testing.T) {
1701+
var w strings.Builder
1702+
goroutineLeakProf.WriteTo(&w, 0)
1703+
parseProfile(t, []byte(w.String()), checkLeakStack(t))
1704+
})
1705+
1706+
t.Run("leak persists between sequential profiling runs", func(t *testing.T) {
1707+
for i := 0; i < 2; i++ {
1708+
var w strings.Builder
1709+
goroutineLeakProf.WriteTo(&w, 0)
1710+
parseProfile(t, []byte(w.String()), checkLeakStack(t))
1711+
}
1712+
})
1713+
1714+
// Concurrent calls to the goroutine leak profiler should not trigger data races
1715+
// or corruption.
1716+
t.Run("overlapping profile requests", func(t *testing.T) {
1717+
ctx := context.Background()
1718+
ctx, cancel := context.WithTimeout(ctx, time.Second)
1719+
defer cancel()
1720+
1721+
var wg sync.WaitGroup
1722+
for i := 0; i < 2; i++ {
1723+
wg.Add(1)
1724+
Do(ctx, Labels("i", fmt.Sprint(i)), func(context.Context) {
1725+
go func() {
1726+
defer wg.Done()
1727+
for ctx.Err() == nil {
1728+
var w strings.Builder
1729+
goroutineLeakProf.WriteTo(&w, 1)
1730+
countLeaks(t, 2*leakCount, w.String())
1731+
includesLeak(t, "goroutineleak", w.String())
1732+
}
1733+
}()
1734+
})
1735+
}
1736+
wg.Wait()
1737+
})
1738+
1739+
// Concurrent calls to the goroutine leak profiler should not trigger data races
1740+
// or corruption, or interfere with regular goroutine profiles.
1741+
t.Run("overlapping goroutine and goroutine leak profile requests", func(t *testing.T) {
1742+
ctx := context.Background()
1743+
ctx, cancel := context.WithTimeout(ctx, time.Second)
1744+
defer cancel()
1745+
1746+
var wg sync.WaitGroup
1747+
for i := 0; i < 2; i++ {
1748+
wg.Add(2)
1749+
Do(ctx, Labels("i", fmt.Sprint(i)), func(context.Context) {
1750+
go func() {
1751+
defer wg.Done()
1752+
for ctx.Err() == nil {
1753+
var w strings.Builder
1754+
goroutineLeakProf.WriteTo(&w, 1)
1755+
countLeaks(t, 2*leakCount, w.String())
1756+
includesLeak(t, "goroutineleak", w.String())
1757+
}
1758+
}()
1759+
go func() {
1760+
defer wg.Done()
1761+
for ctx.Err() == nil {
1762+
var w strings.Builder
1763+
goroutineProf.WriteTo(&w, 1)
1764+
// The regular goroutine profile should see the leaked
1765+
// goroutines. We simply check that the goroutine leak
1766+
// profile does not corrupt the goroutine profile state.
1767+
includesLeak(t, "goroutine", w.String())
1768+
}
1769+
}()
1770+
})
1771+
}
1772+
wg.Wait()
1773+
})
1774+
}
1775+
15681776
func TestGoroutineProfileConcurrency(t *testing.T) {
15691777
testenv.MustHaveParallelism(t)
15701778

0 commit comments

Comments
 (0)