Skip to content

Commit cab7dda

Browse files
committed
feat: add metrics for total_active_file and total_inactive_file memory
The goal of this PR is to have additional cAdvisor metrics which expose total_active_file and total_inactive_file. Today working_set_bytes subtracts total_inactive_file in its calculation, but there are situations where exposing these metrics directly is valuable. For example, two containers sharing files in an emptyDir increases total_active_file over time. This is not tracked in the working_set memory. Exposing total_active_file and total_inactive_file to the user allows them to subtract out total_active_file or total_inactive_file if they so choose in their alerts. In the case of prometheus with a thanos sidecar, working_set can give a false sense of high memory usage. The kernel counts thanos reading prometheus written files as "active_file" memory. In that situation, a user may want to exclude active_file from their ContainerLowOnMemory alert. Relates to: kubernetes/kubernetes#43916
1 parent 04006e5 commit cab7dda

File tree

13 files changed

+126
-18
lines changed

13 files changed

+126
-18
lines changed

cmd/internal/storage/bigquery/bigquery.go

+20
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ const (
5050
colMemoryUsage string = "memory_usage"
5151
// Working set size
5252
colMemoryWorkingSet string = "memory_working_set"
53+
// Total active file size
54+
colMemoryTotalActiveFile string = "memory_total_active_file"
55+
// Total inactive file size
56+
colMemoryTotalInactiveFile string = "memory_total_inactive_file"
5357
// Container page fault
5458
colMemoryContainerPgfault string = "memory_container_pgfault"
5559
// Constainer major page fault
@@ -133,6 +137,16 @@ func (s *bigqueryStorage) GetSchema() *bigquery.TableSchema {
133137
Name: colMemoryWorkingSet,
134138
}
135139
i++
140+
fields[i] = &bigquery.TableFieldSchema{
141+
Type: typeInteger,
142+
Name: colMemoryTotalActiveFile,
143+
}
144+
i++
145+
fields[i] = &bigquery.TableFieldSchema{
146+
Type: typeInteger,
147+
Name: colMemoryTotalInactiveFile,
148+
}
149+
i++
136150
fields[i] = &bigquery.TableFieldSchema{
137151
Type: typeInteger,
138152
Name: colMemoryContainerPgfault,
@@ -226,6 +240,12 @@ func (s *bigqueryStorage) containerStatsToRows(
226240
// Working set size
227241
row[colMemoryWorkingSet] = stats.Memory.WorkingSet
228242

243+
// Total active file size
244+
row[colMemoryTotalActiveFile] = stats.Memory.TotalActiveFile
245+
246+
// Total inactive file size
247+
row[colMemoryTotalInactiveFile] = stats.Memory.TotalInactiveFile
248+
229249
// container page fault
230250
row[colMemoryContainerPgfault] = stats.Memory.ContainerData.Pgfault
231251

cmd/internal/storage/influxdb/influxdb.go

+8
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ const (
7070
serMemoryMappedFile string = "memory_mapped_file"
7171
// Working set size
7272
serMemoryWorkingSet string = "memory_working_set"
73+
// Total active file size
74+
serMemoryTotalActiveFile string = "memory_total_active_file"
75+
// Total inactive file size
76+
serMemoryTotalInactiveFile string = "memory_total_inactive_file"
7377
// Number of memory usage hits limits
7478
serMemoryFailcnt string = "memory_failcnt"
7579
// Cumulative count of memory allocation failures
@@ -256,6 +260,10 @@ func (s *influxdbStorage) memoryStatsToPoints(
256260
points = append(points, makePoint(serMemoryMappedFile, stats.Memory.MappedFile))
257261
// Working Set Size
258262
points = append(points, makePoint(serMemoryWorkingSet, stats.Memory.WorkingSet))
263+
// Total Active File Size
264+
points = append(points, makePoint(serMemoryTotalActiveFile, stats.Memory.TotalActiveFile))
265+
// Total Inactive File Size
266+
points = append(points, makePoint(serMemoryTotalInactiveFile, stats.Memory.TotalInactiveFile))
259267
// Number of memory usage hits limits
260268
points = append(points, makePoint(serMemoryFailcnt, stats.Memory.Failcnt))
261269

cmd/internal/storage/influxdb/influxdb_test.go

+22-10
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,14 @@ func (self *influxDbTestStorageDriver) StatsEq(a, b *info.ContainerStats) bool {
7575
return false
7676
}
7777

78+
if a.Memory.TotalActiveFile != b.Memory.TotalActiveFile {
79+
return false
80+
}
81+
82+
if a.Memory.TotalInactiveFile != b.Memory.TotalInactiveFile {
83+
return false
84+
}
85+
7886
if !reflect.DeepEqual(a.Network, b.Network) {
7987
return false
8088
}
@@ -253,6 +261,8 @@ func TestContainerStatsToPoints(t *testing.T) {
253261
assertContainsPointWithValue(t, points, serMemoryMappedFile, stats.Memory.MappedFile)
254262
assertContainsPointWithValue(t, points, serMemoryUsage, stats.Memory.Usage)
255263
assertContainsPointWithValue(t, points, serMemoryWorkingSet, stats.Memory.WorkingSet)
264+
assertContainsPointWithValue(t, points, serMemoryTotalActiveFile, stats.Memory.TotalActiveFile)
265+
assertContainsPointWithValue(t, points, serMemoryTotalInactiveFile, stats.Memory.TotalInactiveFile)
256266
assertContainsPointWithValue(t, points, serMemoryFailcnt, stats.Memory.Failcnt)
257267
assertContainsPointWithValue(t, points, serMemoryFailure, stats.Memory.ContainerData.Pgfault)
258268
assertContainsPointWithValue(t, points, serMemoryFailure, stats.Memory.ContainerData.Pgmajfault)
@@ -346,16 +356,18 @@ func createTestStats() (*info.ContainerInfo, *info.ContainerStats) {
346356
LoadAverage: int32(rand.Intn(1000)),
347357
},
348358
Memory: info.MemoryStats{
349-
Usage: 26767396864,
350-
MaxUsage: 30429605888,
351-
Cache: 7837376512,
352-
RSS: 18930020352,
353-
Swap: 1024,
354-
MappedFile: 1025327104,
355-
WorkingSet: 23630012416,
356-
Failcnt: 1,
357-
ContainerData: info.MemoryStatsMemoryData{Pgfault: 100328455, Pgmajfault: 97},
358-
HierarchicalData: info.MemoryStatsMemoryData{Pgfault: 100328454, Pgmajfault: 96},
359+
Usage: 26767396864,
360+
MaxUsage: 30429605888,
361+
Cache: 7837376512,
362+
RSS: 18930020352,
363+
Swap: 1024,
364+
MappedFile: 1025327104,
365+
WorkingSet: 23630012416,
366+
TotalActiveFile: 29459246253,
367+
TotalInactiveFile: 28364536434,
368+
Failcnt: 1,
369+
ContainerData: info.MemoryStatsMemoryData{Pgfault: 100328455, Pgmajfault: 97},
370+
HierarchicalData: info.MemoryStatsMemoryData{Pgfault: 100328454, Pgmajfault: 96},
359371
},
360372
Hugetlb: map[string]info.HugetlbStats{
361373
"1GB": {Usage: 1234, MaxUsage: 5678, Failcnt: 9},

cmd/internal/storage/statsd/statsd.go

+8
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@ const (
5757
serMemoryMappedFile string = "memory_mapped_file"
5858
// Working set size
5959
serMemoryWorkingSet string = "memory_working_set"
60+
// Total active file size
61+
serMemoryTotalActiveFile string = "memory_total_active_file"
62+
// Total inactive file size
63+
serMemoryTotalInactiveFile string = "memory_total_inactive_file"
6064
// Number of memory usage hits limits
6165
serMemoryFailcnt string = "memory_failcnt"
6266
// Cumulative count of memory allocation failures
@@ -159,6 +163,10 @@ func (s *statsdStorage) memoryStatsToValues(series *map[string]uint64, stats *in
159163
(*series)[serMemoryMappedFile] = stats.Memory.MappedFile
160164
// Working Set Size
161165
(*series)[serMemoryWorkingSet] = stats.Memory.WorkingSet
166+
// Total Active File Size
167+
(*series)[serMemoryTotalActiveFile] = stats.Memory.TotalActiveFile
168+
// Total Inactive File Size
169+
(*series)[serMemoryTotalInactiveFile] = stats.Memory.TotalInactiveFile
162170
// Number of memory usage hits limits
163171
(*series)[serMemoryFailcnt] = stats.Memory.Failcnt
164172

cmd/internal/storage/stdout/stdout.go

+8
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ const (
5959
serMemoryMappedFile string = "memory_mapped_file"
6060
// Working set size
6161
serMemoryWorkingSet string = "memory_working_set"
62+
// Total active file
63+
serMemoryTotalActiveFile string = "memory_total_active_file"
64+
// Total inactive file
65+
serMemoryTotalInactiveFile string = "memory_total_inactive_file"
6266
// Number of memory usage hits limits
6367
serMemoryFailcnt string = "memory_failcnt"
6468
// Cumulative count of memory allocation failures
@@ -164,6 +168,10 @@ func (driver *stdoutStorage) memoryStatsToValues(series *map[string]uint64, stat
164168
(*series)[serMemoryMappedFile] = stats.Memory.MappedFile
165169
// Working Set Size
166170
(*series)[serMemoryWorkingSet] = stats.Memory.WorkingSet
171+
// Total Active File
172+
(*series)[serMemoryTotalActiveFile] = stats.Memory.TotalActiveFile
173+
// Total Inactive File
174+
(*series)[serMemoryTotalInactiveFile] = stats.Memory.TotalInactiveFile
167175
// Number of memory usage hits limits
168176
(*series)[serMemoryFailcnt] = stats.Memory.Failcnt
169177

container/libcontainer/handler.go

+10
Original file line numberDiff line numberDiff line change
@@ -834,8 +834,18 @@ func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
834834
inactiveFileKeyName = "inactive_file"
835835
}
836836

837+
activeFileKeyName := "total_active_file"
838+
if cgroups.IsCgroup2UnifiedMode() {
839+
activeFileKeyName = "active_file"
840+
}
841+
842+
if v, ok := s.MemoryStats.Stats[activeFileKeyName]; ok {
843+
ret.Memory.TotalActiveFile = v
844+
}
845+
837846
workingSet := ret.Memory.Usage
838847
if v, ok := s.MemoryStats.Stats[inactiveFileKeyName]; ok {
848+
ret.Memory.TotalInactiveFile = v
839849
if workingSet < v {
840850
workingSet = 0
841851
} else {

info/v1/container.go

+8
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,14 @@ type MemoryStats struct {
393393
// Units: Bytes.
394394
WorkingSet uint64 `json:"working_set"`
395395

396+
// The total amount of active file memory.
397+
// Units: Bytes.
398+
TotalActiveFile uint64 `json:"total_active_file"`
399+
400+
// The total amount of inactive file memory.
401+
// Units: Bytes.
402+
TotalInactiveFile uint64 `json:"total_inactive_file"`
403+
396404
Failcnt uint64 `json:"failcnt"`
397405

398406
// Size of kernel memory allocated in bytes.

info/v2/conversion_test.go

+7-5
Original file line numberDiff line numberDiff line change
@@ -137,11 +137,13 @@ func TestContainerStatsFromV1(t *testing.T) {
137137
v1Stats := v1.ContainerStats{
138138
Timestamp: timestamp,
139139
Memory: v1.MemoryStats{
140-
Usage: 1,
141-
Cache: 2,
142-
RSS: 3,
143-
WorkingSet: 4,
144-
Failcnt: 5,
140+
Usage: 1,
141+
Cache: 2,
142+
RSS: 3,
143+
WorkingSet: 4,
144+
Failcnt: 5,
145+
TotalActiveFile: 6,
146+
TotalInactiveFile: 7,
145147
ContainerData: v1.MemoryStatsMemoryData{
146148
Pgfault: 1,
147149
Pgmajfault: 2,

integration/tests/api/test_utils.go

+2
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ func checkMemoryStats(t *testing.T, stat info.MemoryStats) {
6969

7070
assert.NotEqual(0, stat.Usage, "Memory usage should not be zero")
7171
assert.NotEqual(0, stat.WorkingSet, "Memory working set should not be zero")
72+
assert.NotEqual(0, stat.TotalActiveFile, "Memory total active file should not be zero")
73+
assert.NotEqual(0, stat.TotalInactiveFile, "Memory total inactive file should not be zero")
7274
if stat.WorkingSet > stat.Usage {
7375
t.Errorf("Memory working set (%d) should be at most equal to memory usage (%d)", stat.WorkingSet, stat.Usage)
7476
}

metrics/prometheus.go

+16
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,22 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
431431
return metricValues{{value: float64(s.Memory.WorkingSet), timestamp: s.Timestamp}}
432432
},
433433
},
434+
{
435+
name: "container_memory_total_active_file_bytes",
436+
help: "Current total active file in bytes.",
437+
valueType: prometheus.GaugeValue,
438+
getValues: func(s *info.ContainerStats) metricValues {
439+
return metricValues{{value: float64(s.Memory.TotalActiveFile), timestamp: s.Timestamp}}
440+
},
441+
},
442+
{
443+
name: "container_memory_total_inactive_file_bytes",
444+
help: "Current total inactive file in bytes.",
445+
valueType: prometheus.GaugeValue,
446+
getValues: func(s *info.ContainerStats) metricValues {
447+
return metricValues{{value: float64(s.Memory.TotalInactiveFile), timestamp: s.Timestamp}}
448+
},
449+
},
434450
{
435451
name: "container_memory_failures_total",
436452
help: "Cumulative count of memory allocation failures.",

metrics/prometheus_fake.go

+5-3
Original file line numberDiff line numberDiff line change
@@ -329,9 +329,11 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
329329
LoadAverage: 2,
330330
},
331331
Memory: info.MemoryStats{
332-
Usage: 8,
333-
MaxUsage: 8,
334-
WorkingSet: 9,
332+
Usage: 8,
333+
MaxUsage: 8,
334+
WorkingSet: 9,
335+
TotalActiveFile: 7,
336+
TotalInactiveFile: 6,
335337
ContainerData: info.MemoryStatsMemoryData{
336338
Pgfault: 10,
337339
Pgmajfault: 11,

metrics/testdata/prometheus_metrics

+6
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,12 @@ container_memory_rss{container_env_foo_env="prod",container_label_foo_label="bar
180180
# HELP container_memory_swap Container swap usage in bytes.
181181
# TYPE container_memory_swap gauge
182182
container_memory_swap{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 8192 1395066363000
183+
# HELP container_memory_total_active_file_bytes Current total active file in bytes.
184+
# TYPE container_memory_total_active_file_bytes gauge
185+
container_memory_total_active_file_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7 1395066363000
186+
# HELP container_memory_total_inactive_file_bytes Current total inactive file in bytes.
187+
# TYPE container_memory_total_inactive_file_bytes gauge
188+
container_memory_total_inactive_file_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 6 1395066363000
183189
# HELP container_memory_usage_bytes Current memory usage in bytes, including all memory regardless of when it was accessed
184190
# TYPE container_memory_usage_bytes gauge
185191
container_memory_usage_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 8 1395066363000

metrics/testdata/prometheus_metrics_whitelist_filtered

+6
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,12 @@ container_memory_rss{container_env_foo_env="prod",id="testcontainer",image="test
180180
# HELP container_memory_swap Container swap usage in bytes.
181181
# TYPE container_memory_swap gauge
182182
container_memory_swap{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 8192 1395066363000
183+
# HELP container_memory_total_active_file_bytes Current total active file in bytes.
184+
# TYPE container_memory_total_active_file_bytes gauge
185+
container_memory_total_active_file_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7 1395066363000
186+
# HELP container_memory_total_inactive_file_bytes Current total inactive file in bytes.
187+
# TYPE container_memory_total_inactive_file_bytes gauge
188+
container_memory_total_inactive_file_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 6 1395066363000
183189
# HELP container_memory_usage_bytes Current memory usage in bytes, including all memory regardless of when it was accessed
184190
# TYPE container_memory_usage_bytes gauge
185191
container_memory_usage_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 8 1395066363000

0 commit comments

Comments
 (0)