diff --git a/cmd/go.mod b/cmd/go.mod index 80c3e1eeb9..3403572577 100644 --- a/cmd/go.mod +++ b/cmd/go.mod @@ -1,8 +1,8 @@ module github.com/google/cadvisor/cmd -go 1.23.0 +go 1.24.0 -toolchain go1.24.0 +toolchain go1.24.2 // Record that the cmd module requires the cadvisor library module. // The github.com/google/cadvisor/cmd module is built using the Makefile @@ -29,6 +29,7 @@ require ( golang.org/x/oauth2 v0.30.0 google.golang.org/api v0.235.0 gopkg.in/olivere/elastic.v2 v2.0.61 + k8s.io/cri-api v0.34.1 // indirect k8s.io/klog/v2 v2.130.1 k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 ) diff --git a/cmd/go.sum b/cmd/go.sum index 8b257ba133..0e71057e56 100644 --- a/cmd/go.sum +++ b/cmd/go.sum @@ -417,6 +417,8 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools/v3 v3.0.2 h1:kG1BFyqVHuQoVQiR1bWGnfz/fmHvvuiSPIV7rvl360E= gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk= +k8s.io/cri-api v0.34.1 h1:n2bU++FqqJq0CNjP/5pkOs0nIx7aNpb1Xa053TecQkM= +k8s.io/cri-api v0.34.1/go.mod h1:4qVUjidMg7/Z9YGZpqIDygbkPWkg3mkS1PvOx/kpHTE= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 h1:jgJW5IePPXLGB8e/1wvd0Ich9QE97RvvF3a8J3fP/Lg= diff --git a/container/containerd/client.go b/container/containerd/client.go index 34134baf3e..fb29752cd1 100644 --- a/container/containerd/client.go +++ b/container/containerd/client.go @@ -31,6 +31,7 @@ import ( "google.golang.org/grpc/backoff" "google.golang.org/grpc/credentials/insecure" emptypb "google.golang.org/protobuf/types/known/emptypb" + runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" "github.com/google/cadvisor/container/containerd/containers" "github.com/google/cadvisor/container/containerd/pkg/dialer" @@ -40,12 +41,14 @@ type client struct { containerService containersapi.ContainersClient taskService tasksapi.TasksClient versionService versionapi.VersionClient + runtimeService runtimeapi.RuntimeServiceClient } type ContainerdClient interface { LoadContainer(ctx context.Context, id string) (*containers.Container, error) TaskPid(ctx context.Context, id string) (uint32, error) Version(ctx context.Context) (string, error) + ContainerStats(ctx context.Context, id string) (*runtimeapi.ContainerStats, error) } var ( @@ -104,6 +107,7 @@ func Client(address, namespace string) (ContainerdClient, error) { containerService: containersapi.NewContainersClient(conn), taskService: tasksapi.NewTasksClient(conn), versionService: versionapi.NewVersionClient(conn), + runtimeService: runtimeapi.NewRuntimeServiceClient(conn), } }) return ctrdClient, retErr @@ -140,6 +144,16 @@ func (c *client) Version(ctx context.Context) (string, error) { return response.Version, nil } +func (c *client) ContainerStats(ctx context.Context, id string) (*runtimeapi.ContainerStats, error) { + resp, err := c.runtimeService.ContainerStats(ctx, &runtimeapi.ContainerStatsRequest{ + ContainerId: id, + }) + if err != nil { + return nil, err + } + return resp.Stats, nil +} + func containerFromProto(containerpb *containersapi.Container) *containers.Container { var runtime containers.RuntimeInfo // TODO: is nil check required for containerpb diff --git a/container/containerd/client_test.go b/container/containerd/client_test.go index eaaab0e44e..f978b9ff80 100644 --- a/container/containerd/client_test.go +++ b/container/containerd/client_test.go @@ -19,6 +19,7 @@ import ( "fmt" "github.com/google/cadvisor/container/containerd/containers" + runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" ) type containerdClientMock struct { @@ -45,6 +46,22 @@ func (c *containerdClientMock) TaskPid(ctx context.Context, id string) (uint32, return 2389, nil } +func (c *containerdClientMock) ContainerStats(ctx context.Context, id string) (*runtimeapi.ContainerStats, error) { + if c.returnErr != nil { + return nil, c.returnErr + } + // Return mock stats with filesystem usage + return &runtimeapi.ContainerStats{ + Attributes: &runtimeapi.ContainerAttributes{ + Id: id, + }, + WritableLayer: &runtimeapi.FilesystemUsage{ + UsedBytes: &runtimeapi.UInt64Value{Value: 1024 * 1024}, // 1MB + InodesUsed: &runtimeapi.UInt64Value{Value: 100}, + }, + }, nil +} + func mockcontainerdClient(cntrs map[string]*containers.Container, returnErr error) ContainerdClient { return &containerdClientMock{ cntrs: cntrs, diff --git a/container/containerd/handler.go b/container/containerd/handler.go index d7f0864a63..00f289052d 100644 --- a/container/containerd/handler.go +++ b/container/containerd/handler.go @@ -20,6 +20,7 @@ import ( "errors" "fmt" "strings" + "sync" "time" "github.com/containerd/errdefs" @@ -34,6 +35,26 @@ import ( info "github.com/google/cadvisor/info/v1" ) +// fsUsageCache caches filesystem usage data to avoid excessive disk I/O +type fsUsageCache struct { + timestamp time.Time + usedBytes uint64 + inodesUsed uint64 + cacheDuration time.Duration +} + +// newFsUsageCache creates a new filesystem usage cache with default 30s cache duration +func newFsUsageCache() *fsUsageCache { + return &fsUsageCache{ + cacheDuration: 30 * time.Second, + } +} + +// isValid checks if the cached data is still valid +func (c *fsUsageCache) isValid() bool { + return time.Since(c.timestamp) < c.cacheDuration +} + type containerdContainerHandler struct { machineInfoFactory info.MachineInfoFactory // Absolute path to the cgroup hierarchies of this container. @@ -50,6 +71,16 @@ type containerdContainerHandler struct { includedMetrics container.MetricSet libcontainerHandler *containerlibcontainer.Handler + + // Filesystem usage cache with timestamp to avoid excessive disk I/O + fsUsageCache *fsUsageCache + fsUsageCacheLock sync.RWMutex + + // Container snapshot key for filesystem usage calculation + snapshotKey string + snapshotter string + // CRI client for stats collection + client ContainerdClient } var _ container.ContainerHandler = &containerdContainerHandler{} @@ -143,6 +174,10 @@ func newContainerdContainerHandler( includedMetrics: metrics, reference: containerReference, libcontainerHandler: libcontainerHandler, + fsUsageCache: newFsUsageCache(), + snapshotKey: cntr.SnapshotKey, + snapshotter: cntr.Snapshotter, + client: client, } // Add the name and bare ID as aliases of the container. handler.image = cntr.Image @@ -171,9 +206,8 @@ func (h *containerdContainerHandler) ContainerReference() (info.ContainerReferen } func (h *containerdContainerHandler) GetSpec() (info.ContainerSpec, error) { - // TODO: Since we dont collect disk usage stats for containerd, we set hasFilesystem - // to false. Revisit when we support disk usage stats for containerd - hasFilesystem := false + // Enable filesystem stats collection for containerd containers with disk usage metrics + hasFilesystem := h.includedMetrics.Has(container.DiskUsageMetrics) && h.canCollectFilesystemStats() hasNet := h.includedMetrics.Has(container.NetworkUsageMetrics) spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, hasNet, hasFilesystem) spec.Labels = h.labels @@ -183,6 +217,92 @@ func (h *containerdContainerHandler) GetSpec() (info.ContainerSpec, error) { return spec, err } +// canCollectFilesystemStats determines if filesystem stats can be collected for this container +func (h *containerdContainerHandler) canCollectFilesystemStats() bool { + // Only collect filesystem stats for regular containers (not pause/sandbox containers) + // and when we have snapshot information + if h.labels["io.cri-containerd.kind"] == "sandbox" { + return false + } + return h.snapshotKey != "" && h.snapshotter != "" +} + +// collectFilesystemUsage collects filesystem usage statistics using CRI stats API +func (h *containerdContainerHandler) collectFilesystemUsage(stats *info.ContainerStats) error { + setStatsFromCache := func() { + stats.Filesystem = []info.FsStats{{ + Device: h.snapshotter + ":" + h.snapshotKey, + Type: "containerd-snapshotter", + Usage: h.fsUsageCache.usedBytes, + HasInodes: true, + Inodes: h.fsUsageCache.inodesUsed, + }} + } + + h.fsUsageCacheLock.RLock() + if h.fsUsageCache.isValid() { + // Use cached data + setStatsFromCache() + h.fsUsageCacheLock.RUnlock() + return nil + } + h.fsUsageCacheLock.RUnlock() + + // Cache miss or expired, collect fresh data using CRI stats API + h.fsUsageCacheLock.Lock() + defer h.fsUsageCacheLock.Unlock() + + // Double-check after acquiring write lock + if h.fsUsageCache.isValid() { + setStatsFromCache() + return nil + } + + // Get filesystem usage from CRI stats API + usedBytes, inodesUsed, err := h.getCRIFilesystemUsage() + if err != nil { + return err + } + + // Update cache + h.fsUsageCache.timestamp = time.Now() + h.fsUsageCache.usedBytes = usedBytes + h.fsUsageCache.inodesUsed = inodesUsed + + // Set filesystem stats + setStatsFromCache() + + return nil +} + +// getCRIFilesystemUsage gets filesystem usage from CRI stats API +func (h *containerdContainerHandler) getCRIFilesystemUsage() (uint64, uint64, error) { + ctx := context.Background() + containerStats, err := h.client.ContainerStats(ctx, h.reference.Id) + if err != nil { + return 0, 0, fmt.Errorf("failed to get CRI container stats: %v", err) + } + + if containerStats == nil { + return 0, 0, fmt.Errorf("container stats is nil") + } + + // Extract filesystem usage from CRI stats + var usedBytes, inodesUsed uint64 + + // Get writable layer usage (container's filesystem usage) + if containerStats.WritableLayer != nil { + if containerStats.WritableLayer.UsedBytes != nil { + usedBytes = containerStats.WritableLayer.UsedBytes.Value + } + if containerStats.WritableLayer.InodesUsed != nil { + inodesUsed = containerStats.WritableLayer.InodesUsed.Value + } + } + + return usedBytes, inodesUsed, nil +} + func (h *containerdContainerHandler) getFsStats(stats *info.ContainerStats) error { mi, err := h.machineInfoFactory.GetMachineInfo() if err != nil { @@ -192,6 +312,16 @@ func (h *containerdContainerHandler) getFsStats(stats *info.ContainerStats) erro if h.includedMetrics.Has(container.DiskIOMetrics) { common.AssignDeviceNamesToDiskStats((*common.MachineInfoNamer)(mi), &stats.DiskIo) } + + // Collect filesystem usage stats if enabled and possible + if h.includedMetrics.Has(container.DiskUsageMetrics) && h.canCollectFilesystemStats() { + if err := h.collectFilesystemUsage(stats); err != nil { + // Log error but don't fail the entire stats collection + // This maintains backward compatibility + return nil + } + } + return nil } diff --git a/go.mod b/go.mod index 24187e5b8b..5c1b2b83c9 100644 --- a/go.mod +++ b/go.mod @@ -1,8 +1,8 @@ module github.com/google/cadvisor -go 1.23.0 +go 1.24.0 -toolchain go1.24.0 +toolchain go1.24.2 require ( cloud.google.com/go/compute/metadata v0.7.0 @@ -33,6 +33,7 @@ require ( golang.org/x/sys v0.33.0 google.golang.org/grpc v1.72.2 google.golang.org/protobuf v1.36.6 + k8s.io/cri-api v0.34.1 k8s.io/klog/v2 v2.130.1 k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 ) diff --git a/go.sum b/go.sum index e5f39bce04..0b7e005f2c 100644 --- a/go.sum +++ b/go.sum @@ -224,6 +224,8 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools/v3 v3.0.2 h1:kG1BFyqVHuQoVQiR1bWGnfz/fmHvvuiSPIV7rvl360E= gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk= +k8s.io/cri-api v0.34.1 h1:n2bU++FqqJq0CNjP/5pkOs0nIx7aNpb1Xa053TecQkM= +k8s.io/cri-api v0.34.1/go.mod h1:4qVUjidMg7/Z9YGZpqIDygbkPWkg3mkS1PvOx/kpHTE= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 h1:jgJW5IePPXLGB8e/1wvd0Ich9QE97RvvF3a8J3fP/Lg=