Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 90 additions & 3 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ jobs:
runtime_container_aarch64_run: ${{ steps.base-container-gate.outputs.runtime_container_aarch64_run }}
runtime_container_aarch64_version: ${{ steps.base-container-gate.outputs.runtime_container_aarch64_version }}
proto_files_changed: ${{ steps.base-container-gate.outputs.proto_files_changed }}
core_rpc_proto_files_changed: ${{ steps.base-container-gate.outputs.core_rpc_proto_files_changed }}
source_files_changed: ${{ steps.base-container-gate.outputs.source_files_changed }}
release_build_args: ${{ steps.release-metadata.outputs.build_args }}
release_build_args_aarch64: ${{ steps.release-metadata.outputs.build_args_aarch64 }}
Expand Down Expand Up @@ -139,6 +140,8 @@ jobs:
- 'dev/docker/Dockerfile.build-artifacts-container-aarch64'
proto_files:
- '**/*.proto'
core_rpc_proto_files:
- 'crates/rpc/proto/**'
source_files:
- 'crates/**'
- 'Cargo.toml'
Expand Down Expand Up @@ -214,6 +217,7 @@ jobs:
BUILD_ARTIFACTS_X86_64_DOCKERFILE_CHANGED: ${{ steps.build-container-changes.outputs.build_artifacts_x86_64 }}
BUILD_ARTIFACTS_AARCH64_DOCKERFILE_CHANGED: ${{ steps.build-container-changes.outputs.build_artifacts_aarch64 }}
PROTO_FILES_CHANGED: ${{ steps.build-container-changes.outputs.proto_files }}
CORE_RPC_PROTO_FILES_CHANGED: ${{ steps.build-container-changes.outputs.core_rpc_proto_files }}
SOURCE_FILES_CHANGED: ${{ steps.build-container-changes.outputs.source_files }}
run: |
build_container_x86_64_run=false
Expand All @@ -223,6 +227,7 @@ jobs:
build_artifacts_container_x86_64_run=false
build_artifacts_container_aarch64_run=false
proto_files_changed=false
core_rpc_proto_files_changed=false
source_files_changed=false

if [[ "${COMMIT_MESSAGE}" =~ ci-rebuild-base-containers ]]; then
Expand All @@ -240,6 +245,10 @@ jobs:
proto_files_changed=true
fi

if [[ "${CORE_RPC_PROTO_FILES_CHANGED}" == "true" ]]; then
core_rpc_proto_files_changed=true
fi

if [[ "${SOURCE_FILES_CHANGED}" == "true" ]]; then
source_files_changed=true
fi
Expand Down Expand Up @@ -271,6 +280,7 @@ jobs:
echo "build_artifacts_container_x86_64_run=${build_artifacts_container_x86_64_run}" >> "$GITHUB_OUTPUT"
echo "build_artifacts_container_aarch64_run=${build_artifacts_container_aarch64_run}" >> "$GITHUB_OUTPUT"
echo "proto_files_changed=${proto_files_changed}" >> "$GITHUB_OUTPUT"
echo "core_rpc_proto_files_changed=${core_rpc_proto_files_changed}" >> "$GITHUB_OUTPUT"
echo "source_files_changed=${source_files_changed}" >> "$GITHUB_OUTPUT"

if [[ "$build_container_x86_64_run" == "true" ]]; then
Expand Down Expand Up @@ -468,11 +478,22 @@ jobs:
# BUILD STAGE - Release Container
# ============================================================================
build-release-container-x86_64:
if: ${{ always() && github.event_name != 'schedule' && needs.prepare.result == 'success' }}
if: >-
${{
always()
&& github.event_name != 'schedule'
&& needs.prepare.result == 'success'
&& contains('success,skipped', needs.build-container-x86_64.result)
&& contains('success,skipped', needs.build-runtime-container-x86_64.result)
&& contains('success,skipped', needs.check-rest-core-proto-sync.result)
&& contains('success,skipped', needs.lint-police.result)
}}
needs:
- prepare
- build-container-x86_64
- build-runtime-container-x86_64
- check-rest-core-proto-sync
- lint-police
uses: ./.github/workflows/docker-build.yml
with:
dockerfile_path: dev/docker/Dockerfile.release-container-x86_64
Expand All @@ -490,11 +511,22 @@ jobs:
secrets: inherit

build-release-container-aarch64:
if: ${{ always() && github.event_name != 'schedule' && needs.prepare.result == 'success' }}
if: >-
${{
always()
&& github.event_name != 'schedule'
&& needs.prepare.result == 'success'
&& contains('success,skipped', needs.build-container-aarch64.result)
&& contains('success,skipped', needs.build-runtime-container-aarch64.result)
&& contains('success,skipped', needs.check-rest-core-proto-sync.result)
&& contains('success,skipped', needs.lint-police.result)
}}
needs:
- prepare
- build-container-aarch64
- build-runtime-container-aarch64
- check-rest-core-proto-sync
- lint-police
uses: ./.github/workflows/docker-build.yml
with:
dockerfile_path: dev/docker/Dockerfile.release-container-aarch64
Expand Down Expand Up @@ -547,11 +579,22 @@ jobs:
nvcr.io/0837451325059433/carbide-dev/nvmetal-carbide-aarch64@${{ needs.build-release-container-aarch64.outputs.image_digest }}

test-release-container-services:
if: ${{ always() && github.event_name != 'schedule' && needs.prepare.result == 'success' }}
if: >-
${{
always()
&& github.event_name != 'schedule'
&& needs.prepare.result == 'success'
&& contains('success,skipped', needs.build-container-x86_64.result)
&& contains('success,skipped', needs.build-runtime-container-x86_64.result)
&& contains('success,skipped', needs.check-rest-core-proto-sync.result)
&& contains('success,skipped', needs.lint-police.result)
}}
needs:
- prepare
- build-container-x86_64
- build-runtime-container-x86_64
- check-rest-core-proto-sync
- lint-police
runs-on: linux-amd64-cpu16
steps:
- name: Checkout code
Expand Down Expand Up @@ -596,6 +639,11 @@ jobs:
cargo make test-release-container-services
'

- name: Final repository clean check
run: |
git config --global --add safe.directory "${GITHUB_WORKSPACE}"
bash scripts/check-repo-clean.sh "release-container service tests"

# ============================================================================
# BUILD STAGE - Forge CLI (Multi-arch)
# ============================================================================
Expand Down Expand Up @@ -1008,6 +1056,41 @@ jobs:
--against 'https://github.com/${{ github.repository }}.git#branch=main,subdir=crates/rpc/proto' \
--error-format=github-actions

check-rest-core-proto-sync:
name: Check REST Core Proto Sync
permissions:
contents: read
needs:
- prepare
if: ${{ needs.prepare.outputs.core_rpc_proto_files_changed == 'true' }}
runs-on: linux-amd64-cpu4
steps:
- name: Checkout code
uses: actions/checkout@v4
Comment thread
coderabbitai[bot] marked this conversation as resolved.
with:
persist-credentials: false

- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: "1.25.11"
cache: true
cache-dependency-path: rest-api/go.sum

- name: Install buf and protoc plugins
run: |
go install github.com/bufbuild/buf/cmd/buf@v1.70.0
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.36.11
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.6.1

- name: Regenerate REST protos from Core protos
run: |
make -C rest-api core-proto
make -C rest-api/flow gen-nicoapi-pb

- name: Check repository is clean
run: bash scripts/check-repo-clean.sh "REST Core protobuf sync"

lint-police:
needs:
- prepare
Expand Down Expand Up @@ -1057,6 +1140,9 @@ jobs:
- name: Check isolated package builds
run: cargo xtask check-isolated-package-builds

- name: Check repository is clean
run: bash scripts/check-repo-clean.sh "Core pre-build checks"

# ============================================================================
# BUILD STAGE - Helm Chart
# ============================================================================
Expand Down Expand Up @@ -1583,6 +1669,7 @@ jobs:
- test-release-container-services
- security-secret-scan
- lint-police
- check-rest-core-proto-sync
steps:
- name: Decide pass/fail
env:
Expand Down
9 changes: 8 additions & 1 deletion .github/workflows/rest-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ jobs:
outputs:
run_rest_ci: ${{ steps.gate.outputs.run_rest_ci }}
rest_api_changed: ${{ steps.filter.outputs.rest_api }}
core_proto_changed: ${{ steps.filter.outputs.core_proto }}
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand All @@ -38,18 +39,24 @@ jobs:
- 'rest-api/**'
- '.github/workflows/rest-*.yml'
- 'helm/rest/**'
core_proto:
- 'crates/rpc/proto/**'

- name: Decide whether REST CI should run
id: gate
env:
REF: ${{ github.ref }}
COMMIT_MESSAGE: ${{ github.event.head_commit.message || '' }}
REST_API_CHANGED: ${{ steps.filter.outputs.rest_api }}
CORE_PROTO_CHANGED: ${{ steps.filter.outputs.core_proto }}
run: |
run_rest_ci=true

if [[ "${REF}" =~ ^refs/heads/pull-request/[0-9]+$ ]]; then
run_rest_ci="${REST_API_CHANGED:-false}"
run_rest_ci=false
if [[ "${REST_API_CHANGED:-false}" == "true" || "${CORE_PROTO_CHANGED:-false}" == "true" ]]; then
run_rest_ci=true
fi
fi

if [[ "${COMMIT_MESSAGE}" =~ ci-run-complete-pipeline ]]; then
Expand Down
33 changes: 6 additions & 27 deletions .github/workflows/rest-lint-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -179,38 +179,17 @@ jobs:

- name: Install buf and protoc plugins
run: |
go install github.com/bufbuild/buf/cmd/buf@latest
go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
go install github.com/bufbuild/buf/cmd/buf@v1.70.0
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.36.11
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.6.1

- name: Regenerate protobuf code
run: |
cd workflow-schema && buf generate
cd flow && buf generate
make core-proto
make -C flow gen-nicoapi-pb

- name: Check for uncommitted changes
run: |
UNTRACKED=$(git ls-files --others --exclude-standard)

MEANINGFUL_DIFF=$(git diff --unified=0 | grep '^[+-]' | grep -v '^[+-][+-][+-]' | grep -v '^[+-]//.*protoc-gen-go' | grep -v '^[+-]//.*protoc v' || true)

if [ -z "$MEANINGFUL_DIFF" ] && [ -z "$UNTRACKED" ]; then
echo "✓ Protobuf generated code is up to date."
else
echo "::error::Protobuf generated code is out of date. Please run 'make core-protogen' and 'make flow-protogen' and commit the results."
echo ""
echo "Changed files:"
git status --porcelain
echo ""
echo "Diff:"
git -P diff
if [ -n "$UNTRACKED" ]; then
echo ""
echo "Untracked files:"
echo "$UNTRACKED"
fi
exit 1
fi
run: bash ../scripts/check-repo-clean.sh "REST protobuf generation"

test:
name: Test (${{ matrix.module }})
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,7 @@ crates/dpf/doca-platform

# Generated plans from Cursor
.cursor/

# Generated by github runners, ignore them to prevent failures in check-repo-clean.sh
/cargo/
/.sccache/
30 changes: 30 additions & 0 deletions Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,9 @@ description = "Tasks that run on workspace level (not crate level) in pre-commit
category = "CI"
# clippy-flow is a default target provided by cargo-make.
# NOTE: If you add something here, and you also want it to be checked by CI, make sure to add it to the lint-police job in .github/workflows/ci.yaml too.
# REST proto generation is covered by the dedicated check-rest-core-proto-sync CI job because it needs Go/buf setup.
dependencies = [
"generate-rest-core-proto",
"clippy-flow",
"carbide-lints",
"check-format-nightly",
Expand All @@ -586,6 +588,34 @@ dependencies = [
"check-bans",
]

[tasks.generate-rest-core-proto]
workspace = false
description = "Regenerate REST protobuf snapshots and Go bindings from crates/rpc/proto."
category = "Generate"
script = '''
set -eu
make -C rest-api core-proto
make -C rest-api/flow gen-nicoapi-pb
'''

[tasks.check-repo-clean]
workspace = false
description = "Fail if the repository has uncommitted generated or lockfile changes."
category = "CI"
script = '''
bash "${REPO_ROOT}/scripts/check-repo-clean.sh" "cargo make check-repo-clean"
'''

[tasks.check-rest-core-proto-sync]
workspace = false
description = "Regenerate REST Core protobuf outputs and fail if they were not committed."
category = "CI"
script = '''
set -eu
cargo make --no-workspace generate-rest-core-proto
cargo make --no-workspace check-repo-clean
'''

[tasks.clippy]
workspace = false
description = "Runs clippy code linter."
Expand Down
17 changes: 17 additions & 0 deletions docs/observability/core_metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ This file contains a list of metrics exported by NVIDIA Infra Controller (NICo).
<tr><td>carbide_machine_validation_completed</td><td>gauge</td><td>Count of machine validation that have completed successfully</td></tr>
<tr><td>carbide_machine_validation_failed</td><td>gauge</td><td>Count of machine validation that have failed</td></tr>
<tr><td>carbide_machine_validation_in_progress</td><td>gauge</td><td>Count of machine validation that are in progress</td></tr>
<tr><td>carbide_machine_validation_oldest_active_age_seconds</td><td>gauge</td><td>Age in seconds of the oldest active machine validation run</td></tr>
<tr><td>carbide_machine_validation_stale_runs_count</td><td>gauge</td><td>Count of active machine validation runs considered stale</td></tr>
<tr><td>carbide_machine_validation_tests</td><td>gauge</td><td>The details of machine validation tests</td></tr>
<tr><td>carbide_machines_enqueuer_iteration_latency_milliseconds</td><td>histogram</td><td>The overall time it took to enqueue state handling tasks for all carbide_machines in the system</td></tr>
<tr><td>carbide_machines_handler_latency_in_state_milliseconds</td><td>histogram</td><td>The amount of time it took to invoke the state handler for objects of type carbide_machines in a certain state</td></tr>
Expand Down Expand Up @@ -118,6 +120,7 @@ This file contains a list of metrics exported by NVIDIA Infra Controller (NICo).
<tr><td>carbide_site_explorer_created_power_shelves_count</td><td>gauge</td><td>The amount of Power Shelves that had been created by Site Explorer after being identified</td></tr>
<tr><td>carbide_site_explorer_enabled</td><td>gauge</td><td>Whether site-explorer is enabled (1) or paused (0)</td></tr>
<tr><td>carbide_site_explorer_iteration_latency_milliseconds</td><td>histogram</td><td>The time it took to perform one site explorer iteration</td></tr>
<tr><td>carbide_site_explorer_phase_latency_milliseconds</td><td>histogram</td><td>The time it took to perform one site explorer iteration phase</td></tr>
Comment thread
coderabbitai[bot] marked this conversation as resolved.
<tr><td>carbide_switches_enqueuer_iteration_latency_milliseconds</td><td>histogram</td><td>The overall time it took to enqueue state handling tasks for all carbide_switches in the system</td></tr>
<tr><td>carbide_switches_health_overrides_count</td><td>gauge</td><td>The amount of health overrides that are configured in the site</td></tr>
<tr><td>carbide_switches_health_status_count</td><td>gauge</td><td>The total number of Switches in the system that have reported either a healthy or not healthy status - based on the presence of health probe alerts</td></tr>
Expand All @@ -126,4 +129,18 @@ This file contains a list of metrics exported by NVIDIA Infra Controller (NICo).
<tr><td>carbide_switches_total</td><td>gauge</td><td>The total number of carbide_switches in the system</td></tr>
<tr><td>carbide_total_ips_count</td><td>gauge</td><td>The total number of ips in the site</td></tr>
<tr><td>carbide_unavailable_dpu_nic_firmware_update_count</td><td>gauge</td><td>The number of machines in the system that need a firmware update but are unavailable for update.</td></tr>
<tr><td>carbide_vpc_prefixes_enqueuer_iteration_latency_milliseconds</td><td>histogram</td><td>The overall time it took to enqueue state handling tasks for all carbide_vpc_prefixes in the system</td></tr>
<tr><td>carbide_vpc_prefixes_handler_latency_in_state_milliseconds</td><td>histogram</td><td>The amount of time it took to invoke the state handler for objects of type carbide_vpc_prefixes in a certain state</td></tr>
<tr><td>carbide_vpc_prefixes_iteration_latency_milliseconds</td><td>histogram</td><td>The elapsed time in the last state processor iteration to handle objects of type carbide_vpc_prefixes</td></tr>
<tr><td>carbide_vpc_prefixes_object_tasks_completed_total</td><td>counter</td><td>The amount of object handling tasks that have been completed for objects of type carbide_vpc_prefixes</td></tr>
<tr><td>carbide_vpc_prefixes_object_tasks_dispatched_total</td><td>counter</td><td>The amount of types that object handling tasks that have been dequeued and dispatched for processing for objects of type carbide_vpc_prefixes</td></tr>
<tr><td>carbide_vpc_prefixes_object_tasks_enqueued_total</td><td>counter</td><td>The amount of types that object handling tasks that have been freshly enqueued for objects of type carbide_vpc_prefixes</td></tr>
<tr><td>carbide_vpc_prefixes_object_tasks_requeued_total</td><td>counter</td><td>The amount of object handling tasks that have been requeued for objects of type carbide_vpc_prefixes</td></tr>
<tr><td>carbide_vpc_prefixes_per_state</td><td>gauge</td><td>The number of carbide_vpc_prefixes in the system with a given state</td></tr>
<tr><td>carbide_vpc_prefixes_per_state_above_sla</td><td>gauge</td><td>The number of carbide_vpc_prefixes in the system which had been longer in a state than allowed per SLA</td></tr>
<tr><td>carbide_vpc_prefixes_state_entered_total</td><td>counter</td><td>The amount of types that objects of type carbide_vpc_prefixes have entered a certain state</td></tr>
<tr><td>carbide_vpc_prefixes_state_exited_total</td><td>counter</td><td>The amount of types that objects of type carbide_vpc_prefixes have exited a certain state</td></tr>
<tr><td>carbide_vpc_prefixes_time_in_state_seconds</td><td>histogram</td><td>The amount of time objects of type carbide_vpc_prefixes have spent in a certain state</td></tr>
<tr><td>carbide_vpc_prefixes_total</td><td>gauge</td><td>The total number of carbide_vpc_prefixes in the system</td></tr>
<tr><td>carbide_vpc_prefixes_with_state_handling_errors_per_state</td><td>gauge</td><td>The number of carbide_vpc_prefixes in the system with a given state that failed state handling</td></tr>
</table>
7 changes: 6 additions & 1 deletion rest-api/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
.PHONY: build docker-build docker-build-local
.PHONY: test-ipam test-site-agent test-site-manager test-workflow test-db test-api test-auth test-common test-cert-manager test-site-workflow test-flow test-powershelf-manager test-nvswitch-manager migrate core-mock-server-build core-mock-server-start core-mock-server-stop flow-mock-server-build flow-mock-server-start flow-mock-server-stop
.PHONY: validate-openapi preview-openapi generate-client
.PHONY: core-proto core-proto-clean core-proto-fetch core-proto-fmt core-protogen flow-proto flow-protogen
.PHONY: pre-commit-install pre-commit-run pre-commit-update

# Build configuration
Expand Down Expand Up @@ -259,7 +260,11 @@ docker-build:
docker build -t $(IMAGE_REGISTRY)/nico-nsm:$(IMAGE_TAG) -f $(DOCKERFILE_DIR)/Dockerfile.nico-nsm .
docker build -t $(IMAGE_REGISTRY)/nico-mcp:$(IMAGE_TAG) -f $(DOCKERFILE_DIR)/Dockerfile.nico-mcp .

core-proto: core-proto-fetch core-proto-fmt core-protogen
core-proto:
$(MAKE) core-proto-clean
$(MAKE) core-proto-fetch
$(MAKE) core-proto-fmt
$(MAKE) core-protogen

core-proto-clean:
@echo "Cleaning up Core proto and protobuf files"
Expand Down
Loading
Loading