Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions cvs/monitors/metrics_exp/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# AMD GPU Fleet Monitoring - Environment Configuration
# Copy this file to .env and customize values

# ============================================
# PostgreSQL Configuration
# ============================================
POSTGRES_DB=fleet_monitor
POSTGRES_USER=fleet
POSTGRES_PASSWORD=change_this_password

# ============================================
# Prometheus Configuration
# ============================================
# Data retention settings
PROMETHEUS_RETENTION_TIME=15d
PROMETHEUS_RETENTION_SIZE=50GB

# ============================================
# Grafana Configuration
# ============================================
GRAFANA_ADMIN_USER=admin
GRAFANA_ADMIN_PASSWORD=change_this_password
GRAFANA_ROOT_URL=http://localhost:30030
# Generate API key from Grafana UI: Configuration > API Keys
GRAFANA_API_KEY=

# ============================================
# Fleet Manager Configuration
# ============================================
# Secret key for JWT tokens and encryption (change in production!)
SECRET_KEY=your-secret-key-change-in-production

# ============================================
# Loki Configuration (via loki-config.yml)
# ============================================
# Default retention: 168h (7 days)
# Modify server/config/loki/loki-config.yml for custom retention
72 changes: 72 additions & 0 deletions cvs/monitors/metrics_exp/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Environment files
.env
*.env.local

# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
venv/
env/
.venv/

# Node.js
node_modules/
dist/
.npm
*.log

# IDE
.idea/
.vscode/
*.swp
*.swo
*~

# Docker
.docker/

# Secrets
*.pem
*.key
id_rsa*
ssh_keys/

# Database
*.db
*.sqlite

# Volumes (when running locally)
prometheus_data/
loki_data/
grafana_data/
postgres_data/

# Auto-generated target files (contain real IPs/hostnames)
server/config/prometheus/targets/*.json
!server/config/prometheus/targets/.gitkeep

# Environment-specific scripts
nuke-and-rebuild.sh

# OS
.DS_Store
Thumbs.db

# Build artifacts
*.egg-info/
build/
*.egg

# Testing
.coverage
htmlcov/
.pytest_cache/

# Temporary files
*.tmp
*.temp
tmp/
temp/
60 changes: 60 additions & 0 deletions cvs/monitors/metrics_exp/Dockerfile.fleet-manager
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# AMD GPU Fleet Manager - Docker Image
# Multi-stage build: Node for frontend, Python for backend

# Stage 1: Build React frontend
FROM node:20-alpine AS frontend-builder

WORKDIR /app/ui

# Copy package files
COPY server/ui/package.json server/ui/package-lock.json* ./

# Install dependencies
RUN npm install

# Copy UI source
COPY server/ui/ ./

# Build production bundle
RUN npm run build

# Stage 2: Python backend with frontend
FROM python:3.11-slim

WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
openssh-client \
unzip \
&& rm -rf /var/lib/apt/lists/*

# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy application code
COPY server/fleet_manager /app/fleet_manager
COPY server/config /app/config
COPY scripts /app/scripts
COPY templates /app/templates

# Copy built frontend from Stage 1
COPY --from=frontend-builder /app/ui/dist /app/fleet_manager/static

# Create directories
RUN mkdir -p /app/ssh_keys /prometheus-config /prometheus-targets

# Set permissions
RUN chmod +x /app/scripts/*.sh 2>/dev/null || true

# Expose port
EXPOSE 8080

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8080/health || exit 1

# Run application
CMD ["uvicorn", "fleet_manager.main:app", "--host", "0.0.0.0", "--port", "8080"]
Loading
Loading