diff --git a/apps/backend/.env.example b/apps/backend/.env.example index b213cf6d..906dd44c 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -28,3 +28,20 @@ CORS_ORIGIN=http://localhost:3000 # News Provider API (CoinDesk) # documentation is https://developers.coindesk.com/documentation/data-api/news_v1_search COINDESK_API_KEY=your_api_key_here + +# ======================== +# AI Metrics / GPU Monitoring +# ======================== + +# Maximum concurrent AI inference requests before throttling (default: 10) +AI_MAX_CONCURRENT_INFERENCES=10 + +# System RAM usage ratio (0-1) that triggers request throttling (default: 0.90) +AI_RAM_THROTTLE_THRESHOLD=0.90 + +# GPU VRAM usage ratio (0-1) that triggers request throttling (default: 0.90) +AI_VRAM_THROTTLE_THRESHOLD=0.90 + +# Resource sampling interval in milliseconds (default: 15000) +AI_METRICS_SAMPLING_MS=15000 + diff --git a/apps/backend/src/ai-metrics/ai-metrics.controller.spec.ts b/apps/backend/src/ai-metrics/ai-metrics.controller.spec.ts new file mode 100644 index 00000000..5d88b425 --- /dev/null +++ b/apps/backend/src/ai-metrics/ai-metrics.controller.spec.ts @@ -0,0 +1,147 @@ +import { Test, TestingModule } from '@nestjs/testing'; +import { AiMetricsController } from './ai-metrics.controller'; +import { AiMetricsService, AiHealthReport } from './ai-metrics.service'; + +describe('AiMetricsController', () => { + let controller: AiMetricsController; + let aiMetricsService: Partial; + + const mockReport: AiHealthReport = { + status: 'healthy', + timestamp: '2026-03-26T09:00:00.000Z', + uptime: 12345, + throttling: { + active: false, + reason: null, + currentLoad: 2, + maxConcurrent: 10, + }, + resources: { + totalMemoryBytes: 16e9, + freeMemoryBytes: 8e9, + usedMemoryBytes: 8e9, + memoryUsageRatio: 0.5, + heapUsedBytes: 100e6, + heapTotalBytes: 200e6, + rssBytes: 300e6, + externalBytes: 10e6, + gpuAvailable: false, + vramTotalBytes: null, + vramUsedBytes: null, + vramFreeBytes: null, + vramUsageRatio: null, + }, + models: { + totalLoaded: 1, + loadTimes: { 'sentiment-v2': 1200 }, + }, + counters: { + totalInferenceRequests: 42, + totalInferenceErrors: 3, + throttledRequests: 1, + }, + }; + + beforeEach(async () => { + aiMetricsService = { + getHealthReport: jest.fn().mockReturnValue(mockReport), + getPrometheusMetrics: jest + .fn() + .mockResolvedValue('# HELP ai_inference_requests_total\n'), + }; + + const module: TestingModule = await Test.createTestingModule({ + controllers: [AiMetricsController], + providers: [{ provide: AiMetricsService, useValue: aiMetricsService }], + }).compile(); + + controller = module.get(AiMetricsController); + }); + + it('should be defined', () => { + expect(controller).toBeDefined(); + }); + + describe('GET /ai/metrics', () => { + it('should return the health report as JSON', () => { + const json = jest.fn(); + const status = jest.fn().mockReturnValue({ json }); + const res = { status, json } as any; + + controller.getAiMetrics(res); + + expect(status).toHaveBeenCalledWith(200); + expect(json).toHaveBeenCalledWith(mockReport); + }); + + it('should return 500 on error', () => { + (aiMetricsService.getHealthReport as jest.Mock).mockImplementation(() => { + throw new Error('boom'); + }); + + const json = jest.fn(); + const status = jest.fn().mockReturnValue({ json }); + const res = { status, json } as any; + + controller.getAiMetrics(res); + + expect(status).toHaveBeenCalledWith(500); + }); + }); + + describe('GET /ai/metrics/prometheus', () => { + it('should return Prometheus text format', async () => { + const send = jest.fn(); + const set = jest.fn(); + const res = { + set, + send, + status: jest.fn().mockReturnValue({ json: jest.fn() }), + } as any; + + await controller.getPrometheusMetrics(res); + + expect(set).toHaveBeenCalledWith( + 'Content-Type', + 'text/plain; version=0.0.4; charset=utf-8', + ); + expect(send).toHaveBeenCalledWith( + expect.stringContaining('ai_inference_requests_total'), + ); + }); + }); + + describe('GET /ai/metrics/health', () => { + it('should return 200 when healthy', () => { + const json = jest.fn(); + const status = jest.fn().mockReturnValue({ json }); + const res = { status } as any; + + controller.getAiHealth(res); + + expect(status).toHaveBeenCalledWith(200); + expect(json).toHaveBeenCalledWith( + expect.objectContaining({ status: 'healthy' }), + ); + }); + + it('should return 503 when critical', () => { + const criticalReport = { + ...mockReport, + status: 'critical' as const, + throttling: { ...mockReport.throttling, active: true }, + }; + (aiMetricsService.getHealthReport as jest.Mock).mockReturnValue( + criticalReport, + ); + + const json = jest.fn(); + const status = jest.fn().mockReturnValue({ json }); + const res = { status } as any; + + controller.getAiHealth(res); + + expect(status).toHaveBeenCalledWith(503); + }); + }); +}); diff --git a/apps/backend/src/ai-metrics/ai-metrics.controller.ts b/apps/backend/src/ai-metrics/ai-metrics.controller.ts new file mode 100644 index 00000000..bb27a434 --- /dev/null +++ b/apps/backend/src/ai-metrics/ai-metrics.controller.ts @@ -0,0 +1,128 @@ +import { + Controller, + Get, + UseGuards, + Res, + Logger, + HttpStatus, +} from '@nestjs/common'; +import type { Response } from 'express'; +import { AiMetricsService } from './ai-metrics.service'; +import { IpAllowlistGuard } from '../metrics/ip-allowlist.guard'; +import { + ApiTags, + ApiOperation, + ApiResponse, + ApiProduces, +} from '@nestjs/swagger'; + +/** + * Controller that exposes the AI-layer health & performance metrics. + * + * Endpoints: + * GET /ai/metrics — full JSON health report (resource usage, throttling, model stats) + * GET /ai/metrics/prometheus — Prometheus-format text for scraping + * GET /ai/metrics/health — lightweight liveness / readiness check + */ +@ApiTags('ai-metrics') +@Controller('ai/metrics') +@UseGuards(IpAllowlistGuard) +export class AiMetricsController { + private readonly logger = new Logger(AiMetricsController.name); + + constructor(private readonly aiMetricsService: AiMetricsService) {} + + /** + * GET /ai/metrics + * Returns a comprehensive JSON health report including: + * - System status (healthy / degraded / critical) + * - Resource usage (RAM, heap, VRAM) + * - Throttling state & reason + * - Model load times + * - Request & error counters + */ + @Get() + @ApiOperation({ + summary: 'Get AI-layer health & performance metrics', + description: + 'Returns a comprehensive JSON report of the AI subsystem health, ' + + 'including resource utilisation, throttling state, loaded models, and counters.', + }) + @ApiResponse({ + status: 200, + description: 'AI health report in JSON', + }) + @ApiResponse({ + status: 403, + description: 'Forbidden — IP not in allowlist and no valid JWT', + }) + getAiMetrics(@Res() response: Response): void { + try { + const report = this.aiMetricsService.getHealthReport(); + response.status(HttpStatus.OK).json(report); + } catch (error) { + this.logger.error('Error building AI health report:', error); + response + .status(HttpStatus.INTERNAL_SERVER_ERROR) + .json({ error: 'Failed to retrieve AI metrics' }); + } + } + + /** + * GET /ai/metrics/prometheus + * Returns AI-specific metrics in Prometheus text format. + */ + @Get('prometheus') + @ApiOperation({ + summary: 'Get AI metrics in Prometheus format', + description: + 'Returns AI inference, model-load, and resource metrics in Prometheus text format for scraping.', + }) + @ApiProduces('text/plain') + @ApiResponse({ + status: 200, + description: 'Prometheus-format metrics', + }) + async getPrometheusMetrics(@Res() response: Response): Promise { + try { + const metrics = await this.aiMetricsService.getPrometheusMetrics(); + response.set('Content-Type', 'text/plain; version=0.0.4; charset=utf-8'); + response.send(metrics); + } catch (error) { + this.logger.error('Error getting Prometheus AI metrics:', error); + response + .status(HttpStatus.INTERNAL_SERVER_ERROR) + .json({ error: 'Failed to retrieve Prometheus metrics' }); + } + } + + /** + * GET /ai/metrics/health + * Lightweight liveness/readiness check for the AI subsystem. + * Returns 200 when healthy/degraded, 503 when the system should be throttled. + */ + @Get('health') + @ApiOperation({ + summary: 'AI subsystem health check', + description: + 'Returns 200 when the AI layer is operational, 503 when it is under resource pressure and throttling.', + }) + @ApiResponse({ status: 200, description: 'AI layer is healthy or degraded' }) + @ApiResponse({ + status: 503, + description: 'AI layer is in a critical state and throttling requests', + }) + getAiHealth(@Res() response: Response): void { + const report = this.aiMetricsService.getHealthReport(); + const statusCode = + report.status === 'critical' + ? HttpStatus.SERVICE_UNAVAILABLE + : HttpStatus.OK; + response.status(statusCode).json({ + status: report.status, + timestamp: report.timestamp, + uptime: report.uptime, + throttling: report.throttling, + }); + } +} diff --git a/apps/backend/src/ai-metrics/ai-metrics.interceptor.ts b/apps/backend/src/ai-metrics/ai-metrics.interceptor.ts new file mode 100644 index 00000000..84066f91 --- /dev/null +++ b/apps/backend/src/ai-metrics/ai-metrics.interceptor.ts @@ -0,0 +1,71 @@ +import { + Injectable, + NestInterceptor, + ExecutionContext, + CallHandler, + Logger, +} from '@nestjs/common'; +import { Observable } from 'rxjs'; +import { tap } from 'rxjs/operators'; +import type { Request } from 'express'; +import { AiMetricsService } from './ai-metrics.service'; + +/** + * Interceptor that automatically instruments AI-related routes with + * inference latency tracking. + * + * Apply it to controllers or individual routes: + * @UseInterceptors(AiMetricsInterceptor) + * + * The interceptor reads the `x-ai-model` header (or falls back to the + * route path) to identify the model being used, then records timing + * via the AiMetricsService. + */ +@Injectable() +export class AiMetricsInterceptor implements NestInterceptor { + private readonly logger = new Logger(AiMetricsInterceptor.name); + + constructor(private readonly aiMetrics: AiMetricsService) {} + + intercept(context: ExecutionContext, next: CallHandler): Observable { + const request = context.switchToHttp().getRequest(); + const modelName = + (request.headers['x-ai-model'] as string | undefined) || + this.extractModelFromRoute(request.path); + + const tracker = this.aiMetrics.startInference(modelName); + + return next.handle().pipe( + tap({ + next: () => { + tracker.end('success'); + }, + error: (error: unknown) => { + const errorType = + error instanceof Error ? error.constructor.name : 'UnknownError'; + tracker.end('error', errorType); + }, + }), + ); + } + + /** + * Derive a model identifier from the route path. + * e.g. /analyze → "sentiment", /retrain → "retraining" + */ + private extractModelFromRoute(path: string): string { + const cleanPath = (path || '').replace(/^\/+|\/+$/g, '').toLowerCase(); + + if (cleanPath.includes('sentiment') || cleanPath.includes('analyze')) { + return 'sentiment'; + } + if (cleanPath.includes('retrain')) { + return 'retraining'; + } + if (cleanPath.includes('predict') || cleanPath.includes('forecast')) { + return 'forecasting'; + } + + return cleanPath || 'unknown'; + } +} diff --git a/apps/backend/src/ai-metrics/ai-metrics.module.ts b/apps/backend/src/ai-metrics/ai-metrics.module.ts new file mode 100644 index 00000000..4b893180 --- /dev/null +++ b/apps/backend/src/ai-metrics/ai-metrics.module.ts @@ -0,0 +1,36 @@ +import { Module, Global } from '@nestjs/common'; +import { ConfigModule } from '@nestjs/config'; +import { AiMetricsService } from './ai-metrics.service'; +import { AiMetricsController } from './ai-metrics.controller'; +import { AiThrottleGuard } from './ai-throttle.guard'; +import { AiMetricsInterceptor } from './ai-metrics.interceptor'; + +/** + * AI Metrics Module + * + * Global module that provides GPU/resource monitoring and health dashboarding + * for the AI inference layer. + * + * Includes: + * - AI inference request metrics (count, latency, errors) + * - Model load time tracking + * - System RAM and GPU VRAM monitoring + * - Automatic throttling guard for resource pressure + * - GET /ai/metrics endpoint (JSON health report) + * - GET /ai/metrics/prometheus (Prometheus scraping) + * - GET /ai/metrics/health (liveness check) + * + * Environment Variables: + * - AI_MAX_CONCURRENT_INFERENCES: Max concurrent AI requests (default: 10) + * - AI_RAM_THROTTLE_THRESHOLD: RAM usage ratio to trigger throttle (default: 0.90) + * - AI_VRAM_THROTTLE_THRESHOLD: VRAM usage ratio to trigger throttle (default: 0.90) + * - AI_METRICS_SAMPLING_MS: Resource sampling interval in ms (default: 15000) + */ +@Global() +@Module({ + imports: [ConfigModule], + providers: [AiMetricsService, AiThrottleGuard, AiMetricsInterceptor], + controllers: [AiMetricsController], + exports: [AiMetricsService, AiThrottleGuard, AiMetricsInterceptor], +}) +export class AiMetricsModule {} diff --git a/apps/backend/src/ai-metrics/ai-metrics.service.spec.ts b/apps/backend/src/ai-metrics/ai-metrics.service.spec.ts new file mode 100644 index 00000000..04b3f2bf --- /dev/null +++ b/apps/backend/src/ai-metrics/ai-metrics.service.spec.ts @@ -0,0 +1,310 @@ +import { Test, TestingModule } from '@nestjs/testing'; +import { ConfigService } from '@nestjs/config'; +import { register } from 'prom-client'; +import { AiMetricsService } from './ai-metrics.service'; + +/** + * Clear the Prometheus registry between tests to avoid + * "duplicate metric" errors when the service is re-instantiated. + */ +function clearPrometheusRegistry() { + register.clear(); +} + +describe('AiMetricsService', () => { + let service: AiMetricsService; + let configService: ConfigService; + + beforeEach(async () => { + clearPrometheusRegistry(); + + const module: TestingModule = await Test.createTestingModule({ + providers: [ + AiMetricsService, + { + provide: ConfigService, + useValue: { + get: jest.fn((key: string, fallback?: string) => { + const env: Record = { + AI_MAX_CONCURRENT_INFERENCES: '3', + AI_RAM_THROTTLE_THRESHOLD: '0.99', + AI_VRAM_THROTTLE_THRESHOLD: '0.99', + AI_METRICS_SAMPLING_MS: '60000', + }; + return env[key] ?? fallback; + }), + }, + }, + ], + }).compile(); + + service = module.get(AiMetricsService); + configService = module.get(ConfigService); + }); + + afterEach(() => { + // Stop the periodic sampler + service.onModuleDestroy(); + clearPrometheusRegistry(); + }); + + // ── construction ────────────────────────────────────────────── + + it('should be defined', () => { + expect(service).toBeDefined(); + }); + + it('should read configuration values from ConfigService', () => { + expect(configService.get).toHaveBeenCalledWith( + 'AI_MAX_CONCURRENT_INFERENCES', + '10', + ); + expect(configService.get).toHaveBeenCalledWith( + 'AI_RAM_THROTTLE_THRESHOLD', + '0.90', + ); + expect(configService.get).toHaveBeenCalledWith( + 'AI_VRAM_THROTTLE_THRESHOLD', + '0.90', + ); + expect(configService.get).toHaveBeenCalledWith( + 'AI_METRICS_SAMPLING_MS', + '15000', + ); + }); + + // ── model load tracking ─────────────────────────────────────── + + describe('recordModelLoad / recordModelUnload', () => { + it('should track model load times', () => { + service.recordModelLoad('sentiment-v2', 1200); + service.recordModelLoad('forecast-v1', 3500); + + const report = service.getHealthReport(); + expect(report.models.totalLoaded).toBe(2); + expect(report.models.loadTimes['sentiment-v2']).toBe(1200); + expect(report.models.loadTimes['forecast-v1']).toBe(3500); + }); + + it('should decrement loaded model count on unload', () => { + service.recordModelLoad('sentiment-v2', 1200); + service.recordModelLoad('forecast-v1', 3500); + service.recordModelUnload('sentiment-v2'); + + const report = service.getHealthReport(); + expect(report.models.totalLoaded).toBe(1); + expect(report.models.loadTimes['sentiment-v2']).toBeUndefined(); + expect(report.models.loadTimes['forecast-v1']).toBe(3500); + }); + }); + + // ── inference tracking ──────────────────────────────────────── + + describe('startInference', () => { + it('should increment and decrement concurrent inferences', () => { + const tracker = service.startInference('sentiment'); + + let report = service.getHealthReport(); + expect(report.throttling.currentLoad).toBe(1); + + tracker.end('success'); + + report = service.getHealthReport(); + expect(report.throttling.currentLoad).toBe(0); + }); + + it('should count total inference requests', () => { + const t1 = service.startInference('sentiment'); + const t2 = service.startInference('forecast'); + t1.end('success'); + t2.end('success'); + + const report = service.getHealthReport(); + expect(report.counters.totalInferenceRequests).toBe(2); + expect(report.counters.totalInferenceErrors).toBe(0); + }); + + it('should count errors and error types', () => { + const t1 = service.startInference('sentiment'); + t1.end('error', 'TimeoutError'); + + const report = service.getHealthReport(); + expect(report.counters.totalInferenceErrors).toBe(1); + expect(report.counters.totalInferenceRequests).toBe(1); + }); + + it('should handle default error type', () => { + const t1 = service.startInference('sentiment'); + t1.end('error'); + + const report = service.getHealthReport(); + expect(report.counters.totalInferenceErrors).toBe(1); + }); + + it('should never go below zero concurrent inferences', () => { + const t1 = service.startInference('sentiment'); + t1.end('success'); + // Double-ending should not crash or go negative + t1.end('success'); + + const report = service.getHealthReport(); + expect(report.throttling.currentLoad).toBe(0); + }); + }); + + // ── throttling logic ────────────────────────────────────────── + + describe('shouldThrottle', () => { + it('should throttle when max concurrent inferences reached', () => { + // config has maxConcurrent = 3 + service.startInference('m1'); + service.startInference('m2'); + service.startInference('m3'); + + const result = service.shouldThrottle(); + expect(result.throttle).toBe(true); + expect(result.reason).toContain('Concurrency limit reached'); + }); + + it('should not throttle when under limits', () => { + const t = service.startInference('m1'); + const result = service.shouldThrottle(); + expect(result.throttle).toBe(false); + expect(result.reason).toBeNull(); + t.end('success'); + }); + }); + + describe('recordThrottledRequest', () => { + it('should increment throttled request counter', () => { + service.recordThrottledRequest(); + service.recordThrottledRequest(); + + const report = service.getHealthReport(); + expect(report.counters.throttledRequests).toBe(2); + }); + }); + + // ── resource snapshot ───────────────────────────────────────── + + describe('getResourceSnapshot', () => { + it('should return valid memory information', () => { + const snapshot = service.getResourceSnapshot(); + + expect(snapshot.totalMemoryBytes).toBeGreaterThan(0); + expect(snapshot.freeMemoryBytes).toBeGreaterThanOrEqual(0); + expect(snapshot.usedMemoryBytes).toBeGreaterThan(0); + expect(snapshot.memoryUsageRatio).toBeGreaterThanOrEqual(0); + expect(snapshot.memoryUsageRatio).toBeLessThanOrEqual(1); + + expect(snapshot.heapUsedBytes).toBeGreaterThan(0); + expect(snapshot.heapTotalBytes).toBeGreaterThan(0); + expect(snapshot.rssBytes).toBeGreaterThan(0); + expect(snapshot.externalBytes).toBeGreaterThanOrEqual(0); + + // GPU is unlikely to be available in CI, so just check the field exists + expect(typeof snapshot.gpuAvailable).toBe('boolean'); + }); + }); + + // ── health report ───────────────────────────────────────────── + + describe('getHealthReport', () => { + it('should return a well-formed report', () => { + const report = service.getHealthReport(); + + expect(report.status).toMatch(/^(healthy|degraded|critical)$/); + expect(report.timestamp).toBeDefined(); + expect(report.uptime).toBeGreaterThanOrEqual(0); + + expect(report.throttling).toEqual( + expect.objectContaining({ + active: expect.any(Boolean), + currentLoad: expect.any(Number), + maxConcurrent: 3, + }), + ); + + expect(report.resources).toEqual( + expect.objectContaining({ + totalMemoryBytes: expect.any(Number), + freeMemoryBytes: expect.any(Number), + usedMemoryBytes: expect.any(Number), + memoryUsageRatio: expect.any(Number), + }), + ); + + expect(report.models).toEqual({ + totalLoaded: 0, + loadTimes: {}, + }); + + expect(report.counters).toEqual({ + totalInferenceRequests: 0, + totalInferenceErrors: 0, + throttledRequests: 0, + }); + }); + + it('should report critical status when throttling is active', () => { + // Fill concurrency to trigger throttle + service.startInference('m1'); + service.startInference('m2'); + service.startInference('m3'); + + const report = service.getHealthReport(); + expect(report.status).toBe('critical'); + expect(report.throttling.active).toBe(true); + }); + }); + + // ── Prometheus output ───────────────────────────────────────── + + describe('getPrometheusMetrics', () => { + it('should return a non-empty Prometheus text payload', async () => { + const output = await service.getPrometheusMetrics(); + expect(typeof output).toBe('string'); + expect(output.length).toBeGreaterThan(0); + + // Should contain our custom metrics + expect(output).toContain('ai_inference_requests_total'); + expect(output).toContain('ai_inference_duration_seconds'); + expect(output).toContain('ai_model_load_duration_seconds'); + expect(output).toContain('ai_system_memory_usage_ratio'); + expect(output).toContain('ai_concurrent_inferences'); + expect(output).toContain('ai_throttled_requests_total'); + }); + + it('should include recorded model load metrics', async () => { + service.recordModelLoad('test-model', 500); + + const output = await service.getPrometheusMetrics(); + expect(output).toContain('ai_models_loaded_count'); + }); + + it('should include inference latency after a request', async () => { + const tracker = service.startInference('test-model'); + tracker.end('success'); + + const output = await service.getPrometheusMetrics(); + expect(output).toContain('ai_inference_duration_seconds'); + expect(output).toContain('test-model'); + }); + }); + + // ── lifecycle ───────────────────────────────────────────────── + + describe('onModuleInit', () => { + it('should not throw', () => { + expect(() => service.onModuleInit()).not.toThrow(); + }); + }); + + describe('onModuleDestroy', () => { + it('should stop the sampler without errors', () => { + expect(() => service.onModuleDestroy()).not.toThrow(); + // calling it twice should still be safe + expect(() => service.onModuleDestroy()).not.toThrow(); + }); + }); +}); diff --git a/apps/backend/src/ai-metrics/ai-metrics.service.ts b/apps/backend/src/ai-metrics/ai-metrics.service.ts new file mode 100644 index 00000000..5fd18fb7 --- /dev/null +++ b/apps/backend/src/ai-metrics/ai-metrics.service.ts @@ -0,0 +1,557 @@ +import { Injectable, Logger, OnModuleInit } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; +import { Counter, Histogram, Gauge, Summary, register } from 'prom-client'; +import * as os from 'os'; +import { execSync } from 'child_process'; + +/** + * Snapshot of current system resource utilisation. + */ +export interface ResourceSnapshot { + /** Total system RAM in bytes */ + totalMemoryBytes: number; + /** Free system RAM in bytes */ + freeMemoryBytes: number; + /** Used system RAM in bytes */ + usedMemoryBytes: number; + /** RAM utilisation ratio (0-1) */ + memoryUsageRatio: number; + /** Node.js heap used bytes */ + heapUsedBytes: number; + /** Node.js heap total bytes */ + heapTotalBytes: number; + /** Node.js RSS (resident set size) bytes */ + rssBytes: number; + /** Node.js external memory bytes */ + externalBytes: number; + /** Whether VRAM info is available (GPU detected) */ + gpuAvailable: boolean; + /** Total VRAM bytes (if GPU detected) */ + vramTotalBytes: number | null; + /** Used VRAM bytes (if GPU detected) */ + vramUsedBytes: number | null; + /** Free VRAM bytes (if GPU detected) */ + vramFreeBytes: number | null; + /** VRAM utilisation ratio 0-1 (if GPU detected) */ + vramUsageRatio: number | null; +} + +/** + * Full AI health report returned by GET /ai/metrics + */ +export interface AiHealthReport { + status: 'healthy' | 'degraded' | 'critical'; + timestamp: string; + uptime: number; + throttling: { + active: boolean; + reason: string | null; + currentLoad: number; + maxConcurrent: number; + }; + resources: ResourceSnapshot; + models: { + totalLoaded: number; + loadTimes: Record; + }; + counters: { + totalInferenceRequests: number; + totalInferenceErrors: number; + throttledRequests: number; + }; +} + +/** + * Service for collecting AI-layer performance metrics. + * + * Responsibilities: + * - Track model load times & counts + * - Track inference latency per model / per request + * - Monitor system RAM and (when available) GPU VRAM + * - Expose Prometheus-compatible gauges, counters, histograms + * - Provide a health check that can be used to throttle requests + */ +@Injectable() +export class AiMetricsService implements OnModuleInit { + private readonly logger = new Logger(AiMetricsService.name); + + // ── Prometheus primitives ──────────────────────────────────────── + + /** Total AI inference requests */ + readonly aiRequestCounter: Counter; + + /** Total AI inference errors */ + readonly aiErrorCounter: Counter; + + /** Histogram of inference latency (seconds) per model */ + readonly aiInferenceLatency: Histogram; + + /** Histogram of model load / warm-up time (seconds) */ + readonly aiModelLoadTime: Histogram; + + /** Summary for quick percentile view of inference latency */ + readonly aiInferenceLatencySummary: Summary; + + /** Number of models currently loaded */ + readonly aiModelsLoaded: Gauge; + + /** System RAM usage ratio gauge */ + readonly systemMemoryUsageRatio: Gauge; + + /** System RAM used bytes gauge */ + readonly systemMemoryUsedBytes: Gauge; + + /** Node.js heap used bytes gauge */ + readonly nodeHeapUsedBytes: Gauge; + + /** Node.js RSS bytes gauge */ + readonly nodeRssBytes: Gauge; + + /** GPU VRAM usage ratio gauge (if available) */ + readonly gpuVramUsageRatio: Gauge; + + /** GPU VRAM used bytes gauge (if available) */ + readonly gpuVramUsedBytes: Gauge; + + /** Count of requests throttled due to resource pressure */ + readonly throttledRequestCounter: Counter; + + /** Current concurrent AI inference count */ + readonly aiConcurrentInferences: Gauge; + + // ── Internal state ─────────────────────────────────────────────── + + /** Map of model name → load duration in ms */ + private readonly modelLoadTimes = new Map(); + + /** Current concurrent AI inference count */ + private concurrentInferences = 0; + + /** Maximum concurrent inferences before throttling */ + private readonly maxConcurrentInferences: number; + + /** RAM usage ratio threshold that triggers throttling */ + private readonly ramThrottleThreshold: number; + + /** VRAM usage ratio threshold that triggers throttling */ + private readonly vramThrottleThreshold: number; + + /** Interval handle for periodic resource sampling */ + private resourceSamplerInterval: ReturnType | null = null; + + /** Resource sampling period in ms */ + private readonly samplingIntervalMs: number; + + /** Most recent GPU probe result (cached to avoid shelling out too often) */ + private cachedGpuInfo: { + available: boolean; + totalBytes: number | null; + usedBytes: number | null; + freeBytes: number | null; + usageRatio: number | null; + } = { + available: false, + totalBytes: null, + usedBytes: null, + freeBytes: null, + usageRatio: null, + }; + + /** Total inference request count (fast in-memory mirror) */ + private totalInferenceRequests = 0; + /** Total inference error count */ + private totalInferenceErrors = 0; + /** Total throttled requests */ + private totalThrottledRequests = 0; + + // ──────────────────────────────────────────────────────────────── + + constructor(private readonly config: ConfigService) { + // Read tunables from env (with sensible defaults) + this.maxConcurrentInferences = Number( + this.config.get('AI_MAX_CONCURRENT_INFERENCES', '10'), + ); + this.ramThrottleThreshold = Number( + this.config.get('AI_RAM_THROTTLE_THRESHOLD', '0.90'), + ); + this.vramThrottleThreshold = Number( + this.config.get('AI_VRAM_THROTTLE_THRESHOLD', '0.90'), + ); + this.samplingIntervalMs = Number( + this.config.get('AI_METRICS_SAMPLING_MS', '15000'), + ); + + // ── Register Prometheus metrics ───────────────────────────── + + this.aiRequestCounter = new Counter({ + name: 'ai_inference_requests_total', + help: 'Total number of AI inference requests', + labelNames: ['model', 'status'], + }); + + this.aiErrorCounter = new Counter({ + name: 'ai_inference_errors_total', + help: 'Total number of AI inference errors', + labelNames: ['model', 'error_type'], + }); + + this.aiInferenceLatency = new Histogram({ + name: 'ai_inference_duration_seconds', + help: 'AI inference latency in seconds', + labelNames: ['model'], + buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30], + }); + + this.aiModelLoadTime = new Histogram({ + name: 'ai_model_load_duration_seconds', + help: 'Time taken to load / warm-up an AI model (seconds)', + labelNames: ['model'], + buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60, 120], + }); + + this.aiInferenceLatencySummary = new Summary({ + name: 'ai_inference_latency_summary', + help: 'Summary of AI inference latency with percentiles', + labelNames: ['model'], + percentiles: [0.5, 0.9, 0.95, 0.99], + maxAgeSeconds: 600, + ageBuckets: 5, + }); + + this.aiModelsLoaded = new Gauge({ + name: 'ai_models_loaded_count', + help: 'Number of AI models currently loaded in memory', + }); + + this.systemMemoryUsageRatio = new Gauge({ + name: 'ai_system_memory_usage_ratio', + help: 'System RAM usage ratio (0-1)', + }); + + this.systemMemoryUsedBytes = new Gauge({ + name: 'ai_system_memory_used_bytes', + help: 'System RAM used (bytes)', + }); + + this.nodeHeapUsedBytes = new Gauge({ + name: 'ai_node_heap_used_bytes', + help: 'Node.js V8 heap used (bytes)', + }); + + this.nodeRssBytes = new Gauge({ + name: 'ai_node_rss_bytes', + help: 'Node.js RSS resident set size (bytes)', + }); + + this.gpuVramUsageRatio = new Gauge({ + name: 'ai_gpu_vram_usage_ratio', + help: 'GPU VRAM usage ratio (0-1). -1 when not available.', + }); + + this.gpuVramUsedBytes = new Gauge({ + name: 'ai_gpu_vram_used_bytes', + help: 'GPU VRAM used (bytes). -1 when not available.', + }); + + this.throttledRequestCounter = new Counter({ + name: 'ai_throttled_requests_total', + help: 'Number of AI requests rejected/throttled due to resource pressure', + }); + + this.aiConcurrentInferences = new Gauge({ + name: 'ai_concurrent_inferences', + help: 'Number of AI inferences currently running', + }); + + this.logger.log( + `AI metrics service constructed — maxConcurrent=${this.maxConcurrentInferences}, ` + + `ramThreshold=${this.ramThrottleThreshold}, vramThreshold=${this.vramThrottleThreshold}`, + ); + } + + // ── Lifecycle ──────────────────────────────────────────────────── + + onModuleInit(): void { + // Take an initial resource reading + this.sampleResources(); + + // Start periodic sampling + this.resourceSamplerInterval = setInterval( + () => this.sampleResources(), + this.samplingIntervalMs, + ); + + this.logger.log( + `AI metrics resource sampler started (interval=${this.samplingIntervalMs}ms)`, + ); + } + + onModuleDestroy(): void { + if (this.resourceSamplerInterval) { + clearInterval(this.resourceSamplerInterval); + this.resourceSamplerInterval = null; + } + } + + // ── Public API ─────────────────────────────────────────────────── + + /** + * Record a model being loaded / warmed-up. + * @param modelName logical model identifier + * @param durationMs time to load in milliseconds + */ + recordModelLoad(modelName: string, durationMs: number): void { + this.modelLoadTimes.set(modelName, durationMs); + this.aiModelLoadTime.labels(modelName).observe(durationMs / 1000); + this.aiModelsLoaded.set(this.modelLoadTimes.size); + this.logger.log( + `Model "${modelName}" loaded in ${durationMs.toFixed(1)}ms`, + ); + } + + /** + * Record a model being unloaded from memory. + */ + recordModelUnload(modelName: string): void { + this.modelLoadTimes.delete(modelName); + this.aiModelsLoaded.set(this.modelLoadTimes.size); + this.logger.log(`Model "${modelName}" unloaded`); + } + + /** + * Start an inference timing context. + * Returns an `end` callback and increments the concurrent counter. + */ + startInference(modelName: string): { + end: (status: 'success' | 'error', errorType?: string) => void; + } { + this.concurrentInferences++; + this.aiConcurrentInferences.set(this.concurrentInferences); + this.totalInferenceRequests++; + + const startMs = Date.now(); + + return { + end: (status: 'success' | 'error', errorType?: string) => { + const durationMs = Date.now() - startMs; + const durationSec = durationMs / 1000; + + this.concurrentInferences = Math.max(0, this.concurrentInferences - 1); + this.aiConcurrentInferences.set(this.concurrentInferences); + + this.aiRequestCounter.labels(modelName, status).inc(); + this.aiInferenceLatency.labels(modelName).observe(durationSec); + this.aiInferenceLatencySummary.labels(modelName).observe(durationSec); + + if (status === 'error') { + this.totalInferenceErrors++; + this.aiErrorCounter.labels(modelName, errorType ?? 'unknown').inc(); + } + + this.logger.debug( + `Inference [${modelName}] completed in ${durationMs}ms — status=${status}`, + ); + }, + }; + } + + /** + * Evaluate whether the system should throttle new AI requests. + * Returns `{ throttle: boolean; reason?: string }`. + */ + shouldThrottle(): { throttle: boolean; reason: string | null } { + // 1. Concurrency limit + if (this.concurrentInferences >= this.maxConcurrentInferences) { + return { + throttle: true, + reason: `Concurrency limit reached (${this.concurrentInferences}/${this.maxConcurrentInferences})`, + }; + } + + // 2. System RAM + const totalMem = os.totalmem(); + const freeMem = os.freemem(); + const usedRatio = (totalMem - freeMem) / totalMem; + if (usedRatio >= this.ramThrottleThreshold) { + return { + throttle: true, + reason: `System RAM usage at ${(usedRatio * 100).toFixed(1)}% (threshold ${(this.ramThrottleThreshold * 100).toFixed(0)}%)`, + }; + } + + // 3. VRAM (if available) + if ( + this.cachedGpuInfo.available && + this.cachedGpuInfo.usageRatio !== null && + this.cachedGpuInfo.usageRatio >= this.vramThrottleThreshold + ) { + return { + throttle: true, + reason: `GPU VRAM usage at ${(this.cachedGpuInfo.usageRatio * 100).toFixed(1)}% (threshold ${(this.vramThrottleThreshold * 100).toFixed(0)}%)`, + }; + } + + return { throttle: false, reason: null }; + } + + /** + * Increment the throttled-requests counter. + */ + recordThrottledRequest(): void { + this.totalThrottledRequests++; + this.throttledRequestCounter.inc(); + } + + /** + * Build a ResourceSnapshot from current system state. + */ + getResourceSnapshot(): ResourceSnapshot { + const totalMem = os.totalmem(); + const freeMem = os.freemem(); + const usedMem = totalMem - freeMem; + const memUsage = process.memoryUsage(); + + return { + totalMemoryBytes: totalMem, + freeMemoryBytes: freeMem, + usedMemoryBytes: usedMem, + memoryUsageRatio: usedMem / totalMem, + heapUsedBytes: memUsage.heapUsed, + heapTotalBytes: memUsage.heapTotal, + rssBytes: memUsage.rss, + externalBytes: memUsage.external, + gpuAvailable: this.cachedGpuInfo.available, + vramTotalBytes: this.cachedGpuInfo.totalBytes, + vramUsedBytes: this.cachedGpuInfo.usedBytes, + vramFreeBytes: this.cachedGpuInfo.freeBytes, + vramUsageRatio: this.cachedGpuInfo.usageRatio, + }; + } + + /** + * Build the full health report object. + */ + getHealthReport(): AiHealthReport { + const resources = this.getResourceSnapshot(); + const throttleCheck = this.shouldThrottle(); + + let status: AiHealthReport['status'] = 'healthy'; + if (throttleCheck.throttle) { + status = 'critical'; + } else if (resources.memoryUsageRatio > 0.75) { + status = 'degraded'; + } + + return { + status, + timestamp: new Date().toISOString(), + uptime: process.uptime(), + throttling: { + active: throttleCheck.throttle, + reason: throttleCheck.reason, + currentLoad: this.concurrentInferences, + maxConcurrent: this.maxConcurrentInferences, + }, + resources, + models: { + totalLoaded: this.modelLoadTimes.size, + loadTimes: Object.fromEntries(this.modelLoadTimes), + }, + counters: { + totalInferenceRequests: this.totalInferenceRequests, + totalInferenceErrors: this.totalInferenceErrors, + throttledRequests: this.totalThrottledRequests, + }, + }; + } + + /** + * Return all registered AI metrics in Prometheus text format. + */ + async getPrometheusMetrics(): Promise { + return register.metrics(); + } + + // ── Private helpers ────────────────────────────────────────────── + + /** + * Sample system resources and update Prometheus gauges. + * Called on a timer. + */ + private sampleResources(): void { + try { + const totalMem = os.totalmem(); + const freeMem = os.freemem(); + const usedMem = totalMem - freeMem; + const usageRatio = usedMem / totalMem; + + this.systemMemoryUsageRatio.set(usageRatio); + this.systemMemoryUsedBytes.set(usedMem); + + const memUsage = process.memoryUsage(); + this.nodeHeapUsedBytes.set(memUsage.heapUsed); + this.nodeRssBytes.set(memUsage.rss); + + // Attempt to probe GPU (nvidia-smi). Result is cached. + this.probeGpu(); + } catch (err) { + this.logger.warn( + `Resource sampling error: ${err instanceof Error ? err.message : String(err)}`, + ); + } + } + + /** + * Attempt to read GPU VRAM via nvidia-smi. + * If nvidia-smi is not available the GPU is marked as unavailable + * and we stop retrying until next sample cycle. + */ + private probeGpu(): void { + try { + const output = execSync( + 'nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv,noheader,nounits', + { timeout: 5000, encoding: 'utf-8' }, + ); + + const parts = output + .trim() + .split(',') + .map((s: string) => s.trim()); + if (parts.length >= 3) { + const totalMiB = parseFloat(parts[0]); + const usedMiB = parseFloat(parts[1]); + const freeMiB = parseFloat(parts[2]); + + const totalBytes = totalMiB * 1024 * 1024; + const usedBytes = usedMiB * 1024 * 1024; + const freeBytes = freeMiB * 1024 * 1024; + const usageRatio = totalBytes > 0 ? usedBytes / totalBytes : 0; + + this.cachedGpuInfo = { + available: true, + totalBytes, + usedBytes, + freeBytes, + usageRatio, + }; + + this.gpuVramUsageRatio.set(usageRatio); + this.gpuVramUsedBytes.set(usedBytes); + } + } catch { + // nvidia-smi not available — mark GPU as absent + if (this.cachedGpuInfo.available) { + this.logger.debug('GPU not detected (nvidia-smi unavailable)'); + } + this.cachedGpuInfo = { + available: false, + totalBytes: null, + usedBytes: null, + freeBytes: null, + usageRatio: null, + }; + this.gpuVramUsageRatio.set(-1); + this.gpuVramUsedBytes.set(-1); + } + } +} diff --git a/apps/backend/src/ai-metrics/ai-throttle.guard.spec.ts b/apps/backend/src/ai-metrics/ai-throttle.guard.spec.ts new file mode 100644 index 00000000..8bb92bad --- /dev/null +++ b/apps/backend/src/ai-metrics/ai-throttle.guard.spec.ts @@ -0,0 +1,76 @@ +import { ExecutionContext, HttpException, HttpStatus } from '@nestjs/common'; +import { AiThrottleGuard } from './ai-throttle.guard'; +import { AiMetricsService } from './ai-metrics.service'; + +describe('AiThrottleGuard', () => { + let guard: AiThrottleGuard; + let aiMetrics: Partial; + + const mockExecutionContext = (): ExecutionContext => { + const setHeader = jest.fn(); + return { + switchToHttp: () => ({ + getRequest: () => ({}), + getResponse: () => ({ setHeader }), + }), + } as unknown as ExecutionContext; + }; + + beforeEach(() => { + aiMetrics = { + shouldThrottle: jest.fn(), + recordThrottledRequest: jest.fn(), + }; + guard = new AiThrottleGuard(aiMetrics as AiMetricsService); + }); + + it('should allow request when not throttling', () => { + (aiMetrics.shouldThrottle as jest.Mock).mockReturnValue({ + throttle: false, + reason: null, + }); + + expect(guard.canActivate(mockExecutionContext())).toBe(true); + expect(aiMetrics.recordThrottledRequest).not.toHaveBeenCalled(); + }); + + it('should throw 503 when throttling is active', () => { + (aiMetrics.shouldThrottle as jest.Mock).mockReturnValue({ + throttle: true, + reason: 'Concurrency limit reached (10/10)', + }); + + expect(() => guard.canActivate(mockExecutionContext())).toThrow( + HttpException, + ); + + try { + guard.canActivate(mockExecutionContext()); + } catch (e) { + expect(e).toBeInstanceOf(HttpException); + expect((e as HttpException).getStatus()).toBe( + HttpStatus.SERVICE_UNAVAILABLE, + ); + } + + expect(aiMetrics.recordThrottledRequest).toHaveBeenCalled(); + }); + + it('should set Retry-After header when throttling', () => { + (aiMetrics.shouldThrottle as jest.Mock).mockReturnValue({ + throttle: true, + reason: 'High RAM usage', + }); + + const ctx = mockExecutionContext(); + const setHeader = ctx.switchToHttp().getResponse().setHeader; + + try { + guard.canActivate(ctx); + } catch { + // expected + } + + expect(setHeader).toHaveBeenCalledWith('Retry-After', '30'); + }); +}); diff --git a/apps/backend/src/ai-metrics/ai-throttle.guard.ts b/apps/backend/src/ai-metrics/ai-throttle.guard.ts new file mode 100644 index 00000000..6fca1eca --- /dev/null +++ b/apps/backend/src/ai-metrics/ai-throttle.guard.ts @@ -0,0 +1,51 @@ +import { + Injectable, + CanActivate, + ExecutionContext, + HttpException, + HttpStatus, + Logger, +} from '@nestjs/common'; +import type { Response } from 'express'; +import { AiMetricsService } from './ai-metrics.service'; + +/** + * Guard that checks system resource pressure before allowing + * AI inference requests to proceed. + * + * When memory (RAM or VRAM) exceeds the configured threshold or + * the concurrency limit is reached, the guard rejects the request + * with 503 Service Unavailable and a Retry-After header. + * + * Apply this guard to any controller or route that triggers AI inference: + * @UseGuards(AiThrottleGuard) + */ +@Injectable() +export class AiThrottleGuard implements CanActivate { + private readonly logger = new Logger(AiThrottleGuard.name); + + constructor(private readonly aiMetrics: AiMetricsService) {} + + canActivate(context: ExecutionContext): boolean { + const { throttle, reason } = this.aiMetrics.shouldThrottle(); + + if (throttle) { + this.aiMetrics.recordThrottledRequest(); + this.logger.warn(`AI request throttled — ${reason}`); + + const response = context.switchToHttp().getResponse(); + response.setHeader('Retry-After', '30'); + + throw new HttpException( + { + statusCode: HttpStatus.SERVICE_UNAVAILABLE, + message: 'AI service is under resource pressure. Please retry later.', + reason, + }, + HttpStatus.SERVICE_UNAVAILABLE, + ); + } + + return true; + } +} diff --git a/apps/backend/src/ai-metrics/index.ts b/apps/backend/src/ai-metrics/index.ts new file mode 100644 index 00000000..047f08b6 --- /dev/null +++ b/apps/backend/src/ai-metrics/index.ts @@ -0,0 +1,5 @@ +export { AiMetricsModule } from './ai-metrics.module'; +export { AiMetricsService } from './ai-metrics.service'; +export type { ResourceSnapshot, AiHealthReport } from './ai-metrics.service'; +export { AiThrottleGuard } from './ai-throttle.guard'; +export { AiMetricsInterceptor } from './ai-metrics.interceptor'; diff --git a/apps/backend/src/app.module.ts b/apps/backend/src/app.module.ts index aa992021..800e966a 100644 --- a/apps/backend/src/app.module.ts +++ b/apps/backend/src/app.module.ts @@ -19,6 +19,7 @@ import { LoggerMiddleware } from './common/middleware/logger.middleware'; import { TestController } from './test/test.controller'; import { SnapshotsModule } from './snapshot/snapshot.module'; import { ModelRetrainingModule } from './model-retraining/model-retraining.module'; +import { AiMetricsModule } from './ai-metrics/ai-metrics.module'; import { DataSource, DataSourceOptions } from 'typeorm'; import stellarConfig from './stellar/config/stellar.config'; @@ -71,6 +72,7 @@ const appLogger = new Logger('TypeORM'); PortfolioModule, SnapshotsModule, ModelRetrainingModule, + AiMetricsModule, ], controllers: [AppController, TestController, TestExceptionController], providers: [ diff --git a/apps/backend/src/model-retraining/model-retraining.controller.ts b/apps/backend/src/model-retraining/model-retraining.controller.ts index 8d100881..381525a4 100644 --- a/apps/backend/src/model-retraining/model-retraining.controller.ts +++ b/apps/backend/src/model-retraining/model-retraining.controller.ts @@ -11,7 +11,11 @@ import { JwtAuthGuard } from '../auth/jwt-auth.guard'; import { RolesGuard } from '../auth/roles.guard'; import { Roles } from '../auth/decorators/auth.decorators'; import { UserRole } from '../users/entities/user.entity'; -import { ModelRetrainingService, RetrainResult, ModelStatusResult } from './model-retraining.service'; +import { + ModelRetrainingService, + RetrainResult, + ModelStatusResult, +} from './model-retraining.service'; class TriggerRetrainDto { force?: boolean; diff --git a/apps/backend/src/portfolio/dto/portfolio-snapshot.dto.ts b/apps/backend/src/portfolio/dto/portfolio-snapshot.dto.ts index 50112c7b..8e3a3a75 100644 --- a/apps/backend/src/portfolio/dto/portfolio-snapshot.dto.ts +++ b/apps/backend/src/portfolio/dto/portfolio-snapshot.dto.ts @@ -53,7 +53,10 @@ export class PortfolioSummaryResponseDto { }) totalValueUsd: string; - @ApiProperty({ description: 'Individual asset balances', type: [AssetBalanceDto] }) + @ApiProperty({ + description: 'Individual asset balances', + type: [AssetBalanceDto], + }) assets: AssetBalanceDto[]; @ApiProperty({ @@ -64,7 +67,8 @@ export class PortfolioSummaryResponseDto { lastUpdated: Date | null; @ApiProperty({ - description: 'Indicates whether the user has a linked Stellar account with snapshots', + description: + 'Indicates whether the user has a linked Stellar account with snapshots', example: true, }) hasLinkedAccount: boolean; diff --git a/prometheus-rules.yml b/prometheus-rules.yml index 309f6355..d7a2a556 100644 --- a/prometheus-rules.yml +++ b/prometheus-rules.yml @@ -177,3 +177,160 @@ groups: - record: 'job:http:latency:p99' expr: | histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) + + # ================================== + # AI / GPU Resource Monitoring Rules + # ================================== + - name: lumenpulse_ai_alerts + interval: 30s + rules: + # ── Inference Latency ────────────────────────────── + + - alert: AiHighInferenceLatencyP95 + expr: | + histogram_quantile(0.95, rate(ai_inference_duration_seconds_bucket[5m])) > 5 + for: 5m + labels: + severity: warning + service: ai + annotations: + summary: 'High AI inference P95 latency' + description: '{{ .Labels.model }} P95 inference latency > 5s (current: {{ $value | humanizeDuration }})' + + - alert: AiCriticalInferenceLatencyP99 + expr: | + histogram_quantile(0.99, rate(ai_inference_duration_seconds_bucket[5m])) > 15 + for: 3m + labels: + severity: critical + service: ai + annotations: + summary: 'Critical AI inference P99 latency' + description: '{{ .Labels.model }} P99 inference latency > 15s (current: {{ $value | humanizeDuration }})' + + # ── Error Rate ───────────────────────────────────── + + - alert: AiHighErrorRate + expr: | + (rate(ai_inference_errors_total[5m]) / rate(ai_inference_requests_total[5m])) > 0.05 + for: 5m + labels: + severity: warning + service: ai + annotations: + summary: 'High AI inference error rate' + description: 'AI error rate > 5% (current: {{ $value | humanizePercentage }})' + + - alert: AiCriticalErrorRate + expr: | + (rate(ai_inference_errors_total[5m]) / rate(ai_inference_requests_total[5m])) > 0.15 + for: 2m + labels: + severity: critical + service: ai + annotations: + summary: 'Critical AI inference error rate' + description: 'AI error rate > 15% (current: {{ $value | humanizePercentage }})' + + # ── Memory / VRAM Pressure ───────────────────────── + + - alert: AiHighRamUsage + expr: ai_system_memory_usage_ratio > 0.85 + for: 5m + labels: + severity: warning + service: ai + annotations: + summary: 'AI layer RAM usage high' + description: 'System RAM usage at {{ $value | humanizePercentage }}' + + - alert: AiCriticalRamUsage + expr: ai_system_memory_usage_ratio > 0.95 + for: 2m + labels: + severity: critical + service: ai + annotations: + summary: 'AI layer RAM critically high' + description: 'System RAM usage at {{ $value | humanizePercentage }} — throttling likely active' + + - alert: AiHighVramUsage + expr: ai_gpu_vram_usage_ratio > 0.85 and ai_gpu_vram_usage_ratio >= 0 + for: 5m + labels: + severity: warning + service: ai + annotations: + summary: 'GPU VRAM usage high' + description: 'GPU VRAM usage at {{ $value | humanizePercentage }}' + + - alert: AiCriticalVramUsage + expr: ai_gpu_vram_usage_ratio > 0.95 and ai_gpu_vram_usage_ratio >= 0 + for: 2m + labels: + severity: critical + service: ai + annotations: + summary: 'GPU VRAM critically high' + description: 'GPU VRAM at {{ $value | humanizePercentage }} — models may OOM' + + # ── Throttling ───────────────────────────────────── + + - alert: AiRequestsThrottled + expr: rate(ai_throttled_requests_total[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: ai + annotations: + summary: 'AI requests are being throttled' + description: 'Throttle rate: {{ $value | humanize }}/sec — system under resource pressure' + + # ── Concurrency ──────────────────────────────────── + + - alert: AiHighConcurrency + expr: ai_concurrent_inferences > 8 + for: 5m + labels: + severity: warning + service: ai + annotations: + summary: 'High AI inference concurrency' + description: '{{ $value }} concurrent AI inferences running' + + # ── Model Load Time ──────────────────────────────── + + - alert: AiSlowModelLoad + expr: | + histogram_quantile(0.95, rate(ai_model_load_duration_seconds_bucket[1h])) > 60 + for: 5m + labels: + severity: warning + service: ai + annotations: + summary: 'AI model load times are slow' + description: 'P95 model load time > 60s (current: {{ $value | humanizeDuration }})' + + # ── Recording Rules (pre-computed AI metrics) ────── + + - record: 'job:ai:inference:rate5m' + expr: 'rate(ai_inference_requests_total[5m])' + + - record: 'job:ai:errors:rate5m' + expr: 'rate(ai_inference_errors_total[5m])' + + - record: 'job:ai:error_rate:ratio' + expr: | + rate(ai_inference_errors_total[5m]) / rate(ai_inference_requests_total[5m]) + + - record: 'job:ai:latency:p95' + expr: | + histogram_quantile(0.95, rate(ai_inference_duration_seconds_bucket[5m])) + + - record: 'job:ai:latency:p99' + expr: | + histogram_quantile(0.99, rate(ai_inference_duration_seconds_bucket[5m])) + + - record: 'job:ai:throttle:rate5m' + expr: 'rate(ai_throttled_requests_total[5m])' + diff --git a/prometheus.yml b/prometheus.yml index 7486783b..623b209b 100644 --- a/prometheus.yml +++ b/prometheus.yml @@ -45,3 +45,19 @@ scrape_configs: - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] + + # LumenPulse AI Metrics + - job_name: 'lumenpulse-ai' + scrape_interval: 15s + scrape_timeout: 10s + metrics_path: '/ai/metrics/prometheus' + + static_configs: + - targets: ['backend:3000'] + labels: + layer: 'ai' + + relabel_configs: + - source_labels: [__address__] + target_label: instance +