diff --git a/apps/backend/.env.example b/apps/backend/.env.example
index b213cf6d..906dd44c 100644
--- a/apps/backend/.env.example
+++ b/apps/backend/.env.example
@@ -28,3 +28,20 @@ CORS_ORIGIN=http://localhost:3000
 # News Provider API (CoinDesk)
 # documentation is https://developers.coindesk.com/documentation/data-api/news_v1_search
 COINDESK_API_KEY=your_api_key_here
+
+# ========================
+# AI Metrics / GPU Monitoring
+# ========================
+
+# Maximum concurrent AI inference requests before throttling (default: 10)
+AI_MAX_CONCURRENT_INFERENCES=10
+
+# System RAM usage ratio (0-1) that triggers request throttling (default: 0.90)
+AI_RAM_THROTTLE_THRESHOLD=0.90
+
+# GPU VRAM usage ratio (0-1) that triggers request throttling (default: 0.90)
+AI_VRAM_THROTTLE_THRESHOLD=0.90
+
+# Resource sampling interval in milliseconds (default: 15000)
+AI_METRICS_SAMPLING_MS=15000
+
diff --git a/apps/backend/src/ai-metrics/ai-metrics.controller.spec.ts b/apps/backend/src/ai-metrics/ai-metrics.controller.spec.ts
new file mode 100644
index 00000000..5d88b425
--- /dev/null
+++ b/apps/backend/src/ai-metrics/ai-metrics.controller.spec.ts
@@ -0,0 +1,147 @@
+import { Test, TestingModule } from '@nestjs/testing';
+import { AiMetricsController } from './ai-metrics.controller';
+import { AiMetricsService, AiHealthReport } from './ai-metrics.service';
+
+describe('AiMetricsController', () => {
+  let controller: AiMetricsController;
+  let aiMetricsService: Partial<AiMetricsService>;
+
+  const mockReport: AiHealthReport = {
+    status: 'healthy',
+    timestamp: '2026-03-26T09:00:00.000Z',
+    uptime: 12345,
+    throttling: {
+      active: false,
+      reason: null,
+      currentLoad: 2,
+      maxConcurrent: 10,
+    },
+    resources: {
+      totalMemoryBytes: 16e9,
+      freeMemoryBytes: 8e9,
+      usedMemoryBytes: 8e9,
+      memoryUsageRatio: 0.5,
+      heapUsedBytes: 100e6,
+      heapTotalBytes: 200e6,
+      rssBytes: 300e6,
+      externalBytes: 10e6,
+      gpuAvailable: false,
+      vramTotalBytes: null,
+      vramUsedBytes: null,
+      vramFreeBytes: null,
+      vramUsageRatio: null,
+    },
+    models: {
+      totalLoaded: 1,
+      loadTimes: { 'sentiment-v2': 1200 },
+    },
+    counters: {
+      totalInferenceRequests: 42,
+      totalInferenceErrors: 3,
+      throttledRequests: 1,
+    },
+  };
+
+  beforeEach(async () => {
+    aiMetricsService = {
+      getHealthReport: jest.fn().mockReturnValue(mockReport),
+      getPrometheusMetrics: jest
+        .fn()
+        .mockResolvedValue('# HELP ai_inference_requests_total\n'),
+    };
+
+    const module: TestingModule = await Test.createTestingModule({
+      controllers: [AiMetricsController],
+      providers: [{ provide: AiMetricsService, useValue: aiMetricsService }],
+    }).compile();
+
+    controller = module.get<AiMetricsController>(AiMetricsController);
+  });
+
+  it('should be defined', () => {
+    expect(controller).toBeDefined();
+  });
+
+  describe('GET /ai/metrics', () => {
+    it('should return the health report as JSON', () => {
+      const json = jest.fn();
+      const status = jest.fn().mockReturnValue({ json });
+      const res = { status, json } as any;
+
+      controller.getAiMetrics(res);
+
+      expect(status).toHaveBeenCalledWith(200);
+      expect(json).toHaveBeenCalledWith(mockReport);
+    });
+
+    it('should return 500 on error', () => {
+      (aiMetricsService.getHealthReport as jest.Mock).mockImplementation(() => {
+        throw new Error('boom');
+      });
+
+      const json = jest.fn();
+      const status = jest.fn().mockReturnValue({ json });
+      const res = { status, json } as any;
+
+      controller.getAiMetrics(res);
+
+      expect(status).toHaveBeenCalledWith(500);
+    });
+  });
+
+  describe('GET /ai/metrics/prometheus', () => {
+    it('should return Prometheus text format', async () => {
+      const send = jest.fn();
+      const set = jest.fn();
+      const res = {
+        set,
+        send,
+        status: jest.fn().mockReturnValue({ json: jest.fn() }),
+      } as any;
+
+      await controller.getPrometheusMetrics(res);
+
+      expect(set).toHaveBeenCalledWith(
+        'Content-Type',
+        'text/plain; version=0.0.4; charset=utf-8',
+      );
+      expect(send).toHaveBeenCalledWith(
+        expect.stringContaining('ai_inference_requests_total'),
+      );
+    });
+  });
+
+  describe('GET /ai/metrics/health', () => {
+    it('should return 200 when healthy', () => {
+      const json = jest.fn();
+      const status = jest.fn().mockReturnValue({ json });
+      const res = { status } as any;
+
+      controller.getAiHealth(res);
+
+      expect(status).toHaveBeenCalledWith(200);
+      expect(json).toHaveBeenCalledWith(
+        expect.objectContaining({ status: 'healthy' }),
+      );
+    });
+
+    it('should return 503 when critical', () => {
+      const criticalReport = {
+        ...mockReport,
+        status: 'critical' as const,
+        throttling: { ...mockReport.throttling, active: true },
+      };
+      (aiMetricsService.getHealthReport as jest.Mock).mockReturnValue(
+        criticalReport,
+      );
+
+      const json = jest.fn();
+      const status = jest.fn().mockReturnValue({ json });
+      const res = { status } as any;
+
+      controller.getAiHealth(res);
+
+      expect(status).toHaveBeenCalledWith(503);
+    });
+  });
+});
diff --git a/apps/backend/src/ai-metrics/ai-metrics.controller.ts b/apps/backend/src/ai-metrics/ai-metrics.controller.ts
new file mode 100644
index 00000000..bb27a434
--- /dev/null
+++ b/apps/backend/src/ai-metrics/ai-metrics.controller.ts
@@ -0,0 +1,128 @@
+import {
+  Controller,
+  Get,
+  UseGuards,
+  Res,
+  Logger,
+  HttpStatus,
+} from '@nestjs/common';
+import type { Response } from 'express';
+import { AiMetricsService } from './ai-metrics.service';
+import { IpAllowlistGuard } from '../metrics/ip-allowlist.guard';
+import {
+  ApiTags,
+  ApiOperation,
+  ApiResponse,
+  ApiProduces,
+} from '@nestjs/swagger';
+
+/**
+ * Controller that exposes the AI-layer health & performance metrics.
+ *
+ * Endpoints:
+ *  GET /ai/metrics          — full JSON health report (resource usage, throttling, model stats)
+ *  GET /ai/metrics/prometheus — Prometheus-format text for scraping
+ *  GET /ai/metrics/health   — lightweight liveness / readiness check
+ */
+@ApiTags('ai-metrics')
+@Controller('ai/metrics')
+@UseGuards(IpAllowlistGuard)
+export class AiMetricsController {
+  private readonly logger = new Logger(AiMetricsController.name);
+
+  constructor(private readonly aiMetricsService: AiMetricsService) {}
+
+  /**
+   * GET /ai/metrics
+   * Returns a comprehensive JSON health report including:
+   * - System status (healthy / degraded / critical)
+   * - Resource usage (RAM, heap, VRAM)
+   * - Throttling state & reason
+   * - Model load times
+   * - Request & error counters
+   */
+  @Get()
+  @ApiOperation({
+    summary: 'Get AI-layer health & performance metrics',
+    description:
+      'Returns a comprehensive JSON report of the AI subsystem health, ' +
+      'including resource utilisation, throttling state, loaded models, and counters.',
+  })
+  @ApiResponse({
+    status: 200,
+    description: 'AI health report in JSON',
+  })
+  @ApiResponse({
+    status: 403,
+    description: 'Forbidden — IP not in allowlist and no valid JWT',
+  })
+  getAiMetrics(@Res() response: Response): void {
+    try {
+      const report = this.aiMetricsService.getHealthReport();
+      response.status(HttpStatus.OK).json(report);
+    } catch (error) {
+      this.logger.error('Error building AI health report:', error);
+      response
+        .status(HttpStatus.INTERNAL_SERVER_ERROR)
+        .json({ error: 'Failed to retrieve AI metrics' });
+    }
+  }
+
+  /**
+   * GET /ai/metrics/prometheus
+   * Returns AI-specific metrics in Prometheus text format.
+   */
+  @Get('prometheus')
+  @ApiOperation({
+    summary: 'Get AI metrics in Prometheus format',
+    description:
+      'Returns AI inference, model-load, and resource metrics in Prometheus text format for scraping.',
+  })
+  @ApiProduces('text/plain')
+  @ApiResponse({
+    status: 200,
+    description: 'Prometheus-format metrics',
+  })
+  async getPrometheusMetrics(@Res() response: Response): Promise<void> {
+    try {
+      const metrics = await this.aiMetricsService.getPrometheusMetrics();
+      response.set('Content-Type', 'text/plain; version=0.0.4; charset=utf-8');
+      response.send(metrics);
+    } catch (error) {
+      this.logger.error('Error getting Prometheus AI metrics:', error);
+      response
+        .status(HttpStatus.INTERNAL_SERVER_ERROR)
+        .json({ error: 'Failed to retrieve Prometheus metrics' });
+    }
+  }
+
+  /**
+   * GET /ai/metrics/health
+   * Lightweight liveness/readiness check for the AI subsystem.
+   * Returns 200 when healthy/degraded, 503 when the system should be throttled.
+   */
+  @Get('health')
+  @ApiOperation({
+    summary: 'AI subsystem health check',
+    description:
+      'Returns 200 when the AI layer is operational, 503 when it is under resource pressure and throttling.',
+  })
+  @ApiResponse({ status: 200, description: 'AI layer is healthy or degraded' })
+  @ApiResponse({
+    status: 503,
+    description: 'AI layer is in a critical state and throttling requests',
+  })
+  getAiHealth(@Res() response: Response): void {
+    const report = this.aiMetricsService.getHealthReport();
+    const statusCode =
+      report.status === 'critical'
+        ? HttpStatus.SERVICE_UNAVAILABLE
+        : HttpStatus.OK;
+    response.status(statusCode).json({
+      status: report.status,
+      timestamp: report.timestamp,
+      uptime: report.uptime,
+      throttling: report.throttling,
+    });
+  }
+}
diff --git a/apps/backend/src/ai-metrics/ai-metrics.interceptor.ts b/apps/backend/src/ai-metrics/ai-metrics.interceptor.ts
new file mode 100644
index 00000000..84066f91
--- /dev/null
+++ b/apps/backend/src/ai-metrics/ai-metrics.interceptor.ts
@@ -0,0 +1,71 @@
+import {
+  Injectable,
+  NestInterceptor,
+  ExecutionContext,
+  CallHandler,
+  Logger,
+} from '@nestjs/common';
+import { Observable } from 'rxjs';
+import { tap } from 'rxjs/operators';
+import type { Request } from 'express';
+import { AiMetricsService } from './ai-metrics.service';
+
+/**
+ * Interceptor that automatically instruments AI-related routes with
+ * inference latency tracking.
+ *
+ * Apply it to controllers or individual routes:
+ *   @UseInterceptors(AiMetricsInterceptor)
+ *
+ * The interceptor reads the `x-ai-model` header (or falls back to the
+ * route path) to identify the model being used, then records timing
+ * via the AiMetricsService.
+ */
+@Injectable()
+export class AiMetricsInterceptor implements NestInterceptor {
+  private readonly logger = new Logger(AiMetricsInterceptor.name);
+
+  constructor(private readonly aiMetrics: AiMetricsService) {}
+
+  intercept(context: ExecutionContext, next: CallHandler): Observable<unknown> {
+    const request = context.switchToHttp().getRequest<Request>();
+    const modelName =
+      (request.headers['x-ai-model'] as string | undefined) ||
+      this.extractModelFromRoute(request.path);
+
+    const tracker = this.aiMetrics.startInference(modelName);
+
+    return next.handle().pipe(
+      tap({
+        next: () => {
+          tracker.end('success');
+        },
+        error: (error: unknown) => {
+          const errorType =
+            error instanceof Error ? error.constructor.name : 'UnknownError';
+          tracker.end('error', errorType);
+        },
+      }),
+    );
+  }
+
+  /**
+   * Derive a model identifier from the route path.
+   * e.g. /analyze → "sentiment", /retrain → "retraining"
+   */
+  private extractModelFromRoute(path: string): string {
+    const cleanPath = (path || '').replace(/^\/+|\/+$/g, '').toLowerCase();
+
+    if (cleanPath.includes('sentiment') || cleanPath.includes('analyze')) {
+      return 'sentiment';
+    }
+    if (cleanPath.includes('retrain')) {
+      return 'retraining';
+    }
+    if (cleanPath.includes('predict') || cleanPath.includes('forecast')) {
+      return 'forecasting';
+    }
+
+    return cleanPath || 'unknown';
+  }
+}
diff --git a/apps/backend/src/ai-metrics/ai-metrics.module.ts b/apps/backend/src/ai-metrics/ai-metrics.module.ts
new file mode 100644
index 00000000..4b893180
--- /dev/null
+++ b/apps/backend/src/ai-metrics/ai-metrics.module.ts
@@ -0,0 +1,36 @@
+import { Module, Global } from '@nestjs/common';
+import { ConfigModule } from '@nestjs/config';
+import { AiMetricsService } from './ai-metrics.service';
+import { AiMetricsController } from './ai-metrics.controller';
+import { AiThrottleGuard } from './ai-throttle.guard';
+import { AiMetricsInterceptor } from './ai-metrics.interceptor';
+
+/**
+ * AI Metrics Module
+ *
+ * Global module that provides GPU/resource monitoring and health dashboarding
+ * for the AI inference layer.
+ *
+ * Includes:
+ * - AI inference request metrics (count, latency, errors)
+ * - Model load time tracking
+ * - System RAM and GPU VRAM monitoring
+ * - Automatic throttling guard for resource pressure
+ * - GET /ai/metrics endpoint (JSON health report)
+ * - GET /ai/metrics/prometheus (Prometheus scraping)
+ * - GET /ai/metrics/health (liveness check)
+ *
+ * Environment Variables:
+ * - AI_MAX_CONCURRENT_INFERENCES: Max concurrent AI requests (default: 10)
+ * - AI_RAM_THROTTLE_THRESHOLD: RAM usage ratio to trigger throttle (default: 0.90)
+ * - AI_VRAM_THROTTLE_THRESHOLD: VRAM usage ratio to trigger throttle (default: 0.90)
+ * - AI_METRICS_SAMPLING_MS: Resource sampling interval in ms (default: 15000)
+ */
+@Global()
+@Module({
+  imports: [ConfigModule],
+  providers: [AiMetricsService, AiThrottleGuard, AiMetricsInterceptor],
+  controllers: [AiMetricsController],
+  exports: [AiMetricsService, AiThrottleGuard, AiMetricsInterceptor],
+})
+export class AiMetricsModule {}
diff --git a/apps/backend/src/ai-metrics/ai-metrics.service.spec.ts b/apps/backend/src/ai-metrics/ai-metrics.service.spec.ts
new file mode 100644
index 00000000..04b3f2bf
--- /dev/null
+++ b/apps/backend/src/ai-metrics/ai-metrics.service.spec.ts
@@ -0,0 +1,310 @@
+import { Test, TestingModule } from '@nestjs/testing';
+import { ConfigService } from '@nestjs/config';
+import { register } from 'prom-client';
+import { AiMetricsService } from './ai-metrics.service';
+
+/**
+ * Clear the Prometheus registry between tests to avoid
+ * "duplicate metric" errors when the service is re-instantiated.
+ */
+function clearPrometheusRegistry() {
+  register.clear();
+}
+
+describe('AiMetricsService', () => {
+  let service: AiMetricsService;
+  let configService: ConfigService;
+
+  beforeEach(async () => {
+    clearPrometheusRegistry();
+
+    const module: TestingModule = await Test.createTestingModule({
+      providers: [
+        AiMetricsService,
+        {
+          provide: ConfigService,
+          useValue: {
+            get: jest.fn((key: string, fallback?: string) => {
+              const env: Record<string, string> = {
+                AI_MAX_CONCURRENT_INFERENCES: '3',
+                AI_RAM_THROTTLE_THRESHOLD: '0.99',
+                AI_VRAM_THROTTLE_THRESHOLD: '0.99',
+                AI_METRICS_SAMPLING_MS: '60000',
+              };
+              return env[key] ?? fallback;
+            }),
+          },
+        },
+      ],
+    }).compile();
+
+    service = module.get<AiMetricsService>(AiMetricsService);
+    configService = module.get<ConfigService>(ConfigService);
+  });
+
+  afterEach(() => {
+    // Stop the periodic sampler
+    service.onModuleDestroy();
+    clearPrometheusRegistry();
+  });
+
+  // ── construction ──────────────────────────────────────────────
+
+  it('should be defined', () => {
+    expect(service).toBeDefined();
+  });
+
+  it('should read configuration values from ConfigService', () => {
+    expect(configService.get).toHaveBeenCalledWith(
+      'AI_MAX_CONCURRENT_INFERENCES',
+      '10',
+    );
+    expect(configService.get).toHaveBeenCalledWith(
+      'AI_RAM_THROTTLE_THRESHOLD',
+      '0.90',
+    );
+    expect(configService.get).toHaveBeenCalledWith(
+      'AI_VRAM_THROTTLE_THRESHOLD',
+      '0.90',
+    );
+    expect(configService.get).toHaveBeenCalledWith(
+      'AI_METRICS_SAMPLING_MS',
+      '15000',
+    );
+  });
+
+  // ── model load tracking ───────────────────────────────────────
+
+  describe('recordModelLoad / recordModelUnload', () => {
+    it('should track model load times', () => {
+      service.recordModelLoad('sentiment-v2', 1200);
+      service.recordModelLoad('forecast-v1', 3500);
+
+      const report = service.getHealthReport();
+      expect(report.models.totalLoaded).toBe(2);
+      expect(report.models.loadTimes['sentiment-v2']).toBe(1200);
+      expect(report.models.loadTimes['forecast-v1']).toBe(3500);
+    });
+
+    it('should decrement loaded model count on unload', () => {
+      service.recordModelLoad('sentiment-v2', 1200);
+      service.recordModelLoad('forecast-v1', 3500);
+      service.recordModelUnload('sentiment-v2');
+
+      const report = service.getHealthReport();
+      expect(report.models.totalLoaded).toBe(1);
+      expect(report.models.loadTimes['sentiment-v2']).toBeUndefined();
+      expect(report.models.loadTimes['forecast-v1']).toBe(3500);
+    });
+  });
+
+  // ── inference tracking ────────────────────────────────────────
+
+  describe('startInference', () => {
+    it('should increment and decrement concurrent inferences', () => {
+      const tracker = service.startInference('sentiment');
+
+      let report = service.getHealthReport();
+      expect(report.throttling.currentLoad).toBe(1);
+
+      tracker.end('success');
+
+      report = service.getHealthReport();
+      expect(report.throttling.currentLoad).toBe(0);
+    });
+
+    it('should count total inference requests', () => {
+      const t1 = service.startInference('sentiment');
+      const t2 = service.startInference('forecast');
+      t1.end('success');
+      t2.end('success');
+
+      const report = service.getHealthReport();
+      expect(report.counters.totalInferenceRequests).toBe(2);
+      expect(report.counters.totalInferenceErrors).toBe(0);
+    });
+
+    it('should count errors and error types', () => {
+      const t1 = service.startInference('sentiment');
+      t1.end('error', 'TimeoutError');
+
+      const report = service.getHealthReport();
+      expect(report.counters.totalInferenceErrors).toBe(1);
+      expect(report.counters.totalInferenceRequests).toBe(1);
+    });
+
+    it('should handle default error type', () => {
+      const t1 = service.startInference('sentiment');
+      t1.end('error');
+
+      const report = service.getHealthReport();
+      expect(report.counters.totalInferenceErrors).toBe(1);
+    });
+
+    it('should never go below zero concurrent inferences', () => {
+      const t1 = service.startInference('sentiment');
+      t1.end('success');
+      // Double-ending should not crash or go negative
+      t1.end('success');
+
+      const report = service.getHealthReport();
+      expect(report.throttling.currentLoad).toBe(0);
+    });
+  });
+
+  // ── throttling logic ──────────────────────────────────────────
+
+  describe('shouldThrottle', () => {
+    it('should throttle when max concurrent inferences reached', () => {
+      // config has maxConcurrent = 3
+      service.startInference('m1');
+      service.startInference('m2');
+      service.startInference('m3');
+
+      const result = service.shouldThrottle();
+      expect(result.throttle).toBe(true);
+      expect(result.reason).toContain('Concurrency limit reached');
+    });
+
+    it('should not throttle when under limits', () => {
+      const t = service.startInference('m1');
+      const result = service.shouldThrottle();
+      expect(result.throttle).toBe(false);
+      expect(result.reason).toBeNull();
+      t.end('success');
+    });
+  });
+
+  describe('recordThrottledRequest', () => {
+    it('should increment throttled request counter', () => {
+      service.recordThrottledRequest();
+      service.recordThrottledRequest();
+
+      const report = service.getHealthReport();
+      expect(report.counters.throttledRequests).toBe(2);
+    });
+  });
+
+  // ── resource snapshot ─────────────────────────────────────────
+
+  describe('getResourceSnapshot', () => {
+    it('should return valid memory information', () => {
+      const snapshot = service.getResourceSnapshot();
+
+      expect(snapshot.totalMemoryBytes).toBeGreaterThan(0);
+      expect(snapshot.freeMemoryBytes).toBeGreaterThanOrEqual(0);
+      expect(snapshot.usedMemoryBytes).toBeGreaterThan(0);
+      expect(snapshot.memoryUsageRatio).toBeGreaterThanOrEqual(0);
+      expect(snapshot.memoryUsageRatio).toBeLessThanOrEqual(1);
+
+      expect(snapshot.heapUsedBytes).toBeGreaterThan(0);
+      expect(snapshot.heapTotalBytes).toBeGreaterThan(0);
+      expect(snapshot.rssBytes).toBeGreaterThan(0);
+      expect(snapshot.externalBytes).toBeGreaterThanOrEqual(0);
+
+      // GPU is unlikely to be available in CI, so just check the field exists
+      expect(typeof snapshot.gpuAvailable).toBe('boolean');
+    });
+  });
+
+  // ── health report ─────────────────────────────────────────────
+
+  describe('getHealthReport', () => {
+    it('should return a well-formed report', () => {
+      const report = service.getHealthReport();
+
+      expect(report.status).toMatch(/^(healthy|degraded|critical)$/);
+      expect(report.timestamp).toBeDefined();
+      expect(report.uptime).toBeGreaterThanOrEqual(0);
+
+      expect(report.throttling).toEqual(
+        expect.objectContaining({
+          active: expect.any(Boolean),
+          currentLoad: expect.any(Number),
+          maxConcurrent: 3,
+        }),
+      );
+
+      expect(report.resources).toEqual(
+        expect.objectContaining({
+          totalMemoryBytes: expect.any(Number),
+          freeMemoryBytes: expect.any(Number),
+          usedMemoryBytes: expect.any(Number),
+          memoryUsageRatio: expect.any(Number),
+        }),
+      );
+
+      expect(report.models).toEqual({
+        totalLoaded: 0,
+        loadTimes: {},
+      });
+
+      expect(report.counters).toEqual({
+        totalInferenceRequests: 0,
+        totalInferenceErrors: 0,
+        throttledRequests: 0,
+      });
+    });
+
+    it('should report critical status when throttling is active', () => {
+      // Fill concurrency to trigger throttle
+      service.startInference('m1');
+      service.startInference('m2');
+      service.startInference('m3');
+
+      const report = service.getHealthReport();
+      expect(report.status).toBe('critical');
+      expect(report.throttling.active).toBe(true);
+    });
+  });
+
+  // ── Prometheus output ─────────────────────────────────────────
+
+  describe('getPrometheusMetrics', () => {
+    it('should return a non-empty Prometheus text payload', async () => {
+      const output = await service.getPrometheusMetrics();
+      expect(typeof output).toBe('string');
+      expect(output.length).toBeGreaterThan(0);
+
+      // Should contain our custom metrics
+      expect(output).toContain('ai_inference_requests_total');
+      expect(output).toContain('ai_inference_duration_seconds');
+      expect(output).toContain('ai_model_load_duration_seconds');
+      expect(output).toContain('ai_system_memory_usage_ratio');
+      expect(output).toContain('ai_concurrent_inferences');
+      expect(output).toContain('ai_throttled_requests_total');
+    });
+
+    it('should include recorded model load metrics', async () => {
+      service.recordModelLoad('test-model', 500);
+
+      const output = await service.getPrometheusMetrics();
+      expect(output).toContain('ai_models_loaded_count');
+    });
+
+    it('should include inference latency after a request', async () => {
+      const tracker = service.startInference('test-model');
+      tracker.end('success');
+
+      const output = await service.getPrometheusMetrics();
+      expect(output).toContain('ai_inference_duration_seconds');
+      expect(output).toContain('test-model');
+    });
+  });
+
+  // ── lifecycle ─────────────────────────────────────────────────
+
+  describe('onModuleInit', () => {
+    it('should not throw', () => {
+      expect(() => service.onModuleInit()).not.toThrow();
+    });
+  });
+
+  describe('onModuleDestroy', () => {
+    it('should stop the sampler without errors', () => {
+      expect(() => service.onModuleDestroy()).not.toThrow();
+      // calling it twice should still be safe
+      expect(() => service.onModuleDestroy()).not.toThrow();
+    });
+  });
+});
diff --git a/apps/backend/src/ai-metrics/ai-metrics.service.ts b/apps/backend/src/ai-metrics/ai-metrics.service.ts
new file mode 100644
index 00000000..5fd18fb7
--- /dev/null
+++ b/apps/backend/src/ai-metrics/ai-metrics.service.ts
@@ -0,0 +1,557 @@
+import { Injectable, Logger, OnModuleInit } from '@nestjs/common';
+import { ConfigService } from '@nestjs/config';
+import { Counter, Histogram, Gauge, Summary, register } from 'prom-client';
+import * as os from 'os';
+import { execSync } from 'child_process';
+
+/**
+ * Snapshot of current system resource utilisation.
+ */
+export interface ResourceSnapshot {
+  /** Total system RAM in bytes */
+  totalMemoryBytes: number;
+  /** Free system RAM in bytes */
+  freeMemoryBytes: number;
+  /** Used system RAM in bytes */
+  usedMemoryBytes: number;
+  /** RAM utilisation ratio (0-1) */
+  memoryUsageRatio: number;
+  /** Node.js heap used bytes */
+  heapUsedBytes: number;
+  /** Node.js heap total bytes */
+  heapTotalBytes: number;
+  /** Node.js RSS (resident set size) bytes */
+  rssBytes: number;
+  /** Node.js external memory bytes */
+  externalBytes: number;
+  /** Whether VRAM info is available (GPU detected) */
+  gpuAvailable: boolean;
+  /** Total VRAM bytes (if GPU detected) */
+  vramTotalBytes: number | null;
+  /** Used VRAM bytes (if GPU detected) */
+  vramUsedBytes: number | null;
+  /** Free VRAM bytes (if GPU detected) */
+  vramFreeBytes: number | null;
+  /** VRAM utilisation ratio 0-1 (if GPU detected) */
+  vramUsageRatio: number | null;
+}
+
+/**
+ * Full AI health report returned by GET /ai/metrics
+ */
+export interface AiHealthReport {
+  status: 'healthy' | 'degraded' | 'critical';
+  timestamp: string;
+  uptime: number;
+  throttling: {
+    active: boolean;
+    reason: string | null;
+    currentLoad: number;
+    maxConcurrent: number;
+  };
+  resources: ResourceSnapshot;
+  models: {
+    totalLoaded: number;
+    loadTimes: Record<string, number>;
+  };
+  counters: {
+    totalInferenceRequests: number;
+    totalInferenceErrors: number;
+    throttledRequests: number;
+  };
+}
+
+/**
+ * Service for collecting AI-layer performance metrics.
+ *
+ * Responsibilities:
+ * - Track model load times & counts
+ * - Track inference latency per model / per request
+ * - Monitor system RAM and (when available) GPU VRAM
+ * - Expose Prometheus-compatible gauges, counters, histograms
+ * - Provide a health check that can be used to throttle requests
+ */
+@Injectable()
+export class AiMetricsService implements OnModuleInit {
+  private readonly logger = new Logger(AiMetricsService.name);
+
+  // ── Prometheus primitives ────────────────────────────────────────
+
+  /** Total AI inference requests */
+  readonly aiRequestCounter: Counter;
+
+  /** Total AI inference errors */
+  readonly aiErrorCounter: Counter;
+
+  /** Histogram of inference latency (seconds) per model */
+  readonly aiInferenceLatency: Histogram;
+
+  /** Histogram of model load / warm-up time (seconds) */
+  readonly aiModelLoadTime: Histogram;
+
+  /** Summary for quick percentile view of inference latency */
+  readonly aiInferenceLatencySummary: Summary;
+
+  /** Number of models currently loaded */
+  readonly aiModelsLoaded: Gauge;
+
+  /** System RAM usage ratio gauge */
+  readonly systemMemoryUsageRatio: Gauge;
+
+  /** System RAM used bytes gauge */
+  readonly systemMemoryUsedBytes: Gauge;
+
+  /** Node.js heap used bytes gauge */
+  readonly nodeHeapUsedBytes: Gauge;
+
+  /** Node.js RSS bytes gauge */
+  readonly nodeRssBytes: Gauge;
+
+  /** GPU VRAM usage ratio gauge (if available) */
+  readonly gpuVramUsageRatio: Gauge;
+
+  /** GPU VRAM used bytes gauge (if available) */
+  readonly gpuVramUsedBytes: Gauge;
+
+  /** Count of requests throttled due to resource pressure */
+  readonly throttledRequestCounter: Counter;
+
+  /** Current concurrent AI inference count */
+  readonly aiConcurrentInferences: Gauge;
+
+  // ── Internal state ───────────────────────────────────────────────
+
+  /** Map of model name → load duration in ms */
+  private readonly modelLoadTimes = new Map<string, number>();
+
+  /** Current concurrent AI inference count */
+  private concurrentInferences = 0;
+
+  /** Maximum concurrent inferences before throttling */
+  private readonly maxConcurrentInferences: number;
+
+  /** RAM usage ratio threshold that triggers throttling */
+  private readonly ramThrottleThreshold: number;
+
+  /** VRAM usage ratio threshold that triggers throttling */
+  private readonly vramThrottleThreshold: number;
+
+  /** Interval handle for periodic resource sampling */
+  private resourceSamplerInterval: ReturnType<typeof setInterval> | null = null;
+
+  /** Resource sampling period in ms */
+  private readonly samplingIntervalMs: number;
+
+  /** Most recent GPU probe result (cached to avoid shelling out too often) */
+  private cachedGpuInfo: {
+    available: boolean;
+    totalBytes: number | null;
+    usedBytes: number | null;
+    freeBytes: number | null;
+    usageRatio: number | null;
+  } = {
+    available: false,
+    totalBytes: null,
+    usedBytes: null,
+    freeBytes: null,
+    usageRatio: null,
+  };
+
+  /** Total inference request count (fast in-memory mirror) */
+  private totalInferenceRequests = 0;
+  /** Total inference error count */
+  private totalInferenceErrors = 0;
+  /** Total throttled requests */
+  private totalThrottledRequests = 0;
+
+  // ────────────────────────────────────────────────────────────────
+
+  constructor(private readonly config: ConfigService) {
+    // Read tunables from env (with sensible defaults)
+    this.maxConcurrentInferences = Number(
+      this.config.get<string>('AI_MAX_CONCURRENT_INFERENCES', '10'),
+    );
+    this.ramThrottleThreshold = Number(
+      this.config.get<string>('AI_RAM_THROTTLE_THRESHOLD', '0.90'),
+    );
+    this.vramThrottleThreshold = Number(
+      this.config.get<string>('AI_VRAM_THROTTLE_THRESHOLD', '0.90'),
+    );
+    this.samplingIntervalMs = Number(
+      this.config.get<string>('AI_METRICS_SAMPLING_MS', '15000'),
+    );
+
+    // ── Register Prometheus metrics ─────────────────────────────
+
+    this.aiRequestCounter = new Counter({
+      name: 'ai_inference_requests_total',
+      help: 'Total number of AI inference requests',
+      labelNames: ['model', 'status'],
+    });
+
+    this.aiErrorCounter = new Counter({
+      name: 'ai_inference_errors_total',
+      help: 'Total number of AI inference errors',
+      labelNames: ['model', 'error_type'],
+    });
+
+    this.aiInferenceLatency = new Histogram({
+      name: 'ai_inference_duration_seconds',
+      help: 'AI inference latency in seconds',
+      labelNames: ['model'],
+      buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30],
+    });
+
+    this.aiModelLoadTime = new Histogram({
+      name: 'ai_model_load_duration_seconds',
+      help: 'Time taken to load / warm-up an AI model (seconds)',
+      labelNames: ['model'],
+      buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60, 120],
+    });
+
+    this.aiInferenceLatencySummary = new Summary({
+      name: 'ai_inference_latency_summary',
+      help: 'Summary of AI inference latency with percentiles',
+      labelNames: ['model'],
+      percentiles: [0.5, 0.9, 0.95, 0.99],
+      maxAgeSeconds: 600,
+      ageBuckets: 5,
+    });
+
+    this.aiModelsLoaded = new Gauge({
+      name: 'ai_models_loaded_count',
+      help: 'Number of AI models currently loaded in memory',
+    });
+
+    this.systemMemoryUsageRatio = new Gauge({
+      name: 'ai_system_memory_usage_ratio',
+      help: 'System RAM usage ratio (0-1)',
+    });
+
+    this.systemMemoryUsedBytes = new Gauge({
+      name: 'ai_system_memory_used_bytes',
+      help: 'System RAM used (bytes)',
+    });
+
+    this.nodeHeapUsedBytes = new Gauge({
+      name: 'ai_node_heap_used_bytes',
+      help: 'Node.js V8 heap used (bytes)',
+    });
+
+    this.nodeRssBytes = new Gauge({
+      name: 'ai_node_rss_bytes',
+      help: 'Node.js RSS resident set size (bytes)',
+    });
+
+    this.gpuVramUsageRatio = new Gauge({
+      name: 'ai_gpu_vram_usage_ratio',
+      help: 'GPU VRAM usage ratio (0-1). -1 when not available.',
+    });
+
+    this.gpuVramUsedBytes = new Gauge({
+      name: 'ai_gpu_vram_used_bytes',
+      help: 'GPU VRAM used (bytes). -1 when not available.',
+    });
+
+    this.throttledRequestCounter = new Counter({
+      name: 'ai_throttled_requests_total',
+      help: 'Number of AI requests rejected/throttled due to resource pressure',
+    });
+
+    this.aiConcurrentInferences = new Gauge({
+      name: 'ai_concurrent_inferences',
+      help: 'Number of AI inferences currently running',
+    });
+
+    this.logger.log(
+      `AI metrics service constructed — maxConcurrent=${this.maxConcurrentInferences}, ` +
+        `ramThreshold=${this.ramThrottleThreshold}, vramThreshold=${this.vramThrottleThreshold}`,
+    );
+  }
+
+  // ── Lifecycle ────────────────────────────────────────────────────
+
+  onModuleInit(): void {
+    // Take an initial resource reading
+    this.sampleResources();
+
+    // Start periodic sampling
+    this.resourceSamplerInterval = setInterval(
+      () => this.sampleResources(),
+      this.samplingIntervalMs,
+    );
+
+    this.logger.log(
+      `AI metrics resource sampler started (interval=${this.samplingIntervalMs}ms)`,
+    );
+  }
+
+  onModuleDestroy(): void {
+    if (this.resourceSamplerInterval) {
+      clearInterval(this.resourceSamplerInterval);
+      this.resourceSamplerInterval = null;
+    }
+  }
+
+  // ── Public API ───────────────────────────────────────────────────
+
+  /**
+   * Record a model being loaded / warmed-up.
+   * @param modelName logical model identifier
+   * @param durationMs time to load in milliseconds
+   */
+  recordModelLoad(modelName: string, durationMs: number): void {
+    this.modelLoadTimes.set(modelName, durationMs);
+    this.aiModelLoadTime.labels(modelName).observe(durationMs / 1000);
+    this.aiModelsLoaded.set(this.modelLoadTimes.size);
+    this.logger.log(
+      `Model "${modelName}" loaded in ${durationMs.toFixed(1)}ms`,
+    );
+  }
+
+  /**
+   * Record a model being unloaded from memory.
+   */
+  recordModelUnload(modelName: string): void {
+    this.modelLoadTimes.delete(modelName);
+    this.aiModelsLoaded.set(this.modelLoadTimes.size);
+    this.logger.log(`Model "${modelName}" unloaded`);
+  }
+
+  /**
+   * Start an inference timing context.
+   * Returns an `end` callback and increments the concurrent counter.
+   */
+  startInference(modelName: string): {
+    end: (status: 'success' | 'error', errorType?: string) => void;
+  } {
+    this.concurrentInferences++;
+    this.aiConcurrentInferences.set(this.concurrentInferences);
+    this.totalInferenceRequests++;
+
+    const startMs = Date.now();
+
+    return {
+      end: (status: 'success' | 'error', errorType?: string) => {
+        const durationMs = Date.now() - startMs;
+        const durationSec = durationMs / 1000;
+
+        this.concurrentInferences = Math.max(0, this.concurrentInferences - 1);
+        this.aiConcurrentInferences.set(this.concurrentInferences);
+
+        this.aiRequestCounter.labels(modelName, status).inc();
+        this.aiInferenceLatency.labels(modelName).observe(durationSec);
+        this.aiInferenceLatencySummary.labels(modelName).observe(durationSec);
+
+        if (status === 'error') {
+          this.totalInferenceErrors++;
+          this.aiErrorCounter.labels(modelName, errorType ?? 'unknown').inc();
+        }
+
+        this.logger.debug(
+          `Inference [${modelName}] completed in ${durationMs}ms — status=${status}`,
+        );
+      },
+    };
+  }
+
+  /**
+   * Evaluate whether the system should throttle new AI requests.
+   * Returns `{ throttle: boolean; reason?: string }`.
+   */
+  shouldThrottle(): { throttle: boolean; reason: string | null } {
+    // 1. Concurrency limit
+    if (this.concurrentInferences >= this.maxConcurrentInferences) {
+      return {
+        throttle: true,
+        reason: `Concurrency limit reached (${this.concurrentInferences}/${this.maxConcurrentInferences})`,
+      };
+    }
+
+    // 2. System RAM
+    const totalMem = os.totalmem();
+    const freeMem = os.freemem();
+    const usedRatio = (totalMem - freeMem) / totalMem;
+    if (usedRatio >= this.ramThrottleThreshold) {
+      return {
+        throttle: true,
+        reason: `System RAM usage at ${(usedRatio * 100).toFixed(1)}% (threshold ${(this.ramThrottleThreshold * 100).toFixed(0)}%)`,
+      };
+    }
+
+    // 3. VRAM (if available)
+    if (
+      this.cachedGpuInfo.available &&
+      this.cachedGpuInfo.usageRatio !== null &&
+      this.cachedGpuInfo.usageRatio >= this.vramThrottleThreshold
+    ) {
+      return {
+        throttle: true,
+        reason: `GPU VRAM usage at ${(this.cachedGpuInfo.usageRatio * 100).toFixed(1)}% (threshold ${(this.vramThrottleThreshold * 100).toFixed(0)}%)`,
+      };
+    }
+
+    return { throttle: false, reason: null };
+  }
+
+  /**
+   * Increment the throttled-requests counter.
+   */
+  recordThrottledRequest(): void {
+    this.totalThrottledRequests++;
+    this.throttledRequestCounter.inc();
+  }
+
+  /**
+   * Build a ResourceSnapshot from current system state.
+   */
+  getResourceSnapshot(): ResourceSnapshot {
+    const totalMem = os.totalmem();
+    const freeMem = os.freemem();
+    const usedMem = totalMem - freeMem;
+    const memUsage = process.memoryUsage();
+
+    return {
+      totalMemoryBytes: totalMem,
+      freeMemoryBytes: freeMem,
+      usedMemoryBytes: usedMem,
+      memoryUsageRatio: usedMem / totalMem,
+      heapUsedBytes: memUsage.heapUsed,
+      heapTotalBytes: memUsage.heapTotal,
+      rssBytes: memUsage.rss,
+      externalBytes: memUsage.external,
+      gpuAvailable: this.cachedGpuInfo.available,
+      vramTotalBytes: this.cachedGpuInfo.totalBytes,
+      vramUsedBytes: this.cachedGpuInfo.usedBytes,
+      vramFreeBytes: this.cachedGpuInfo.freeBytes,
+      vramUsageRatio: this.cachedGpuInfo.usageRatio,
+    };
+  }
+
+  /**
+   * Build the full health report object.
+   */
+  getHealthReport(): AiHealthReport {
+    const resources = this.getResourceSnapshot();
+    const throttleCheck = this.shouldThrottle();
+
+    let status: AiHealthReport['status'] = 'healthy';
+    if (throttleCheck.throttle) {
+      status = 'critical';
+    } else if (resources.memoryUsageRatio > 0.75) {
+      status = 'degraded';
+    }
+
+    return {
+      status,
+      timestamp: new Date().toISOString(),
+      uptime: process.uptime(),
+      throttling: {
+        active: throttleCheck.throttle,
+        reason: throttleCheck.reason,
+        currentLoad: this.concurrentInferences,
+        maxConcurrent: this.maxConcurrentInferences,
+      },
+      resources,
+      models: {
+        totalLoaded: this.modelLoadTimes.size,
+        loadTimes: Object.fromEntries(this.modelLoadTimes),
+      },
+      counters: {
+        totalInferenceRequests: this.totalInferenceRequests,
+        totalInferenceErrors: this.totalInferenceErrors,
+        throttledRequests: this.totalThrottledRequests,
+      },
+    };
+  }
+
+  /**
+   * Return all registered AI metrics in Prometheus text format.
+   */
+  async getPrometheusMetrics(): Promise<string> {
+    return register.metrics();
+  }
+
+  // ── Private helpers ──────────────────────────────────────────────
+
+  /**
+   * Sample system resources and update Prometheus gauges.
+   * Called on a timer.
+   */
+  private sampleResources(): void {
+    try {
+      const totalMem = os.totalmem();
+      const freeMem = os.freemem();
+      const usedMem = totalMem - freeMem;
+      const usageRatio = usedMem / totalMem;
+
+      this.systemMemoryUsageRatio.set(usageRatio);
+      this.systemMemoryUsedBytes.set(usedMem);
+
+      const memUsage = process.memoryUsage();
+      this.nodeHeapUsedBytes.set(memUsage.heapUsed);
+      this.nodeRssBytes.set(memUsage.rss);
+
+      // Attempt to probe GPU (nvidia-smi). Result is cached.
+      this.probeGpu();
+    } catch (err) {
+      this.logger.warn(
+        `Resource sampling error: ${err instanceof Error ? err.message : String(err)}`,
+      );
+    }
+  }
+
+  /**
+   * Attempt to read GPU VRAM via nvidia-smi.
+   * If nvidia-smi is not available the GPU is marked as unavailable
+   * and we stop retrying until next sample cycle.
+   */
+  private probeGpu(): void {
+    try {
+      const output = execSync(
+        'nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv,noheader,nounits',
+        { timeout: 5000, encoding: 'utf-8' },
+      );
+
+      const parts = output
+        .trim()
+        .split(',')
+        .map((s: string) => s.trim());
+      if (parts.length >= 3) {
+        const totalMiB = parseFloat(parts[0]);
+        const usedMiB = parseFloat(parts[1]);
+        const freeMiB = parseFloat(parts[2]);
+
+        const totalBytes = totalMiB * 1024 * 1024;
+        const usedBytes = usedMiB * 1024 * 1024;
+        const freeBytes = freeMiB * 1024 * 1024;
+        const usageRatio = totalBytes > 0 ? usedBytes / totalBytes : 0;
+
+        this.cachedGpuInfo = {
+          available: true,
+          totalBytes,
+          usedBytes,
+          freeBytes,
+          usageRatio,
+        };
+
+        this.gpuVramUsageRatio.set(usageRatio);
+        this.gpuVramUsedBytes.set(usedBytes);
+      }
+    } catch {
+      // nvidia-smi not available — mark GPU as absent
+      if (this.cachedGpuInfo.available) {
+        this.logger.debug('GPU not detected (nvidia-smi unavailable)');
+      }
+      this.cachedGpuInfo = {
+        available: false,
+        totalBytes: null,
+        usedBytes: null,
+        freeBytes: null,
+        usageRatio: null,
+      };
+      this.gpuVramUsageRatio.set(-1);
+      this.gpuVramUsedBytes.set(-1);
+    }
+  }
+}
diff --git a/apps/backend/src/ai-metrics/ai-throttle.guard.spec.ts b/apps/backend/src/ai-metrics/ai-throttle.guard.spec.ts
new file mode 100644
index 00000000..8bb92bad
--- /dev/null
+++ b/apps/backend/src/ai-metrics/ai-throttle.guard.spec.ts
@@ -0,0 +1,76 @@
+import { ExecutionContext, HttpException, HttpStatus } from '@nestjs/common';
+import { AiThrottleGuard } from './ai-throttle.guard';
+import { AiMetricsService } from './ai-metrics.service';
+
+describe('AiThrottleGuard', () => {
+  let guard: AiThrottleGuard;
+  let aiMetrics: Partial<AiMetricsService>;
+
+  const mockExecutionContext = (): ExecutionContext => {
+    const setHeader = jest.fn();
+    return {
+      switchToHttp: () => ({
+        getRequest: () => ({}),
+        getResponse: () => ({ setHeader }),
+      }),
+    } as unknown as ExecutionContext;
+  };
+
+  beforeEach(() => {
+    aiMetrics = {
+      shouldThrottle: jest.fn(),
+      recordThrottledRequest: jest.fn(),
+    };
+    guard = new AiThrottleGuard(aiMetrics as AiMetricsService);
+  });
+
+  it('should allow request when not throttling', () => {
+    (aiMetrics.shouldThrottle as jest.Mock).mockReturnValue({
+      throttle: false,
+      reason: null,
+    });
+
+    expect(guard.canActivate(mockExecutionContext())).toBe(true);
+    expect(aiMetrics.recordThrottledRequest).not.toHaveBeenCalled();
+  });
+
+  it('should throw 503 when throttling is active', () => {
+    (aiMetrics.shouldThrottle as jest.Mock).mockReturnValue({
+      throttle: true,
+      reason: 'Concurrency limit reached (10/10)',
+    });
+
+    expect(() => guard.canActivate(mockExecutionContext())).toThrow(
+      HttpException,
+    );
+
+    try {
+      guard.canActivate(mockExecutionContext());
+    } catch (e) {
+      expect(e).toBeInstanceOf(HttpException);
+      expect((e as HttpException).getStatus()).toBe(
+        HttpStatus.SERVICE_UNAVAILABLE,
+      );
+    }
+
+    expect(aiMetrics.recordThrottledRequest).toHaveBeenCalled();
+  });
+
+  it('should set Retry-After header when throttling', () => {
+    (aiMetrics.shouldThrottle as jest.Mock).mockReturnValue({
+      throttle: true,
+      reason: 'High RAM usage',
+    });
+
+    const ctx = mockExecutionContext();
+    const setHeader = ctx.switchToHttp().getResponse<any>().setHeader;
+
+    try {
+      guard.canActivate(ctx);
+    } catch {
+      // expected
+    }
+
+    expect(setHeader).toHaveBeenCalledWith('Retry-After', '30');
+  });
+});
diff --git a/apps/backend/src/ai-metrics/ai-throttle.guard.ts b/apps/backend/src/ai-metrics/ai-throttle.guard.ts
new file mode 100644
index 00000000..6fca1eca
--- /dev/null
+++ b/apps/backend/src/ai-metrics/ai-throttle.guard.ts
@@ -0,0 +1,51 @@
+import {
+  Injectable,
+  CanActivate,
+  ExecutionContext,
+  HttpException,
+  HttpStatus,
+  Logger,
+} from '@nestjs/common';
+import type { Response } from 'express';
+import { AiMetricsService } from './ai-metrics.service';
+
+/**
+ * Guard that checks system resource pressure before allowing
+ * AI inference requests to proceed.
+ *
+ * When memory (RAM or VRAM) exceeds the configured threshold or
+ * the concurrency limit is reached, the guard rejects the request
+ * with 503 Service Unavailable and a Retry-After header.
+ *
+ * Apply this guard to any controller or route that triggers AI inference:
+ *   @UseGuards(AiThrottleGuard)
+ */
+@Injectable()
+export class AiThrottleGuard implements CanActivate {
+  private readonly logger = new Logger(AiThrottleGuard.name);
+
+  constructor(private readonly aiMetrics: AiMetricsService) {}
+
+  canActivate(context: ExecutionContext): boolean {
+    const { throttle, reason } = this.aiMetrics.shouldThrottle();
+
+    if (throttle) {
+      this.aiMetrics.recordThrottledRequest();
+      this.logger.warn(`AI request throttled — ${reason}`);
+
+      const response = context.switchToHttp().getResponse<Response>();
+      response.setHeader('Retry-After', '30');
+
+      throw new HttpException(
+        {
+          statusCode: HttpStatus.SERVICE_UNAVAILABLE,
+          message: 'AI service is under resource pressure. Please retry later.',
+          reason,
+        },
+        HttpStatus.SERVICE_UNAVAILABLE,
+      );
+    }
+
+    return true;
+  }
+}
diff --git a/apps/backend/src/ai-metrics/index.ts b/apps/backend/src/ai-metrics/index.ts
new file mode 100644
index 00000000..047f08b6
--- /dev/null
+++ b/apps/backend/src/ai-metrics/index.ts
@@ -0,0 +1,5 @@
+export { AiMetricsModule } from './ai-metrics.module';
+export { AiMetricsService } from './ai-metrics.service';
+export type { ResourceSnapshot, AiHealthReport } from './ai-metrics.service';
+export { AiThrottleGuard } from './ai-throttle.guard';
+export { AiMetricsInterceptor } from './ai-metrics.interceptor';
diff --git a/apps/backend/src/app.module.ts b/apps/backend/src/app.module.ts
index aa992021..800e966a 100644
--- a/apps/backend/src/app.module.ts
+++ b/apps/backend/src/app.module.ts
@@ -19,6 +19,7 @@ import { LoggerMiddleware } from './common/middleware/logger.middleware';
 import { TestController } from './test/test.controller';
 import { SnapshotsModule } from './snapshot/snapshot.module';
 import { ModelRetrainingModule } from './model-retraining/model-retraining.module';
+import { AiMetricsModule } from './ai-metrics/ai-metrics.module';
 import { DataSource, DataSourceOptions } from 'typeorm';
 import stellarConfig from './stellar/config/stellar.config';
 
@@ -71,6 +72,7 @@ const appLogger = new Logger('TypeORM');
     PortfolioModule,
     SnapshotsModule,
     ModelRetrainingModule,
+    AiMetricsModule,
   ],
   controllers: [AppController, TestController, TestExceptionController],
   providers: [
diff --git a/apps/backend/src/model-retraining/model-retraining.controller.ts b/apps/backend/src/model-retraining/model-retraining.controller.ts
index 8d100881..381525a4 100644
--- a/apps/backend/src/model-retraining/model-retraining.controller.ts
+++ b/apps/backend/src/model-retraining/model-retraining.controller.ts
@@ -11,7 +11,11 @@ import { JwtAuthGuard } from '../auth/jwt-auth.guard';
 import { RolesGuard } from '../auth/roles.guard';
 import { Roles } from '../auth/decorators/auth.decorators';
 import { UserRole } from '../users/entities/user.entity';
-import { ModelRetrainingService, RetrainResult, ModelStatusResult } from './model-retraining.service';
+import {
+  ModelRetrainingService,
+  RetrainResult,
+  ModelStatusResult,
+} from './model-retraining.service';
 
 class TriggerRetrainDto {
   force?: boolean;
diff --git a/apps/backend/src/portfolio/dto/portfolio-snapshot.dto.ts b/apps/backend/src/portfolio/dto/portfolio-snapshot.dto.ts
index 50112c7b..8e3a3a75 100644
--- a/apps/backend/src/portfolio/dto/portfolio-snapshot.dto.ts
+++ b/apps/backend/src/portfolio/dto/portfolio-snapshot.dto.ts
@@ -53,7 +53,10 @@ export class PortfolioSummaryResponseDto {
   })
   totalValueUsd: string;
 
-  @ApiProperty({ description: 'Individual asset balances', type: [AssetBalanceDto] })
+  @ApiProperty({
+    description: 'Individual asset balances',
+    type: [AssetBalanceDto],
+  })
   assets: AssetBalanceDto[];
 
   @ApiProperty({
@@ -64,7 +67,8 @@ export class PortfolioSummaryResponseDto {
   lastUpdated: Date | null;
 
   @ApiProperty({
-    description: 'Indicates whether the user has a linked Stellar account with snapshots',
+    description:
+      'Indicates whether the user has a linked Stellar account with snapshots',
     example: true,
   })
   hasLinkedAccount: boolean;
diff --git a/prometheus-rules.yml b/prometheus-rules.yml
index 309f6355..d7a2a556 100644
--- a/prometheus-rules.yml
+++ b/prometheus-rules.yml
@@ -177,3 +177,160 @@ groups:
       - record: 'job:http:latency:p99'
         expr: |
           histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))
+
+  # ==================================
+  # AI / GPU Resource Monitoring Rules
+  # ==================================
+  - name: lumenpulse_ai_alerts
+    interval: 30s
+    rules:
+      # ── Inference Latency ──────────────────────────────
+
+      - alert: AiHighInferenceLatencyP95
+        expr: |
+          histogram_quantile(0.95, rate(ai_inference_duration_seconds_bucket[5m])) > 5
+        for: 5m
+        labels:
+          severity: warning
+          service: ai
+        annotations:
+          summary: 'High AI inference P95 latency'
+          description: '{{ .Labels.model }} P95 inference latency > 5s (current: {{ $value | humanizeDuration }})'
+
+      - alert: AiCriticalInferenceLatencyP99
+        expr: |
+          histogram_quantile(0.99, rate(ai_inference_duration_seconds_bucket[5m])) > 15
+        for: 3m
+        labels:
+          severity: critical
+          service: ai
+        annotations:
+          summary: 'Critical AI inference P99 latency'
+          description: '{{ .Labels.model }} P99 inference latency > 15s (current: {{ $value | humanizeDuration }})'
+
+      # ── Error Rate ─────────────────────────────────────
+
+      - alert: AiHighErrorRate
+        expr: |
+          (rate(ai_inference_errors_total[5m]) / rate(ai_inference_requests_total[5m])) > 0.05
+        for: 5m
+        labels:
+          severity: warning
+          service: ai
+        annotations:
+          summary: 'High AI inference error rate'
+          description: 'AI error rate > 5% (current: {{ $value | humanizePercentage }})'
+
+      - alert: AiCriticalErrorRate
+        expr: |
+          (rate(ai_inference_errors_total[5m]) / rate(ai_inference_requests_total[5m])) > 0.15
+        for: 2m
+        labels:
+          severity: critical
+          service: ai
+        annotations:
+          summary: 'Critical AI inference error rate'
+          description: 'AI error rate > 15% (current: {{ $value | humanizePercentage }})'
+
+      # ── Memory / VRAM Pressure ─────────────────────────
+
+      - alert: AiHighRamUsage
+        expr: ai_system_memory_usage_ratio > 0.85
+        for: 5m
+        labels:
+          severity: warning
+          service: ai
+        annotations:
+          summary: 'AI layer RAM usage high'
+          description: 'System RAM usage at {{ $value | humanizePercentage }}'
+
+      - alert: AiCriticalRamUsage
+        expr: ai_system_memory_usage_ratio > 0.95
+        for: 2m
+        labels:
+          severity: critical
+          service: ai
+        annotations:
+          summary: 'AI layer RAM critically high'
+          description: 'System RAM usage at {{ $value | humanizePercentage }} — throttling likely active'
+
+      - alert: AiHighVramUsage
+        expr: ai_gpu_vram_usage_ratio > 0.85 and ai_gpu_vram_usage_ratio >= 0
+        for: 5m
+        labels:
+          severity: warning
+          service: ai
+        annotations:
+          summary: 'GPU VRAM usage high'
+          description: 'GPU VRAM usage at {{ $value | humanizePercentage }}'
+
+      - alert: AiCriticalVramUsage
+        expr: ai_gpu_vram_usage_ratio > 0.95 and ai_gpu_vram_usage_ratio >= 0
+        for: 2m
+        labels:
+          severity: critical
+          service: ai
+        annotations:
+          summary: 'GPU VRAM critically high'
+          description: 'GPU VRAM at {{ $value | humanizePercentage }} — models may OOM'
+
+      # ── Throttling ─────────────────────────────────────
+
+      - alert: AiRequestsThrottled
+        expr: rate(ai_throttled_requests_total[5m]) > 0.1
+        for: 5m
+        labels:
+          severity: warning
+          service: ai
+        annotations:
+          summary: 'AI requests are being throttled'
+          description: 'Throttle rate: {{ $value | humanize }}/sec — system under resource pressure'
+
+      # ── Concurrency ────────────────────────────────────
+
+      - alert: AiHighConcurrency
+        expr: ai_concurrent_inferences > 8
+        for: 5m
+        labels:
+          severity: warning
+          service: ai
+        annotations:
+          summary: 'High AI inference concurrency'
+          description: '{{ $value }} concurrent AI inferences running'
+
+      # ── Model Load Time ────────────────────────────────
+
+      - alert: AiSlowModelLoad
+        expr: |
+          histogram_quantile(0.95, rate(ai_model_load_duration_seconds_bucket[1h])) > 60
+        for: 5m
+        labels:
+          severity: warning
+          service: ai
+        annotations:
+          summary: 'AI model load times are slow'
+          description: 'P95 model load time > 60s (current: {{ $value | humanizeDuration }})'
+
+      # ── Recording Rules (pre-computed AI metrics) ──────
+
+      - record: 'job:ai:inference:rate5m'
+        expr: 'rate(ai_inference_requests_total[5m])'
+
+      - record: 'job:ai:errors:rate5m'
+        expr: 'rate(ai_inference_errors_total[5m])'
+
+      - record: 'job:ai:error_rate:ratio'
+        expr: |
+          rate(ai_inference_errors_total[5m]) / rate(ai_inference_requests_total[5m])
+
+      - record: 'job:ai:latency:p95'
+        expr: |
+          histogram_quantile(0.95, rate(ai_inference_duration_seconds_bucket[5m]))
+
+      - record: 'job:ai:latency:p99'
+        expr: |
+          histogram_quantile(0.99, rate(ai_inference_duration_seconds_bucket[5m]))
+
+      - record: 'job:ai:throttle:rate5m'
+        expr: 'rate(ai_throttled_requests_total[5m])'
+
diff --git a/prometheus.yml b/prometheus.yml
index 7486783b..623b209b 100644
--- a/prometheus.yml
+++ b/prometheus.yml
@@ -45,3 +45,19 @@ scrape_configs:
   - job_name: 'prometheus'
     static_configs:
       - targets: ['localhost:9090']
+
+  # LumenPulse AI Metrics
+  - job_name: 'lumenpulse-ai'
+    scrape_interval: 15s
+    scrape_timeout: 10s
+    metrics_path: '/ai/metrics/prometheus'
+
+    static_configs:
+      - targets: ['backend:3000']
+        labels:
+          layer: 'ai'
+
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: instance
+