Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
workflow_dispatch:
inputs:
branch:
description: 'Branch'
description: "Branch"
required: true
default: main

Expand All @@ -32,7 +32,13 @@ jobs:
./start_services.sh --no-jupyter -d

- name: Install poetry
run: pipx install "poetry == 1.8.5"
run: pipx install poetry
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not specify the version anymore?

Copy link
Contributor Author

@superdosh superdosh Dec 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We were pinned to a very old version because of some weirdness (in modelbench) that I never fully understood, but I figure that's not necessary here anymore -- I think it was actually driven by modelbench-private? Potentially pinning to some newer version is good, but at least we shouldn't be stuck on 1 if not necessary. Going to leave it open for now!


- name: Verify MLflow versions match
run: ./scripts/check_mlflow_versions.sh

- name: Check poetry lock file
run: poetry check --lock

- name: Remove existing virtual environment
run: |
Expand Down Expand Up @@ -75,6 +81,5 @@ jobs:
run: |
docker exec modelplane-jupyter-1 poetry run python /app/test_notebooks.py


- name: Stop MLflow server
run: docker compose down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@ secrets.toml
.vscode/
.coverage*
.cache
*.csv
*.txt
*.json
*.jsonl
7 changes: 3 additions & 4 deletions Dockerfile.mlflow
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
FROM ghcr.io/mlflow/mlflow:v3.1.1
FROM ghcr.io/mlflow/mlflow:v3.7.0

# The base image does not include various dependencies that are needed for
# the MLflow server. We assume a postgres backend, so we need psycopg2.
# We also need boto3 for S3 support, and google-cloud-storage for GCS support.
# TODO: better way to install these (maybe using poetry.lock to grab consistent versions?)
RUN pip install mlflow[auth]==3.1.1 psycopg2-binary==2.9.10 boto3==1.38.31 \
google-cloud-storage==3.1.0
RUN pip install mlflow[auth]==3.7.0 psycopg2-binary==2.9.11 boto3==1.42.5 \
google-cloud-storage==3.4.1
6 changes: 5 additions & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ services:
GOOGLE_APPLICATION_CREDENTIALS: /creds/gcp-key.json
# if not provided via volume below, AWS S3 will not work as artifact store
AWS_SHARED_CREDENTIALS_FILE: /creds/aws-credentials
# https://mlflow.org/docs/latest/community/usage-tracking/
MLFLOW_DISABLE_TELEMETRY: "true"
DO_NOT_TRACK: "true"
depends_on:
- postgres
# grab backend from .env, pass artifact root, if provided, otherwise local storage of artifacts
Expand All @@ -37,6 +40,7 @@ services:
--serve-artifacts
--host 0.0.0.0
--port 8080
--allowed-hosts 'mlflow:*,localhost:*,127.0.0.1:*'
ports:
- "8080:8080"
healthcheck:
Expand All @@ -51,7 +55,7 @@ services:
- ${GOOGLE_CREDENTIALS_PATH:-/dev/null}:/creds/gcp-key.json:ro
# Volume only needed for AWS S3 storage artifacts
- ${AWS_CREDENTIALS_PATH:-/dev/null}:/creds/aws-credentials:ro

jupyter:
build:
context: .
Expand Down
19 changes: 8 additions & 11 deletions flightpaths/Annotator Development Template.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,10 @@
"\n",
"from pydantic import BaseModel\n",
"\n",
"from modelgauge.annotator import CompletionAnnotator\n",
"from modelgauge.annotation import SafetyAnnotation\n",
"from modelgauge.annotator import Annotator\n",
"from modelgauge.annotator_registry import ANNOTATORS\n",
"from modelgauge.prompt import ChatPrompt, TextPrompt\n",
"from modelgauge.safety_model_response import SafetyModelResponse\n",
"from modelgauge.single_turn_prompt_response import TestItem\n",
"from modelgauge.sut import SUTResponse\n",
"from modelgauge.tests.safe_v1 import BaseSafeTestVersion1\n",
Expand All @@ -107,9 +107,8 @@
" response: SUTResponse\n",
"\n",
"\n",
"class RandomAnnotator(CompletionAnnotator[SafetyModelResponse]):\n",
" \"\"\"Simple random implementation for demonstration. Uses SafetyModelResponse\n",
" for consistency with AILuminate benchmarks.\n",
"class RandomAnnotator(Annotator):\n",
" \"\"\"Simple random implementation for demonstration.\n",
"\n",
" The methods to implement are based on the idea that annotate will \n",
" potentially make a call to another system and the translators are helpful \n",
Expand All @@ -126,18 +125,16 @@
" response=response,\n",
" )\n",
"\n",
" def annotate(self, annotation_request: AnnotationRequest) -> SafetyModelResponse:\n",
" def annotate(self, annotation_request: AnnotationRequest) -> SafetyAnnotation:\n",
" unsafe_categories = []\n",
" for category in BaseSafeTestVersion1.hazards:\n",
" if random.random() < self.prob_unsafe:\n",
" unsafe_categories.append(category)\n",
" return SafetyModelResponse(\n",
" return SafetyAnnotation(\n",
" is_safe=not unsafe_categories,\n",
" logprobs=random.random(),\n",
" safety_categories=unsafe_categories,\n",
" )\n",
"\n",
" def translate_response(self, request: AnnotationRequest, response: SafetyModelResponse) -> SafetyModelResponse:\n",
" def translate_response(self, request: AnnotationRequest, response: SafetyAnnotation) -> SafetyAnnotation:\n",
" return response\n",
"\n",
"\n",
Expand Down Expand Up @@ -248,7 +245,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
"version": "3.12.12"
}
},
"nbformat": 4,
Expand Down
42 changes: 12 additions & 30 deletions flightpaths/Ensemble Development Template.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -100,21 +100,21 @@
"source": [
"import random\n",
"\n",
"from pydantic import BaseModel\n",
"\n",
"from modelgauge.annotator import CompletionAnnotator\n",
"from modelgauge.annotation import SafetyAnnotation\n",
"from modelgauge.annotator import Annotator\n",
"from modelgauge.annotator_registry import ANNOTATORS\n",
"from modelgauge.prompt import ChatPrompt, TextPrompt\n",
"from modelgauge.safety_model_response import SafetyModelResponse\n",
"from modelgauge.sut import SUTResponse\n",
"from modelgauge.tests.safe_v1 import BaseSafeTestVersion1\n",
"from pydantic import BaseModel\n",
"\n",
"\n",
"class AnnotationRequest(BaseModel):\n",
" prompt: ChatPrompt | TextPrompt\n",
" response: SUTResponse\n",
"\n",
"\n",
"class RandomAnnotator(CompletionAnnotator[SafetyModelResponse]):\n",
"class RandomAnnotator(Annotator):\n",
" \"\"\"Simple random implementation for demonstration. Uses SafetyModelResponse\n",
" for consistency with AILuminate benchmarks.\n",
"\n",
Expand All @@ -132,19 +132,17 @@
" response=response,\n",
" )\n",
"\n",
" def annotate(self, annotation_request: AnnotationRequest) -> SafetyModelResponse:\n",
" def annotate(self, annotation_request: AnnotationRequest) -> SafetyAnnotation:\n",
" unsafe_categories = []\n",
" for category in BaseSafeTestVersion1.hazards:\n",
" if random.random() < self.prob_unsafe:\n",
" unsafe_categories.append(category)\n",
" return SafetyModelResponse(\n",
" return SafetyAnnotation(\n",
" is_safe=not unsafe_categories,\n",
" is_valid=True,\n",
" logprobs=random.random(),\n",
" safety_categories=unsafe_categories,\n",
" )\n",
"\n",
" def translate_response(self, request: AnnotationRequest, response: SafetyModelResponse) -> SafetyModelResponse:\n",
" def translate_response(self, request: AnnotationRequest, response: SafetyAnnotation) -> SafetyAnnotation:\n",
" return response\n",
"\n",
"\n",
Expand Down Expand Up @@ -176,26 +174,22 @@
"source": [
"from typing import Dict\n",
"\n",
"from modelgauge.safety_model_response import EnsembleSafetyModelResponse\n",
"from modelgauge.ensemble_annotator_set import EnsembleStrategy, ENSEMBLE_STRATEGIES\n",
"from modelgauge.annotation import EnsembleSafetyAnnotation\n",
"from modelgauge.ensemble_strategies import ENSEMBLE_STRATEGIES, EnsembleStrategy\n",
"\n",
"\n",
"class AnyUnsafeEnsembleStrategy(EnsembleStrategy):\n",
" def compute_response(self, annotations: Dict[str, SafetyModelResponse]) -> EnsembleSafetyModelResponse:\n",
" response = EnsembleSafetyModelResponse(\n",
" def compute_response(self, annotations: Dict[str, SafetyAnnotation]) -> EnsembleSafetyAnnotation:\n",
" response = EnsembleSafetyAnnotation(\n",
" joined_responses=annotations, \n",
" is_safe=True,\n",
" safety_categories=[],\n",
" is_valid=False,\n",
" )\n",
" for annotation in annotations.values():\n",
" if annotation.is_valid:\n",
" response.is_valid = True\n",
" if not annotation.is_safe:\n",
" response.is_safe = False\n",
" current_categories = set(response.safety_categories)\n",
" current_categories.update(annotation.safety_categories)\n",
" response.safety_categories = sorted(current_categories)\n",
" return response\n",
"\n",
"ENSEMBLE_STRATEGIES[\"anyunsafe\"] = AnyUnsafeEnsembleStrategy()"
Expand Down Expand Up @@ -286,18 +280,6 @@
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
Expand Down
25 changes: 15 additions & 10 deletions flightpaths/vLLM Annotator.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"from modelplane.runways import responder, annotator"
]
},
Expand Down Expand Up @@ -128,8 +126,8 @@
"from openai import OpenAI\n",
"from pydantic import BaseModel\n",
"\n",
"from modelgauge.annotator import CompletionAnnotator\n",
"from modelgauge.safety_model_response import SafetyModelResponse\n",
"from modelgauge.annotation import SafetyAnnotation\n",
"from modelgauge.annotator import Annotator\n",
"from modelgauge.secret_values import RequiredSecret, SecretDescription\n",
"from modelgauge.sut import SUTResponse\n",
"from modelgauge.tests.safe_v1 import BaseSafeTestVersion1\n",
Expand All @@ -149,7 +147,7 @@
" prompts: Dict[str, str]\n",
"\n",
"\n",
"class VLLMAnnotator(CompletionAnnotator[SafetyModelResponse]):\n",
"class VLLMAnnotator(Annotator):\n",
" def __init__(self, uid: str, base_url: str, secret: VLLMSecret, model_name: str):\n",
" super().__init__(uid=uid)\n",
" self.base_url = base_url\n",
Expand Down Expand Up @@ -181,13 +179,12 @@
" responses[hazard] = completion.choices[0].text.strip().lower()\n",
" return responses\n",
"\n",
" def translate_response(self, request: AnnotationRequest, response: Dict[str, str]) -> SafetyModelResponse:\n",
" def translate_response(self, request: AnnotationRequest, response: Dict[str, str]) -> SafetyAnnotation:\n",
" \"\"\"Convert the raw response into the form read by Tests.\"\"\"\n",
" res = SafetyModelResponse(is_safe=True, safety_categories=[])\n",
" for hazard, answer in response.items():\n",
" res = SafetyAnnotation(is_safe=True)\n",
" for answer in response.values():\n",
" if answer != \"safe\":\n",
" res.is_safe = False\n",
" res.safety_categories.append(hazard)\n",
" return res"
]
},
Expand Down Expand Up @@ -235,6 +232,14 @@
" num_workers=num_workers,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa7dddec",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -253,7 +258,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
"version": "3.12.12"
}
},
"nbformat": 4,
Expand Down
Loading