Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 114 additions & 0 deletions packages/app/src/lib/eval-samples-live.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import { describe, it, expect } from 'vitest';

import type { GithubArtifact } from '@/lib/github-artifacts';
import { type EvalArtifactConfig, findEvalSampleArtifact } from '@/lib/eval-samples-live';

function makeArtifact(name: string, id = 1): GithubArtifact {
return {
id,
name,
archive_download_url: `https://example.com/${name}.zip`,
} as GithubArtifact;
}

const baseConfig: EvalArtifactConfig = {
model: 'dsr1',
framework: 'sglang',
hardware: 'mi355x',
precision: 'fp4',
specMethod: 'mtp',
disagg: false,
conc: 128,
};

describe('findEvalSampleArtifact', () => {
it('matches a single-conc non-disagg artifact', () => {
const artifacts = [
makeArtifact(
'eval_dsr1_8k1k_dsr1_8k1k_fp4_sglang_tp8-ep1-dpafalse_disagg-false_spec-mtp_conc128_mi355x-amds_01',
),
];
const result = findEvalSampleArtifact(artifacts, baseConfig);
expect(result?.id).toBe(1);
});

it('accepts the legacy `sglang-disagg` alias when the config framework is `mori-sglang`', () => {
// Eval rows are normalized via FRAMEWORK_ALIASES (sglang-disagg → mori-sglang),
// but artifact names keep the raw alias. The matcher must accept either.
const artifacts = [
makeArtifact(
'eval_dsr1_8k1k_dsr1_8k1k_fp4_sglang-disagg_prefill-tp8-ep1-dpfalse-nw1_decode-tp8-ep1-dpfalse-nw2_disagg-true_spec-mtp_conc64x128x256_mi355x-amds_08',
),
];
const result = findEvalSampleArtifact(artifacts, {
...baseConfig,
framework: 'mori-sglang',
disagg: true,
conc: 128,
});
expect(result?.id).toBe(1);
});

it('matches a conc value embedded in an x-separated list (disagg artifacts)', () => {
const artifacts = [
makeArtifact(
'eval_dsr1_8k1k_dsr1_8k1k_fp4_sglang-disagg_prefill-tp8-ep8-dptrue-nw2_decode-tp8-ep8-dptrue-nw1_disagg-true_spec-mtp_conc1024x2048x4096_mi355x-amds_06',
),
];
const result = findEvalSampleArtifact(artifacts, {
...baseConfig,
framework: 'mori-sglang',
disagg: true,
conc: 2048,
});
expect(result?.id).toBe(1);
});

it('rejects when the requested conc is not in the list', () => {
const artifacts = [
makeArtifact(
'eval_dsr1_8k1k_dsr1_8k1k_fp4_sglang-disagg_prefill-tp8-ep8-dptrue-nw2_decode-tp8-ep8-dptrue-nw1_disagg-true_spec-mtp_conc1024x2048x4096_mi355x-amds_06',
),
];
const result = findEvalSampleArtifact(artifacts, {
...baseConfig,
framework: 'mori-sglang',
disagg: true,
conc: 64,
});
expect(result).toBeNull();
});

it('avoids substring conc collisions (conc=12 must not match conc128)', () => {
const artifacts = [
makeArtifact(
'eval_dsr1_8k1k_dsr1_8k1k_fp4_sglang_tp8-ep1-dpafalse_disagg-false_spec-mtp_conc128_mi355x-amds_01',
),
];
const result = findEvalSampleArtifact(artifacts, { ...baseConfig, conc: 12 });
expect(result).toBeNull();
});

it('skips eval_results_ and eval_gpu_metrics_ artifacts', () => {
const artifacts = [
makeArtifact('eval_results_all'),
makeArtifact('eval_gpu_metrics_dsr1_8k1k_fp4_sglang_spec-mtp_conc128_mi355x-amds'),
];
expect(findEvalSampleArtifact(artifacts, baseConfig)).toBeNull();
});

it('prefers artifacts whose disagg token matches the config', () => {
const artifacts = [
makeArtifact(
'eval_dsr1_8k1k_dsr1_8k1k_fp4_sglang_tp8-ep1_disagg-false_spec-mtp_conc128_mi355x-amds_01',
1,
),
makeArtifact(
'eval_dsr1_8k1k_dsr1_8k1k_fp4_sglang_tp8-ep1_disagg-true_spec-mtp_conc128_mi355x-amds_02',
2,
),
];
const result = findEvalSampleArtifact(artifacts, { ...baseConfig, disagg: true });
expect(result?.id).toBe(2);
});
});
31 changes: 28 additions & 3 deletions packages/app/src/lib/eval-samples-live.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
* ingest does. No caching — same policy as `/api/unofficial-run`, since GHA
* artifacts can change while a workflow is still running.
*/
import { resolveFrameworkAliasesInString } from '@semianalysisai/inferencex-constants/framework-aliases';

import {
type GithubArtifact,
downloadGithubArtifact,
Expand All @@ -30,6 +32,17 @@ export interface EvalArtifactConfig {
conc: number | null;
}

/**
* Check whether the artifact's `_conc<N>_` or `_conc<N>x<N>x<...>_` segment
* lists `targetConc`. Disagg artifacts pack multiple concurrencies into a
* single zip; non-disagg artifacts encode a single conc value.
*/
function artifactConcMatches(artifactName: string, targetConc: number): boolean {
const m = artifactName.match(/_conc(\d+(?:x\d+)*)_/);
if (!m) return false;
return m[1].split('x').includes(String(targetConc));
}

/**
* Pick the per-config eval artifact matching `config` from a run's artifact list.
*
Expand All @@ -40,6 +53,15 @@ export interface EvalArtifactConfig {
* `EvalRow`, so when multiple artifacts differ only in sequence length we pick
* the highest-id (most recent) match. Excludes the aggregate (`eval_results_all`)
* and gpu-metrics artifacts which share the `eval_` prefix but don't carry samples.
*
* Two normalization quirks the matcher has to undo:
* - The eval row's `framework` is canonicalized via `FRAMEWORK_ALIASES`
* (e.g. `sglang-disagg` → `mori-sglang`), but the artifact name keeps the
* raw alias. We canonicalize the artifact name via `resolveFrameworkAliasesInString`
* before comparing.
* - Disagg artifacts pack multiple concurrencies into one zip and encode them
* as `conc<N>x<N>x<N>`, so we parse the conc segment as an x-separated list
* and check membership instead of requiring an exact `_conc<N>_` token.
*/
export function findEvalSampleArtifact(
artifacts: GithubArtifact[],
Expand All @@ -57,15 +79,18 @@ export function findEvalSampleArtifact(
`_${config.hardware}-`,
`_spec-${config.specMethod}_`,
];
if (config.conc !== null) required.push(`_conc${config.conc}_`);
// Preferred token — used as a tiebreaker when more than one artifact matches.
const preferredDisagg = `_disagg-${config.disagg ? 'true' : 'false'}_`;

const matches = artifacts.filter((a) => {
const n = a.name.toLowerCase();
// Canonicalize legacy framework substrings (e.g. `sglang-disagg` → `mori-sglang`)
// so the framework token matches what the eval row was normalized to.
const n = resolveFrameworkAliasesInString(a.name.toLowerCase());
if (!n.startsWith('eval_')) return false;
if (n.startsWith('eval_results_') || n.startsWith('eval_gpu_metrics_')) return false;
return required.every((t) => n.includes(t.toLowerCase()));
if (!required.every((t) => n.includes(t.toLowerCase()))) return false;
if (config.conc !== null && !artifactConcMatches(n, config.conc)) return false;
return true;
});
if (matches.length === 0) return null;
// Prefer artifacts whose disagg flag matches the row, then fall back to newest.
Expand Down
Loading