Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
337 changes: 337 additions & 0 deletions src/lib/signal-scorer-v2.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,337 @@
/**
* Signal quality auto-scorer v2.
*
* Aligned with editor 7-gate framework (Zen Rocket v3.1).
* Adds tier-1 source domains, structure detection, novelty scoring,
* specificity checks, and beat-specific keyword density.
*
* Pure function — no I/O, no DB. Runs synchronously at submission time.
*/

export interface SignalScoreBreakdown {
/** 0–25: tier-1 source count + URL specificity */
sourceQuality: number;
/** 0–20: headline structure + body length + readability */
thesisClarity: number;
/** 0–20: beat-specific keyword density (editor Gate 3 + Gate 5) */
beatRelevance: number;
/** 0–15: body structure (CLAIM/EVIDENCE/IMPLICATION) */
structure: number;
/** 0–10: novelty indicators in body */
novelty: number;
/** 0–10: named entities, PR/issue numbers, on-chain specifics */
specificity: number;
}

export interface SignalScore {
/** Composite quality score, 0–100 */
total: number;
breakdown: SignalScoreBreakdown;
}

export interface SignalScorerInput {
headline: string;
body?: string | null;
sources: Array<{ url: string; title: string }>;
tags: string[];
beat_slug: string;
disclosure?: string | null;
}

// ── Tier-1 primary source domains (editor Gate 1) ──
const TIER1_DOMAINS = [
"github.com",
"arxiv.org",
"nist.gov",
"ietf.org",
"mempool.space",
"blockstream.info",
"hiro.so",
"stacks.co",
"sec.gov",
"bis.org",
"bitcoinops.org",
"nict.go.jp",
"eprint.iacr.org",
];

const TIER1_TLDS = [".gov", ".edu", ".ac.uk", ".ac.jp"];

// ── Beat-specific keyword sets (editor Gate 3 + Gate 5) ──
// Precompiled at module level for performance
const BEAT_KEYWORDS: Record<string, string[]> = {
quantum: [
"quantum", "post-quantum", "pqc", "bip-360", "bip-361", "ecdsa",
"lattice", "nist", "migration", "shor", "grover", "p2qrh", "p2mr",
"dilithium", "sphincs", "falcon", "kyber", "ml-kem", "ml-dsa",
"slh-dsa", "secp256k1", "harvest", "qubit", "post-quantum-cryptography",
"quantum-resistant", "quantum-safe",
],
"bitcoin-macro": [
"bitcoin", "btc", "mempool", "difficulty", "hashrate", "fee",
"mining", "block", "transaction", "inflation", "halving", "etf",
"institutional", "price", "liquidity", "sats", "lightning",
],
"aibtc-network": [
"aibtc", "agent", "relay", "x402", "mcp", "signal", "brief",
"correspondent", "publisher", "sats", "stacks", "sbtc", "payout",
"nonce", "bip-322", "stx",
],
};

// Precompiled beat keyword regexes (word-boundary matching)
const BEAT_KEYWORD_REGEXES: Record<string, RegExp[]> = Object.fromEntries(
Object.entries(BEAT_KEYWORDS).map(([beat, kws]) => [
beat,
kws.map((kw) => new RegExp("\\b" + kw.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + "\\b", "i")),
])
);

// ── Novelty indicator words ──
const NOVELTY_WORDS = [
"first", "newly", "reveals", "exposes", "uncovered", "demonstrates",
"proves", "confirms", "shows", "finds", "discloses", "identifies",
"uncovers", "breaks", "launches", "ships", "merges",
];

// ── Specificity patterns ──
const SPECIFICITY_PATTERNS = [
/PR\s*#\s*\d+/gi, // PR #123
/issue\s*#\s*\d+/gi, // Issue #456
/BIP-?\s*\d+/gi, // BIP-360
/arxiv[:\s]*\d{4}\.\d+/gi, // arxiv:2603.01091
/block\s+\d{6,}/gi, // Block 946018
/FIPS\s+\d+/gi, // FIPS 204
/0x[0-9a-f]{8,}/gi, // tx hashes
/\$[\d,.]+[BMK]?/g, // dollar amounts
/[\d,.]+\s*(BTC|sats|STX)/gi, // crypto amounts
/[\d.]+%/g, // percentages
];


// ── Dimension weights (sum = 100) ──
const MAX_SOURCE_QUALITY = 25;
const MAX_THESIS_CLARITY = 20;
const MAX_BEAT_RELEVANCE = 20;
const MAX_STRUCTURE = 15;
const MAX_NOVELTY = 10;
const MAX_SPECIFICITY = 10;

/**
* Check if a URL is from a tier-1 primary source domain.
*/
function isTier1Source(url: string): boolean {
try {
const hostname = new URL(url).hostname.toLowerCase();
// Direct domain match
if (TIER1_DOMAINS.some((d) => hostname === d || hostname.endsWith("." + d))) {
return true;
}
// TLD match
if (TIER1_TLDS.some((tld) => hostname.endsWith(tld))) {
return true;
}
return false;
} catch {
return false;
}
}

/**
* Check if a URL contains a specific path (not just homepage).
* Homepage-level URLs fail Gate 0 source verification.
*/
function isSpecificUrl(url: string): boolean {
try {
const parsed = new URL(url);
const path = parsed.pathname;
// Specific paths: /abs/1234, /pull/123, /api/..., /txid/0x..., /issues/123
if (/\/(abs|pull|issues|api|txid|blob|commit|releases?|pubs)\/./i.test(path)) {
return true;
}
// Contains year in path (arxiv papers)
if (/\/\d{4}\.\d+/.test(path)) {
return true;
}
return false;
} catch {
return false;
}
}

/**
* Score source quality (0–25).
* Tier-1 sources + URL specificity.
*/
function scoreSourceQuality(sources: Array<{ url: string; title: string }>): number {
if (sources.length === 0) return 0;

const tier1Count = sources.filter((s) => isTier1Source(s.url)).length;
const specificCount = sources.filter((s) => isSpecificUrl(s.url)).length;

let pts = 0;

// Base: tier-1 source count
if (tier1Count >= 3) pts = 20;
else if (tier1Count === 2) pts = 15;
else if (tier1Count === 1) pts = 10;
else pts = 3; // non-tier-1 sources get minimal credit

// Bonus: specific URLs (not homepage-level)
if (specificCount >= 2) pts += 5;
else if (specificCount >= 1) pts += 3;

return Math.min(pts, MAX_SOURCE_QUALITY);
}

/**
* Score thesis clarity (0–20).
* Headline structure + body length + complete sentence.
*/
function scoreThesisClarity(headline: string, body?: string | null): number {
const words = headline.trim().split(/\s+/).filter((w) => w.length > 0).length;
let pts = 0;

// Headline word count: 8–15 = sweet spot
if (words >= 8 && words <= 15) pts = 10;
else if (words >= 5 && words <= 20) pts = 7;
else if (words >= 3) pts = 4;

// Body length: 500–940 chars = optimal
if (body) {
const bodyLen = body.trim().length;
if (bodyLen >= 500 && bodyLen <= 940) pts += 7;
else if (bodyLen >= 300) pts += 4;
else if (bodyLen >= 100) pts += 2;

// Complete sentence (doesn't end with truncation)
const lastChar = body.trim().slice(-1);
if (".!?".includes(lastChar)) pts += 3;
}

return Math.min(pts, MAX_THESIS_CLARITY);
}

/**
* Score beat relevance (0–20).
* Matches body + tags against beat-specific keyword set.
*/
function scoreBeatRelevance(
tags: string[],
beat_slug: string,
body?: string | null,
headline?: string
): number {
const keywords = BEAT_KEYWORDS[beat_slug];
if (!keywords || keywords.length === 0) return 0;
const text = [headline, body, ...tags].filter(Boolean).join(" ").toLowerCase();

let keywordHits = 0;

for (const kw of keywords) {
// Word-boundary match (editor uses \b regex)
const regex = new RegExp("\\b" + kw.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + "\\b", "i");
if (regex.test(text)) {
keywordHits++;
}
}

if (keywordHits >= 10) return MAX_BEAT_RELEVANCE;
if (keywordHits >= 5) return 15;
if (keywordHits >= 3) return 10;
if (keywordHits >= 1) return 5;
return 0;
}

/**
* Score structure (0–15).
* Detects CLAIM/EVIDENCE/IMPLICATION pattern in body.
*/
function scoreStructure(body?: string | null): number {
if (!body) return 0;
const upper = body.toUpperCase();

const hasClaim = /\bCLAIM\b/.test(upper);
const hasEvidence = /\bEVIDENCE\b/.test(upper);
const hasImplication = /\bIMPLICATION\b/.test(upper) || /\bACTION\b/.test(upper);

const sections = [hasClaim, hasEvidence, hasImplication].filter(Boolean).length;

if (sections === 3) return MAX_STRUCTURE; // Full C/E/I
if (sections === 2) return 10; // Partial structure
if (sections === 1) return 5; // Minimal structure
return 0;
}

/**
* Score novelty (0–10).
* Counts novelty indicator words in body.
*/
// Precompiled novelty regexes
const NOVELTY_REGEXES = NOVELTY_WORDS.map(
(w) => new RegExp("\\b" + w.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + "\\b", "i")
);

function scoreNovelty(body?: string | null): number {
if (!body) return 0;

let hits = 0;
for (const regex of NOVELTY_REGEXES) {
if (regex.test(body)) hits++;
}

if (hits >= 3) return MAX_NOVELTY;
if (hits >= 2) return 7;
if (hits >= 1) return 4;
return 0;
}

/**
* Score specificity (0–10).
* Counts named entities, PR numbers, on-chain specifics.
*/
function scoreSpecificity(body?: string | null, headline?: string): number {
const text = [headline, body].filter(Boolean).join(" ");
let matches = 0;

for (const pattern of SPECIFICITY_PATTERNS) {
const found = text.match(pattern);
if (found) matches += found.length;
}

if (matches >= 5) return MAX_SPECIFICITY;
if (matches >= 3) return 7;
if (matches >= 1) return 4;
return 0;
}

/**
* Score a signal and return a composite quality score with per-dimension breakdown.
*/
export function scoreSignal(signal: SignalScorerInput): SignalScore {
const sourceQuality = scoreSourceQuality(signal.sources);
const thesisClarity = scoreThesisClarity(signal.headline, signal.body);
const beatRelevance = scoreBeatRelevance(
signal.tags,
signal.beat_slug,
signal.body,
signal.headline
);
const structure = scoreStructure(signal.body);
const novelty = scoreNovelty(signal.body);
const specificity = scoreSpecificity(signal.body, signal.headline);

const total = sourceQuality + thesisClarity + beatRelevance + structure + novelty + specificity;

return {
total,
breakdown: {
sourceQuality,
thesisClarity,
beatRelevance,
structure,
novelty,
specificity,
},
};
}