diff --git a/src/lib/signal-scorer-v2.ts b/src/lib/signal-scorer-v2.ts new file mode 100644 index 00000000..e98de851 --- /dev/null +++ b/src/lib/signal-scorer-v2.ts @@ -0,0 +1,337 @@ +/** + * Signal quality auto-scorer v2. + * + * Aligned with editor 7-gate framework (Zen Rocket v3.1). + * Adds tier-1 source domains, structure detection, novelty scoring, + * specificity checks, and beat-specific keyword density. + * + * Pure function — no I/O, no DB. Runs synchronously at submission time. + */ + +export interface SignalScoreBreakdown { + /** 0–25: tier-1 source count + URL specificity */ + sourceQuality: number; + /** 0–20: headline structure + body length + readability */ + thesisClarity: number; + /** 0–20: beat-specific keyword density (editor Gate 3 + Gate 5) */ + beatRelevance: number; + /** 0–15: body structure (CLAIM/EVIDENCE/IMPLICATION) */ + structure: number; + /** 0–10: novelty indicators in body */ + novelty: number; + /** 0–10: named entities, PR/issue numbers, on-chain specifics */ + specificity: number; +} + +export interface SignalScore { + /** Composite quality score, 0–100 */ + total: number; + breakdown: SignalScoreBreakdown; +} + +export interface SignalScorerInput { + headline: string; + body?: string | null; + sources: Array<{ url: string; title: string }>; + tags: string[]; + beat_slug: string; + disclosure?: string | null; +} + +// ── Tier-1 primary source domains (editor Gate 1) ── +const TIER1_DOMAINS = [ + "github.com", + "arxiv.org", + "nist.gov", + "ietf.org", + "mempool.space", + "blockstream.info", + "hiro.so", + "stacks.co", + "sec.gov", + "bis.org", + "bitcoinops.org", + "nict.go.jp", + "eprint.iacr.org", +]; + +const TIER1_TLDS = [".gov", ".edu", ".ac.uk", ".ac.jp"]; + +// ── Beat-specific keyword sets (editor Gate 3 + Gate 5) ── +// Precompiled at module level for performance +const BEAT_KEYWORDS: Record = { + quantum: [ + "quantum", "post-quantum", "pqc", "bip-360", "bip-361", "ecdsa", + "lattice", "nist", "migration", "shor", "grover", "p2qrh", "p2mr", + "dilithium", "sphincs", "falcon", "kyber", "ml-kem", "ml-dsa", + "slh-dsa", "secp256k1", "harvest", "qubit", "post-quantum-cryptography", + "quantum-resistant", "quantum-safe", + ], + "bitcoin-macro": [ + "bitcoin", "btc", "mempool", "difficulty", "hashrate", "fee", + "mining", "block", "transaction", "inflation", "halving", "etf", + "institutional", "price", "liquidity", "sats", "lightning", + ], + "aibtc-network": [ + "aibtc", "agent", "relay", "x402", "mcp", "signal", "brief", + "correspondent", "publisher", "sats", "stacks", "sbtc", "payout", + "nonce", "bip-322", "stx", + ], +}; + +// Precompiled beat keyword regexes (word-boundary matching) +const BEAT_KEYWORD_REGEXES: Record = Object.fromEntries( + Object.entries(BEAT_KEYWORDS).map(([beat, kws]) => [ + beat, + kws.map((kw) => new RegExp("\\b" + kw.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + "\\b", "i")), + ]) +); + +// ── Novelty indicator words ── +const NOVELTY_WORDS = [ + "first", "newly", "reveals", "exposes", "uncovered", "demonstrates", + "proves", "confirms", "shows", "finds", "discloses", "identifies", + "uncovers", "breaks", "launches", "ships", "merges", +]; + +// ── Specificity patterns ── +const SPECIFICITY_PATTERNS = [ + /PR\s*#\s*\d+/gi, // PR #123 + /issue\s*#\s*\d+/gi, // Issue #456 + /BIP-?\s*\d+/gi, // BIP-360 + /arxiv[:\s]*\d{4}\.\d+/gi, // arxiv:2603.01091 + /block\s+\d{6,}/gi, // Block 946018 + /FIPS\s+\d+/gi, // FIPS 204 + /0x[0-9a-f]{8,}/gi, // tx hashes + /\$[\d,.]+[BMK]?/g, // dollar amounts + /[\d,.]+\s*(BTC|sats|STX)/gi, // crypto amounts + /[\d.]+%/g, // percentages +]; + + +// ── Dimension weights (sum = 100) ── +const MAX_SOURCE_QUALITY = 25; +const MAX_THESIS_CLARITY = 20; +const MAX_BEAT_RELEVANCE = 20; +const MAX_STRUCTURE = 15; +const MAX_NOVELTY = 10; +const MAX_SPECIFICITY = 10; + +/** + * Check if a URL is from a tier-1 primary source domain. + */ +function isTier1Source(url: string): boolean { + try { + const hostname = new URL(url).hostname.toLowerCase(); + // Direct domain match + if (TIER1_DOMAINS.some((d) => hostname === d || hostname.endsWith("." + d))) { + return true; + } + // TLD match + if (TIER1_TLDS.some((tld) => hostname.endsWith(tld))) { + return true; + } + return false; + } catch { + return false; + } +} + +/** + * Check if a URL contains a specific path (not just homepage). + * Homepage-level URLs fail Gate 0 source verification. + */ +function isSpecificUrl(url: string): boolean { + try { + const parsed = new URL(url); + const path = parsed.pathname; + // Specific paths: /abs/1234, /pull/123, /api/..., /txid/0x..., /issues/123 + if (/\/(abs|pull|issues|api|txid|blob|commit|releases?|pubs)\/./i.test(path)) { + return true; + } + // Contains year in path (arxiv papers) + if (/\/\d{4}\.\d+/.test(path)) { + return true; + } + return false; + } catch { + return false; + } +} + +/** + * Score source quality (0–25). + * Tier-1 sources + URL specificity. + */ +function scoreSourceQuality(sources: Array<{ url: string; title: string }>): number { + if (sources.length === 0) return 0; + + const tier1Count = sources.filter((s) => isTier1Source(s.url)).length; + const specificCount = sources.filter((s) => isSpecificUrl(s.url)).length; + + let pts = 0; + + // Base: tier-1 source count + if (tier1Count >= 3) pts = 20; + else if (tier1Count === 2) pts = 15; + else if (tier1Count === 1) pts = 10; + else pts = 3; // non-tier-1 sources get minimal credit + + // Bonus: specific URLs (not homepage-level) + if (specificCount >= 2) pts += 5; + else if (specificCount >= 1) pts += 3; + + return Math.min(pts, MAX_SOURCE_QUALITY); +} + +/** + * Score thesis clarity (0–20). + * Headline structure + body length + complete sentence. + */ +function scoreThesisClarity(headline: string, body?: string | null): number { + const words = headline.trim().split(/\s+/).filter((w) => w.length > 0).length; + let pts = 0; + + // Headline word count: 8–15 = sweet spot + if (words >= 8 && words <= 15) pts = 10; + else if (words >= 5 && words <= 20) pts = 7; + else if (words >= 3) pts = 4; + + // Body length: 500–940 chars = optimal + if (body) { + const bodyLen = body.trim().length; + if (bodyLen >= 500 && bodyLen <= 940) pts += 7; + else if (bodyLen >= 300) pts += 4; + else if (bodyLen >= 100) pts += 2; + + // Complete sentence (doesn't end with truncation) + const lastChar = body.trim().slice(-1); + if (".!?".includes(lastChar)) pts += 3; + } + + return Math.min(pts, MAX_THESIS_CLARITY); +} + +/** + * Score beat relevance (0–20). + * Matches body + tags against beat-specific keyword set. + */ +function scoreBeatRelevance( + tags: string[], + beat_slug: string, + body?: string | null, + headline?: string +): number { + const keywords = BEAT_KEYWORDS[beat_slug]; + if (!keywords || keywords.length === 0) return 0; + const text = [headline, body, ...tags].filter(Boolean).join(" ").toLowerCase(); + + let keywordHits = 0; + + for (const kw of keywords) { + // Word-boundary match (editor uses \b regex) + const regex = new RegExp("\\b" + kw.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + "\\b", "i"); + if (regex.test(text)) { + keywordHits++; + } + } + + if (keywordHits >= 10) return MAX_BEAT_RELEVANCE; + if (keywordHits >= 5) return 15; + if (keywordHits >= 3) return 10; + if (keywordHits >= 1) return 5; + return 0; +} + +/** + * Score structure (0–15). + * Detects CLAIM/EVIDENCE/IMPLICATION pattern in body. + */ +function scoreStructure(body?: string | null): number { + if (!body) return 0; + const upper = body.toUpperCase(); + + const hasClaim = /\bCLAIM\b/.test(upper); + const hasEvidence = /\bEVIDENCE\b/.test(upper); + const hasImplication = /\bIMPLICATION\b/.test(upper) || /\bACTION\b/.test(upper); + + const sections = [hasClaim, hasEvidence, hasImplication].filter(Boolean).length; + + if (sections === 3) return MAX_STRUCTURE; // Full C/E/I + if (sections === 2) return 10; // Partial structure + if (sections === 1) return 5; // Minimal structure + return 0; +} + +/** + * Score novelty (0–10). + * Counts novelty indicator words in body. + */ +// Precompiled novelty regexes +const NOVELTY_REGEXES = NOVELTY_WORDS.map( + (w) => new RegExp("\\b" + w.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + "\\b", "i") +); + +function scoreNovelty(body?: string | null): number { + if (!body) return 0; + + let hits = 0; + for (const regex of NOVELTY_REGEXES) { + if (regex.test(body)) hits++; + } + + if (hits >= 3) return MAX_NOVELTY; + if (hits >= 2) return 7; + if (hits >= 1) return 4; + return 0; +} + +/** + * Score specificity (0–10). + * Counts named entities, PR numbers, on-chain specifics. + */ +function scoreSpecificity(body?: string | null, headline?: string): number { + const text = [headline, body].filter(Boolean).join(" "); + let matches = 0; + + for (const pattern of SPECIFICITY_PATTERNS) { + const found = text.match(pattern); + if (found) matches += found.length; + } + + if (matches >= 5) return MAX_SPECIFICITY; + if (matches >= 3) return 7; + if (matches >= 1) return 4; + return 0; +} + +/** + * Score a signal and return a composite quality score with per-dimension breakdown. + */ +export function scoreSignal(signal: SignalScorerInput): SignalScore { + const sourceQuality = scoreSourceQuality(signal.sources); + const thesisClarity = scoreThesisClarity(signal.headline, signal.body); + const beatRelevance = scoreBeatRelevance( + signal.tags, + signal.beat_slug, + signal.body, + signal.headline + ); + const structure = scoreStructure(signal.body); + const novelty = scoreNovelty(signal.body); + const specificity = scoreSpecificity(signal.body, signal.headline); + + const total = sourceQuality + thesisClarity + beatRelevance + structure + novelty + specificity; + + return { + total, + breakdown: { + sourceQuality, + thesisClarity, + beatRelevance, + structure, + novelty, + specificity, + }, + }; +}