Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 3 additions & 20 deletions src/llm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -953,26 +953,9 @@ export class RemoteLLM implements LLM {
const model = options.model || oa.model || "gpt-4o-mini";

const docsText = documents.map((doc, i) => `[${i}] ${doc.text}`).join("\n---\n");
const prompt = [
"你是记忆检索助手。根据查询从候选文档中筛选并提取相关信息。",
"",
`查询:${query}`,
"",
"候选文档:",
docsText,
"",
"规则:",
"1. 只提取与查询直接相关的文档内容,忽略不相关的",
"2. 每篇用 [编号] 开头,后面跟提取的核心内容",
"3. 用纯文本输出,不要JSON,不要markdown格式符",
"4. 没有相关文档则输出 NONE",
"5. 多篇文档内容相同或高度重复时,只提取第一篇,跳过后续重复",
"6. 优先选择原始数据源(如日记、笔记、配置记录),跳过「对话/搜索会话记录」类文档",
"",
"示例格式:",
"[0] 提取的核心内容",
"[3] 另一篇的核心内容",
].join("\n");
// Load prompt template from ~/.config/qmd/rerank-prompt.txt if present.
// This allows iterating on extract behavior without changing code.
const prompt = buildRerankPrompt(query, docsText);

const resp = await fetchWithRetry(`${baseUrl}/chat/completions`, {
method: "POST",
Expand Down
116 changes: 112 additions & 4 deletions src/qmd.ts
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,86 @@ async function rerank(query: string, documents: { file: string; text: string }[]
return result.map((r) => ({ file: r.file, score: r.score, extract: r.extract }));
}

// =============================================================================
// Optional final-stage LLM filter/extractor ("文献器")
// =============================================================================

let _llmFilter: { llm: RemoteLLM; model: string } | null = null;

function getLlmFilter(): { llm: RemoteLLM; model: string } | null {
if (_llmFilter) return _llmFilter;

// Provider selection (default: openai-compatible)
const provider = (process.env.QMD_LLM_FILTER_PROVIDER as 'openai' | 'gemini' | undefined)
|| (process.env.QMD_OPENAI_API_KEY ? 'openai' : (process.env.QMD_GEMINI_API_KEY ? 'gemini' : undefined));

if (!provider) return null;

if (provider === 'openai') {
const apiKey = process.env.QMD_LLM_FILTER_OPENAI_API_KEY || process.env.QMD_OPENAI_API_KEY;
if (!apiKey) return null;
const baseUrl = process.env.QMD_LLM_FILTER_OPENAI_BASE_URL || process.env.QMD_OPENAI_BASE_URL;
const model = process.env.QMD_LLM_FILTER_MODEL || process.env.QMD_LLM_FILTER_OPENAI_MODEL || process.env.QMD_OPENAI_MODEL || 'gpt-4o-mini';

const cfg: RemoteLLMConfig = {
rerankProvider: 'openai',
rerankMode: 'llm',
openai: { apiKey, baseUrl, model },
};

_llmFilter = { llm: new RemoteLLM(cfg), model };
return _llmFilter;
}

if (provider === 'gemini') {
const apiKey = process.env.QMD_LLM_FILTER_GEMINI_API_KEY || process.env.QMD_GEMINI_API_KEY;
if (!apiKey) return null;
const baseUrl = process.env.QMD_LLM_FILTER_GEMINI_BASE_URL || process.env.QMD_GEMINI_BASE_URL;
const model = process.env.QMD_LLM_FILTER_MODEL || process.env.QMD_LLM_FILTER_GEMINI_MODEL || process.env.QMD_GEMINI_MODEL || 'gemini-2.5-flash';

const cfg: RemoteLLMConfig = {
rerankProvider: 'gemini',
rerankMode: 'llm',
gemini: { apiKey, baseUrl, model },
};

_llmFilter = { llm: new RemoteLLM(cfg), model };
return _llmFilter;
}

return null;
}

async function llmFilterExtract(
query: string,
documents: { file: string; text: string }[],
_opts: OutputOptions,
): Promise<{ file: string; score: number; extract?: string }[]> {
if (documents.length === 0) return [];

const filter = getLlmFilter();
if (!filter) return [];

if (!_quietMode) {
process.stderr.write(`${c.dim}LLM 文献器:提取相关内容(${documents.length} items, model=${filter.model})...${c.reset}
`);
}

try {
const rerankDocs: RerankDocument[] = documents.map((doc) => ({
file: doc.file,
text: doc.text.slice(0, 4000),
}));

const result = await filter.llm.rerank(query, rerankDocs, { model: filter.model });
return result.results.map((r) => ({ file: r.file, score: r.score, extract: r.extract }));
} catch (err) {
process.stderr.write(`${c.yellow}LLM 文献器失败:${String(err)}${c.reset}
`);
return [];
}
}

function formatTimeAgo(date: Date): string {
const seconds = Math.floor((Date.now() - date.getTime()) / 1000);
if (seconds < 60) return `${seconds}s ago`;
Expand Down Expand Up @@ -2530,8 +2610,10 @@ async function _querySearchImpl(query: string, opts: OutputOptions, embedModel:
// Give 2x weight to original query results (first 2 lists: FTS + vector)
const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0);
const fused = reciprocalRankFusion(rankedLists, weights);
// Hard cap reranking for latency/cost. We rerank per-document (best chunk only).
const RERANK_DOC_LIMIT = parseInt(process.env.QMD_RERANK_DOC_LIMIT || "40", 10);
// Hard cap reranking for latency/cost.
// In cross-encoder rerank mode, we can safely rerank more docs than LLM mode.
const defaultRerankDocLimit = (process.env.QMD_RERANK_MODE || 'llm') === 'rerank' ? "120" : "40";
const RERANK_DOC_LIMIT = parseInt(process.env.QMD_RERANK_DOC_LIMIT || defaultRerankDocLimit, 10);
const candidates = fused.slice(0, RERANK_DOC_LIMIT);

if (candidates.length === 0) {
Expand Down Expand Up @@ -2675,7 +2757,7 @@ async function _querySearchImpl(query: string, opts: OutputOptions, embedModel:
const candidateMap = new Map(candidates.map(cand => [cand.file, { displayPath: cand.displayPath, title: cand.title, body: cand.body }]));
const rrfRankMap = new Map(candidates.map((cand, i) => [cand.file, i + 1])); // 1-indexed rank

const finalResults = Array.from(aggregatedScores.entries()).map(([file, { score: rerankScore, bestChunkIdx, extract }]) => {
let finalResults = Array.from(aggregatedScores.entries()).map(([file, { score: rerankScore, bestChunkIdx, extract }]) => {
const candidate = candidateMap.get(file);
const chunkInfo = docChunkMap.get(file);

Expand Down Expand Up @@ -2725,6 +2807,33 @@ async function _querySearchImpl(query: string, opts: OutputOptions, embedModel:

finalResults.sort((a, b) => b.score - a.score);

if (_profile) { _timings.push({ step: "排序", ms: Date.now() - _tStep, detail: `${finalResults.length}条结果` }); _tStep = Date.now(); }

// Optional final-stage LLM filter/extractor ("文献器"):
// After cross-encoder rerank, let an LLM select + extract the truly relevant items.
// IMPORTANT: do NOT reorder results here — keep the reranker-based ranking stable.
const llmFilterDocLimit = parseInt(process.env.QMD_LLM_FILTER_DOC_LIMIT || "0", 10);
if (!hasExtracts && llmFilterDocLimit > 0) {
const _tFilter0 = Date.now();
const input = finalResults.slice(0, Math.min(finalResults.length, llmFilterDocLimit));
const extracted = await llmFilterExtract(query, input.map(r => ({ file: r.file, text: r.body })), opts);

if (extracted.length > 0) {
const extractByFile = new Map(extracted.map(r => [r.file, r.extract] as const));
// Keep ordering; only replace the snippet/body with extracted content when available.
finalResults = finalResults.map(r => {
const ex = extractByFile.get(r.file);
return ex ? { ...r, body: ex } : r;
});
}

if (_profile) {
_timings.push({ step: "文献器", ms: Date.now() - _tFilter0, detail: `${Math.min(finalResults.length, llmFilterDocLimit)}→${extracted.length} 提取(不改排序)` });
_tStep = Date.now();
}
}


// DEBUG: after sort
if (_verbose) {
console.log("\n=== AFTER SORT ===");
Expand All @@ -2743,7 +2852,6 @@ async function _querySearchImpl(query: string, opts: OutputOptions, embedModel:
// });

if (_profile) {
_timings.push({ step: "排序", ms: Date.now() - _tStep, detail: `${finalResults.length}条结果` });
const totalMs = Date.now() - _t0;
process.stderr.write(`\n${c.dim}步骤\tms\t占比\t详情${c.reset}\n`);
for (const t of _timings) {
Expand Down
4 changes: 3 additions & 1 deletion src/store.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB

// Chunking: configurable via QMD_CHUNK_SIZE_TOKENS and QMD_CHUNK_OVERLAP_TOKENS env vars
export const CHUNK_SIZE_TOKENS = parseInt(process.env.QMD_CHUNK_SIZE_TOKENS || "200", 10);
export const CHUNK_OVERLAP_TOKENS = parseInt(process.env.QMD_CHUNK_OVERLAP_TOKENS || "40", 10);
// Clamp overlap to avoid zero/negative step (which can hang chunking loops)
const _CHUNK_OVERLAP_TOKENS_RAW = parseInt(process.env.QMD_CHUNK_OVERLAP_TOKENS || "40", 10);
export const CHUNK_OVERLAP_TOKENS = Math.max(0, Math.min(_CHUNK_OVERLAP_TOKENS_RAW, Math.max(0, CHUNK_SIZE_TOKENS - 1)));
// Fallback char-based approximation for sync chunking (~4 chars per token)
export const CHUNK_SIZE_CHARS = CHUNK_SIZE_TOKENS * 4;
export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4;
Expand Down