Skip to content

Commit f91de72

Browse files
committed
2 parents a452efa + 04cedd1 commit f91de72

5 files changed

Lines changed: 635 additions & 91 deletions

File tree

.claude/settings.local.json

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,19 @@
2222
"WebFetch(domain:www.npmjs.com)",
2323
"WebSearch",
2424
"Bash(cargo check:*)",
25-
"Bash(tsc:*)"
25+
"Bash(tsc:*)",
26+
"WebFetch(domain:reeden.app)",
27+
"WebFetch(domain:github.com)",
28+
"WebFetch(domain:article.juejin.cn)",
29+
"WebFetch(domain:blog.51cto.com)",
30+
"WebFetch(domain:fast.v2ex.com)",
31+
"WebFetch(domain:www.oomol.com)",
32+
"WebFetch(domain:developer.aliyun.com)",
33+
"WebFetch(domain:raw.githubusercontent.com)",
34+
"WebFetch(domain:stackoverflow.com)",
35+
"WebFetch(domain:mozilla.github.io)",
36+
"WebFetch(domain:jsdev.space)",
37+
"WebFetch(domain:pdf-lib.js.org)"
2638
]
2739
}
2840
}

packages/core/src/rag/index.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,24 @@ export {
2121
clearChunkCache,
2222
} from "./search";
2323

24+
// Tokenizer exports
25+
export { tokenize, tokenizeQuery, getTokenFrequencies } from "./tokenizer";
26+
27+
// Inverted index exports
28+
export {
29+
buildInvertedIndex,
30+
searchInvertedIndex,
31+
getMatchingDocIds,
32+
getIntersectingDocIds,
33+
getIndexStats,
34+
} from "./inverted-index";
35+
export type {
36+
Posting,
37+
IndexEntry,
38+
DocMeta,
39+
InvertedIndex,
40+
} from "./inverted-index";
41+
2442
export { vectorizeBook } from "./vectorize";
2543
export type { VectorizeCallback } from "./vectorize";
2644

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
/**
2+
* Inverted Index for BM25 search
3+
*
4+
* Optimizes search by pre-computing:
5+
* - Term → Document IDs mapping (inverted index)
6+
* - Document term frequencies
7+
* - IDF scores
8+
*
9+
* Time complexity:
10+
* - Build: O(n * m) where n = docs, m = avg tokens per doc
11+
* - Search: O(k * d) where k = query terms, d = avg docs per term
12+
*
13+
* vs. naive approach O(k * n * m) for every query
14+
*/
15+
16+
import { tokenize, getTokenFrequencies } from "./tokenizer";
17+
18+
/** Posting entry: document ID + term frequency */
19+
export interface Posting {
20+
docId: string;
21+
tf: number; // term frequency in this document
22+
}
23+
24+
/** Inverted index entry for a term */
25+
export interface IndexEntry {
26+
/** List of documents containing this term */
27+
postings: Posting[];
28+
/** Document frequency (number of docs containing this term) */
29+
df: number;
30+
/** Pre-computed IDF score */
31+
idf: number;
32+
}
33+
34+
/** Document metadata */
35+
export interface DocMeta {
36+
docId: string;
37+
/** Token count (document length) */
38+
length: number;
39+
}
40+
41+
/** Inverted index structure */
42+
export interface InvertedIndex {
43+
/** Term → IndexEntry mapping */
44+
termIndex: Map<string, IndexEntry>;
45+
/** Document metadata */
46+
docMeta: Map<string, DocMeta>;
47+
/** Total number of documents */
48+
totalDocs: number;
49+
/** Average document length */
50+
avgDocLength: number;
51+
}
52+
53+
/**
54+
* Build inverted index from documents
55+
*
56+
* @param documents - Array of { id, content } objects
57+
* @param tokenizeFn - Tokenization function (default: built-in tokenizer)
58+
* @returns Inverted index
59+
*
60+
* @example
61+
* const index = buildInvertedIndex([
62+
* { id: "1", content: "Hello world" },
63+
* { id: "2", content: "World of AI" }
64+
* ]);
65+
*/
66+
export function buildInvertedIndex(
67+
documents: Array<{ id: string; content: string }>,
68+
tokenizeFn: (text: string) => string[] = tokenize,
69+
): InvertedIndex {
70+
const termIndex = new Map<string, IndexEntry>();
71+
const docMeta = new Map<string, DocMeta>();
72+
let totalLength = 0;
73+
74+
for (const doc of documents) {
75+
const tokens = tokenizeFn(doc.content);
76+
const tokenFreqs = getTokenFrequencies(tokens);
77+
78+
// Store document metadata
79+
docMeta.set(doc.id, { docId: doc.id, length: tokens.length });
80+
totalLength += tokens.length;
81+
82+
// Update inverted index
83+
for (const [term, tf] of tokenFreqs) {
84+
let entry = termIndex.get(term);
85+
if (!entry) {
86+
entry = { postings: [], df: 0, idf: 0 };
87+
termIndex.set(term, entry);
88+
}
89+
entry.postings.push({ docId: doc.id, tf });
90+
entry.df++;
91+
}
92+
}
93+
94+
const totalDocs = documents.length;
95+
const avgDocLength = totalDocs > 0 ? totalLength / totalDocs : 0;
96+
97+
// Pre-compute IDF for all terms
98+
for (const entry of termIndex.values()) {
99+
// BM25 IDF formula: log((N - df + 0.5) / (df + 0.5) + 1)
100+
entry.idf = Math.log((totalDocs - entry.df + 0.5) / (entry.df + 0.5) + 1);
101+
}
102+
103+
return { termIndex, docMeta, totalDocs, avgDocLength };
104+
}
105+
106+
/**
107+
* Search using inverted index with BM25 scoring
108+
*
109+
* @param index - Inverted index
110+
* @param queryTerms - Tokenized query terms
111+
* @param topK - Maximum number of results
112+
* @param k1 - BM25 term frequency saturation parameter (default: 1.5)
113+
* @param b - BM25 document length normalization parameter (default: 0.75)
114+
* @returns Array of { docId, score } sorted by score descending
115+
*/
116+
export function searchInvertedIndex(
117+
index: InvertedIndex,
118+
queryTerms: string[],
119+
topK: number,
120+
k1 = 1.5,
121+
b = 0.75,
122+
): Array<{ docId: string; score: number }> {
123+
if (queryTerms.length === 0 || index.totalDocs === 0) {
124+
return [];
125+
}
126+
127+
// Accumulate scores for each document
128+
const scores = new Map<string, number>();
129+
130+
for (const term of queryTerms) {
131+
const entry = index.termIndex.get(term);
132+
if (!entry) continue; // Term not in index
133+
134+
const { idf, postings } = entry;
135+
136+
for (const posting of postings) {
137+
const docLength = index.docMeta.get(posting.docId)?.length ?? 0;
138+
139+
// BM25 score for this term in this document
140+
const tf = posting.tf;
141+
const normalizedTf = (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (docLength / index.avgDocLength)));
142+
const termScore = idf * normalizedTf;
143+
144+
scores.set(posting.docId, (scores.get(posting.docId) || 0) + termScore);
145+
}
146+
}
147+
148+
// Sort by score and return top K
149+
return Array.from(scores.entries())
150+
.map(([docId, score]) => ({ docId, score }))
151+
.sort((a, b) => b.score - a.score)
152+
.slice(0, topK);
153+
}
154+
155+
/**
156+
* Get document IDs that contain ANY of the query terms (for filtering)
157+
*/
158+
export function getMatchingDocIds(
159+
index: InvertedIndex,
160+
queryTerms: string[],
161+
): Set<string> {
162+
const matchingDocs = new Set<string>();
163+
164+
for (const term of queryTerms) {
165+
const entry = index.termIndex.get(term);
166+
if (!entry) continue;
167+
168+
for (const posting of entry.postings) {
169+
matchingDocs.add(posting.docId);
170+
}
171+
}
172+
173+
return matchingDocs;
174+
}
175+
176+
/**
177+
* Get document IDs that contain ALL of the query terms (AND query)
178+
*/
179+
export function getIntersectingDocIds(
180+
index: InvertedIndex,
181+
queryTerms: string[],
182+
): Set<string> {
183+
if (queryTerms.length === 0) return new Set();
184+
185+
// Get docs for first term
186+
const firstEntry = index.termIndex.get(queryTerms[0]);
187+
if (!firstEntry) return new Set();
188+
189+
let result = new Set(firstEntry.postings.map((p) => p.docId));
190+
191+
// Intersect with docs for remaining terms
192+
for (let i = 1; i < queryTerms.length; i++) {
193+
const entry = index.termIndex.get(queryTerms[i]);
194+
if (!entry) return new Set(); // No docs contain this term
195+
196+
const termDocs = new Set(entry.postings.map((p) => p.docId));
197+
result = new Set([...result].filter((id) => termDocs.has(id)));
198+
199+
if (result.size === 0) return new Set();
200+
}
201+
202+
return result;
203+
}
204+
205+
/**
206+
* Get index statistics
207+
*/
208+
export function getIndexStats(index: InvertedIndex): {
209+
totalDocs: number;
210+
totalTerms: number;
211+
avgDocLength: number;
212+
avgTermsPerDoc: number;
213+
} {
214+
let totalTermsInDocs = 0;
215+
for (const meta of index.docMeta.values()) {
216+
totalTermsInDocs += meta.length;
217+
}
218+
219+
return {
220+
totalDocs: index.totalDocs,
221+
totalTerms: index.termIndex.size,
222+
avgDocLength: index.avgDocLength,
223+
avgTermsPerDoc: index.totalDocs > 0 ? totalTermsInDocs / index.totalDocs : 0,
224+
};
225+
}

0 commit comments

Comments
 (0)