Skip to content

research embeddings #17

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,429 changes: 1,762 additions & 667 deletions package-lock.json

Large diffs are not rendered by default.

12 changes: 11 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@
"title": "FireCoder: Login",
"icon": "$(key)",
"description": "Login to your account"
},
{
"command": "firecoder.testEmbed",
"title": "FireCoder: Start Test embed"
},
{
"command": "firecoder.clearLocalEmbeddings",
"title": "FireCoder: Clear Local Embeddings"
}
],
"keybindings": [
Expand Down Expand Up @@ -211,9 +219,11 @@
"dependencies": {
"@grafana/faro-core": "^1.3.5",
"@grafana/faro-web-sdk": "^1.3.5",
"@lancedb/lancedb": "^0.5.0",
"@langchain/community": "^0.0.27",
"@supabase/supabase-js": "^2.42.7",
"@xenova/transformers": "^2.17.1",
"ignore": "^5.3.1",
"langchain": "^0.1.17"
}
}
}
5 changes: 5 additions & 0 deletions src/common/download/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@ const getModelInfo = async (
checksum:
"ec11bacb9e0b8c8e0f483f209c487939202b04bbf4f815f0a0945c5b256da895",
},
"embed-small": {
url: "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.f16.gguf",
checksum:
"f7af6f66802f4df86eda10fe9bbcfc75c39562bed48ef6ace719a251cf1c2fdb",
},
};

return models[typeModel];
Expand Down
205 changes: 205 additions & 0 deletions src/common/embedding/embedding.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
import * as vscode from "vscode";
import { OpenAIEmbeddings } from "langchain/embeddings/openai";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { readFileSync } from "node:fs";
import { FileVectorStore } from "./fileVectorStore";
import { VSCodeDirectoryLoader } from "./vscodeDirectoryLoader";
import { index } from "langchain/indexes";
import { InMemoryRecordManager } from "@langchain/community/indexes/memory";
import { state } from "../utils/state";
import { Document } from "langchain/document";

const embedding = new OpenAIEmbeddings({
apiKey: "YOUR-API-KEY",
batchSize: 64,
maxConcurrency: 10,
configuration: {
baseURL: "http://localhost:39729/v1",
},
});

class Embeddings {
private vectorStore?: FileVectorStore;
private recordManager?: InMemoryRecordManager;
private workspacePath!: vscode.Uri;

public async init(workspacePath: vscode.Uri) {
this.workspacePath = workspacePath;
}

public async refreshIndex() {
const vectorStore = this.initVectorStore();
const recordManager = this.initRecordManager();
const document = await this.loadDocuments();

const processedDocument = await this.splitDocuments(document);

const resultIndex = await index({
docsSource: processedDocument,
recordManager,
vectorStore: vectorStore,
options: {
cleanup: "full",
sourceIdKey: (doc) => doc.metadata.source,
},
});

console.log(resultIndex);

state.workspace.update("embedding", vectorStore.serialize());
const recordManagerState = JSON.stringify(
Object.fromEntries(recordManager.records)
);

state.workspace.update("recordManager", recordManagerState);
}

initRecordManager() {
if (this.recordManager) {
return this.recordManager;
}

const recordManager = new InMemoryRecordManager();
const recordManagerStateExist = state.workspace.get("recordManager");

if (recordManagerStateExist) {
recordManager.records = new Map(
Object.entries(JSON.parse(recordManagerStateExist))
);
}

this.recordManager = recordManager;

return recordManager;
}

initVectorStore() {
if (this.vectorStore) {
return this.vectorStore;
}

const savedVectorStore = state.workspace.get("embedding");
const vectorStore = savedVectorStore
? FileVectorStore.deserialize(savedVectorStore, embedding)
: new FileVectorStore(embedding);
this.vectorStore = vectorStore;
return vectorStore;
}

async loadDocuments() {
const loader = new VSCodeDirectoryLoader(this.workspacePath);
const docs = await loader.load();

return docs;
}

async splitDocuments(docs: Document[]) {
const splitter = RecursiveCharacterTextSplitter.fromLanguage("js", {
chunkSize: 7000,
chunkOverlap: 0,
});

const splittedDocuments = await splitter.splitDocuments(
docs.map((doc) => ({
...doc,
pageContent: "search_document: " + doc.pageContent,
}))
);
return splittedDocuments;
}

public async getRelativeDocuments(
activeDocument: vscode.TextDocument
): Promise<vscode.TextDocument[]> {
if (this.vectorStore) {
const results = await this.vectorStore.similaritySearchWithScore(
`search_document: ${activeDocument.getText()}`,
20,
(doc) => doc.metadata.source !== activeDocument.uri.fsPath
);

const documents = await Promise.all(
results.map((document) =>
vscode.workspace.openTextDocument(
vscode.Uri.parse(document[0].metadata.source)
)
)
);

return documents;
}
return [];
}
}

export const embeddings = new Embeddings();

export const startTest = async () => {
const workspacePath = vscode.workspace.workspaceFolders![0].uri;

const loader = new VSCodeDirectoryLoader(workspacePath);

const docs = await loader.load();

const splitter = RecursiveCharacterTextSplitter.fromLanguage("js", {
chunkSize: 7000,
chunkOverlap: 0,
});

const jsOutput = await splitter.splitDocuments(
docs.map((doc) => ({
...doc,
pageContent: "search_document: " + doc.pageContent,
}))
);

const savedVectorStore = state.workspace.get("embedding");
const vectorStore = savedVectorStore
? FileVectorStore.deserialize(savedVectorStore, embedding)
: new FileVectorStore(embedding);

const recordManager = new InMemoryRecordManager();
const recordManagerStateExist = state.workspace.get("recordManager");

if (recordManagerStateExist) {
recordManager.records = new Map(
Object.entries(JSON.parse(recordManagerStateExist))
);
}

console.log(
await index({
docsSource: jsOutput,
recordManager,
vectorStore,
options: {
cleanup: "full",
sourceIdKey: (doc) => doc.metadata.source,
},
})
);

state.workspace.update("embedding", vectorStore.serialize());
const recordManagerState = JSON.stringify(
Object.fromEntries(recordManager.records)
);

state.workspace.update("recordManager", recordManagerState);

const file = readFileSync(
"/home/gespispace/firecoder/llm-backend/src/completions/completions.module.ts",
{ encoding: "utf-8" }
);

const start = performance.now();
const resultOne = await vectorStore.similaritySearchWithScore(
`search_document: ${file}`,
20
);
const end = performance.now();

console.log(`Full Time: ${(end - start).toFixed(2)}ms`);
console.log(
resultOne.map((document) => [document[1], document[0].metadata.source])
);
};
67 changes: 67 additions & 0 deletions src/common/embedding/fileVectorStore.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import { randomUUID } from "crypto";
import { EmbeddingsInterface } from "langchain/embeddings/base";
import {
MemoryVectorStore,
MemoryVectorStoreArgs,
} from "langchain/vectorstores/memory";

export class FileVectorStore extends MemoryVectorStore {
constructor(
embeddings: EmbeddingsInterface,
{
similarity,
memoryVectors,
...rest
}: MemoryVectorStoreArgs & { memoryVectors?: any } = {}
) {
super(embeddings, { similarity, ...rest });
if (memoryVectors) {
this.memoryVectors = memoryVectors;
}
}
override async addDocuments(
documents: any[],
options?: { ids?: string[] }
): Promise<void> {
const texts = documents.map(({ pageContent }) => pageContent);
return this.addVectors(
await this.embeddings.embedDocuments(texts),
documents.map((doc, index) => {
return {
...doc,
metadata: {
...doc.metadata,
id: options?.ids?.[index] ?? randomUUID(),
},
};
})
);
}

public serialize(): string {
return JSON.stringify(this.memoryVectors);
}

static deserialize(
index: string,
embeddings: EmbeddingsInterface,
dbConfig?: MemoryVectorStoreArgs
): FileVectorStore {
const memoryVectors = JSON.parse(index);
const instance = new FileVectorStore(embeddings, {
memoryVectors,
similarity: dbConfig?.similarity,
...dbConfig,
});
return instance;
}

public async delete(params: { ids: string[] }): Promise<void> {
params.ids.forEach((id) => {
const index = this.memoryVectors.findIndex((v) => v.metadata.id === id);
if (index !== -1) {
this.memoryVectors.splice(index, 1);
}
});
}
}
2 changes: 2 additions & 0 deletions src/common/embedding/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
export { VSCodeDirectoryLoader } from "./vscodeDirectoryLoader";
export { FileVectorStore } from "./fileVectorStore";
Loading