-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path1_embeddings.py
More file actions
executable file
·31 lines (26 loc) · 1006 Bytes
/
1_embeddings.py
File metadata and controls
executable file
·31 lines (26 loc) · 1006 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#!/usr/bin/env python3
"""Generate document embeddings. Full and reduced to 2d."""
import numpy as np
from sentence_transformers import SentenceTransformer
from umap import UMAP
from util import get_docs, EMBEDDINGS_PATH, config
embed_translation = config.get("EMBED_TRANSLATION", False)
docs = get_docs(translation=embed_translation in [True, "True", "true", "1", "on"])
sentence_model = SentenceTransformer(config.get("EMBEDDING_MODEL", "all-MiniLM-L6-v2"))
embeddings = sentence_model.encode(docs, show_progress_bar=True)
# save embeddings
with open(EMBEDDINGS_PATH, "wb") as f:
np.save(f, embeddings, allow_pickle=False)
# save 2d embeddings
umap_model = UMAP(
n_neighbors=15,
n_components=2,
min_dist=0.0,
metric="cosine",
low_memory=True,
random_state=42,
)
doc_embeddings_reduced = umap_model.fit_transform(embeddings)
# save reduced embeddings
with open(EMBEDDINGS_PATH.with_suffix(".2d.npy"), "wb") as f:
np.save(f, doc_embeddings_reduced, allow_pickle=False)