-
Notifications
You must be signed in to change notification settings - Fork 4
/
phobert_embeding.py
25 lines (22 loc) · 965 Bytes
/
phobert_embeding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import torch
from transformers import AutoModel, AutoTokenizer
import tensorflow as tf
phobert = AutoModel.from_pretrained("./vinai/phobert-base", local_files_only=True).to(torch.device("cuda:0"))
# phobert.save_pretrained("vinai/phobert-base") #lưu model sau khi tải về ổ cứng
# For transformers v4.x+:
tokenizer = AutoTokenizer.from_pretrained("./vinai/phobert-base", use_fast=False, local_files_only=True)
# tokenizer.save_pretrained("vinai/phobert-base") #lưu model sau khi tải về ổ cứng
MAX_LEN = 256
print("LOAD phoBERT DONE")
def get_emb_vector(input_ids):
input_ids = torch.tensor([input_ids]).to(torch.long)
with torch.no_grad():
features = phobert(input_ids.to(torch.device("cuda:0")))
#print(features)
emb_vecs = features[0].cpu().numpy()[0]#[1:-1]
#print(emb_vecs)
return emb_vecs
def text2ids(text):
# print(tokenizer.encode("<pad> nhà <pad>"))
tkz = tokenizer.encode(text)
return tkz