-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataset.py
76 lines (62 loc) · 2.68 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import AutoTokenizer, GPT2Tokenizer
class MarkdownDataset(Dataset):
def __init__(self, df, model_name_or_path, total_max_len, md_max_len, fts):
super().__init__()
self.df = df.reset_index(drop=True)
self.md_max_len = md_max_len
self.total_max_len = total_max_len # maxlen allowed by model config
# Sử dụng AutoTokenizer, optimize với fast tokenizer với option use_fast = True
#self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
# self.tokenizer AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
# Sử dụng một tokenizer từ một model pretrain trước
self.tokenizer = AutoTokenizer.from_pretrained("tals/roberta_python")
# Nếu tokenizer chưa có pad_token, thêm vào
if self.tokenizer.pad_token is None:
self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
self.fts = fts
def __getitem__(self, index):
row = self.df.iloc[index]
inputs = self.tokenizer.encode_plus(
row.source,
None,
add_special_tokens=True,
max_length=self.md_max_len,
padding="max_length",
return_token_type_ids=True,
truncation=True
)
code_inputs = self.tokenizer.batch_encode_plus(
[str(x) for x in self.fts[row.id]["codes"]],
add_special_tokens=True,
max_length=23,
padding="max_length",
truncation=True
)
n_md = self.fts[row.id]["total_md"]
n_code = self.fts[row.id]["total_md"]
if n_md + n_code == 0:
fts = torch.FloatTensor([0])
else:
fts = torch.FloatTensor([n_md / (n_md + n_code)])
ids = inputs['input_ids']
for x in code_inputs['input_ids']:
ids.extend(x[:-1])
ids = ids[:self.total_max_len]
if len(ids) != self.total_max_len:
ids = ids + [self.tokenizer.pad_token_id, ] * \
(self.total_max_len - len(ids))
ids = torch.LongTensor(ids)
mask = inputs['attention_mask']
for x in code_inputs['attention_mask']:
mask.extend(x[:-1])
mask = mask[:self.total_max_len]
if len(mask) != self.total_max_len:
mask = mask + [self.tokenizer.pad_token_id, ] * \
(self.total_max_len - len(mask))
mask = torch.LongTensor(mask)
assert len(ids) == self.total_max_len
return ids, mask, fts, torch.FloatTensor([row.pct_rank])
def __len__(self):
return self.df.shape[0]