-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathpreprocessing.py
672 lines (570 loc) · 26.7 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
from abc import ABC, abstractmethod
from datasets import Dataset, Value
import os
import copy
import json
import multiprocessing
import numpy as np
import pandas as pd
import pdb
from tqdm import tqdm
import torch
from transformers import AutoTokenizer
from torchtext.data.utils import get_tokenizer
from gensim.models import KeyedVectors
from utils import NpEncoder
class Preprocessor(ABC):
'''
Description
-----------
Abstract class based on which built-in and custom preprocessors are
prepared.
Attributes
----------
checkpoint_dir : str
max_seq_len : int
batch_size : int
prepend_labels : bool
length_threshold : int
Methods
-------
process_data():
Given a loaded dataset from HF or local path, output the preprocessed
dataset.
'''
def __init__(self, checkpoint_dir, max_seq_len, batch_size,
prepend_labels, mode):
self.checkpoint_dir = checkpoint_dir
self.max_seq_len = max_seq_len
self.batch_size = batch_size
self.prepend_labels = prepend_labels
self.mode = mode
@abstractmethod
def process_data(self, data):
'''
Description
-----------
Given a loaded dataset from HF or local path, output the preprocessed
dataset.
Parameters
----------
data : ``Dataset``, A dataset object from HF with at least 'label' and
'text' columns.
'''
pass
class Preprocessor_for_RNN(Preprocessor):
def __init__(self, embed_dir_processed, embed_dir_unprocessed,
vocab_size=20000, embed_type='glove', embed_size=300,
**kwargs):
super().__init__(**kwargs)
self.embed_dir_processed = embed_dir_processed
self.embed_dir_unprocessed = embed_dir_unprocessed
self.vocab_size = vocab_size
self.embed_type = embed_type
self.embed_size = embed_size
self.tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
self.PAD = '<PAD>'
self.SOS = '<SOS>'
self.EOS = '<EOS>'
self.UNK = '<UNK>'
if self.embed_type not in ['none']:
self.vocab, self.embeds, self.word2idx, self.idx2word =\
self._make_vocab_and_embeds_files()
else:
self.vocab, self.embeds, self.word2idx, self.idx2word = (None,)*4
def _make_vocab_and_embeds_files(self):
'''
Returns:
vocab (np.ndarray): 1D array of strings, untrimmed vocabulary
embeds (np.ndarray): 2D array, of untrimmed vocabulary size X
self.embed_size
'''
vocab_file = os.path.join(
self.embed_dir_processed,
f'vocab_type{self.embed_type}_d{self.embed_size}_np.npy')
embeds_file = os.path.join(
self.embed_dir_processed,
f'embeds_type{self.embed_type}_d{self.embed_size}_np.npy')
try:
with open(vocab_file, 'rb') as v_f:
vocab = np.load(v_f)
with open(embeds_file, 'rb') as e_f:
embeds = np.load(e_f)
print("Loaded vocabulary and embedding files...")
except FileNotFoundError:
print("Preparing vocabulary and embedding files...")
if self.embed_type.lower() == 'glove':
vocab, embeds = self.make_vocab_and_embeds_glove(
vocab_file, embeds_file)
elif self.embed_type.lower() in ['word2vec', 'w2v']:
vocab, embeds = self.make_vocab_and_embeds_w2v(
vocab_file, embeds_file)
else:
raise Exception(
"'embed_type' can only be 'glove', 'word2vec', or 'none'.")
indexes = np.arange(vocab.size)
word2idx = {}
idx2word = {}
for idx, word in zip(indexes, vocab):
word2idx[word] = idx
idx2word[idx] = word
# Additionally adding <UNK> (will be reindexed later)
word2idx[self.UNK] = len(indexes) + 1
idx2word[len(indexes) + 1] = self.UNK
return vocab, embeds, word2idx, idx2word
def make_vocab_and_embeds_w2v(self, vocab_file, embeds_file):
model = KeyedVectors.load_word2vec_format(self.embed_dir_unprocessed,
binary=True)
vocab_np = np.array(model.index_to_key)
embeds_np = model.vectors
del model
np.save(vocab_file, vocab_np)
np.save(embeds_file, embeds_np)
return vocab_np, embeds_np
def make_vocab_and_embeds_glove(self, vocab_file, embeds_file):
vocab, embeds = [], []
with open(self.embed_dir_unprocessed, 'rt') as f:
everything = f.read().strip().split('\n')
for idx in range(len(everything)):
idx_word = everything[idx].split(' ')[0]
idx_embeds = [float(val) for val in everything[idx].split(' ')[1:]]
vocab.append(idx_word)
embeds.append(idx_embeds)
vocab_np = np.array(vocab) # vocab_size
embeds_np = np.array(embeds) # vocab_size X embed_dim
with open(vocab_file, 'wb') as f:
np.save(f, vocab_np)
with open(embeds_file, 'wb') as f:
np.save(f, embeds_np)
return vocab_np, embeds_np
def process_data(self, data, train_split=True, rewriting=False,
last_checkpoint_path=None, first_shard=True):
if self.embed_type not in ['none']:
data = self.process_data_embeds(
data, train_split=train_split, rewriting=rewriting,
last_checkpoint_path=last_checkpoint_path,
first_shard=first_shard)
else:
data = self.process_data_no_embeds(
data, train_split=train_split, rewriting=rewriting,
last_checkpoint_path=last_checkpoint_path,
first_shard=first_shard)
return data
def process_data_embeds(self, data, train_split=True, rewriting=False,
last_checkpoint_path=None, first_shard=True):
'''
Procedure:
1. Tokenize and vectorize raw data
2. Get the most frequent tokens
3. Recreate the vocabulary, embeddings and word2idx/idx2word
dictionaries based on only the most frequent tokens (or based
on existing saved files when rewriting.)
4. Reindex the data and pad
Additionally, if prepending labels:
Get a set of all labels in the training set as strings.
Prepend 'LABEL_' to each string in case it is a duplicate
vocabulary item.
Create corresponding indexes for each string label, starting from
the last vocabulary index (self.vocab_size + 4).
Add both strings and indexes to self.word2idx and self.idx2word
...
'''
if self.prepend_labels and train_split and first_shard:
labels = [doc['label'] for doc in data]
str_labels = ['LABEL_' + str_lab
for str_lab in sorted(set(labels))]
if first_shard:
idx_labels = np.arange(len(str_labels)) + self.vocab_size + 4
else:
idx_labels = np.arange(len(str_labels)) + (self.vocab_size - len(str_labels))
else:
idx_labels = None
data = self._tokenize_and_vectorize(data)
if first_shard and train_split:
if rewriting:
try:
old_idx2word = copy.deepcopy(self.idx2word)
self.vocab, self.embeds, self.word2idx, self.idx2word =\
self._load_existing_compact_embeds(last_checkpoint_path)
except:
print("Could not load existing word2idx and idx2word "
"dictionaries, rebuilding based on specified dataset. "
"If pre-training and rewriting on two different "
"datasets, MAKE SURE the vocabularies are the same "
"for both.")
top_idxs = self._get_frequency(data)
old_idx2word = copy.deepcopy(self.idx2word)
self.vocab, self.embeds, self.word2idx, self.idx2word =\
self._prepare_compact_embeds(top_idxs,
idx_labels=idx_labels)
else:
top_idxs = self._get_frequency(data)
old_idx2word = copy.deepcopy(self.idx2word)
self.vocab, self.embeds, self.word2idx, self.idx2word =\
self._prepare_compact_embeds(top_idxs,
idx_labels=idx_labels)
else:
old_idx2word = None
if self.prepend_labels and train_split and first_shard:
# Add label strings and indexes to the existing word2idx and
# idx2word dictionaries
for idx, lab in enumerate(str_labels):
self.word2idx[lab] = idx_labels[idx]
self.idx2word[idx_labels[idx]] = lab
len_labels = len(str_labels)
else:
len_labels = 0
data = self._reindex_data_and_pad(data, self.word2idx,
old_idx2word=old_idx2word)
if first_shard and train_split:
self.vocab_size = self.vocab_size + 4 + len_labels
return data
def _tokenize_and_vectorize(self, dataset):
data = []
for doc_dict in tqdm(dataset):
tokenized = self.tokenizer(doc_dict['text'].strip().lower())
tensor = torch.tensor(
[self.word2idx[token]
if token in self.word2idx.keys()
else self.word2idx[self.UNK]
for token in tokenized][:(self.max_seq_len-2)],
dtype=torch.long) # -2 for SOS+EOS tokens
length = tensor.size()[0]
data.append((tensor, length, doc_dict['label']))
return data
def _get_frequency(self, train_data):
relevant = torch.cat([val[0] for val in train_data])
# Ignore UNK token in counts
relevant = relevant[relevant != self.word2idx[self.UNK]]
counts = torch.bincount(relevant)
top_counts, top_indexes = torch.topk(counts, self.vocab_size)
return top_indexes
def _prepare_compact_embeds(self, top_indexes, idx_labels=None):
reindexes = np.arange(self.vocab_size+4)
new_vocab = self.vocab[top_indexes]
new_embeds = self.embeds[top_indexes, :]
new_vocab = np.insert(new_vocab, 0, self.PAD)
new_vocab = np.insert(new_vocab, 1, self.UNK)
new_vocab = np.insert(new_vocab, 2, self.SOS)
new_vocab = np.insert(new_vocab, 3, self.EOS)
# Pad token is all 0s
pad_emb_np = np.zeros((1, new_embeds.shape[1]))
# UNK token is mean of all other embeds
unk_emb_np = np.mean(new_embeds, axis=0, keepdims=True)
# SOS token is a random vector (standard normal)
sos_emb_np = np.random.normal(size=pad_emb_np.shape)
# EOS token is a random vector (standard normal)
eos_emb_np = np.random.normal(size=pad_emb_np.shape)
new_embeds = np.vstack((pad_emb_np, unk_emb_np, sos_emb_np,
eos_emb_np, new_embeds))
if self.prepend_labels:
idx_emb_nps = np.random.normal(size=(
len(idx_labels), pad_emb_np.shape[1]))
new_embeds = np.vstack((new_embeds, idx_emb_nps))
new_word2idx = {}
new_idx2word = {}
for idx, word in zip(reindexes, new_vocab):
new_word2idx[word] = idx
new_idx2word[idx] = word
# Save vocabulary, embeddings, word2idx and idx2word in checkpoint
# directory
np.save(os.path.join(self.checkpoint_dir, 'vocab.npy'), new_vocab)
np.save(os.path.join(self.checkpoint_dir, 'embeds.npy'), new_embeds)
with open(os.path.join(self.checkpoint_dir, 'word2idx.json'), 'w',
encoding='utf-8') as f:
json.dump(new_word2idx, f, ensure_ascii=False, indent=4,
cls=NpEncoder)
return new_vocab, new_embeds, new_word2idx, new_idx2word
def _load_existing_compact_embeds(self, last_checkpoint_path):
'''
For rewriting mode, load the pre-trained vocabulary, embeddings,
word2idx and idx2word dictionaries.
'''
checkpoint_dir = os.path.abspath(
os.path.join(last_checkpoint_path, os.pardir))
new_vocab = np.load(os.path.join(checkpoint_dir, 'vocab.npy'))
new_embeds = np.load(os.path.join(checkpoint_dir, 'embeds.npy'))
with open(os.path.join(checkpoint_dir, 'word2idx.json'), 'r',
encoding='utf-8') as f:
new_word2idx = json.load(f)
new_idx2word = {idx: word for word, idx in new_word2idx.items()}
return new_vocab, new_embeds, new_word2idx, new_idx2word
def _reindex_data_and_pad(self, data, compact_word2idx, old_idx2word=None):
'''
Carries out three tasks:
1. Converts indexes of untrimmed vocabulary to indexes of trimmed
vocabulary.
2. Adds special tokens to existing tensor (SOS, EOS and PAD).
3. Prepends the label token to the front of the sequence (after SOS
token), if 'self.prepend_labels' is True.
'''
reindexed = []
for data_point in tqdm(data):
if old_idx2word is not None:
old_indexes = data_point[0]
words = [old_idx2word[idx.item()] for idx in old_indexes]
new_indexes = torch.tensor(
[compact_word2idx[word]
if word in compact_word2idx
else compact_word2idx[self.UNK] for word in words],
dtype=torch.long)
else:
# If subsequent shard
new_indexes = data_point[0]
if self.prepend_labels:
# In case of any new labels (generally shouldn't be the case)
if "LABEL_" + str(data_point[2]) in self.word2idx:
lab = self.word2idx["LABEL_" + str(data_point[2])]
else:
lab = self.word2idx[self.UNK]
new_indexes = torch.cat(
(torch.tensor([lab]), new_indexes[:(self.max_seq_len-3)]),
dim=0)
# restricting to 'self.max_seq_len-3' to account for the
# SOS, EOS and label tokens
else:
lab = data_point[2]
new_indexes = torch.cat(
(torch.tensor([compact_word2idx[self.SOS]]),
new_indexes,
torch.tensor([compact_word2idx[self.EOS]])), dim=0)
new_length = data_point[1] + 2
while new_indexes.shape[0] < self.max_seq_len:
new_indexes = torch.cat(
(new_indexes,
torch.tensor([compact_word2idx[self.PAD]])))
reindexed.append((new_indexes, new_length, lab))
return reindexed
def process_data_no_embeds(self, data, train_split=True, rewriting=False,
last_checkpoint_path=None, first_shard=True):
# Only prepare the vocabulary based on the first shard
# (dataset should be large enough, e.g. Wikipedia)
if first_shard and train_split:
data = self._process_data_no_embeds_first_shard(
data, rewriting=rewriting,
last_checkpoint_path=last_checkpoint_path)
else:
data = self._process_data_no_embeds_subsequent_shard(
data, rewriting=rewriting,
last_checkpoint_path=last_checkpoint_path)
return data
def _process_data_no_embeds_first_shard(
self, data, last_checkpoint_path=None, rewriting=False):
def encode(examples):
examples['text'] = [self.tokenizer(doc.strip().lower())
for doc in examples['text']]
return examples
# Multiprocessing for larger datasets
threshold = 50000
if len(data) > threshold:
proc_num = os.cpu_count()
else:
proc_num = None
data = data.map(encode, batched=True, num_proc=proc_num)
labels = data['label']
if rewriting:
# Loading the large idx2word for the full vocab
try:
checkpoint_dir = os.path.abspath(
os.path.join(last_checkpoint_path, os.pardir))
with open(os.path.join(checkpoint_dir,
'large_idx2word.json'), 'r',
encoding='utf-8') as f:
self.idx2word = json.load(f)
self.idx2word = {int(idx): token
for idx, token in self.idx2word.items()}
except FileNotFoundError:
print("Could not load existing FULL word2idx and idx2word "
"dictionaries, rebuilding based on specified dataset. "
"If pre-training and rewriting on two different "
"datasets, MAKE SURE the vocabularies are the same "
"for both.")
idx2word = []
for idx, doc in tqdm(enumerate(data)):
idx2word += doc['text']
if idx % 5000 == 0:
idx2word = list(set(idx2word))
idx2word = list(set(idx2word))
self.idx2word = {idx: token
for idx, token in enumerate(idx2word)}
else:
idx2word = []
for idx, doc in tqdm(enumerate(data)):
idx2word += doc['text']
if idx % 5000 == 0:
idx2word = list(set(idx2word))
idx2word = list(set(idx2word))
self.idx2word = {idx: token for idx, token in enumerate(idx2word)}
with open(os.path.join(self.checkpoint_dir,
'large_idx2word.json'), 'w',
encoding='utf-8') as f:
json.dump(self.idx2word, f, ensure_ascii=False, indent=4,
cls=NpEncoder)
# Additionally adding <UNK> (will be reindexed later)
self.idx2word[len(self.idx2word) + 1] = self.UNK
self.word2idx = {token: idx for idx, token in self.idx2word.items()}
data = data.map(self._encode, batched=True, num_proc=proc_num)
data = [(torch.tensor(indexes), length, label)
for indexes, length, label in zip(
data['encoded'], data['length'], data['label'])]
if len(self.idx2word) < self.vocab_size:
print(f"Specified vocabulary size as {self.vocab_size}, but number of unique words in dataset {len(self.idx2word)}. Setting vocabulary size to {len(self.idx2word)-1}.")
self.vocab_size = len(self.idx2word) - 1
if rewriting:
# Loading the small idx2word
try:
old_idx2word = copy.deepcopy(self.idx2word)
checkpoint_dir = os.path.abspath(
os.path.join(last_checkpoint_path, os.pardir))
with open(os.path.join(checkpoint_dir, 'idx2word.json'), 'r',
encoding='utf-8') as f:
self.idx2word = json.load(f)
self.idx2word = {int(idx): token for idx, token in self.idx2word.items()}
self.word2idx = {token: idx for idx, token in self.idx2word.items()}
except FileNotFoundError:
print("Could not load existing TRIMMED word2idx and idx2word "
"dictionaries, rebuilding based on specified dataset. "
"If pre-training and rewriting on two different "
"datasets, MAKE SURE the vocabularies are the same "
"for both.")
top_indexes = self._get_frequency(data)
old_idx2word = copy.deepcopy(self.idx2word)
reindexes = np.arange(self.vocab_size+4)
vocab = np.vectorize(self.idx2word.get)(top_indexes)
vocab = np.insert(vocab, 0, self.PAD)
vocab = np.insert(vocab, 1, self.UNK)
vocab = np.insert(vocab, 2, self.SOS)
vocab = np.insert(vocab, 3, self.EOS)
new_word2idx = {}
new_idx2word = {}
for idx, word in zip(reindexes, vocab):
new_word2idx[word] = idx
new_idx2word[idx] = word
self.idx2word = new_idx2word
self.word2idx = new_word2idx
with open(os.path.join(self.checkpoint_dir, 'idx2word.json'), 'w',
encoding='utf-8') as f:
json.dump(self.idx2word, f, ensure_ascii=False, indent=4,
cls=NpEncoder)
else:
top_indexes = self._get_frequency(data)
old_idx2word = copy.deepcopy(self.idx2word)
reindexes = np.arange(self.vocab_size+4)
vocab = np.vectorize(self.idx2word.get)(top_indexes)
vocab = np.insert(vocab, 0, self.PAD)
vocab = np.insert(vocab, 1, self.UNK)
vocab = np.insert(vocab, 2, self.SOS)
vocab = np.insert(vocab, 3, self.EOS)
new_word2idx = {}
new_idx2word = {}
for idx, word in zip(reindexes, vocab):
new_word2idx[word] = int(idx)
new_idx2word[int(idx)] = word
self.idx2word = new_idx2word
self.word2idx = new_word2idx
with open(os.path.join(self.checkpoint_dir, 'idx2word.json'), 'w',
encoding='utf-8') as f:
json.dump(self.idx2word, f, ensure_ascii=False, indent=4,
cls=NpEncoder)
if self.prepend_labels:
str_labels = ['LABEL_' + str(str_lab)
for str_lab in sorted(set(labels))]
idx_labels = np.arange(len(str_labels)) + len(self.idx2word)
# Add label strings and indexes to the existing word2idx and
# idx2word dictionaries
for idx, lab in enumerate(str_labels):
self.word2idx[lab] = idx_labels[idx]
self.idx2word[idx_labels[idx]] = lab
data = self._reindex_data_and_pad(data, self.word2idx,
old_idx2word=old_idx2word)
self.vocab_size = len(self.idx2word)
return data
def _process_data_no_embeds_subsequent_shard(
self, data, last_checkpoint_path=None, rewriting=False):
def encode(examples):
examples['text'] = [self.tokenizer(doc.strip().lower())
for doc in examples['text']]
return examples
# Multiprocessing for larger datasets
threshold = 50000
if len(data) > threshold:
proc_num = os.cpu_count()
else:
proc_num = None
data = data.map(encode, batched=True, num_proc=proc_num)
data = data.map(self._encode, batched=True)
data = [(torch.tensor(indexes), length, label)
for indexes, length, label in zip(
data['encoded'], data['length'], data['label'])]
# Only padding and adding sos/eos tokens in this case
data = self._reindex_data_and_pad(data, self.word2idx)
return data
def _encode(self, examples):
encoded = [[self.word2idx[tok] if tok in self.word2idx else self.word2idx[self.UNK] for tok in doc] for doc in examples['text']]
examples['encoded'] = [torch.tensor(enc_doc[:(self.max_seq_len-2)]) for enc_doc in encoded]
examples['length'] = [doc.size()[0] for doc in examples['encoded']]
return examples
class Preprocessor_for_Transformer(Preprocessor):
def __init__(self, transformer_type='bert-base-uncased', **kwargs):
super().__init__(**kwargs)
self.transformer_type = transformer_type
self.tokenizer = AutoTokenizer.from_pretrained(self.transformer_type)
self.PAD = self.tokenizer.pad_token
self.UNK = self.tokenizer.unk_token
self.vocab, self.embeds, self.word2idx, self.idx2word = (None,)*4
self.lab_str2int = None
self.lab_int2str = None
self.num_labels = None
if self.prepend_labels:
raise Exception(
"Prepending labels not yet available for transformer-based "
"models.")
def process_data(self, data, train_split=True, first_shard=True):
if np.count_nonzero(data['label']) == 0:
sorted_labels = [0.0]
else:
sorted_labels = sorted(set(
data[idx]['label'] for idx in range(len(data))))
if not train_split:
sorted_val_labels = [lab for lab in sorted_labels
if lab not in self.lab_str2int]
for val in sorted_val_labels:
self.lab_str2int[val] = len(self.lab_str2int)
else:
lab_str2int = {string: idx
for idx, string in enumerate(sorted_labels)}
self.lab_str2int = lab_str2int
lab_int2str = {val: key for key, val in self.lab_str2int.items()}
self.lab_int2str = lab_int2str
self.num_labels = len(self.lab_str2int)
def encode(examples):
if None in examples['text']:
print("***** None found *****")
examples['text'] = [text if text is not None else ''
for text in examples['text']]
tokenized_batch = self.tokenizer(
examples['text'], truncation=True,
max_length=self.max_seq_len, padding='max_length')
tokenized_batch['label'] =\
[self.lab_str2int[lab] for lab in examples['label']]
return tokenized_batch
data = self.map_data_split(data, encode)
return data
def map_data_split(self, data_split, encode):
data_split = data_split.map(encode, batched=True)
data_split = data_split.map(
lambda examples: {'labels': examples['label']}, batched=True)
data_split = data_split.remove_columns('label')
data_split.set_format(
type='torch', columns=['input_ids', 'attention_mask', 'labels'])
if not data_split.features['labels'] == Value('int64'):
new_features = data_split.features.copy()
new_features['labels'] = Value('int64')
data_split = data_split.cast(new_features)
return data_split
class Custom_Preprocessor(Preprocessor):
def __init__(self, **kwargs):
super().__init__(**kwargs)
raise NotImplementedError
def _process_data(self, dataset):
pass