-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_val_sets.py
29 lines (24 loc) · 1.33 KB
/
create_val_sets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import os
from datasets import Dataset
# Create smaller val sets for fine-tuning the BERT classifiers
# phishing
phish_data_set = Dataset.from_json(os.path.join('data', 'phishing', 'train.jsonl'))
phish_data_set.class_encode_column('label').train_test_split(
stratify_by_column='label', test_size=2000, seed=1337)['test'].to_json(
os.path.join('data', 'phishing', 'validation.jsonl'))
# webmd
webmd_data_set = Dataset.from_json(os.path.join('data', 'webmd', 'train.jsonl'))
webmd_data_set.class_encode_column('label').train_test_split(
stratify_by_column='label', test_size=2000, seed=1337)['test'].to_json(
os.path.join('data', 'webmd', 'validation.jsonl'))
# thumbs-up
thumbs_set = Dataset.from_json(os.path.join('data', 'thumbs-up', 'validation.jsonl'))
os.rename(os.path.join('data', 'thumbs-up', 'validation.jsonl'), os.path.join('data', 'thumbs-up', 'orig_validation.jsonl'))
thumbs_set.class_encode_column('label').train_test_split(
stratify_by_column='label', test_size=2000, seed=1337)['test'].to_json(
os.path.join('data', 'thumbs-up', 'validation.jsonl'))
# swmh
swmh_set = Dataset.from_json(os.path.join('data', 'swmh', 'val.jsonl'))
swmh_set.class_encode_column('label').train_test_split(
stratify_by_column='label', test_size=2000, seed=1337)['test'].to_json(
os.path.join('data', 'swmh', 'validation.jsonl'))