-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset.py
101 lines (83 loc) · 3.13 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import torch
import os
from cv2 import imread, IMREAD_GRAYSCALE
import csv
class Dataset():
'''
Generic Dataset class.
Includes getter, split/shuffle, and generator.
'''
def __init__(self, data) -> None:
self.data = data
self.len = len(data)
def __getitem__(self, index) -> tuple:
return self.data[index]
'''
Randomly shuffles and splits dataset into two separate sets.
The split must be >=1. set_2 := split, set_2 := 1 - split.
Returns a tuple of dicts.
'''
def split_shuffle(self, split) -> tuple:
assert(split <= 1)
set_1, set_2 = {}, {}
idx_1, idx_2 = 0, 0
for i in torch.randperm(self.len):
if (idx_1 < (self.len * split)):
set_1[idx_1] = self.data[i.item()]
idx_1 += 1
else:
set_2[idx_2] = self.data[i.item()]
idx_2 += 1
return set_1, set_2
def data_generator(self):
for i in range(self.len):
yield self.data[i]
class TitanicDataset(Dataset):
'''
Data class for titanic data, takes in csv files test.csv and train.csv.
Contains 10 features + whether a passenger survived for train.csv. I am only
loading 7 features.
'''
def __init__(self, data_pth) -> None:
self.data = {}
with open(f"{data_pth}/train.csv", newline='\n') as f:
reader = csv.DictReader(f)
k = 0
for row in reader:
row_info = [
float(row['PassengerId']),
float(row['Pclass']),
float(row['Sex'] == 'male'),
float(row['Age']) if row['Age'] != '' else 0, # Handling of missing age
float(row['SibSp']),
float(row['Parch']),
float(row['Fare'])
]
self.data[k] = (torch.tensor(row_info, dtype=torch.float), int(row['Survived']))
k += 1
self.len = len(self.data)
class MNISTDataset(Dataset):
'''
Data class for MNIST data, takes in a data_pth and processes sub-folders,
or takes in a dictionary of data.
MNIST images are 28x28 grayscale.
If loaded from data_pth each image is stored as a tuple with the data and the label.
'''
def __init__(self, data=None, data_pth=None) -> None:
if data != None:
assert(type(data) is dict)
self.data = data
self.len = len(data)
else:
self.data = {}
k = 0
for i in range(0, 10):
for filename in os.listdir(f"{data_pth}/{i}"):
if filename is not None:
img = imread(f"{data_pth}/{i}/{filename}", IMREAD_GRAYSCALE)
if img is not None:
img_data = torch.tensor(torch.from_numpy(img), dtype=torch.float)
self.data[k] = (img_data, i)
self.data[k][0].requires_grad = True
k += 1
self.len = k