-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
196 lines (152 loc) · 6.6 KB
/
data.py
File metadata and controls
196 lines (152 loc) · 6.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import torch
from torch.utils.data import Dataset
import numpy as np
def generate_points(num=40000,dim=18,type='normal',normal_var=1,radius=1):
'''
If type=='normal', then generate points from N(0,normal_var)
If type=='spherical', then simply divide the points by their norm.'''
X = torch.randn([num,dim]) #coordinates sampled from N(0,1)
if type=='spherical':
norm = torch.norm(X, p=2, dim=1, keepdim=True)
X_spherical = X / norm
return X_spherical
elif type=='normal':
return X
else:
raise ValueError('type should be either normal or spherical')
class TreeNode:
'''
This class represents a node in the decision tree.
Each node has a depth, a maximum depth(of the tree), a feature index, and left and right child nodes.
Leaf nodes have a value, which is the predicted class.
'''
def __init__(self, depth, max_depth, feature_index):
self.depth = depth
self.max_depth = max_depth
self.feature = feature_index
self.left = None
self.right = None
self.value = None # This will store the predicted class for leaf nodes
def build_tree(self):
if self.depth == self.max_depth:
self.value = float(self.feature % 2)
return
# Create left and right child nodes
self.left = TreeNode(self.depth + 1, self.max_depth, 2*self.feature+1)
self.right = TreeNode(self.depth + 1, self.max_depth, 2*self.feature+2)
# Recursively build left and right subtrees
self.left.build_tree()
self.right.build_tree()
def predict(self, x):
if self.value is not None:
return self.value
if x[self.feature] > 0:
return self.left.predict(x)
else:
return self.right.predict(x)
def gen_std_basis_DT(depth, dim_in, type_data, num_points, feat_index_start=0,radius=1):
'''
Generate points uniformly random from a hypersphere. And the label is the prediction of the tree with depth = max_depth.
The node hyperplanes are simply characterised by standard basis vectors(for example, the root node hyperplane is x[0] = 0)
'''
Tree = TreeNode(depth = 0,max_depth=depth,feature_index = feat_index_start)
Tree.build_tree()
X = generate_points(num=num_points,dim=dim_in,type=type_data,radius=radius)
Y=[]
for item in X:
Y.append(Tree.predict(item))
Y = torch.tensor(Y)
return X,Y
def set_npseed(seed):
np.random.seed(seed)
def set_torchseed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
#Four mode classification data
def gen_random_DT(num_data=1000, dim=2, seed=0, w_list=None, b_list=None, vals=None, num_levels=2, margin=0):
'''
Generate random data and a decision tree with random hyperplanes as nodes.
The data is generated uniformly at random from the hypercube [-1,1]^dim.
The decision tree is generated by first generating random hyperplanes and then
assigning the median of the data in each node as the bias of the hyperplane.
The labels are assigned by the decision tree.
We also prune the data by removing points that are within a margin of the hyperplane.
The output is the pruned data, the labels, the hyperplanes and the number of points in each node.
'''
# np.random.seed(6790)
set_npseed(seed=seed)
# Construct a complete decision tree with 2**num_levels-1 internal nodes,
# e.g. num_levels=2 means there are 3 internal nodes.
# w_list, b_list is a list of size equal to num_internal_nodes
# vals is a list of size equal to num_leaf_nodes, with values +1 or -1
num_internal_nodes = 2**num_levels - 1
num_leaf_nodes = 2**num_levels
stats = np.zeros(num_internal_nodes+num_leaf_nodes)
if vals is None:
vals = np.arange(0,num_internal_nodes+num_leaf_nodes,1,dtype=np.int32)%2
vals[:num_internal_nodes] = -99
if w_list is None:
w_list = np.random.standard_normal((num_internal_nodes, dim))
w_list = w_list/np.linalg.norm(w_list, axis=1)[:, None]
b_list = np.zeros((num_internal_nodes))
data_x = np.random.random_sample((num_data, dim))*2 - 1.
relevant_stats = data_x @ w_list.T + b_list
curr_index = np.zeros(shape=(num_data), dtype=int)
for level in range(num_levels):
nodes_curr_level=list(range(2**level - 1,2**(level+1)-1 ))
for el in nodes_curr_level:
b_list[el]=-1*np.median(relevant_stats[curr_index==el,el])
relevant_stats[:,el] += b_list[el]
decision_variable = np.choose(curr_index, relevant_stats.T)
# Go down and right if wx+b>0 down and left otherwise.
# i.e. 0 -> 1 if w[0]x+b[0]<0 and 0->2 otherwise
curr_index = (curr_index+1)*2 - (1-(decision_variable > 0))
bound_dist = np.min(np.abs(relevant_stats), axis=1)
thres = margin #margin
labels = vals[curr_index]
data_x_pruned = data_x[bound_dist>thres]
labels_pruned = labels[bound_dist>thres]
relevant_stats = np.sign(data_x_pruned @ w_list.T + b_list)
nodes_active = np.zeros((len(data_x_pruned), num_internal_nodes+num_leaf_nodes), dtype=np.int32)
for node in range(num_internal_nodes+num_leaf_nodes):
if node==0:
stats[node]=len(relevant_stats)
nodes_active[:,0]=1
continue
parent = (node-1)//2
nodes_active[:,node]=nodes_active[:,parent]
right_child = node-(parent*2)-1 # 0 means left, 1 means right 1 has children 3,4
if right_child==1:
nodes_active[:,node] *= relevant_stats[:,parent]>0
if right_child==0:
nodes_active[:,node] *= relevant_stats[:,parent]<0
stats = nodes_active.sum(axis=0)
return ((data_x_pruned, labels_pruned), (w_list, b_list, vals), stats)
# depth = 4
# dim_in = 18
# type_data = 'normal'
# feat_index_start = 0 #the index of the first feature in the tree
# num_points = 40000
# Tree = TreeNode(depth = 0,max_depth=depth,feature_index = feat_index_start)
# Tree.build_tree()
# X = generate_points(num=num_points,dim=dim_in,type=type_data)
# Y=[]
# for item in X:
# Y.append(Tree.predict(item))
# Y = torch.tensor(Y)
class CustomDataset(Dataset):
def __init__(self, x, y, transform=None):
self.x = x
self.y = y
self.transform = transform
def __len__(self):
return len(self.x)
def __getitem__(self, idx):
sample_x = self.x[idx]
sample_y = self.y[idx]
if self.transform:
sample_x, sample_y = self.transform(sample_x, sample_y)
return sample_x, sample_y