HCL_TPP/data_io.py at master · SDS-Lab/HCL_TPP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import numpy as np
import pickle
import torch
import torch.utils.data
import os
from typing import List, Dict, Tuple


DEFAULT_PAD = 0


class EventData(torch.utils.data.Dataset):
    """ Event stream dataset. """
    def __init__(self, data: List[List[Dict]]):
        """
        Data should be a list of event streams; each event stream is a list of dictionaries;
        each dictionary contains: time_since_start, time_since_last_event, type_event

        In other words, each sequence is saved as [t0, dt1, dt2, ..., dtN] + [c0, c1, c2, ..., cN],
        where each dti = ti - ti-1
        """
        # step one: convert the data to the structure List[List[List]]]
        self.time = [[elem['time_since_start'] for elem in inst] for inst in data]
        self.time_gap = [[elem['time_since_last_event'] for elem in inst] for inst in data]
        # plus 1 since there could be event type 0, but we use 0 as padding
        self.event_type = [[elem['type_event'] + 1 for elem in inst] for inst in data]
        self.length = len(data)

    def __len__(self):
        return self.length

    def __getitem__(self, idx: int) -> Tuple[List, List, List]:
        """ Each returned element is a list, which represents an event stream """
        return self.time[idx], self.time_gap[idx], self.event_type[idx]


def pad_time(insts: List[List], pad_value: int = DEFAULT_PAD) -> torch.Tensor:
    """
    Pad the instance to the max seq length in batch.
    :param: A list of event streams, each list contains a list of instances (timestamps of events)
    :param: pad_value: the value used to pad sequence
    """
    max_len = max(len(inst) for inst in insts)
    batch_seq = np.array([
        inst + [pad_value] * (max_len - len(inst))
        for inst in insts])
    print(max_len)
    return torch.tensor(batch_seq, dtype=torch.float32)


def pad_type(insts: List[List], pad_value: int = DEFAULT_PAD) -> torch.Tensor:
    """
    Pad the instance to the max seq length in batch.
    :param: A list of event streams, each list contains a list of instances (types of events)
    :param: pad_value: the value used to pad sequence
    """
    max_len = max(len(inst) for inst in insts)
    batch_seq = np.array([
        inst + [pad_value] * (max_len - len(inst))
        for inst in insts])
    print(max_len)
    return torch.tensor(batch_seq, dtype=torch.long)


def collate_fn(insts: List[List]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Collate function, as required by PyTorch,
    which used to make the output of each sample with the same structure and size
    """
    time, time_gap, event_type = list(zip(*insts))
    time = pad_time(time)
    time_gap = pad_time(time_gap)
    event_type = pad_type(event_type)
    return time, time_gap, event_type


def get_dataloader(data: List[List[Dict]], batch_size: int, shuffle: bool = True) -> torch.utils.data.DataLoader:
    """
    Prepare dataloader.
    :param: the event streaming data with structure List[List[Dict]]
    :param: batch_size: the batch size
    :param: shuffle: shuffle the data or not.
    """
    ds = EventData(data)
    dl = torch.utils.data.DataLoader(
        ds,
        num_workers=2,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle
    )
    return dl


def load_data(name: str, dict_name: str) -> Tuple[List[List[Dict]], int]:
    """
    Load raw streaming data from a pickle file
    :param name: the name of the file
    :param dict_name: the keyword of the raw data in the file
    :return:
        1) data: the streaming data with structure List[List[Dict]]],
        each dictionary contains time_since_start, time_since_last_event, type_event
        2) The number of event types
    """
    with open(name, 'rb') as f:
        data = pickle.load(f, encoding='latin-1')
        num_types = int(data['dim_process'])
        data = data[dict_name]
    # return data, int(num_types)
    return data, num_types


def prepare_dataloader(data_folder_name: str, batch_size: int) -> Tuple[Dict[str, torch.utils.data.DataLoader], int]:
    """
    Load data and prepare dataloader.

    :param data_folder_name: a structure contains at least two items (data_folder_name, batch_size)
    :param batch_size: the batch size
    """
    print('[Info] Loading train data...')
    train_data, num_types = load_data(os.path.join(data_folder_name, 'train.pkl'), 'train')
    print('[Info] Loading dev data...')
    dev_data, _ = load_data(os.path.join(data_folder_name, 'dev.pkl'), 'dev')
    print('[Info] Loading test data...')
    test_data, _ = load_data(os.path.join(data_folder_name, 'test.pkl'), 'test')
    dataloaders = {'train': get_dataloader(train_data, batch_size, shuffle=True),
                   'val': get_dataloader(dev_data, batch_size, shuffle=False),
                   'test': get_dataloader(test_data, batch_size, shuffle=False)}
    return dataloaders, num_types


# Data augmentation operation
def sorting(event_type: torch.Tensor, event_time: torch.Tensor):
    """
    Sort each sequence in the ascending order of time
    """
    batch = event_time.shape[0]
    idx = torch.argsort(event_time, dim=1, descending=False)
    for b in range(batch):
        event_time[b, :] = event_time[b, idx[b, :]]
        event_type[b, :] = event_type[b, idx[b, :]]
    return event_type, event_time


def reorganize(event_type: torch.Tensor, event_time: torch.Tensor):
    """
    Move zeros to the end of each sequence
    """
    batch = event_time.shape[0]
    for b in range(batch):
        tmp_time = event_time[b, :]
        tmp_type = event_type[b, :]
        event_time[b, :] = torch.cat((tmp_time[tmp_type != 0], tmp_time[tmp_type == 0]), dim=0)
        event_type[b, :] = torch.cat((tmp_type[tmp_type != 0], tmp_type[tmp_type == 0]), dim=0)
    len_max = torch.max(torch.sum(event_type != 0))
    return event_type[:, :len_max], event_time[:, :len_max]


def one_step_superpose(event_type: torch.Tensor, event_time: torch.Tensor):
    batch = event_time.shape[0]
    shift = int(batch / 2)
    shift_type = torch.cat((event_type[shift:, :], event_type[:shift, :]), dim=0)
    shift_time = torch.cat((event_time[shift:, :], event_time[:shift, :]), dim=0)
    new_type = torch.cat((event_type, shift_type), dim=1)  # batch * (2seq_len)
    new_time = torch.cat((event_time, shift_time), dim=1)  # batch * (2seq_len)
    new_type, new_time = sorting(new_type, new_time)
    new_type, new_time = reorganize(new_type, new_time)
    return new_type, new_time


def shift_and_superpose(event_type: torch.Tensor, event_time: torch.Tensor, num_iter: int = 2):
    """
    Superpose event sequences with their shifted versions

    Input: event_type: batch*seq_len;
           event_time: batch*seq_len.
    Output: new event_type: batch * (2 * seq_len)
            new event_time: batch * (2 * seq_len)
    """
    for i in range(num_iter):
        event_type, event_time = one_step_superpose(event_type, event_time)
    return event_type, event_time


def thinning_process_deterministic(event_type: torch.Tensor, event_time: torch.Tensor,
                                   significance: torch.Tensor, ratio_remove: float = 0.2):
    """
    Deterministically thinning event sequences guided by a probability/significance
    Input: event_type: batch * seq_len;
           event_time: batch * seq_len;
           significance: batch * seq_len, the significance of the events per sequence
           num_neg: the number of negative sequences per observed sequence
           ratio_remove: the ratio of removed events per sequence, in the range (0, 1)
    Output: thinned event_type: batch * [seq_len * (1 - ratio_remove)]
            thinned event_time: batch * [seq_len * (1 - ratio_remove)]
    """
    batch, seq_len = event_type.shape
    new_len = int(seq_len * (1 - ratio_remove))
    idx = torch.argsort(significance, dim=1, descending=True)
    thinned_type = torch.zeros_like(event_type)
    thinned_time = torch.zeros_like(event_time)
    for b in range(batch):
        num_events = torch.sum(event_type[b, :] > 0)
        new_len_b = int(num_events * (1 - ratio_remove))
        thinned_type[b, :new_len_b] = event_type[b, idx[b, :new_len_b]]
        thinned_time[b, :new_len_b] = event_time[b, idx[b, :new_len_b]]
    new_type, new_time = sorting(thinned_type[:, :new_len], thinned_time[:, :new_len])
    return reorganize(new_type, new_time)


def thinning_process_random(event_type: torch.Tensor, event_time: torch.Tensor, ratio_remove: float = 0.5):
    # import pdb;
    # pdb.set_trace()
    """
    Randomly thinning event sequences with a relatively-high removal ratio
        Input: event_type: batch * seq_len;
               event_time: batch * seq_len;
               num_neg: the number of negative sequences per observed sequence
               ratio_remove: the ratio of removed events per sequence, in the range (0, 1)
        Output: thinned event_type: batch * [seq_len * (1 - ratio_remove)]
                thinned event_time: batch * [seq_len * (1 - ratio_remove)]
    """
    rv = torch.rand_like(event_time) * (event_type > 0)
    return thinning_process_deterministic(event_type, event_time, rv, ratio_remove)


def sampling_positive_seqs(event_type: torch.Tensor, event_time: torch.Tensor,
                           significance: torch.Tensor, ratio_remove: float = 0.2):
    """
    Sampling positive sequences based on the significance of the events guided by the model
    """
    return thinning_process_deterministic(event_type, event_time, significance, ratio_remove)


def sampling_negative_seqs(event_type: torch.Tensor, event_time: torch.Tensor,
                           num_neg: int = 5, ratio_remove: float = 0.5):
    """
    Sampling negative sequences by randomly removing some events
    """
    neg_type, neg_time = thinning_process_random(event_type, event_time, ratio_remove)
    for k in range(num_neg - 1):
        tmp_type, tmp_time = thinning_process_random(event_type, event_time, ratio_remove)
        neg_type = torch.cat((neg_type, tmp_type), dim=0)
        neg_time = torch.cat((neg_time, tmp_time), dim=0)
    return neg_type, neg_time