Replies: 2 comments 1 reply
-
Hi @Unreal9er, from tsai.data.validation import balance_idx
y = np.concatenate([[i] * np.random.randint(10, 100) for i in range(5)])
train_split = np.random.choice(len(y), int(len(y) * .8), False)
c, v = np.unique(y[train_split], return_counts=True)
print(f"{'imbalanced:':25} {c} {v}")
oversampled_train_split = train_split[balance_idx(y[train_split], strategy="oversample")]
osc, osv = np.unique(y[oversampled_train_split], return_counts=True)
print(f"{'balanced (oversample):':25} {osc} {osv}")
test_eq(osv, [max(v)] * len(v))
undersampled_train_split = train_split[balance_idx(y[train_split], strategy="undersample")]
usc, usv = np.unique(y[undersampled_train_split], return_counts=True)
print(f"{'balanced (undersample):':25} {usc} {usv}")
test_eq(usv, [min(v)] * len(v)) You can also use this function under the hood if you set splits = get_splits(y, valid_size=0.2, balance=True, strategy="undersample", stratify=True, shuffle=False) Please, let me know if this is what you were looking for. |
Beta Was this translation helpful? Give feedback.
-
Thank you, @Unreal9er, for raising this issue! |
Beta Was this translation helpful? Give feedback.
-
Hello,
I use tsai for a year now and haven´t found a sampling or undersampling method, but maybe I didn´t search enough or it´s included in some parameter of another function. Here is a custom code which currently uses 3 classes and samples down the whole time series array.
Code:
window_length = 80 # window_length is usually selected based on prior domain knowledge or by trial and error
stride = 1 # None for non-overlapping (stride = window_length) (default = 1). This depends on how often you want to predict once the model is trained
start = 0 # use all data since the first time stamp (default = 0)
get_x = Data_df.columns[:-1] # Indicates which are the columns that contain the x data.
get_y = "CloseDifferenceClassificationFuture" # In multivariate time series, you must indicate which is/are the y columns
horizon = 0 # 0 means y is taken from the last time stamp of the time sequence (default = 0)
seq_first = True
X, y = SlidingWindow(window_length, stride=stride, start=start, get_x=get_x, get_y=get_y, horizon=horizon, seq_first=seq_first)(Data_df)
splits = get_splits(y, valid_size=0.2, balance=True, stratify=True, shuffle=False)
check_data(X, y, splits)
Data_df.groupby(["CloseDifferenceClassificationFuture"])["CloseDifferenceClassificationFuture"].count()
import numpy as np
import random
#specify the percentage of the selected class to keep - here it´s 0
percentage = p
#create an empty list to store the indices of the time series to keep
keep = []
#loop over the values in y
for i, value in enumerate(y):
#if the value is not 0, add the index to the list of indices to keep
if value != 0:
keep.append(i)
#if the value is 0, add the index to the list of indices to keep with a certain probability
else:
if random.uniform(0, 1) < percentage:
keep.append(i)
#use the list of indices to keep to create a new array with only the desired time series
X_new = X[keep, :, :]
#use the list of indices to keep to create a new list with only the desired values
y_new = np.array([y[i] for i in keep])
print(y_new)
X = 0
y = 0
unique_values, counts = np.unique(y_new, return_counts=True)
#Print the unique values and their counts
print("Unique Values:", unique_values)
print("Counts:", counts)
splits_new = get_splits(y_new, valid_size=0.2, balance=True, stratify=True, shuffle=False)
check_data(X_new, y_new, splits_new)
Beta Was this translation helpful? Give feedback.
All reactions