Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multiple Group IDs for rolling_time_series #885

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions tests/units/utilities/test_dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,74 @@ def test_assert_single_row(self):
n_jobs=0,
)

def test_multiple_index(self):
xy_class = pd.DataFrame(
{"a": [1, 2], "b": [5, 6], "time": range(2), "first_id": ["x", "y"]}
)

yx_class = pd.DataFrame(
{"a": [3, 4], "b": [7, 8], "time": range(2), "first_id": ["y", "x"]}
)

df_intermediate = pd.concat([xy_class, yx_class])
a_class = df_intermediate.copy()
a_class["second_id"] = "a"
b_class = df_intermediate
b_class["second_id"] = "b"
df_full = pd.concat([a_class, b_class], ignore_index=True)

""" df_full is
a b time first_id second_id
0 1 5 0 x a
1 2 6 1 y a
2 3 7 0 y a
3 4 8 1 x a
4 1 5 0 x b
5 2 6 1 y b
6 3 7 0 y b
7 4 8 1 x b
"""
correct_indices = [
("x", "a", 0),
("x", "a", 1),
("x", "a", 1),
("x", "b", 0),
("x", "b", 1),
("x", "b", 1),
("y", "a", 0),
("y", "a", 1),
("y", "a", 1),
("y", "b", 0),
("y", "b", 1),
("y", "b", 1),
]
correct_values_a = [1.0, 1.0, 4.0, 1.0, 1.0, 4.0, 3.0, 3.0, 2.0, 3.0, 3.0, 2.0]
correct_values_b = [
5.0,
5.0,
8.0,
5.0,
5.0,
8.0,
7.0,
7.0,
6.0,
7.0,
7.0,
6.0,
]
df = dataframe_functions.roll_time_series(
df_full,
column_id=["first_id", "second_id"],
column_sort="time",
column_kind=None,
rolling_direction=1,
n_jobs=0,
)
self.assertListEqual(list(df["id"]), correct_indices)
self.assertListEqual(list(df["a"].values), correct_values_a)
self.assertListEqual(list(df["b"].values), correct_values_b)

def test_positive_rolling(self):
first_class = pd.DataFrame(
{"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)}
Expand Down
48 changes: 34 additions & 14 deletions tsfresh/utilities/dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,14 @@ def _f(x):
else:
timeshift_value = timeshift - 1
# and now create new ones ids out of the old ones
df_temp["id"] = df_temp[column_id].apply(lambda row: (row, timeshift_value))
if isinstance(column_id, list):
df_temp["id"] = (
df_temp[column_id]
.apply(tuple, axis=1)
.apply(lambda row: row + (timeshift_value,))
)
else:
df_temp["id"] = df_temp[column_id].apply(lambda row: (row, timeshift_value))

return df_temp

Expand Down Expand Up @@ -394,8 +401,8 @@ def roll_time_series(
:type df_or_dict: pandas.DataFrame or dict

:param column_id: it must be present in the pandas DataFrame or in all DataFrames in the dictionary.
It is not allowed to have NaN values in this column.
:type column_id: basestring
It is not allowed to have NaN values in this column. Multiple id column names which define together one time series can be passed as well.
:type column_id: basestring or list

:param column_sort: if not None, sort the rows by this column. It is not allowed to
have NaN values in this column. If not given, will be filled by an increasing number,
Expand Down Expand Up @@ -482,7 +489,14 @@ def roll_time_series(
"Your time series container has zero or one rows!. Can not perform rolling."
)

if column_id is not None:
if isinstance(column_id, list):
if not all(item in df.columns for item in column_id):
raise AttributeError(
"The given columns for the id are not present in the data."
)
if len(column_id) == 1:
column_id = column_id[0]
elif isinstance(column_id, str):
if column_id not in df:
raise AttributeError(
"The given column for the id is not present in the data."
Expand All @@ -491,13 +505,15 @@ def roll_time_series(
raise ValueError(
"You have to set the column_id which contains the ids of the different time series"
)

if column_kind is not None:
grouper = [column_kind, column_id]
else:
grouper = [
column_id,
]
grouper = []
if column_kind is not None and isinstance(column_id, str):
grouper.extend([column_kind, column_id])
elif column_kind is not None and isinstance(column_id, list):
grouper.append(column_kind).extend(column_id)
elif isinstance(column_id, str):
grouper.append(column_id)
elif isinstance(column_id, list):
grouper.extend(column_id)

if column_sort is not None:
# Require no Nans in column
Expand All @@ -510,8 +526,10 @@ def roll_time_series(
# if rolling is enabled, the data should be uniformly sampled in this column
# Build the differences between consecutive time sort values

differences = df.groupby(grouper)[column_sort].apply(
lambda x: x.values[:-1] - x.values[1:]
differences = (
df.groupby(grouper)[column_sort]
.apply(lambda x: x.values[:-1] - x.values[1:])
.dropna()
)
# Write all of them into one big list
differences = sum(map(list, differences), [])
Expand Down Expand Up @@ -575,7 +593,9 @@ def roll_time_series(
distributor.close()

df_shift = pd.concat(shifted_chunks, ignore_index=True)

# drop inital column_id if it isn't overwritten already as it is not needed for feature calculation and is included in the id
if column_id != "id":
df_shift = df_shift.drop(column_id, axis=1)
return df_shift.sort_values(by=["id", column_sort or "sort"])


Expand Down