blue-yonder · mmcux · Aug 19, 2021 · Aug 19, 2021 · Aug 20, 2021
diff --git a/tests/units/utilities/test_dataframe_functions.py b/tests/units/utilities/test_dataframe_functions.py
@@ -145,6 +145,74 @@ def test_assert_single_row(self):
             n_jobs=0,
         )
 
+    def test_multiple_index(self):
+        xy_class = pd.DataFrame(
+            {"a": [1, 2], "b": [5, 6], "time": range(2), "first_id": ["x", "y"]}
+        )
+
+        yx_class = pd.DataFrame(
+            {"a": [3, 4], "b": [7, 8], "time": range(2), "first_id": ["y", "x"]}
+        )
+
+        df_intermediate = pd.concat([xy_class, yx_class])
+        a_class = df_intermediate.copy()
+        a_class["second_id"] = "a"
+        b_class = df_intermediate
+        b_class["second_id"] = "b"
+        df_full = pd.concat([a_class, b_class], ignore_index=True)
+
+        """ df_full is
+            a	b	 time  first_id	second_id
+        0	1	5  	    0	    x	    a
+        1	2	6	    1   	y	    a
+        2	3	7	    0   	y	    a
+        3	4	8	    1   	x	    a
+        4	1	5	    0   	x	    b
+        5	2	6	    1   	y	    b
+        6	3	7	    0   	y	    b
+        7	4	8	    1   	x	    b
+        """
+        correct_indices = [
+            ("x", "a", 0),
+            ("x", "a", 1),
+            ("x", "a", 1),
+            ("x", "b", 0),
+            ("x", "b", 1),
+            ("x", "b", 1),
+            ("y", "a", 0),
+            ("y", "a", 1),
+            ("y", "a", 1),
+            ("y", "b", 0),
+            ("y", "b", 1),
+            ("y", "b", 1),
+        ]
+        correct_values_a = [1.0, 1.0, 4.0, 1.0, 1.0, 4.0, 3.0, 3.0, 2.0, 3.0, 3.0, 2.0]
+        correct_values_b = [
+            5.0,
+            5.0,
+            8.0,
+            5.0,
+            5.0,
+            8.0,
+            7.0,
+            7.0,
+            6.0,
+            7.0,
+            7.0,
+            6.0,
+        ]
+        df = dataframe_functions.roll_time_series(
+            df_full,
+            column_id=["first_id", "second_id"],
+            column_sort="time",
+            column_kind=None,
+            rolling_direction=1,
+            n_jobs=0,
+        )
+        self.assertListEqual(list(df["id"]), correct_indices)
+        self.assertListEqual(list(df["a"].values), correct_values_a)
+        self.assertListEqual(list(df["b"].values), correct_values_b)
+
     def test_positive_rolling(self):
         first_class = pd.DataFrame(
             {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)}

diff --git a/tsfresh/utilities/dataframe_functions.py b/tsfresh/utilities/dataframe_functions.py
@@ -343,7 +343,14 @@ def _f(x):
         else:
             timeshift_value = timeshift - 1
         # and now create new ones ids out of the old ones
-        df_temp["id"] = df_temp[column_id].apply(lambda row: (row, timeshift_value))
+        if isinstance(column_id, list):
+            df_temp["id"] = (
+                df_temp[column_id]
+                .apply(tuple, axis=1)
+                .apply(lambda row: row + (timeshift_value,))
+            )
+        else:
+            df_temp["id"] = df_temp[column_id].apply(lambda row: (row, timeshift_value))
 
         return df_temp
 
@@ -394,8 +401,8 @@ def roll_time_series(
     :type df_or_dict: pandas.DataFrame or dict
 
     :param column_id: it must be present in the pandas DataFrame or in all DataFrames in the dictionary.
-        It is not allowed to have NaN values in this column.
-    :type column_id: basestring
+        It is not allowed to have NaN values in this column. Multiple id column names which define together one time series can be passed as well.
+    :type column_id: basestring or list
 
     :param column_sort: if not None, sort the rows by this column. It is not allowed to
         have NaN values in this column. If not given, will be filled by an increasing number,
@@ -482,7 +489,14 @@ def roll_time_series(
             "Your time series container has zero or one rows!. Can not perform rolling."
         )
 
-    if column_id is not None:
+    if isinstance(column_id, list):
+        if not all(item in df.columns for item in column_id):
+            raise AttributeError(
+                "The given columns for the id are not present in the data."
+            )
+        if len(column_id) == 1:
+            column_id = column_id[0]
+    elif isinstance(column_id, str):
         if column_id not in df:
             raise AttributeError(
                 "The given column for the id is not present in the data."
@@ -491,13 +505,15 @@ def roll_time_series(
         raise ValueError(
             "You have to set the column_id which contains the ids of the different time series"
         )
-
-    if column_kind is not None:
-        grouper = [column_kind, column_id]
-    else:
-        grouper = [
-            column_id,
-        ]
+    grouper = []
+    if column_kind is not None and isinstance(column_id, str):
+        grouper.extend([column_kind, column_id])
+    elif column_kind is not None and isinstance(column_id, list):
+        grouper.append(column_kind).extend(column_id)
+    elif isinstance(column_id, str):
+        grouper.append(column_id)
+    elif isinstance(column_id, list):
+        grouper.extend(column_id)
 
     if column_sort is not None:
         # Require no Nans in column
@@ -510,8 +526,10 @@ def roll_time_series(
             # if rolling is enabled, the data should be uniformly sampled in this column
             # Build the differences between consecutive time sort values
 
-            differences = df.groupby(grouper)[column_sort].apply(
-                lambda x: x.values[:-1] - x.values[1:]
+            differences = (
+                df.groupby(grouper)[column_sort]
+                .apply(lambda x: x.values[:-1] - x.values[1:])
+                .dropna()
             )
             # Write all of them into one big list
             differences = sum(map(list, differences), [])
@@ -575,7 +593,9 @@ def roll_time_series(
     distributor.close()
 
     df_shift = pd.concat(shifted_chunks, ignore_index=True)
-
+    # drop inital column_id if it isn't overwritten already as it is not needed for feature calculation and is included in the id
+    if column_id != "id":
+        df_shift = df_shift.drop(column_id, axis=1)
     return df_shift.sort_values(by=["id", column_sort or "sort"])