Merge pull request #607 from WenjieDu/(fix)timemixer

WenjieDu · web-flow · commit d899d02a0e07 · 2025-03-07T11:55:29.000+08:00
Fix x and x_mark shape not consistent bug in forecasting TimeMixer
diff --git a/pypots/forecasting/timemixer/core.py b/pypots/forecasting/timemixer/core.py
@@ -10,7 +10,6 @@
 import torch
 import torch.nn as nn
 
-from ...nn.functional import nonstationary_norm, nonstationary_denorm
 from ...nn.functional.error import calc_mse
 from ...nn.modules.timemixer import BackboneTimeMixer
 
@@ -33,13 +32,12 @@ def __init__(
         moving_avg: int,
         downsampling_layers: int,
         downsampling_window: int,
-        apply_nonstationary_norm: bool = False,
+        use_norm: bool = False,
     ):
         super().__init__()
 
         self.n_pred_steps = n_pred_steps
         self.n_pred_features = n_pred_features
-        self.apply_nonstationary_norm = apply_nonstationary_norm
 
         assert term in ["long", "short"], "forecasting term should be either 'long' or 'short'"
         self.model = BackboneTimeMixer(
@@ -60,13 +58,15 @@ def __init__(
             downsampling_window=downsampling_window,
             downsampling_method="avg",
             use_future_temporal_feature=False,
+            use_norm=use_norm,
         )
 
         # for the imputation task, the output dim is the same as input dim
         self.output_projection = nn.Linear(n_features, n_pred_features)
 
     def forward(self, inputs: dict) -> dict:
-        X, missing_mask = inputs["X"], inputs["missing_mask"]
+        X = inputs["X"]
+        # missing_mask = inputs["missing_mask"]
 
         if self.training:
             X_pred, X_pred_missing_mask = inputs["X_pred"], inputs["X_pred_missing_mask"]
@@ -77,16 +77,12 @@ def forward(self, inputs: dict) -> dict:
                 torch.ones(batch_size, self.n_pred_steps, self.n_pred_features),
             )
 
-        if self.apply_nonstationary_norm:
-            # Normalization from Non-stationary Transformer
-            X, means, stdev = nonstationary_norm(X, missing_mask)
-
         # TimesMixer processing
-        enc_out = self.model.forecast(X, missing_mask)
-
-        if self.apply_nonstationary_norm:
-            # De-Normalization from Non-stationary Transformer
-            enc_out = nonstationary_denorm(enc_out, means, stdev)
+        # WDU: missing_mask should not be passed into the model forward processing because the official implementation
+        # does not accept POTS on the forecasting task. And if pass in, it will result in
+        # x and x_mark shape not consistent bug
+        # enc_out = self.model.forecast(X, missing_mask)
+        enc_out = self.model.forecast(X, None)
 
         # project back the original data space
         forecasting_result = self.output_projection(enc_out)
diff --git a/pypots/forecasting/timemixer/model.py b/pypots/forecasting/timemixer/model.py
@@ -70,10 +70,8 @@ class TimeMixer(BaseNNForecaster):
     downsampling_window :
         The window size for downsampling.
 
-    apply_nonstationary_norm :
-        Whether to apply non-stationary normalization to the input data for TimeMixer.
-        Please refer to :cite:`liu2022nonstationary` for details about non-stationary normalization,
-        which is not the idea of the original TimeMixer paper. Hence, we make it optional and default not to use here.
+    use_norm :
+        Whether to apply RevIN to the input data for TimeMixer.
 
     batch_size :
         The batch size for training and evaluating the model.
@@ -143,7 +141,7 @@ def __init__(
         moving_avg: int = 5,
         downsampling_layers: int = 3,
         downsampling_window: int = 2,
-        apply_nonstationary_norm: bool = False,
+        use_norm: bool = False,
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
@@ -184,7 +182,7 @@ def __init__(
         self.moving_avg = moving_avg
         self.downsampling_layers = downsampling_layers
         self.downsampling_window = downsampling_window
-        self.apply_nonstationary_norm = apply_nonstationary_norm
+        self.use_norm = use_norm
 
         # set up the model
         self.model = _TimeMixer(
@@ -203,7 +201,7 @@ def __init__(
             self.moving_avg,
             self.downsampling_layers,
             self.downsampling_window,
-            self.apply_nonstationary_norm,
+            self.use_norm,
         )
         self._print_model_size()
         self._send_model_to_given_device()
diff --git a/pypots/nn/modules/timemixer/backbone.py b/pypots/nn/modules/timemixer/backbone.py
@@ -35,6 +35,7 @@ def __init__(
         downsampling_window: int,
         downsampling_method: str,
         use_future_temporal_feature: bool,
+        use_norm: bool = False,
         embed="fixed",
         freq="h",
         n_classes=None,
@@ -50,6 +51,7 @@ def __init__(
         self.downsampling_window = downsampling_window
         self.downsampling_layers = downsampling_layers
         self.downsampling_method = downsampling_method
+        self.use_norm = use_norm
         self.use_future_temporal_feature = use_future_temporal_feature
 
         assert downsampling_method in ["max", "avg", "conv"], "downsampling_method must be in ['max', 'avg', 'conv']"
@@ -74,12 +76,13 @@ def __init__(
         )
         self.preprocess = SeriesDecompositionBlock(moving_avg)
 
-        if self.channel_independence == 1:
+        if self.channel_independence:
             self.enc_embedding = DataEmbedding(1, d_model, embed, freq, dropout, with_pos=False)
         else:
             self.enc_embedding = DataEmbedding(n_features, d_model, embed, freq, dropout, with_pos=False)
 
-        self.normalize_layers = torch.nn.ModuleList([RevIN(n_features) for _ in range(downsampling_layers + 1)])
+        if self.use_norm:
+            self.normalize_layers = torch.nn.ModuleList([RevIN(n_features) for _ in range(downsampling_layers + 1)])
 
         if task_name == "long_term_forecast" or task_name == "short_term_forecast":
             self.predict_layers = torch.nn.ModuleList(
@@ -92,7 +95,7 @@ def __init__(
                 ]
             )
 
-            if self.channel_independence == 1:
+            if self.channel_independence:
                 self.projection_layer = nn.Linear(d_model, 1, bias=True)
             else:
                 self.projection_layer = nn.Linear(d_model, n_pred_features, bias=True)
@@ -117,7 +120,7 @@ def __init__(
                     ]
                 )
         elif task_name == "imputation" or task_name == "anomaly_detection":
-            if self.channel_independence == 1:
+            if self.channel_independence:
                 self.projection_layer = nn.Linear(d_model, 1, bias=True)
             else:
                 self.projection_layer = nn.Linear(d_model, n_pred_features, bias=True)
@@ -137,7 +140,7 @@ def out_projection(self, dec_out, i, out_res):
         return dec_out
 
     def pre_enc(self, x_list):
-        if self.channel_independence == 1:
+        if self.channel_independence:
             return x_list, None
         else:
             out1_list = []
@@ -197,7 +200,7 @@ def __multi_scale_process_inputs(self, x_enc, x_mark_enc):
 
     def forecast(self, x_enc, x_mark_enc, x_dec=None, x_mark_dec=None):
         if self.use_future_temporal_feature:
-            if self.channel_independence == 1:
+            if self.channel_independence:
                 B, T, N = x_enc.size()
                 x_mark_dec = x_mark_dec.repeat(N, 1, 1)
                 self.x_mark_dec = self.enc_embedding(None, x_mark_dec)
@@ -211,8 +214,8 @@ def forecast(self, x_enc, x_mark_enc, x_dec=None, x_mark_dec=None):
         if x_mark_enc is not None:
             for i, x, x_mark in zip(range(len(x_enc)), x_enc, x_mark_enc):
                 B, T, N = x.size()
-                x = self.normalize_layers[i](x, x_mark, mode="norm")
-                if self.channel_independence == 1:
+                x = self.normalize_layers[i](x, x_mark, mode="norm") if self.use_norm else x
+                if self.channel_independence:
                     x = x.permute(0, 2, 1).contiguous().reshape(B * N, T, 1)
                     x_mark = x_mark.repeat(N, 1, 1)
                 x_list.append(x)
@@ -223,8 +226,8 @@ def forecast(self, x_enc, x_mark_enc, x_dec=None, x_mark_dec=None):
                 x_enc,
             ):
                 B, T, N = x.size()
-                x = self.normalize_layers[i](x, mode="norm")
-                if self.channel_independence == 1:
+                x = self.normalize_layers[i](x, mode="norm") if self.use_norm else x
+                if self.channel_independence:
                     x = x.permute(0, 2, 1).contiguous().reshape(B * N, T, 1)
                 x_list.append(x)
 
@@ -248,12 +251,12 @@ def forecast(self, x_enc, x_mark_enc, x_dec=None, x_mark_dec=None):
         dec_out_list = self.future_multi_mixing(B, enc_out_list, x_list)
 
         dec_out = torch.stack(dec_out_list, dim=-1).sum(-1)
-        dec_out = self.normalize_layers[0](dec_out, mode="denorm")
+        dec_out = self.normalize_layers[0](dec_out, mode="denorm") if self.use_norm else dec_out
         return dec_out
 
     def future_multi_mixing(self, B, enc_out_list, x_list):
         dec_out_list = []
-        if self.channel_independence == 1:
+        if self.channel_independence:
             x_list = x_list[0]
             for i, enc_out in zip(range(len(x_list)), enc_out_list):
                 dec_out = self.predict_layers[i](enc_out.permute(0, 2, 1)).permute(0, 2, 1)  # align temporal dimension
@@ -310,8 +313,8 @@ def anomaly_detection(self, x_enc):
             x_enc,
         ):
             B, T, N = x.size()
-            x = self.normalize_layers[i](x, "norm")
-            if self.channel_independence == 1:
+            x = self.normalize_layers[i](x, "norm") if self.use_norm else x
+            if self.channel_independence:
                 x = x.permute(0, 2, 1).contiguous().reshape(B * N, T, 1)
             x_list.append(x)
 
@@ -328,7 +331,7 @@ def anomaly_detection(self, x_enc):
         dec_out = self.projection_layer(enc_out_list[0])
         dec_out = dec_out.reshape(B, self.c_out, -1).permute(0, 2, 1).contiguous()
 
-        dec_out = self.normalize_layers[0](dec_out, "denorm")
+        dec_out = self.normalize_layers[0](dec_out, "denorm") if self.use_norm else dec_out
         return dec_out
 
     def imputation(self, x_enc, x_mark_enc):
@@ -341,15 +344,15 @@ def imputation(self, x_enc, x_mark_enc):
         if x_mark_enc is not None:
             for i, x, x_mark in zip(range(len(x_enc)), x_enc, x_mark_enc):
                 B, T, N = x.size()
-                if self.channel_independence == 1:
+                if self.channel_independence:
                     x = x.permute(0, 2, 1).contiguous().reshape(B * N, T, 1)
                 x_list.append(x)
                 x_mark = x_mark.repeat(N, 1, 1)
                 x_mark_list.append(x_mark)
         else:
             for i, x in zip(range(len(x_enc)), x_enc):
                 B, T, N = x.size()
-                if self.channel_independence == 1:
+                if self.channel_independence:
                     x = x.permute(0, 2, 1).contiguous().reshape(B * N, T, 1)
                 x_list.append(x)
 
diff --git a/tests/forecasting/timemixer.py b/tests/forecasting/timemixer.py
@@ -54,7 +54,8 @@ class TestTimeMixer(unittest.TestCase):
         d_model=32,
         d_ffn=32,
         moving_avg=25,
-        downsampling_window=1,
+        downsampling_window=2,
+        use_norm=True,
         dropout=0.1,
         epochs=EPOCHS,
         saving_path=saving_path,
diff --git a/tests/global_test_config.py b/tests/global_test_config.py
@@ -20,8 +20,8 @@
 # set the number of epochs for all model training
 EPOCHS = 2
 # set the number of prediction steps for forecasting models
-N_STEPS = 12
-N_PRED_STEPS = 3
+N_STEPS = 14
+N_PRED_STEPS = 2
 N_FEATURES = 5
 # tensorboard and model files saving directory
 RESULT_SAVING_DIR = "testing_results"