diff --git a/nbs/common.modules.ipynb b/nbs/common.modules.ipynb
index 307fd96e4..370d5e463 100644
--- a/nbs/common.modules.ipynb
+++ b/nbs/common.modules.ipynb
@@ -154,7 +154,7 @@
     "\n",
     "\n",
     "class CausalConv1d(nn.Module):\n",
-    "    \"\"\" Causal Convolution 1d\n",
+    "    r\"\"\" Causal Convolution 1d\n",
     "\n",
     "    Receives `x` input of dim [N,C_in,T], and computes a causal convolution\n",
     "    in the time dimension. Skipping the H steps of the forecast horizon, through\n",
diff --git a/nbs/common.scalers.ipynb b/nbs/common.scalers.ipynb
index f49714a6b..2cf8fc53f 100644
--- a/nbs/common.scalers.ipynb
+++ b/nbs/common.scalers.ipynb
@@ -193,7 +193,7 @@
    "source": [
     "#| export\n",
     "def minmax_statistics(x, mask, eps=1e-6, dim=-1):\n",
-    "    \"\"\" MinMax Scaler\n",
+    "    r\"\"\" MinMax Scaler\n",
     "\n",
     "    Standardizes temporal features by ensuring its range dweels between\n",
     "    [0,1] range. This transformation is often used as an alternative \n",
@@ -264,7 +264,7 @@
    "source": [
     "#| export\n",
     "def minmax1_statistics(x, mask, eps=1e-6, dim=-1):\n",
-    "    \"\"\" MinMax1 Scaler\n",
+    "    r\"\"\" MinMax1 Scaler\n",
     "\n",
     "    Standardizes temporal features by ensuring its range dweels between\n",
     "    [-1,1] range. This transformation is often used as an alternative \n",
@@ -337,7 +337,7 @@
    "source": [
     "#| export\n",
     "def std_statistics(x, mask, dim=-1, eps=1e-6):\n",
-    "    \"\"\" Standard Scaler\n",
+    "    r\"\"\" Standard Scaler\n",
     "\n",
     "    Standardizes features by removing the mean and scaling\n",
     "    to unit variance along the `dim` dimension. \n",
@@ -400,7 +400,7 @@
    "source": [
     "#| export\n",
     "def robust_statistics(x, mask, dim=-1, eps=1e-6):\n",
-    "    \"\"\" Robust Median Scaler\n",
+    "    r\"\"\" Robust Median Scaler\n",
     "\n",
     "    Standardizes features by removing the median and scaling\n",
     "    with the mean absolute deviation (mad) a robust estimator of variance.\n",
@@ -475,7 +475,7 @@
    "source": [
     "#| export\n",
     "def invariant_statistics(x, mask, dim=-1, eps=1e-6):\n",
-    "    \"\"\" Invariant Median Scaler\n",
+    "    r\"\"\" Invariant Median Scaler\n",
     "\n",
     "    Standardizes features by removing the median and scaling\n",
     "    with the mean absolute deviation (mad) a robust estimator of variance.\n",
@@ -615,7 +615,7 @@
    "source": [
     "#| export\n",
     "class TemporalNorm(nn.Module):\n",
-    "    \"\"\" Temporal Normalization\n",
+    "    r\"\"\" Temporal Normalization\n",
     "\n",
     "    Standardization of the features is a common requirement for many \n",
     "    machine learning estimators, and it is commonly achieved by removing \n",
diff --git a/nbs/losses.pytorch.ipynb b/nbs/losses.pytorch.ipynb
index 9fbf564dc..bc4564909 100644
--- a/nbs/losses.pytorch.ipynb
+++ b/nbs/losses.pytorch.ipynb
@@ -211,7 +211,7 @@
    "source": [
     "#| export\n",
     "class MAE(BasePointLoss):\n",
-    "    \"\"\"Mean Absolute Error\n",
+    "    r\"\"\"Mean Absolute Error\n",
     "\n",
     "    Calculates Mean Absolute Error between\n",
     "    `y` and `y_hat`. MAE measures the relative prediction\n",
@@ -296,7 +296,7 @@
    "source": [
     "#| export\n",
     "class MSE(BasePointLoss):\n",
-    "    \"\"\"  Mean Squared Error\n",
+    "    r\"\"\"  Mean Squared Error\n",
     "\n",
     "    Calculates Mean Squared Error between\n",
     "    `y` and `y_hat`. MSE measures the relative prediction\n",
@@ -382,7 +382,7 @@
    "source": [
     "#| export\n",
     "class RMSE(BasePointLoss):\n",
-    "    \"\"\" Root Mean Squared Error\n",
+    "    r\"\"\" Root Mean Squared Error\n",
     "\n",
     "    Calculates Root Mean Squared Error between\n",
     "    `y` and `y_hat`. RMSE measures the relative prediction\n",
@@ -482,7 +482,7 @@
    "source": [
     "#| export\n",
     "class MAPE(BasePointLoss):\n",
-    "    \"\"\" Mean Absolute Percentage Error\n",
+    "    r\"\"\" Mean Absolute Percentage Error\n",
     "\n",
     "    Calculates Mean Absolute Percentage Error  between\n",
     "    `y` and `y_hat`. MAPE measures the relative prediction\n",
@@ -574,7 +574,7 @@
    "source": [
     "#| export\n",
     "class SMAPE(BasePointLoss):\n",
-    "    \"\"\" Symmetric Mean Absolute Percentage Error\n",
+    "    r\"\"\" Symmetric Mean Absolute Percentage Error\n",
     "\n",
     "    Calculates Symmetric Mean Absolute Percentage Error between\n",
     "    `y` and `y_hat`. SMAPE measures the relative prediction\n",
@@ -669,7 +669,7 @@
    "source": [
     "#| export\n",
     "class MASE(BasePointLoss):\n",
-    "    \"\"\" Mean Absolute Scaled Error \n",
+    "    r\"\"\" Mean Absolute Scaled Error \n",
     "    Calculates the Mean Absolute Scaled Error between\n",
     "    `y` and `y_hat`. MASE measures the relative prediction\n",
     "    accuracy of a forecasting method by comparinng the mean absolute errors\n",
@@ -765,7 +765,7 @@
    "source": [
     "#| export\n",
     "class relMSE(BasePointLoss):\n",
-    "    \"\"\"Relative Mean Squared Error\n",
+    "    r\"\"\"Relative Mean Squared Error\n",
     "    Computes Relative Mean Squared Error (relMSE), as proposed by Hyndman & Koehler (2006)\n",
     "    as an alternative to percentage errors, to avoid measure unstability.\n",
     "    $$ \\mathrm{relMSE}(\\\\mathbf{y}, \\\\mathbf{\\hat{y}}, \\\\mathbf{\\hat{y}}^{benchmark}) =\n",
@@ -867,7 +867,7 @@
    "source": [
     "#| export\n",
     "class QuantileLoss(BasePointLoss):\n",
-    "    \"\"\" Quantile Loss\n",
+    "    r\"\"\" Quantile Loss\n",
     "\n",
     "    Computes the quantile loss between `y` and `y_hat`.\n",
     "    QL measures the deviation of a quantile forecast.\n",
@@ -993,7 +993,7 @@
    "source": [
     "#| export\n",
     "class MQLoss(BasePointLoss):\n",
-    "    \"\"\"  Multi-Quantile loss\n",
+    "    r\"\"\"  Multi-Quantile loss\n",
     "\n",
     "    Calculates the Multi-Quantile loss (MQL) between `y` and `y_hat`.\n",
     "    MQL calculates the average multi-quantile Loss for\n",
@@ -1203,7 +1203,7 @@
     "\n",
     "\n",
     "class IQLoss(QuantileLoss):\n",
-    "    \"\"\"Implicit Quantile Loss\n",
+    "    r\"\"\"Implicit Quantile Loss\n",
     "\n",
     "    Computes the quantile loss between `y` and `y_hat`, with the quantile `q` provided as an input to the network. \n",
     "    IQL measures the deviation of a quantile forecast.\n",
@@ -1485,7 +1485,7 @@
     "\n",
     "\n",
     "class Tweedie(Distribution):\n",
-    "    \"\"\" Tweedie Distribution\n",
+    "    r\"\"\" Tweedie Distribution\n",
     "\n",
     "    The Tweedie distribution is a compound probability, special case of exponential\n",
     "    dispersion models EDMs defined by its mean-variance relationship.\n",
@@ -2629,7 +2629,7 @@
     "                 y: torch.Tensor,\n",
     "                 distr_args: torch.Tensor,\n",
     "                 mask: Union[torch.Tensor, None] = None):\n",
-    "         \"\"\"\n",
+    "         r\"\"\"\n",
     "         Computes the negative log-likelihood objective function. \n",
     "         To estimate the following predictive distribution:\n",
     "\n",
@@ -2753,7 +2753,7 @@
    "source": [
     "#| export\n",
     "class PMM(torch.nn.Module):\n",
-    "    \"\"\" Poisson Mixture Mesh\n",
+    "    r\"\"\" Poisson Mixture Mesh\n",
     "\n",
     "    This Poisson Mixture statistical model assumes independence across groups of \n",
     "    data $\\mathcal{G}=\\{[g_{i}]\\}$, and estimates relationships within the group.\n",
@@ -2928,7 +2928,7 @@
     "                 y: torch.Tensor,\n",
     "                 distr_args: torch.Tensor,\n",
     "                 mask: Union[torch.Tensor, None] = None):\n",
-    "        \"\"\"\n",
+    "        r\"\"\"\n",
     "        Computes the negative log-likelihood objective function. \n",
     "        To estimate the following predictive distribution:\n",
     "\n",
@@ -3104,7 +3104,7 @@
    "source": [
     "#| export\n",
     "class GMM(torch.nn.Module):\n",
-    "    \"\"\" Gaussian Mixture Mesh\n",
+    "    r\"\"\" Gaussian Mixture Mesh\n",
     "\n",
     "    This Gaussian Mixture statistical model assumes independence across groups of \n",
     "    data $\\mathcal{G}=\\{[g_{i}]\\}$, and estimates relationships within the group.\n",
@@ -3283,7 +3283,7 @@
     "                 y: torch.Tensor,\n",
     "                 distr_args: torch.Tensor,\n",
     "                 mask: Union[torch.Tensor, None] = None):\n",
-    "        \"\"\"\n",
+    "        r\"\"\"\n",
     "        Computes the negative log-likelihood objective function. \n",
     "        To estimate the following predictive distribution:\n",
     "\n",
@@ -3460,7 +3460,7 @@
    "source": [
     "#| export\n",
     "class NBMM(torch.nn.Module):\n",
-    "    \"\"\" Negative Binomial Mixture Mesh\n",
+    "    r\"\"\" Negative Binomial Mixture Mesh\n",
     "\n",
     "    This N. Binomial Mixture statistical model assumes independence across groups of \n",
     "    data $\\mathcal{G}=\\{[g_{i}]\\}$, and estimates relationships within the group.\n",
@@ -3641,7 +3641,7 @@
     "                 y: torch.Tensor,\n",
     "                 distr_args: torch.Tensor,\n",
     "                 mask: Union[torch.Tensor, None] = None):\n",
-    "        \"\"\"\n",
+    "        r\"\"\"\n",
     "        Computes the negative log-likelihood objective function. \n",
     "        To estimate the following predictive distribution:\n",
     "\n",
@@ -3790,7 +3790,7 @@
    "source": [
     "#| export\n",
     "class HuberLoss(BasePointLoss):\n",
-    "    \"\"\" Huber Loss\n",
+    "    r\"\"\" Huber Loss\n",
     "\n",
     "    The Huber loss, employed in robust regression, is a loss function that \n",
     "    exhibits reduced sensitivity to outliers in data when compared to the \n",
@@ -3887,7 +3887,7 @@
    "source": [
     "#| export\n",
     "class TukeyLoss(BasePointLoss):\n",
-    "    \"\"\" Tukey Loss\n",
+    "    r\"\"\" Tukey Loss\n",
     "\n",
     "    The Tukey loss function, also known as Tukey's biweight function, is a \n",
     "    robust statistical loss function used in robust statistics. Tukey's loss exhibits\n",
@@ -4019,7 +4019,7 @@
    "source": [
     "#| export\n",
     "class HuberQLoss(BasePointLoss):\n",
-    "    \"\"\" Huberized Quantile Loss\n",
+    "    r\"\"\" Huberized Quantile Loss\n",
     "\n",
     "    The Huberized quantile loss is a modified version of the quantile loss function that\n",
     "    combines the advantages of the quantile loss and the Huber loss. It is commonly used\n",
@@ -4125,7 +4125,7 @@
    "source": [
     "#| export\n",
     "class HuberMQLoss(BasePointLoss):\n",
-    "    \"\"\"  Huberized Multi-Quantile loss\n",
+    "    r\"\"\"  Huberized Multi-Quantile loss\n",
     "\n",
     "    The Huberized Multi-Quantile loss (HuberMQL) is a modified version of the multi-quantile loss function \n",
     "    that combines the advantages of the quantile loss and the Huber loss. HuberMQL is commonly used in regression \n",
@@ -4448,7 +4448,7 @@
    "source": [
     "#| export\n",
     "class Accuracy(BasePointLoss):\n",
-    "    \"\"\" Accuracy\n",
+    "    r\"\"\" Accuracy\n",
     "\n",
     "    Computes the accuracy between categorical `y` and `y_hat`.\n",
     "    This evaluation metric is only meant for evalution, as it\n",
@@ -4535,7 +4535,7 @@
    "source": [
     "#| export\n",
     "class sCRPS(BasePointLoss):\n",
-    "    \"\"\"Scaled Continues Ranked Probability Score\n",
+    "    r\"\"\"Scaled Continues Ranked Probability Score\n",
     "\n",
     "    Calculates a scaled variation of the CRPS, as proposed by Rangapuram (2021),\n",
     "    to measure the accuracy of predicted quantiles `y_hat` compared to the observation `y`.\n",
diff --git a/nbs/models.hint.ipynb b/nbs/models.hint.ipynb
index 49398c0a6..f53f4b4c3 100644
--- a/nbs/models.hint.ipynb
+++ b/nbs/models.hint.ipynb
@@ -83,7 +83,7 @@
    "source": [
     "#| export\n",
     "def get_bottomup_P(S: np.ndarray):\n",
-    "    \"\"\"BottomUp Reconciliation Matrix.\n",
+    "    r\"\"\"BottomUp Reconciliation Matrix.\n",
     "\n",
     "    Creates BottomUp hierarchical \\\"projection\\\" matrix is defined as:\n",
     "    $$\\mathbf{P}_{\\\\text{BU}} = [\\mathbf{0}_{\\mathrm{[b],[a]}}\\;|\\;\\mathbf{I}_{\\mathrm{[b][b]}}]$$    \n",
@@ -106,7 +106,7 @@
     "    return P\n",
     "\n",
     "def get_mintrace_ols_P(S: np.ndarray):\n",
-    "    \"\"\"MinTraceOLS Reconciliation Matrix.\n",
+    "    r\"\"\"MinTraceOLS Reconciliation Matrix.\n",
     "\n",
     "    Creates MinTraceOLS reconciliation matrix as proposed by Wickramasuriya et al.\n",
     "\n",
@@ -137,7 +137,7 @@
     "    return P\n",
     "\n",
     "def get_mintrace_wls_P(S: np.ndarray):\n",
-    "    \"\"\"MinTraceOLS Reconciliation Matrix.\n",
+    "    r\"\"\"MinTraceOLS Reconciliation Matrix.\n",
     "\n",
     "    Creates MinTraceOLS reconciliation matrix as proposed by Wickramasuriya et al.\n",
     "    Depending on a weighted GLS estimator and an estimator of the covariance matrix of the coherency errors $\\mathbf{W}_{h}$.\n",
diff --git a/neuralforecast/common/_modules.py b/neuralforecast/common/_modules.py
index 3b7d14ec2..d001b1679 100644
--- a/neuralforecast/common/_modules.py
+++ b/neuralforecast/common/_modules.py
@@ -82,7 +82,7 @@ def forward(self, x):
 
 
 class CausalConv1d(nn.Module):
-    """Causal Convolution 1d
+    r"""Causal Convolution 1d
 
     Receives `x` input of dim [N,C_in,T], and computes a causal convolution
     in the time dimension. Skipping the H steps of the forecast horizon, through
diff --git a/neuralforecast/common/_scalers.py b/neuralforecast/common/_scalers.py
index f11187d21..43fe6a421 100644
--- a/neuralforecast/common/_scalers.py
+++ b/neuralforecast/common/_scalers.py
@@ -1,3 +1,5 @@
+"""Temporal normalization has proven to be essential in neural forecasting tasks, as it enables network's non-linearities to express themselves. Forecasting scaling methods take particular interest in the temporal dimension where most of the variance dwells, contrary to other deep learning techniques like `BatchNorm` that normalizes across batch and temporal dimensions, and `LayerNorm` that normalizes across the feature dimension. Currently we support the following techniques: `std`, `median`, `norm`, `norm1`, `invariant`, `revin`."""
+
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/common.scalers.ipynb.
 
 # %% auto 0
@@ -56,7 +58,7 @@ def masked_mean(x, mask, dim=-1, keepdim=True):
 
 # %% ../../nbs/common.scalers.ipynb 14
 def minmax_statistics(x, mask, eps=1e-6, dim=-1):
-    """MinMax Scaler
+    r"""MinMax Scaler
 
     Standardizes temporal features by ensuring its range dweels between
     [0,1] range. This transformation is often used as an alternative
@@ -106,7 +108,7 @@ def inv_minmax_scaler(z, x_min, x_range):
 
 # %% ../../nbs/common.scalers.ipynb 17
 def minmax1_statistics(x, mask, eps=1e-6, dim=-1):
-    """MinMax1 Scaler
+    r"""MinMax1 Scaler
 
     Standardizes temporal features by ensuring its range dweels between
     [-1,1] range. This transformation is often used as an alternative
@@ -158,7 +160,7 @@ def inv_minmax1_scaler(z, x_min, x_range):
 
 # %% ../../nbs/common.scalers.ipynb 20
 def std_statistics(x, mask, dim=-1, eps=1e-6):
-    """Standard Scaler
+    r"""Standard Scaler
 
     Standardizes features by removing the mean and scaling
     to unit variance along the `dim` dimension.
@@ -196,7 +198,7 @@ def inv_std_scaler(z, x_mean, x_std):
 
 # %% ../../nbs/common.scalers.ipynb 23
 def robust_statistics(x, mask, dim=-1, eps=1e-6):
-    """Robust Median Scaler
+    r"""Robust Median Scaler
 
     Standardizes features by removing the median and scaling
     with the mean absolute deviation (mad) a robust estimator of variance.
@@ -246,7 +248,7 @@ def inv_robust_scaler(z, x_median, x_mad):
 
 # %% ../../nbs/common.scalers.ipynb 26
 def invariant_statistics(x, mask, dim=-1, eps=1e-6):
-    """Invariant Median Scaler
+    r"""Invariant Median Scaler
 
     Standardizes features by removing the median and scaling
     with the mean absolute deviation (mad) a robust estimator of variance.
@@ -328,7 +330,7 @@ def inv_identity_scaler(z, x_shift, x_scale):
 
 # %% ../../nbs/common.scalers.ipynb 33
 class TemporalNorm(nn.Module):
-    """Temporal Normalization
+    r"""Temporal Normalization
 
     Standardization of the features is a common requirement for many
     machine learning estimators, and it is commonly achieved by removing
diff --git a/neuralforecast/losses/pytorch.py b/neuralforecast/losses/pytorch.py
index e1b4477a9..1d710d3ee 100644
--- a/neuralforecast/losses/pytorch.py
+++ b/neuralforecast/losses/pytorch.py
@@ -1,3 +1,5 @@
+"""NeuralForecast contains a collection PyTorch Loss classes aimed to be used during the models' optimization."""
+
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/losses.pytorch.ipynb.
 
 # %% auto 0
@@ -101,7 +103,7 @@ def _compute_weights(self, y, mask):
 
 # %% ../../nbs/losses.pytorch.ipynb 11
 class MAE(BasePointLoss):
-    """Mean Absolute Error
+    r"""Mean Absolute Error
 
     Calculates Mean Absolute Error between
     `y` and `y_hat`. MAE measures the relative prediction
@@ -143,7 +145,7 @@ def __call__(
 
 # %% ../../nbs/losses.pytorch.ipynb 16
 class MSE(BasePointLoss):
-    """Mean Squared Error
+    r"""Mean Squared Error
 
     Calculates Mean Squared Error between
     `y` and `y_hat`. MSE measures the relative prediction
@@ -185,7 +187,7 @@ def __call__(
 
 # %% ../../nbs/losses.pytorch.ipynb 21
 class RMSE(BasePointLoss):
-    """Root Mean Squared Error
+    r"""Root Mean Squared Error
 
     Calculates Root Mean Squared Error between
     `y` and `y_hat`. RMSE measures the relative prediction
@@ -231,7 +233,7 @@ def __call__(
 
 # %% ../../nbs/losses.pytorch.ipynb 27
 class MAPE(BasePointLoss):
-    """Mean Absolute Percentage Error
+    r"""Mean Absolute Percentage Error
 
     Calculates Mean Absolute Percentage Error  between
     `y` and `y_hat`. MAPE measures the relative prediction
@@ -279,7 +281,7 @@ def __call__(
 
 # %% ../../nbs/losses.pytorch.ipynb 32
 class SMAPE(BasePointLoss):
-    """Symmetric Mean Absolute Percentage Error
+    r"""Symmetric Mean Absolute Percentage Error
 
     Calculates Symmetric Mean Absolute Percentage Error between
     `y` and `y_hat`. SMAPE measures the relative prediction
@@ -329,7 +331,7 @@ def __call__(
 
 # %% ../../nbs/losses.pytorch.ipynb 37
 class MASE(BasePointLoss):
-    """Mean Absolute Scaled Error
+    r"""Mean Absolute Scaled Error
     Calculates the Mean Absolute Scaled Error between
     `y` and `y_hat`. MASE measures the relative prediction
     accuracy of a forecasting method by comparinng the mean absolute errors
@@ -385,7 +387,7 @@ def __call__(
 
 # %% ../../nbs/losses.pytorch.ipynb 42
 class relMSE(BasePointLoss):
-    """Relative Mean Squared Error
+    r"""Relative Mean Squared Error
     Computes Relative Mean Squared Error (relMSE), as proposed by Hyndman & Koehler (2006)
     as an alternative to percentage errors, to avoid measure unstability.
     $$ \mathrm{relMSE}(\\mathbf{y}, \\mathbf{\hat{y}}, \\mathbf{\hat{y}}^{benchmark}) =
@@ -437,7 +439,7 @@ def __call__(
 
 # %% ../../nbs/losses.pytorch.ipynb 47
 class QuantileLoss(BasePointLoss):
-    """Quantile Loss
+    r"""Quantile Loss
 
     Computes the quantile loss between `y` and `y_hat`.
     QL measures the deviation of a quantile forecast.
@@ -514,7 +516,7 @@ def quantiles_to_outputs(quantiles):
 
 # %% ../../nbs/losses.pytorch.ipynb 53
 class MQLoss(BasePointLoss):
-    """Multi-Quantile loss
+    r"""Multi-Quantile loss
 
     Calculates the Multi-Quantile loss (MQL) between `y` and `y_hat`.
     MQL calculates the average multi-quantile Loss for
@@ -661,7 +663,7 @@ def forward(self, tau: torch.Tensor) -> torch.Tensor:
 
 
 class IQLoss(QuantileLoss):
-    """Implicit Quantile Loss
+    r"""Implicit Quantile Loss
 
     Computes the quantile loss between `y` and `y_hat`, with the quantile `q` provided as an input to the network.
     IQL measures the deviation of a quantile forecast.
@@ -882,7 +884,7 @@ def est_beta(mu, rho):
 
 
 class Tweedie(Distribution):
-    """Tweedie Distribution
+    r"""Tweedie Distribution
 
     The Tweedie distribution is a compound probability, special case of exponential
     dispersion models EDMs defined by its mean-variance relationship.
@@ -2022,7 +2024,7 @@ def __call__(
         distr_args: torch.Tensor,
         mask: Union[torch.Tensor, None] = None,
     ):
-        """
+        r"""
         Computes the negative log-likelihood objective function.
         To estimate the following predictive distribution:
 
@@ -2051,7 +2053,7 @@ def __call__(
 
 # %% ../../nbs/losses.pytorch.ipynb 75
 class PMM(torch.nn.Module):
-    """Poisson Mixture Mesh
+    r"""Poisson Mixture Mesh
 
     This Poisson Mixture statistical model assumes independence across groups of
     data $\mathcal{G}=\{[g_{i}]\}$, and estimates relationships within the group.
@@ -2244,7 +2246,7 @@ def __call__(
         distr_args: torch.Tensor,
         mask: Union[torch.Tensor, None] = None,
     ):
-        """
+        r"""
         Computes the negative log-likelihood objective function.
         To estimate the following predictive distribution:
 
@@ -2277,7 +2279,7 @@ def __call__(
 
 # %% ../../nbs/losses.pytorch.ipynb 83
 class GMM(torch.nn.Module):
-    """Gaussian Mixture Mesh
+    r"""Gaussian Mixture Mesh
 
     This Gaussian Mixture statistical model assumes independence across groups of
     data $\mathcal{G}=\{[g_{i}]\}$, and estimates relationships within the group.
@@ -2474,7 +2476,7 @@ def __call__(
         distr_args: torch.Tensor,
         mask: Union[torch.Tensor, None] = None,
     ):
-        """
+        r"""
         Computes the negative log-likelihood objective function.
         To estimate the following predictive distribution:
 
@@ -2506,7 +2508,7 @@ def __call__(
 
 # %% ../../nbs/losses.pytorch.ipynb 91
 class NBMM(torch.nn.Module):
-    """Negative Binomial Mixture Mesh
+    r"""Negative Binomial Mixture Mesh
 
     This N. Binomial Mixture statistical model assumes independence across groups of
     data $\mathcal{G}=\{[g_{i}]\}$, and estimates relationships within the group.
@@ -2707,7 +2709,7 @@ def __call__(
         distr_args: torch.Tensor,
         mask: Union[torch.Tensor, None] = None,
     ):
-        """
+        r"""
         Computes the negative log-likelihood objective function.
         To estimate the following predictive distribution:
 
@@ -2733,7 +2735,7 @@ def __call__(
 
 # %% ../../nbs/losses.pytorch.ipynb 98
 class HuberLoss(BasePointLoss):
-    """ Huber Loss
+    r""" Huber Loss
 
     The Huber loss, employed in robust regression, is a loss function that 
     exhibits reduced sensitivity to outliers in data when compared to the 
@@ -2786,7 +2788,7 @@ def __call__(
 
 # %% ../../nbs/losses.pytorch.ipynb 103
 class TukeyLoss(BasePointLoss):
-    """ Tukey Loss
+    r""" Tukey Loss
 
     The Tukey loss function, also known as Tukey's biweight function, is a 
     robust statistical loss function used in robust statistics. Tukey's loss exhibits
@@ -2879,7 +2881,7 @@ def __call__(
 
 # %% ../../nbs/losses.pytorch.ipynb 108
 class HuberQLoss(BasePointLoss):
-    """Huberized Quantile Loss
+    r"""Huberized Quantile Loss
 
     The Huberized quantile loss is a modified version of the quantile loss function that
     combines the advantages of the quantile loss and the Huber loss. It is commonly used
@@ -2944,7 +2946,7 @@ def __call__(
 
 # %% ../../nbs/losses.pytorch.ipynb 113
 class HuberMQLoss(BasePointLoss):
-    """Huberized Multi-Quantile loss
+    r"""Huberized Multi-Quantile loss
 
     The Huberized Multi-Quantile loss (HuberMQL) is a modified version of the multi-quantile loss function
     that combines the advantages of the quantile loss and the Huber loss. HuberMQL is commonly used in regression
@@ -3172,7 +3174,7 @@ def domain_map(self, y_hat):
 
 # %% ../../nbs/losses.pytorch.ipynb 124
 class Accuracy(BasePointLoss):
-    """Accuracy
+    r"""Accuracy
 
     Computes the accuracy between categorical `y` and `y_hat`.
     This evaluation metric is only meant for evalution, as it
@@ -3226,7 +3228,7 @@ def __call__(
 
 # %% ../../nbs/losses.pytorch.ipynb 128
 class sCRPS(BasePointLoss):
-    """Scaled Continues Ranked Probability Score
+    r"""Scaled Continues Ranked Probability Score
 
     Calculates a scaled variation of the CRPS, as proposed by Rangapuram (2021),
     to measure the accuracy of predicted quantiles `y_hat` compared to the observation `y`.
diff --git a/neuralforecast/models/hint.py b/neuralforecast/models/hint.py
index c0ab931f6..df86812a0 100644
--- a/neuralforecast/models/hint.py
+++ b/neuralforecast/models/hint.py
@@ -11,7 +11,7 @@
 
 # %% ../../nbs/models.hint.ipynb 7
 def get_bottomup_P(S: np.ndarray):
-    """BottomUp Reconciliation Matrix.
+    r"""BottomUp Reconciliation Matrix.
 
     Creates BottomUp hierarchical \"projection\" matrix is defined as:
     $$\mathbf{P}_{\\text{BU}} = [\mathbf{0}_{\mathrm{[b],[a]}}\;|\;\mathbf{I}_{\mathrm{[b][b]}}]$$
@@ -35,7 +35,7 @@ def get_bottomup_P(S: np.ndarray):
 
 
 def get_mintrace_ols_P(S: np.ndarray):
-    """MinTraceOLS Reconciliation Matrix.
+    r"""MinTraceOLS Reconciliation Matrix.
 
     Creates MinTraceOLS reconciliation matrix as proposed by Wickramasuriya et al.
 
@@ -67,7 +67,7 @@ def get_mintrace_ols_P(S: np.ndarray):
 
 
 def get_mintrace_wls_P(S: np.ndarray):
-    """MinTraceOLS Reconciliation Matrix.
+    r"""MinTraceOLS Reconciliation Matrix.
 
     Creates MinTraceOLS reconciliation matrix as proposed by Wickramasuriya et al.
     Depending on a weighted GLS estimator and an estimator of the covariance matrix of the coherency errors $\mathbf{W}_{h}$.