|
| 1 | + |
| 2 | +import numpy as np |
| 3 | +from typing import Optional, Tuple |
| 4 | +from sklearn.exceptions import NotFittedError |
| 5 | + |
| 6 | +from dtaianomaly.utils import get_dimension |
| 7 | +from dtaianomaly.preprocessing.Preprocessor import Preprocessor |
| 8 | + |
| 9 | + |
| 10 | +class RobustScaler(Preprocessor): |
| 11 | + """ |
| 12 | + Scale the time series using robust statistics. |
| 13 | +
|
| 14 | + The :py:class:`~dtaianomaly.preprocessing.RobustScaler` is similar to |
| 15 | + :py:class:`~dtaianomaly.preprocessing.StandardScaler`, but uses robust |
| 16 | + statistics rather than mean and standard deviation. The center of the data |
| 17 | + is computed via the median, and the scale is computed as the range between |
| 18 | + two quantiles (by default uses the IQR). This ensures that scaling is less |
| 19 | + affected by outliers. |
| 20 | +
|
| 21 | + For a time series :math:`x`, center :math:`c` and scale :math:`s`, observation |
| 22 | + :math:`x_i` is scaled to observation :math:`y_i` using the following equation: |
| 23 | +
|
| 24 | + .. math:: |
| 25 | +
|
| 26 | + y_i = \\frac{x_i - c}{s} |
| 27 | +
|
| 28 | + Notice the similarity with the formula for standard scaling. For multivariate |
| 29 | + time series, each attribute is scaled independently, each with an independent |
| 30 | + scale and center. |
| 31 | +
|
| 32 | + Parameters |
| 33 | + ---------- |
| 34 | + quantile_range: tuple of (float, float), default = (25.0, 75.0) |
| 35 | + Quantile range used to compute the ``scale_`` of the robust scaler. |
| 36 | + By default, this is equal to the Inter Quantile Range (IQR). The first |
| 37 | + value of the quantile range corresponds to the smallest quantile, the |
| 38 | + second value corresponds to the larger quantile. If the first value is |
| 39 | + not smaller than the second value, an error will be thrown. The values |
| 40 | + must also both be in the range [0, 100]. |
| 41 | +
|
| 42 | + Attributes |
| 43 | + ---------- |
| 44 | + center_: array-like of shape (n_attributes) |
| 45 | + The median value in each attribute of the training data. |
| 46 | + scale_: array-like of shape (n_attributes) |
| 47 | + The quantile range for each attribute of the training data. |
| 48 | +
|
| 49 | + Raises |
| 50 | + ------ |
| 51 | + NotFittedError |
| 52 | + If the `transform` method is called before fitting this StandardScaler. |
| 53 | + """ |
| 54 | + quantile_range: (float, float) |
| 55 | + center_: np.array |
| 56 | + scale_: np.array |
| 57 | + |
| 58 | + def __init__(self, quantile_range: (float, float) = (25.0, 75.0)): |
| 59 | + if not isinstance(quantile_range, tuple): |
| 60 | + raise TypeError("`quantile_range` should be tuple") |
| 61 | + if len(quantile_range) != 2: |
| 62 | + raise ValueError("'quantile_range' should consist of exactly two values (length of 2)") |
| 63 | + if not isinstance(quantile_range[0], (float, int)) or isinstance(quantile_range[0], bool): |
| 64 | + raise TypeError("The first element `quantile_range` should be a float or int") |
| 65 | + if not isinstance(quantile_range[1], (float, int)) or isinstance(quantile_range[1], bool): |
| 66 | + raise TypeError("The second element `quantile_range` should be a float or int") |
| 67 | + if quantile_range[0] < 0.0: |
| 68 | + raise ValueError("the first element in 'quantile_range' must be at least 0.0") |
| 69 | + if quantile_range[1] > 100.0: |
| 70 | + raise ValueError("the second element in 'quantile_range' must be at most 100.0") |
| 71 | + if not quantile_range[0] < quantile_range[1]: |
| 72 | + raise ValueError("the first element in 'quantile_range' must be at smaller than the second element in 'quantile_range'") |
| 73 | + self.quantile_range = quantile_range |
| 74 | + |
| 75 | + def _fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> 'RobustScaler': |
| 76 | + if get_dimension(X) == 1: |
| 77 | + # univariate case |
| 78 | + self.center_ = np.array([np.nanmedian(X)]) |
| 79 | + q_min = np.percentile(X, q=self.quantile_range[0]) |
| 80 | + q_max = np.percentile(X, q=self.quantile_range[1]) |
| 81 | + self.scale_ = np.array([q_max - q_min]) |
| 82 | + else: |
| 83 | + # multivariate case |
| 84 | + self.center_ = np.nanmedian(X, axis=0) |
| 85 | + q_min = np.percentile(X, q=self.quantile_range[0], axis=0) |
| 86 | + q_max = np.percentile(X, q=self.quantile_range[1], axis=0) |
| 87 | + self.scale_ = q_max - q_min |
| 88 | + return self |
| 89 | + |
| 90 | + def _transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> Tuple[np.ndarray, Optional[np.ndarray]]: |
| 91 | + if not (hasattr(self, 'center_') and hasattr(self, 'scale_')): |
| 92 | + raise NotFittedError(f'Call `fit` before using transform on {str(self)}') |
| 93 | + if not ((len(X.shape) == 1 and self.center_.shape[0] == 1) or X.shape[1] == self.center_.shape[0]): |
| 94 | + raise AttributeError(f'Trying to robust scale a time series with {X.shape[0]} attributes while it was fitted on {self.center_.shape[0]} attributes!') |
| 95 | + |
| 96 | + X_ = (X - self.center_) / self.scale_ |
| 97 | + return np.where(np.isnan(X_), X, X_), y |
0 commit comments