-
-
Notifications
You must be signed in to change notification settings - Fork 554
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implementation of LODA (Lightweight On-line Detection of Anomalies)
- Loading branch information
1 parent
424cc38
commit 60521a4
Showing
2 changed files
with
115 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
from __future__ import annotations | ||
|
||
import math | ||
|
||
import numpy as np | ||
|
||
from river import anomaly, utils | ||
|
||
__all__ = ["LODA"] | ||
|
||
|
||
class LODA(anomaly.base.AnomalyDetector): | ||
"""LODA (Lightweight on-line detector of anomalies) | ||
LODA [^1] comprises a collection of one-dimensional histograms, each approximating | ||
the probability density of the inputed data projected on a single projection vector. Its output | ||
on a sample is the average of the logarithm of probabilities estimated on individual projection vectors. | ||
LODA shows that an ensemble of very weak detections can lead to a very strong anomaly detector with performance | ||
equal to or even better than state-of-the-art methods. | ||
This implementation within `River` is adapted from the versions implemented by the | ||
[PyOD - Python Outlier Detection](https://pyod.readthedocs.io/en/latest/_modules/pyod/models/loda.html) and | ||
[PySAD - Python Streaming Anomaly Detection](https://pysad.readthedocs.io/en/latest/_modules/pysad/models/loda.html) | ||
frameworks. | ||
Parameters | ||
---------- | ||
n_bins | ||
Number of bins of the histograms generated by the algorithm. | ||
n_random_cuts | ||
Number of random cuts | ||
References | ||
---------- | ||
[^1] Pevný, T. 2015. LODA: Lightweight on-line detector of anomalies. Machine Learning. 102, 2 (2015), 275–304. | ||
Examples | ||
-------- | ||
>>> import pandas as pd | ||
>>> from river import anomaly | ||
>>> from river import datasets | ||
>>> cc_df = pd.DataFrame(datasets.CreditCard()) | ||
>>> loda = anomaly.LODA(n_bins=10, n_random_cuts=100) | ||
>>> for x, _ in datasets.CreditCard().take(10_000): | ||
... loda.learn_one(x) | ||
>>> loda.n_features | ||
30 | ||
>>> loda.score_one(cc_df[0][10_001]) | ||
9.091044415623026e-16 | ||
""" | ||
|
||
def __init__(self, n_bins=10, n_random_cuts=100): | ||
self.n_bins = n_bins | ||
self.n_random_cuts = n_random_cuts | ||
|
||
self.weights = [] | ||
self.projections_ = [] | ||
self.histograms_ = [] | ||
self.limits_ = [] | ||
self.n_bins_ = [] | ||
|
||
self.n_features = 0 | ||
self.n_zero_features = 0 | ||
self.n_nonzero_features = 0 | ||
|
||
self.init = True | ||
|
||
def learn_one(self, x): | ||
x_np = utils.dict2numpy(x) | ||
|
||
if self.init: | ||
self.n_features = len(x) | ||
self.n_nonzero_features = math.sqrt(self.n_features) | ||
self.n_zero_features = self.n_features - np.int_(self.n_nonzero_features) | ||
|
||
self.weights = np.ones(self.n_random_cuts) / self.n_random_cuts | ||
self.projections_ = np.random.rand(self.n_random_cuts, self.n_features) | ||
self.histograms_ = np.zeros((self.n_random_cuts, self.n_bins)) | ||
self.limits_ = np.zeros((self.n_random_cuts, self.n_bins + 1)) | ||
|
||
self.init = False | ||
|
||
x_np = x_np.reshape(1, -1) | ||
|
||
for i in range(self.n_random_cuts): | ||
rands = np.random.permutation(self.n_features)[: self.n_zero_features] | ||
self.projections_[i, rands] = 0 | ||
projected_data = self.projections_[i, :].dot(x_np.T) | ||
self.histograms_[i, :], self.limits_[i, :] = np.histogram( | ||
projected_data, bins=self.n_bins, density=False | ||
) | ||
self.histograms_[i, :] += 1e-12 | ||
self.histograms_[i, :] /= np.sum(self.histograms_[i, :]) | ||
|
||
def score_one(self, x): | ||
x_np = utils.dict2numpy(x).reshape(1, -1) | ||
|
||
pred_scores = np.zeros([x_np.shape[0], 1]) | ||
for i in range(self.n_random_cuts): | ||
projected_data = self.projections_[i, :].dot(x_np.T) | ||
inds = np.searchsorted(self.limits_[i, : self.n_bins - 1], projected_data, side="left") | ||
pred_scores[:, 0] += -self.weights[i] * np.log(self.histograms_[i, inds]) | ||
pred_scores /= self.n_random_cuts | ||
|
||
return pred_scores.ravel().item() |