|
10 | 10 | from visualizeFit import visualizeFit
|
11 | 11 | from show import show
|
12 | 12 |
|
13 |
| -## Machine Learning Online Class |
| 13 | +# Machine Learning Online Class |
14 | 14 | # Exercise 8 | Anomaly Detection and Collaborative Filtering
|
15 | 15 | #
|
16 | 16 | # Instructions
|
|
25 | 25 | #
|
26 | 26 | # For this exercise, you will not need to change any code in this file,
|
27 | 27 | # or any other files other than those mentioned above.
|
28 |
| -# |
29 | 28 |
|
30 |
| -## ================== Part 1: Load Example Dataset =================== |
| 29 | + |
| 30 | +# ================== Part 1: Load Example Dataset =================== |
31 | 31 | # We start this exercise by using a small dataset that is easy to
|
32 | 32 | # visualize.
|
33 | 33 | #
|
34 | 34 | # Our example case consists of 2 network server statistics across
|
35 | 35 | # several machines: the latency and throughput of each machine.
|
36 | 36 | # This exercise will help us find possibly faulty (or very fast) machines.
|
37 |
| -# |
38 | 37 |
|
39 | 38 | print('Visualizing example dataset for outlier detection.')
|
40 | 39 |
|
41 |
| -# The following command loads the dataset. You should now have the |
42 |
| -# variables X, Xval, yval in your environment |
| 40 | +# The following command loads the dataset. You should now have the |
| 41 | +# variables X, Xval, yval in your environment |
43 | 42 | data = scipy.io.loadmat('ex8data1.mat')
|
44 | 43 | X = data['X']
|
45 | 44 | Xval = data['Xval']
|
46 | 45 | yval = data['yval'].flatten()
|
47 | 46 |
|
48 |
| -# Visualize the example dataset |
| 47 | +# Visualize the example dataset |
49 | 48 | plt.plot(X[:, 0], X[:, 1], 'bx')
|
50 | 49 | plt.axis([0, 30, 0, 30])
|
51 | 50 | plt.xlabel('Latency (ms)')
|
|
54 | 53 | input('Program paused. Press Enter to continue...')
|
55 | 54 |
|
56 | 55 |
|
57 |
| -## ================== Part 2: Estimate the dataset statistics =================== |
| 56 | +# ================== Part 2: Estimate the dataset statistics =================== |
58 | 57 | # For this exercise, we assume a Gaussian distribution for the dataset.
|
59 | 58 | #
|
60 | 59 | # We first estimate the parameters of our assumed Gaussian distribution,
|
61 | 60 | # then compute the probabilities for each of the points and then visualize
|
62 | 61 | # both the overall distribution and where each of the points falls in
|
63 | 62 | # terms of that distribution.
|
64 |
| -# |
| 63 | + |
65 | 64 | print('Visualizing Gaussian fit.')
|
66 | 65 |
|
67 |
| -# Estimate my and sigma2 |
| 66 | +# Estimate mu and sigma2 |
68 | 67 | mu, sigma2 = estimateGaussian(X)
|
69 | 68 |
|
70 |
| -# Returns the density of the multivariate normal at each data point (row) |
71 |
| -# of X |
| 69 | +# Returns the density of the multivariate normal at each data point (row) |
| 70 | +# of X |
72 | 71 | p = multivariateGaussian(X, mu, sigma2)
|
73 | 72 |
|
74 |
| -# Visualize the fit |
| 73 | +# Visualize the fit |
75 | 74 | visualizeFit(X, mu, sigma2)
|
76 | 75 | plt.xlabel('Latency (ms)')
|
77 | 76 | plt.ylabel('Throughput (mb/s)')
|
78 | 77 | show()
|
79 | 78 |
|
80 | 79 | input('Program paused. Press Enter to continue...')
|
81 | 80 |
|
82 |
| -## ================== Part 3: Find Outliers =================== |
| 81 | +# ================== Part 3: Find Outliers =================== |
83 | 82 | # Now you will find a good epsilon threshold using a cross-validation set
|
84 | 83 | # probabilities given the estimated Gaussian distribution
|
85 |
| -# |
86 | 84 |
|
87 | 85 | pval = multivariateGaussian(Xval, mu, sigma2)
|
88 | 86 |
|
|
91 | 89 | print('Best F1 on Cross Validation Set: %f' % F1)
|
92 | 90 | print(' (you should see a value epsilon of about 8.99e-05)')
|
93 | 91 |
|
94 |
| -# Find the outliers in the training set and plot the |
| 92 | +# Find the outliers in the training set and plot the |
95 | 93 | outliers = np.where(p < epsilon, True, False)
|
96 | 94 |
|
97 |
| -# Draw a red circle around those outliers |
98 |
| -plt.plot(X[outliers, 0], X[outliers, 1], 'ro', lw=2, markersize=10, fillstyle='none', markeredgewidth=1) |
| 95 | +# Draw a red circle around those outliers |
| 96 | +plt.plot(X[outliers, 0], X[outliers, 1], 'ro', lw=2, |
| 97 | + markersize=10, fillstyle='none', markeredgewidth=1) |
99 | 98 | show()
|
100 | 99 |
|
101 | 100 | input('Program paused. Press Enter to continue...')
|
102 | 101 |
|
103 |
| -## ================== Part 4: Multidimensional Outliers =================== |
| 102 | +# ================== Part 4: Multidimensional Outliers =================== |
104 | 103 | # We will now use the code from the previous part and apply it to a
|
105 | 104 | # harder problem in which more features describe each datapoint and only
|
106 | 105 | # some features indicate whether a point is an outlier.
|
107 |
| -# |
108 | 106 |
|
109 |
| -# Loads the second dataset. You should now have the |
110 |
| -# variables X, Xval, yval in your environment |
| 107 | +# Loads the second dataset. You should now have the |
| 108 | +# variables X, Xval, yval in your environment |
111 | 109 | data = scipy.io.loadmat('ex8data2.mat')
|
112 | 110 | X = data['X']
|
113 | 111 | Xval = data['Xval']
|
114 | 112 | yval = data['yval'].flatten()
|
115 | 113 |
|
116 |
| -# Apply the same steps to the larger dataset |
| 114 | +# Apply the same steps to the larger dataset |
117 | 115 | mu, sigma2 = estimateGaussian(X)
|
118 | 116 |
|
119 |
| -# Training set |
| 117 | +# Training set |
120 | 118 | p = multivariateGaussian(X, mu, sigma2)
|
121 | 119 |
|
122 |
| -# Cross-validation set |
| 120 | +# Cross-validation set |
123 | 121 | pval = multivariateGaussian(Xval, mu, sigma2)
|
124 | 122 |
|
125 |
| -# Find the best threshold |
| 123 | +# Find the best threshold |
126 | 124 | epsilon, F1 = selectThreshold(yval, pval)
|
127 | 125 |
|
128 | 126 | print('Best epsilon found using cross-validation: %e' % epsilon)
|
|
0 commit comments