Skip to content

Commit 1e17b41

Browse files
committed
Update ex8
1 parent 1466951 commit 1e17b41

File tree

9 files changed

+101
-106
lines changed

9 files changed

+101
-106
lines changed

ex8/checkCostFunction.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import numpy as np
2+
23
from ex4.computeNumericalGradient import computeNumericalGradient
34
from cofiCostFunc import cofiCostFunc
45

@@ -11,7 +12,7 @@ def checkCostFunction(Lambda=0):
1112
computations should result in very similar values.
1213
"""
1314

14-
## Create small problem
15+
# Create small problem
1516
X_t = np.random.rand(4, 3)
1617
Theta_t = np.random.rand(5, 3)
1718

@@ -21,14 +22,14 @@ def checkCostFunction(Lambda=0):
2122
R = np.zeros(Y.shape)
2223
R[np.where(Y != 0, True, False)] = 1
2324

24-
## Run Gradient Checking
25+
# Run Gradient Checking
2526
X = np.random.random_sample(X_t.shape)
2627
Theta = np.random.random_sample(Theta_t.shape)
2728
num_users = Y.shape[1]
2829
num_movies = Y.shape[0]
2930
num_features = Theta_t.shape[1]
3031

31-
# Unroll parameters
32+
# Unroll parameters
3233
params = np.hstack((X.T.flatten(), Theta.T.flatten()))
3334

3435
costFunc = lambda t: cofiCostFunc(t, Y, R, num_users, num_movies, num_features, Lambda)
@@ -41,15 +42,14 @@ def costFunc_w(t):
4142

4243
cost, grad = cofiCostFunc(params, Y, R, num_users, num_movies, num_features, Lambda)
4344

44-
4545
print(np.column_stack((numgrad, grad)))
4646

47-
print('The above two columns you get should be very similar.\n' \
48-
'(Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n')
47+
print('The above two columns you get should be very similar.\n'
48+
'(Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n')
4949

5050
diff = np.linalg.norm(numgrad-grad)/np.linalg.norm(numgrad+grad)
5151

52-
print('If your backpropagation implementation is correct, then\n ' \
53-
'the relative difference will be small (less than 1e-9). \n' \
52+
print('If your backpropagation implementation is correct, then\n '
53+
'the relative difference will be small (less than 1e-9). \n'
5454
'\nRelative Difference: %g\n' % diff)
5555

ex8/cofiCostFunc.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,13 @@
22

33

44
def cofiCostFunc(params, Y, R, num_users, num_movies, num_features, Lambda):
5-
"""returns the cost and gradient for the
5+
"""returns the cost and gradient for the collaborative filtering problem.
66
"""
77

88
# Unfold the U and W matrices from params
99
X = np.array(params[:num_movies*num_features]).reshape(num_features, num_movies).T.copy()
1010
Theta = np.array(params[num_movies*num_features:]).reshape(num_features, num_users).T.copy()
1111

12-
1312
# You need to return the following values correctly
1413
J = 0
1514
X_grad = np.zeros(X.shape)
@@ -38,6 +37,6 @@ def cofiCostFunc(params, Y, R, num_users, num_movies, num_features, Lambda):
3837
# partial derivatives w.r.t. to each element of Theta
3938
# =============================================================
4039

41-
grad = np.hstack((X_grad.T.flatten(),Theta_grad.T.flatten()))
40+
grad = np.hstack((X_grad.T.flatten(), Theta_grad.T.flatten()))
4241

4342
return J, grad

ex8/estimateGaussian.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ def estimateGaussian(X):
99
The output is an n-dimensional vector mu, the mean of the data set
1010
and the variances sigma^2, an n x 1 vector
1111
"""
12+
1213
m = len(X)
1314

1415
# ====================== YOUR CODE HERE ======================

ex8/ex8.py

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from visualizeFit import visualizeFit
1111
from show import show
1212

13-
## Machine Learning Online Class
13+
# Machine Learning Online Class
1414
# Exercise 8 | Anomaly Detection and Collaborative Filtering
1515
#
1616
# Instructions
@@ -25,27 +25,26 @@
2525
#
2626
# For this exercise, you will not need to change any code in this file,
2727
# or any other files other than those mentioned above.
28-
#
2928

30-
## ================== Part 1: Load Example Dataset ===================
29+
30+
# ================== Part 1: Load Example Dataset ===================
3131
# We start this exercise by using a small dataset that is easy to
3232
# visualize.
3333
#
3434
# Our example case consists of 2 network server statistics across
3535
# several machines: the latency and throughput of each machine.
3636
# This exercise will help us find possibly faulty (or very fast) machines.
37-
#
3837

3938
print('Visualizing example dataset for outlier detection.')
4039

41-
# The following command loads the dataset. You should now have the
42-
# variables X, Xval, yval in your environment
40+
# The following command loads the dataset. You should now have the
41+
# variables X, Xval, yval in your environment
4342
data = scipy.io.loadmat('ex8data1.mat')
4443
X = data['X']
4544
Xval = data['Xval']
4645
yval = data['yval'].flatten()
4746

48-
# Visualize the example dataset
47+
# Visualize the example dataset
4948
plt.plot(X[:, 0], X[:, 1], 'bx')
5049
plt.axis([0, 30, 0, 30])
5150
plt.xlabel('Latency (ms)')
@@ -54,35 +53,34 @@
5453
input('Program paused. Press Enter to continue...')
5554

5655

57-
## ================== Part 2: Estimate the dataset statistics ===================
56+
# ================== Part 2: Estimate the dataset statistics ===================
5857
# For this exercise, we assume a Gaussian distribution for the dataset.
5958
#
6059
# We first estimate the parameters of our assumed Gaussian distribution,
6160
# then compute the probabilities for each of the points and then visualize
6261
# both the overall distribution and where each of the points falls in
6362
# terms of that distribution.
64-
#
63+
6564
print('Visualizing Gaussian fit.')
6665

67-
# Estimate my and sigma2
66+
# Estimate mu and sigma2
6867
mu, sigma2 = estimateGaussian(X)
6968

70-
# Returns the density of the multivariate normal at each data point (row)
71-
# of X
69+
# Returns the density of the multivariate normal at each data point (row)
70+
# of X
7271
p = multivariateGaussian(X, mu, sigma2)
7372

74-
# Visualize the fit
73+
# Visualize the fit
7574
visualizeFit(X, mu, sigma2)
7675
plt.xlabel('Latency (ms)')
7776
plt.ylabel('Throughput (mb/s)')
7877
show()
7978

8079
input('Program paused. Press Enter to continue...')
8180

82-
## ================== Part 3: Find Outliers ===================
81+
# ================== Part 3: Find Outliers ===================
8382
# Now you will find a good epsilon threshold using a cross-validation set
8483
# probabilities given the estimated Gaussian distribution
85-
#
8684

8785
pval = multivariateGaussian(Xval, mu, sigma2)
8886

@@ -91,38 +89,38 @@
9189
print('Best F1 on Cross Validation Set: %f' % F1)
9290
print(' (you should see a value epsilon of about 8.99e-05)')
9391

94-
# Find the outliers in the training set and plot the
92+
# Find the outliers in the training set and plot the
9593
outliers = np.where(p < epsilon, True, False)
9694

97-
# Draw a red circle around those outliers
98-
plt.plot(X[outliers, 0], X[outliers, 1], 'ro', lw=2, markersize=10, fillstyle='none', markeredgewidth=1)
95+
# Draw a red circle around those outliers
96+
plt.plot(X[outliers, 0], X[outliers, 1], 'ro', lw=2,
97+
markersize=10, fillstyle='none', markeredgewidth=1)
9998
show()
10099

101100
input('Program paused. Press Enter to continue...')
102101

103-
## ================== Part 4: Multidimensional Outliers ===================
102+
# ================== Part 4: Multidimensional Outliers ===================
104103
# We will now use the code from the previous part and apply it to a
105104
# harder problem in which more features describe each datapoint and only
106105
# some features indicate whether a point is an outlier.
107-
#
108106

109-
# Loads the second dataset. You should now have the
110-
# variables X, Xval, yval in your environment
107+
# Loads the second dataset. You should now have the
108+
# variables X, Xval, yval in your environment
111109
data = scipy.io.loadmat('ex8data2.mat')
112110
X = data['X']
113111
Xval = data['Xval']
114112
yval = data['yval'].flatten()
115113

116-
# Apply the same steps to the larger dataset
114+
# Apply the same steps to the larger dataset
117115
mu, sigma2 = estimateGaussian(X)
118116

119-
# Training set
117+
# Training set
120118
p = multivariateGaussian(X, mu, sigma2)
121119

122-
# Cross-validation set
120+
# Cross-validation set
123121
pval = multivariateGaussian(Xval, mu, sigma2)
124122

125-
# Find the best threshold
123+
# Find the best threshold
126124
epsilon, F1 = selectThreshold(yval, pval)
127125

128126
print('Best epsilon found using cross-validation: %e' % epsilon)

0 commit comments

Comments
 (0)