diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2358084 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*.pyc +.idea +iterate.dat +.DS_Store +*~ +token.txt +.git_ diff --git a/Submission.py b/Submission.py new file mode 100644 index 0000000..897ab52 --- /dev/null +++ b/Submission.py @@ -0,0 +1,98 @@ +from urllib import urlencode +from urllib2 import urlopen +from json import loads, dumps +from collections import OrderedDict +import numpy as np +import os + +from getpass import getpass + + +class Submission(): + + def __init__(self, homework, part_names, srcs, output): + self.__homework = homework + self.__part_names = part_names + self.__srcs = srcs + self.__output = output + self.__submit_url = 'https://www-origin.coursera.org/api/onDemandProgrammingImmediateFormSubmissions.v1' + self.__login = None + self.__password = None + + def submit(self): + print '==\n== Submitting Solutions | Programming Exercise %s\n==' % self.__homework + self.login_prompt() + + parts = OrderedDict() + for part_id, _ in enumerate(self.__srcs,1): + parts[str(part_id)] = {'output': self.__output(part_id)} + + result, response = self.request(parts) + + response = loads(response) + print '==' + print '== %43s | %9s | %-s' % ('Part Name', 'Score', 'Feedback') + print '== %43s | %9s | %-s' % ('---------', '-----', '--------') + for part in parts: + partFeedback = response['partFeedbacks'][part] + partEvaluation = response['partEvaluations'][part] + score = '%d / %3d' % (partEvaluation['score'], partEvaluation['maxScore']) + print '== %43s | %9s | %-s' % (self.__part_names[int(part)-1], score, partFeedback) + + evaluation = response['evaluation'] + totalScore = '%d / %d' % (evaluation['score'], evaluation['maxScore']) + print '== --------------------------------' + print '== %43s | %9s | %-s\n' % (' ', totalScore, ' ') + print '==' + + if not os.path.isfile('token.txt'): + with open('token.txt', 'w') as f: + f.write(self.__login + '\n') + f.writelines(self.__password) + + + def login_prompt(self): + try: + with open('token.txt', 'r') as f: + self.__login = f.readline().strip() + self.__password = f.readline().strip() + except IOError: + pass + + if self.__login is not None and self.__password is not None: + reenter = raw_input('Use token from last successful submission (%s)? (Y/n): ' % self.__login) + + if reenter == '' or reenter[0] == 'Y' or reenter[0] == 'y': + return + + if os.path.isfile('token.txt'): + os.remove('token.txt') + self.__login = raw_input('login (Email address): ') + self.__password = getpass('Password: ') + + def request(self, parts): + + params = { + 'assignmentSlug': self.__homework, + 'secret': self.__password, + 'parts': parts, + 'submitterEmail': self.__login} + + params = urlencode({'jsonBody': dumps(params)}) + f = urlopen(self.__submit_url, params) + try: + return 0, f.read() + finally: + f.close() + +def sprintf(fmt, arg): + "emulates (part of) Octave sprintf function" + if isinstance(arg, tuple): + # for multiple return values, only use the first one + arg = arg[0] + + if isinstance(arg, (np.ndarray, list)): + # concatenates all elements, column by column + return ' '.join(fmt % e for e in np.asarray(arg).ravel('F')) + else: + return fmt % arg \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ex1/__init__.py b/ex1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ex1/computeCost.py b/ex1/computeCost.py new file mode 100644 index 0000000..2db3ff2 --- /dev/null +++ b/ex1/computeCost.py @@ -0,0 +1,20 @@ +import numpy as np + +def computeCost(X, y, theta): + """ + computes the cost of using theta as the parameter for linear + regression to fit the data points in X and y + """ + m = y.size + J = 0 + +# ====================== YOUR CODE HERE ====================== +# Instructions: Compute the cost of a particular choice of theta +# You should set J to the cost. + + +# ========================================================================= + + return J + + diff --git a/ex1/computeCostMulti.py b/ex1/computeCostMulti.py new file mode 100644 index 0000000..b98155b --- /dev/null +++ b/ex1/computeCostMulti.py @@ -0,0 +1,16 @@ +def computeCostMulti(X, y, theta): + """ + Compute cost for linear regression with multiple variables + J = computeCost(X, y, theta) computes the cost of using theta as the + parameter for linear regression to fit the data points in X and y + """ + m = y.size + J = 0 +# ====================== YOUR CODE HERE ====================== +# Instructions: Compute the cost of a particular choice of theta +# You should set J to the cost. + + +# ========================================================================= + + return J diff --git a/ex1/ex1.py b/ex1/ex1.py new file mode 100644 index 0000000..7cdb995 --- /dev/null +++ b/ex1/ex1.py @@ -0,0 +1,161 @@ +from matplotlib import use, cm +use('TkAgg') +import matplotlib.pyplot as plt +import numpy as np +from mpl_toolkits.mplot3d import axes3d +from sklearn import linear_model + +from gradientDescent import gradientDescent +from computeCost import computeCost +from warmUpExercise import warmUpExercise +from plotData import plotData +from show import show + +## Machine Learning Online Class - Exercise 1: Linear Regression + +# Instructions +# ------------ +# +# This file contains code that helps you get started on the +# linear exercise. You will need to complete the following modules +# in this exericse: +# +# warmUpExercise.py +# plotData.py +# gradientDescent.py +# computeCost.py +# gradientDescentMulti.py +# computeCostMulti.py +# featureNormalize.py +# normalEqn.py +# +# For this exercise, you will not need to change any code in this file, +# or any other files other than those mentioned above. +# +# x refers to the population size in 10,000s +# y refers to the profit in $10,000s + +# ==================== Part 1: Basic Function ==================== +# Complete warmUpExercise.py +print 'Running warmUpExercise ...' +print '5x5 Identity Matrix:' +warmup = warmUpExercise() +print warmup +raw_input("Program paused. Press Enter to continue...") + +# ======================= Part 2: Plotting ======================= +data = np.loadtxt('ex1data1.txt', delimiter=',') +m = data.shape[0] +X = np.vstack(zip(np.ones(m),data[:,0])) +y = data[:, 1] + +# Plot Data +# Note: You have to complete the code in plotData.py +print 'Plotting Data ...' +plotData(data) +show() + +raw_input("Program paused. Press Enter to continue...") + +# =================== Part 3: Gradient descent =================== +print 'Running Gradient Descent ...' +theta = np.zeros(2) + +# compute and display initial cost +J = computeCost(X, y, theta) +print 'cost: %0.4f ' % J + +# Some gradient descent settings +iterations = 1500 +alpha = 0.01 + +# run gradient descent +theta, J_history = gradientDescent(X, y, theta, alpha, iterations) + +# print theta to screen +print 'Theta found by gradient descent: ' +print '%s %s \n' % (theta[0], theta[1]) + +# Plot the linear fit +plt.figure() +plotData(data) +plt.plot(X[:, 1], X.dot(theta), '-', label='Linear regression') +plt.legend(loc='upper right', shadow=True, fontsize='x-large', numpoints=1) +show() + +raw_input("Program paused. Press Enter to continue...") + +# Predict values for population sizes of 35,000 and 70,000 +predict1 = np.array([1, 3.5]).dot(theta) +predict2 = np.array([1, 7]).dot(theta) +print 'For population = 35,000, we predict a profit of {:.4f}'.format(predict1*10000) +print 'For population = 70,000, we predict a profit of {:.4f}'.format(predict2*10000) + +# ============= Part 4: Visualizing J(theta_0, theta_1) ============= +print 'Visualizing J(theta_0, theta_1) ...' + +# Grid over which we will calculate J +theta0_vals = np.linspace(-10, 10, X.shape[0]) +theta1_vals = np.linspace(-1, 4, X.shape[0]) + +# initialize J_vals to a matrix of 0's +J_vals=np.array(np.zeros(X.shape[0]).T) + +for i in range(theta0_vals.size): + col = [] + for j in range(theta1_vals.size): + t = np.array([theta0_vals[i],theta1_vals[j]]) + col.append(computeCost(X, y, t.T)) + J_vals=np.column_stack((J_vals,col)) + +# Because of the way meshgrids work in the surf command, we need to +# transpose J_vals before calling surf, or else the axes will be flipped +J_vals = J_vals[:,1:].T +theta0_vals, theta1_vals = np.meshgrid(theta0_vals, theta1_vals) + +# Surface plot +fig = plt.figure() +ax = fig.gca(projection='3d') +ax.plot_surface(theta0_vals, theta1_vals, J_vals, rstride=8, cstride=8, alpha=0.3, + cmap=cm.coolwarm, linewidth=0, antialiased=False) +ax.set_xlabel(r'$\theta_0$') +ax.set_ylabel(r'$\theta_1$') +ax.set_zlabel(r'J($\theta$)') +show() + +raw_input("Program paused. Press Enter to continue...") + +# Contour plot +plt.figure() + +# Plot J_vals as 15 contours spaced logarithmically between 0.01 and 100 +ax = plt.contour(theta0_vals, theta1_vals, J_vals, np.logspace(-2, 3, 20)) +plt.clabel(ax, inline=1, fontsize=10) +plt.xlabel(r'$\theta_0$') +plt.ylabel(r'$\theta_1$') +plt.plot(0.0, 0.0, 'rx', linewidth=2, markersize=10) +show() + +raw_input("Program paused. Press Enter to continue...") + +# =============Use Scikit-learn ============= +regr = linear_model.LinearRegression(fit_intercept=False, normalize=True) +regr.fit(X, y) + +print 'Theta found by scikit: ' +print '%s %s \n' % (regr.coef_[0], regr.coef_[1]) + +predict1 = np.array([1, 3.5]).dot(regr.coef_) +predict2 = np.array([1, 7]).dot(regr.coef_) +print 'For population = 35,000, we predict a profit of {:.4f}'.format(predict1*10000) +print 'For population = 70,000, we predict a profit of {:.4f}'.format(predict2*10000) + +plt.figure() +plotData(data) +plt.plot(X[:, 1], X.dot(regr.coef_), '-', color='black', label='Linear regression wit scikit') +plt.legend(loc='upper right', shadow=True, fontsize='x-large', numpoints=1) +show() + +raw_input("Program paused. Press Enter to continue...") + + diff --git a/ex1/ex1_multi.py b/ex1/ex1_multi.py new file mode 100644 index 0000000..e635b09 --- /dev/null +++ b/ex1/ex1_multi.py @@ -0,0 +1,129 @@ +from matplotlib import use +use('TkAgg') +import numpy as np +import matplotlib.pyplot as plt + +from gradientDescentMulti import gradientDescentMulti +from normalEqn import normalEqn +from featureNormalize import featureNormalize +from show import show +# ================ Part 1: Feature Normalization ================ + +print 'Loading data ...' + +# Load Data +data = np.loadtxt('ex1data2.txt', delimiter=',') +X = data[:, :2] +y = data[:, 2] +m = y.T.size + + +# Print out some data points +print 'First 10 examples from the dataset:' +print np.column_stack( (X[:10], y[:10]) ) +raw_input("Program paused. Press Enter to continue...") + +# Scale features and set them to zero mean +print 'Normalizing Features ...' + +X, mu, sigma = featureNormalize(X) +print '[mu] [sigma]' +print mu, sigma + +# Add intercept term to X +X = np.concatenate((np.ones((m, 1)), X), axis=1) + + +# ================ Part 2: Gradient Descent ================ +# +# ====================== YOUR CODE HERE ====================== +# Instructions: We have provided you with the following starter +# code that runs gradient descent with a particular +# learning rate (alpha). +# +# Your task is to first make sure that your functions - +# computeCost and gradientDescent already work with +# this starter code and support multiple variables. +# +# After that, try running gradient descent with +# different values of alpha and see which one gives +# you the best result. +# +# Finally, you should complete the code at the end +# to predict the price of a 1650 sq-ft, 3 br house. +# +# Hint: By using the 'hold on' command, you can plot multiple +# graphs on the same figure. +# +# Hint: At prediction, make sure you do the same feature normalization. +# + +print 'Running gradient descent ...' + +# Choose some alpha value +alpha = 0.01 +num_iters = 400 + +# Init Theta and Run Gradient Descent +theta = np.zeros(3) +theta, J_history = gradientDescentMulti(X, y, theta, alpha, num_iters) + +# Plot the convergence graph +plt.plot(J_history, '-b') +plt.xlabel('Number of iterations') +plt.ylabel('Cost J') +show() +raw_input("Program paused. Press Enter to continue...") + +# Display gradient descent's result +print 'Theta computed from gradient descent: ' +print theta + +# Estimate the price of a 1650 sq-ft, 3 br house +price = np.array([1,3,1650]).dot(theta) + +print 'Predicted price of a 1650 sq-ft, 3 br house' +print '(using gradient descent): ' +print price + +raw_input("Program paused. Press Enter to continue...") + +# ================ Part 3: Normal Equations ================ + +# ====================== YOUR CODE HERE ====================== +# Instructions: The following code computes the closed form +# solution for linear regression using the normal +# equations. You should complete the code in +# normalEqn.m +# +# After doing so, you should complete this code +# to predict the price of a 1650 sq-ft, 3 br house. +# + +print 'Solving with normal equations...' + +# Load Data +data = np.loadtxt('ex1data2.txt', delimiter=',') +X = data[:, :2] +y = data[:, 2] +m = y.T.size + +# Add intercept term to X +X = np.concatenate((np.ones((m,1)), X), axis=1) + +# Calculate the parameters from the normal equation +theta = normalEqn(X, y) + +# Display normal equation's result +print 'Theta computed from the normal equations:' +print ' %s \n' % theta + +# Estimate the price of a 1650 sq-ft, 3 br house +price = np.array([1, 3, 1650]).dot(theta) + +# ============================================================ + +print "Predicted price of a 1650 sq-ft, 3 br house " +print '(using normal equations):\n $%f\n' % price + +raw_input("Program paused. Press Enter to continue...") diff --git a/ex1/ex1data1.txt b/ex1/ex1data1.txt new file mode 100644 index 0000000..0f88ccb --- /dev/null +++ b/ex1/ex1data1.txt @@ -0,0 +1,97 @@ +6.1101,17.592 +5.5277,9.1302 +8.5186,13.662 +7.0032,11.854 +5.8598,6.8233 +8.3829,11.886 +7.4764,4.3483 +8.5781,12 +6.4862,6.5987 +5.0546,3.8166 +5.7107,3.2522 +14.164,15.505 +5.734,3.1551 +8.4084,7.2258 +5.6407,0.71618 +5.3794,3.5129 +6.3654,5.3048 +5.1301,0.56077 +6.4296,3.6518 +7.0708,5.3893 +6.1891,3.1386 +20.27,21.767 +5.4901,4.263 +6.3261,5.1875 +5.5649,3.0825 +18.945,22.638 +12.828,13.501 +10.957,7.0467 +13.176,14.692 +22.203,24.147 +5.2524,-1.22 +6.5894,5.9966 +9.2482,12.134 +5.8918,1.8495 +8.2111,6.5426 +7.9334,4.5623 +8.0959,4.1164 +5.6063,3.3928 +12.836,10.117 +6.3534,5.4974 +5.4069,0.55657 +6.8825,3.9115 +11.708,5.3854 +5.7737,2.4406 +7.8247,6.7318 +7.0931,1.0463 +5.0702,5.1337 +5.8014,1.844 +11.7,8.0043 +5.5416,1.0179 +7.5402,6.7504 +5.3077,1.8396 +7.4239,4.2885 +7.6031,4.9981 +6.3328,1.4233 +6.3589,-1.4211 +6.2742,2.4756 +5.6397,4.6042 +9.3102,3.9624 +9.4536,5.4141 +8.8254,5.1694 +5.1793,-0.74279 +21.279,17.929 +14.908,12.054 +18.959,17.054 +7.2182,4.8852 +8.2951,5.7442 +10.236,7.7754 +5.4994,1.0173 +20.341,20.992 +10.136,6.6799 +7.3345,4.0259 +6.0062,1.2784 +7.2259,3.3411 +5.0269,-2.6807 +6.5479,0.29678 +7.5386,3.8845 +5.0365,5.7014 +10.274,6.7526 +5.1077,2.0576 +5.7292,0.47953 +5.1884,0.20421 +6.3557,0.67861 +9.7687,7.5435 +6.5159,5.3436 +8.5172,4.2415 +9.1802,6.7981 +6.002,0.92695 +5.5204,0.152 +5.0594,2.8214 +5.7077,1.8451 +7.6366,4.2959 +5.8707,7.2029 +5.3054,1.9869 +8.2934,0.14454 +13.394,9.0551 +5.4369,0.61705 diff --git a/ex1/ex1data2.txt b/ex1/ex1data2.txt new file mode 100644 index 0000000..79e9a80 --- /dev/null +++ b/ex1/ex1data2.txt @@ -0,0 +1,47 @@ +2104,3,399900 +1600,3,329900 +2400,3,369000 +1416,2,232000 +3000,4,539900 +1985,4,299900 +1534,3,314900 +1427,3,198999 +1380,3,212000 +1494,3,242500 +1940,4,239999 +2000,3,347000 +1890,3,329999 +4478,5,699900 +1268,3,259900 +2300,4,449900 +1320,2,299900 +1236,3,199900 +2609,4,499998 +3031,4,599000 +1767,3,252900 +1888,2,255000 +1604,3,242900 +1962,4,259900 +3890,3,573900 +1100,3,249900 +1458,3,464500 +2526,3,469000 +2200,3,475000 +2637,3,299900 +1839,2,349900 +1000,1,169900 +2040,4,314900 +3137,3,579900 +1811,4,285900 +1437,3,249900 +1239,3,229900 +2132,4,345000 +4215,4,549000 +2162,4,287000 +1664,2,368500 +2238,3,329900 +2567,4,314000 +1200,3,299000 +852,2,179900 +1852,4,299900 +1203,3,239500 diff --git a/ex1/featureNormalize.py b/ex1/featureNormalize.py new file mode 100644 index 0000000..6c9fe6b --- /dev/null +++ b/ex1/featureNormalize.py @@ -0,0 +1,31 @@ +import numpy as np + + +def featureNormalize(X): + """ + returns a normalized version of X where + the mean value of each feature is 0 and the standard deviation + is 1. This is often a good preprocessing step to do when + working with learning algorithms. + """ + X_norm, mu, sigma = 0,0,0 + # ====================== YOUR CODE HERE ====================== + # Instructions: First, for each feature dimension, compute the mean + # of the feature and subtract it from the dataset, + # storing the mean value in mu. Next, compute the + # standard deviation of each feature and divide + # each feature by it's standard deviation, storing + # the standard deviation in sigma. + # + # Note that X is a matrix where each column is a + # feature and each row is an example. You need + # to perform the normalization separately for + # each feature. + # + # Hint: You might find the 'mean' and 'std' functions useful. + # + + +# ============================================================ + + return X_norm, mu, sigma diff --git a/ex1/gradientDescent.py b/ex1/gradientDescent.py new file mode 100644 index 0000000..9868a19 --- /dev/null +++ b/ex1/gradientDescent.py @@ -0,0 +1,30 @@ +from computeCost import computeCost + + +def gradientDescent(X, y, theta, alpha, num_iters): + """ + Performs gradient descent to learn theta + theta = gradientDescent(x, y, theta, alpha, num_iters) updates theta by + taking num_iters gradient steps with learning rate alpha + """ + + # Initialize some useful values + J_history = [] + m = y.size # number of training examples + + for i in range(num_iters): + # ====================== YOUR CODE HERE ====================== + # Instructions: Perform a single gradient step on the parameter vector + # theta. + # + # Hint: While debugging, it can be useful to print out the values + # of the cost function (computeCost) and gradient here. + # + + + # ============================================================ + + # Save the cost J in every iteration + J_history.append(computeCost(X, y, theta)) + + return theta, J_history diff --git a/ex1/gradientDescentMulti.py b/ex1/gradientDescentMulti.py new file mode 100644 index 0000000..36301ce --- /dev/null +++ b/ex1/gradientDescentMulti.py @@ -0,0 +1,31 @@ +from computeCostMulti import computeCostMulti + + +def gradientDescentMulti(X, y, theta, alpha, num_iters): + """ + Performs gradient descent to learn theta + theta = gradientDescent(x, y, theta, alpha, num_iters) updates theta by + taking num_iters gradient steps with learning rate alpha + """ + + # Initialize some useful values + J_history = [] + m = y.size # number of training examples + + for i in range(num_iters): + # ====================== YOUR CODE HERE ====================== + # Instructions: Perform a single gradient step on the parameter vector + # theta. + # + # Hint: While debugging, it can be useful to print out the values + # of the cost function (computeCost) and gradient here. + # + + + + # ============================================================ + + # Save the cost J in every iteration + J_history.append(computeCostMulti(X, y, theta)) + + return theta, J_history \ No newline at end of file diff --git a/ex1/normalEqn.py b/ex1/normalEqn.py new file mode 100644 index 0000000..dd45fc5 --- /dev/null +++ b/ex1/normalEqn.py @@ -0,0 +1,23 @@ +import numpy as np + + +def normalEqn(X,y): + """ Computes the closed-form solution to linear regression + normalEqn(X,y) computes the closed-form solution to linear + regression using the normal equations. + """ + theta = 0 +# ====================== YOUR CODE HERE ====================== +# Instructions: Complete the code to compute the closed form solution +# to linear regression and put the result in theta. +# + +# ---------------------- Sample Solution ---------------------- + + +# ------------------------------------------------------------- + + return theta + +# ============================================================ + diff --git a/ex1/plotData.py b/ex1/plotData.py new file mode 100644 index 0000000..19fdc74 --- /dev/null +++ b/ex1/plotData.py @@ -0,0 +1,23 @@ +import matplotlib.pyplot as plt +import numpy as np + +def plotData(data): + """ + plots the data points and gives the figure axes labels of + population and profit. + """ + +# ====================== YOUR CODE HERE ====================== +# Instructions: Plot the training data into a figure using the +# "figure" and "plot" commands. Set the axes labels using +# the "xlabel" and "ylabel" commands. Assume the +# population and revenue data have been passed in +# as the x and y arguments of this function. +# +# Hint: You can use the 'rx' option with plot to have the markers +# appear as red crosses. Furthermore, you can make the +# markers larger by using plot(..., 'rx', 'MarkerSize', 10); + + plt.figure() # open a new figure window + +# ============================================================ diff --git a/ex1/submit.py b/ex1/submit.py new file mode 100644 index 0000000..cd58a80 --- /dev/null +++ b/ex1/submit.py @@ -0,0 +1,57 @@ +import numpy as np + +from Submission import Submission +from Submission import sprintf + +__all__ = ['submit'] + +homework = 'linear-regression' + +part_names = [ + 'Warm up exercise', + 'Computing Cost (for one variable)', + 'Gradient Descent (for one variable)', + 'Feature Normalization', + 'Computing Cost (for multiple variables)', + 'Gradient Descent (for multiple variables)', + 'Normal Equations', + ] + +srcs = [ + 'warmUpExercise.py', + 'computeCost.py', + 'gradientDescent.py', + 'featureNormalize.py', + 'computeCostMulti.py', + 'gradientDescentMulti.py', + 'normalEqn.py', + ] + + +def output(part_id): + X1 = np.column_stack((np.ones(20), np.exp(1) + np.exp(2) * np.linspace(0.1, 2, 20))) + Y1 = X1[:,1] + np.sin(X1[:,0]) + np.cos(X1[:,1]) + X2 = np.column_stack((X1, X1[:,1]**0.5, X1[:,1]**0.25)) + Y2 = np.power(Y1, 0.5) + Y1 + + fname = srcs[part_id-1].rsplit('.',1)[0] + mod = __import__(fname, fromlist=[fname], level=1) + func = getattr(mod, fname) + + if part_id == 1: + return sprintf('%0.5f ', func()) + elif part_id == 2: + return sprintf('%0.5f ', func(X1, Y1, np.array([0.5, -0.5]))) + elif part_id == 3: + return sprintf('%0.5f ', func(X1, Y1, np.array([0.5, -0.5]), 0.01, 10)) + elif part_id == 4: + return sprintf('%0.5f ', func(X2[:,1:4])) + elif part_id == 5: + return sprintf('%0.5f ', func(X2, Y2, np.array([0.1, 0.2, 0.3, 0.4]))) + elif part_id == 6: + return sprintf('%0.5f ', func(X2, Y2, np.array([-0.1, -0.2, -0.3, -0.4]), 0.01, 10)) + elif part_id == 7: + return sprintf('%0.5f ', func(X2, Y2)) + +s = Submission(homework, part_names, srcs, output) +s.submit() diff --git a/ex1/warmUpExercise.py b/ex1/warmUpExercise.py new file mode 100644 index 0000000..119f9fa --- /dev/null +++ b/ex1/warmUpExercise.py @@ -0,0 +1,15 @@ +from numpy import eye + +def warmUpExercise(): + """ an example function that returns the 5x5 identity matrix + """ + +# ============= YOUR CODE HERE ============== +# Instructions: Return the 5x5 identity matrix +# In octave, we return values by defining which variables +# represent the return values (at the top of the file) +# and then set them accordingly. + + return eye(5) + +# =========================================== diff --git a/ex2/__init__.py b/ex2/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ex2/costFunction.py b/ex2/costFunction.py new file mode 100644 index 0000000..efb0ad7 --- /dev/null +++ b/ex2/costFunction.py @@ -0,0 +1,21 @@ +from numpy import log +from sigmoid import sigmoid + +def costFunction(theta, X,y): + """ computes the cost of using theta as the + parameter for logistic regression and the + gradient of the cost w.r.t. to the parameters.""" + +# Initialize some useful values + m = y.size # number of training examples + + +# ====================== YOUR CODE HERE ====================== +# Instructions: Compute the cost of a particular choice of theta. +# You should set J to the cost. +# Compute the partial derivatives and set grad to the partial +# derivatives of the cost w.r.t. each parameter in theta +# +# Note: grad should have the same dimensions as theta +# + return J diff --git a/ex2/costFunctionReg.py b/ex2/costFunctionReg.py new file mode 100644 index 0000000..830cd40 --- /dev/null +++ b/ex2/costFunctionReg.py @@ -0,0 +1,22 @@ +from costFunction import costFunction + + +def costFunctionReg(theta, X, y, Lambda): + """ + Compute cost and gradient for logistic regression with regularization + + computes the cost of using theta as the parameter for regularized logistic regression and the + gradient of the cost w.r.t. to the parameters. + """ + # Initialize some useful values + m = len(y) # number of training examples + +# ====================== YOUR CODE HERE ====================== +# Instructions: Compute the cost of a particular choice of theta. +# You should set J to the cost. +# Compute the partial derivatives and set grad to the partial +# derivatives of the cost w.r.t. each parameter in theta + +# ============================================================= + + return J diff --git a/ex2/ex2.py b/ex2/ex2.py new file mode 100644 index 0000000..5f9ea2e --- /dev/null +++ b/ex2/ex2.py @@ -0,0 +1,113 @@ +# Logistic Regression +from matplotlib import use + +use('TkAgg') +import matplotlib.pyplot as plt +import numpy as np +from scipy.optimize import minimize + +from costFunction import costFunction +from gradientFunction import gradientFunction +from sigmoid import sigmoid +from predict import predict +from show import show + +## Machine Learning Online Class - Exercise 2: Logistic Regression +# +# Instructions +# ------------ +# +# This file contains code that helps you get started on the second part +# of the exercise which covers regularization with logistic regression. +# +# You will need to complete the following functions in this exericse: +# +# sigmoid.py +# costFunction.py +# gradientFunction.py +# predict.py +# costFunctionReg.py +# gradientFunctionReg.py +# +# For this exercise, you will not need to change any code in this file, +# or any other files other than those mentioned above. +# + +from ml import plotData, plotDecisionBoundary +# Load Data +# The first two columns contains the exam scores and the third column +# contains the label. + +data = np.loadtxt('ex2data1.txt', delimiter=',') +X = data[:, 0:2] +y = data[:, 2] + +# ==================== Part 1: Plotting ==================== + +print 'Plotting data with + indicating (y = 1) examples and o indicating (y = 0) examples.' + +plotData(X, y) +plt.legend(['Admitted', 'Not admitted'], loc='upper right', shadow=True, fontsize='x-large', numpoints=1) + +plt.xlabel('Exam 1 score') +plt.ylabel('Exam 2 score') +show() +raw_input("Program paused. Press Enter to continue...") + + +# # ============ Part 2: Compute Cost and Gradient ============ +# # Setup the data matrix appropriately, and add ones for the intercept term +m, n = X.shape + +# Add intercept term to x and X_test +X = np.concatenate((np.ones((m, 1)), X), axis=1) + +# Initialize fitting parameters +initial_theta = np.zeros(n + 1) + +# Compute and display initial cost and gradient +cost = costFunction(initial_theta, X, y) +print 'Cost at initial theta (zeros): %f' % cost + +grad = gradientFunction(initial_theta, X, y) +print 'Gradient at initial theta (zeros): ' + str(grad) + +raw_input("Program paused. Press Enter to continue...") + +# ============= Part 3: Optimizing using scipy ============= +res = minimize(costFunction, initial_theta, method='TNC', + jac=False, args=(X, y), options={'gtol': 1e-3, 'disp': True, 'maxiter': 1000}) + +theta = res.x +cost = res.fun + +# Print theta to screen +print 'Cost at theta found by scipy: %f' % cost +print 'theta:', ["%0.4f" % i for i in theta] + +# Plot Boundary +plotDecisionBoundary(theta, X, y) + +# Labels and Legend +plt.legend(['Admitted', 'Not admitted'], loc='upper right', shadow=True, fontsize='x-large', numpoints=1) +plt.xlabel('Exam 1 score') +plt.ylabel('Exam 2 score') +show() + +raw_input("Program paused. Press Enter to continue...") + +# ============== Part 4: Predict and Accuracies ============== + +# Predict probability for a student with score 45 on exam 1 +# and score 85 on exam 2 + +prob = sigmoid(np.array([1, 45, 85]).dot(theta)) +print 'For a student with scores 45 and 85, we predict an admission probability of %f' % prob + +# Compute accuracy on our training set +p = predict(theta, X) +acc = 1.0*np.where(p == y)[0].size/len(p) * 100 +print 'Train Accuracy: %f' % acc + +raw_input("Program paused. Press Enter to continue...") + diff --git a/ex2/ex2_reg.py b/ex2/ex2_reg.py new file mode 100644 index 0000000..4b8441a --- /dev/null +++ b/ex2/ex2_reg.py @@ -0,0 +1,112 @@ +# Logistic Regression +from matplotlib import use + +use('TkAgg') +import numpy as np +import matplotlib.pyplot as plt +from scipy.optimize import minimize + +import pandas as pd + +from ml import mapFeature, plotData, plotDecisionBoundary +from show import show +from costFunctionReg import costFunctionReg +from gradientFunctionReg import gradientFunctionReg +from sigmoid import sigmoid + + +def optimize(Lambda): + + result = minimize(costFunctionReg, initial_theta, method='L-BFGS-B', + jac=gradientFunctionReg, args=(X.as_matrix(), y, Lambda), + options={'gtol': 1e-4, 'disp': False, 'maxiter': 1000}) + + return result + + +# Plot Boundary +def plotBoundary(theta, X, y): + plotDecisionBoundary(theta, X.values, y.values) + plt.title(r'$\lambda$ = ' + str(Lambda)) + + # Labels and Legend + plt.xlabel('Microchip Test 1') + plt.ylabel('Microchip Test 2') + show() + + + +# Initialization + +# Load Data +# The first two columns contains the X values and the third column +# contains the label (y). + +data = pd.read_csv('ex2data2.txt', header=None, names=[1,2,3]) +X = data[[1, 2]] +y = data[[3]] + +plotData(X.values, y.values) + +# Labels and Legend +plt.xlabel('Microchip Test 1') +plt.ylabel('Microchip Test 2') +show() +raw_input("Program paused. Press Enter to continue...") + + +# =========== Part 1: Regularized Logistic Regression ============ + +# Add Polynomial Features + +# Note that mapFeature also adds a column of ones for us, so the intercept +# term is handled +X = X.apply(mapFeature, axis=1) + +# Initialize fitting parameters +initial_theta = np.zeros(X.shape[1]) + +# Set regularization parameter lambda to 1 +Lambda = 0.0 + +# Compute and display initial cost and gradient for regularized logistic +# regression +cost = costFunctionReg(initial_theta, X, y, Lambda) + +print 'Cost at initial theta (zeros): %f' % cost + +# ============= Part 2: Regularization and Accuracies ============= + +# Optimize and plot boundary + +Lambda = 1.0 +result = optimize(Lambda) +theta = result.x +cost = result.fun + +# Print to screen +print 'lambda = ' + str(Lambda) +print 'Cost at theta found by scipy: %f' % cost +print 'theta:', ["%0.4f" % i for i in theta] + +raw_input("Program paused. Press Enter to continue...") + +plotBoundary(theta, X, y) + +# Compute accuracy on our training set +p = np.round(sigmoid(X.dot(theta))) +acc = np.mean(np.where(p == y.T,1,0)) * 100 +print 'Train Accuracy: %f' % acc + +raw_input("Program paused. Press Enter to continue...") + +# ============= Part 3: Optional Exercises ============= + + +for Lambda in np.arange(0.0,10.1,1.0): + result = optimize(Lambda) + theta = result.x + print 'lambda = ' + str(Lambda) + print 'theta:', ["%0.4f" % i for i in theta] + plotBoundary(theta, X, y) +raw_input("Program paused. Press Enter to continue...") diff --git a/ex2/ex2data1.txt b/ex2/ex2data1.txt new file mode 100644 index 0000000..3a5f952 --- /dev/null +++ b/ex2/ex2data1.txt @@ -0,0 +1,100 @@ +34.62365962451697,78.0246928153624,0 +30.28671076822607,43.89499752400101,0 +35.84740876993872,72.90219802708364,0 +60.18259938620976,86.30855209546826,1 +79.0327360507101,75.3443764369103,1 +45.08327747668339,56.3163717815305,0 +61.10666453684766,96.51142588489624,1 +75.02474556738889,46.55401354116538,1 +76.09878670226257,87.42056971926803,1 +84.43281996120035,43.53339331072109,1 +95.86155507093572,38.22527805795094,0 +75.01365838958247,30.60326323428011,0 +82.30705337399482,76.48196330235604,1 +69.36458875970939,97.71869196188608,1 +39.53833914367223,76.03681085115882,0 +53.9710521485623,89.20735013750205,1 +69.07014406283025,52.74046973016765,1 +67.94685547711617,46.67857410673128,0 +70.66150955499435,92.92713789364831,1 +76.97878372747498,47.57596364975532,1 +67.37202754570876,42.83843832029179,0 +89.67677575072079,65.79936592745237,1 +50.534788289883,48.85581152764205,0 +34.21206097786789,44.20952859866288,0 +77.9240914545704,68.9723599933059,1 +62.27101367004632,69.95445795447587,1 +80.1901807509566,44.82162893218353,1 +93.114388797442,38.80067033713209,0 +61.83020602312595,50.25610789244621,0 +38.78580379679423,64.99568095539578,0 +61.379289447425,72.80788731317097,1 +85.40451939411645,57.05198397627122,1 +52.10797973193984,63.12762376881715,0 +52.04540476831827,69.43286012045222,1 +40.23689373545111,71.16774802184875,0 +54.63510555424817,52.21388588061123,0 +33.91550010906887,98.86943574220611,0 +64.17698887494485,80.90806058670817,1 +74.78925295941542,41.57341522824434,0 +34.1836400264419,75.2377203360134,0 +83.90239366249155,56.30804621605327,1 +51.54772026906181,46.85629026349976,0 +94.44336776917852,65.56892160559052,1 +82.36875375713919,40.61825515970618,0 +51.04775177128865,45.82270145776001,0 +62.22267576120188,52.06099194836679,0 +77.19303492601364,70.45820000180959,1 +97.77159928000232,86.7278223300282,1 +62.07306379667647,96.76882412413983,1 +91.56497449807442,88.69629254546599,1 +79.94481794066932,74.16311935043758,1 +99.2725269292572,60.99903099844988,1 +90.54671411399852,43.39060180650027,1 +34.52451385320009,60.39634245837173,0 +50.2864961189907,49.80453881323059,0 +49.58667721632031,59.80895099453265,0 +97.64563396007767,68.86157272420604,1 +32.57720016809309,95.59854761387875,0 +74.24869136721598,69.82457122657193,1 +71.79646205863379,78.45356224515052,1 +75.3956114656803,85.75993667331619,1 +35.28611281526193,47.02051394723416,0 +56.25381749711624,39.26147251058019,0 +30.05882244669796,49.59297386723685,0 +44.66826172480893,66.45008614558913,0 +66.56089447242954,41.09209807936973,0 +40.45755098375164,97.53518548909936,1 +49.07256321908844,51.88321182073966,0 +80.27957401466998,92.11606081344084,1 +66.74671856944039,60.99139402740988,1 +32.72283304060323,43.30717306430063,0 +64.0393204150601,78.03168802018232,1 +72.34649422579923,96.22759296761404,1 +60.45788573918959,73.09499809758037,1 +58.84095621726802,75.85844831279042,1 +99.82785779692128,72.36925193383885,1 +47.26426910848174,88.47586499559782,1 +50.45815980285988,75.80985952982456,1 +60.45555629271532,42.50840943572217,0 +82.22666157785568,42.71987853716458,0 +88.9138964166533,69.80378889835472,1 +94.83450672430196,45.69430680250754,1 +67.31925746917527,66.58935317747915,1 +57.23870631569862,59.51428198012956,1 +80.36675600171273,90.96014789746954,1 +68.46852178591112,85.59430710452014,1 +42.0754545384731,78.84478600148043,0 +75.47770200533905,90.42453899753964,1 +78.63542434898018,96.64742716885644,1 +52.34800398794107,60.76950525602592,0 +94.09433112516793,77.15910509073893,1 +90.44855097096364,87.50879176484702,1 +55.48216114069585,35.57070347228866,0 +74.49269241843041,84.84513684930135,1 +89.84580670720979,45.35828361091658,1 +83.48916274498238,48.38028579728175,1 +42.2617008099817,87.10385094025457,1 +99.31500880510394,68.77540947206617,1 +55.34001756003703,64.9319380069486,1 +74.77589300092767,89.52981289513276,1 diff --git a/ex2/ex2data2.txt b/ex2/ex2data2.txt new file mode 100644 index 0000000..a888992 --- /dev/null +++ b/ex2/ex2data2.txt @@ -0,0 +1,118 @@ +0.051267,0.69956,1 +-0.092742,0.68494,1 +-0.21371,0.69225,1 +-0.375,0.50219,1 +-0.51325,0.46564,1 +-0.52477,0.2098,1 +-0.39804,0.034357,1 +-0.30588,-0.19225,1 +0.016705,-0.40424,1 +0.13191,-0.51389,1 +0.38537,-0.56506,1 +0.52938,-0.5212,1 +0.63882,-0.24342,1 +0.73675,-0.18494,1 +0.54666,0.48757,1 +0.322,0.5826,1 +0.16647,0.53874,1 +-0.046659,0.81652,1 +-0.17339,0.69956,1 +-0.47869,0.63377,1 +-0.60541,0.59722,1 +-0.62846,0.33406,1 +-0.59389,0.005117,1 +-0.42108,-0.27266,1 +-0.11578,-0.39693,1 +0.20104,-0.60161,1 +0.46601,-0.53582,1 +0.67339,-0.53582,1 +-0.13882,0.54605,1 +-0.29435,0.77997,1 +-0.26555,0.96272,1 +-0.16187,0.8019,1 +-0.17339,0.64839,1 +-0.28283,0.47295,1 +-0.36348,0.31213,1 +-0.30012,0.027047,1 +-0.23675,-0.21418,1 +-0.06394,-0.18494,1 +0.062788,-0.16301,1 +0.22984,-0.41155,1 +0.2932,-0.2288,1 +0.48329,-0.18494,1 +0.64459,-0.14108,1 +0.46025,0.012427,1 +0.6273,0.15863,1 +0.57546,0.26827,1 +0.72523,0.44371,1 +0.22408,0.52412,1 +0.44297,0.67032,1 +0.322,0.69225,1 +0.13767,0.57529,1 +-0.0063364,0.39985,1 +-0.092742,0.55336,1 +-0.20795,0.35599,1 +-0.20795,0.17325,1 +-0.43836,0.21711,1 +-0.21947,-0.016813,1 +-0.13882,-0.27266,1 +0.18376,0.93348,0 +0.22408,0.77997,0 +0.29896,0.61915,0 +0.50634,0.75804,0 +0.61578,0.7288,0 +0.60426,0.59722,0 +0.76555,0.50219,0 +0.92684,0.3633,0 +0.82316,0.27558,0 +0.96141,0.085526,0 +0.93836,0.012427,0 +0.86348,-0.082602,0 +0.89804,-0.20687,0 +0.85196,-0.36769,0 +0.82892,-0.5212,0 +0.79435,-0.55775,0 +0.59274,-0.7405,0 +0.51786,-0.5943,0 +0.46601,-0.41886,0 +0.35081,-0.57968,0 +0.28744,-0.76974,0 +0.085829,-0.75512,0 +0.14919,-0.57968,0 +-0.13306,-0.4481,0 +-0.40956,-0.41155,0 +-0.39228,-0.25804,0 +-0.74366,-0.25804,0 +-0.69758,0.041667,0 +-0.75518,0.2902,0 +-0.69758,0.68494,0 +-0.4038,0.70687,0 +-0.38076,0.91886,0 +-0.50749,0.90424,0 +-0.54781,0.70687,0 +0.10311,0.77997,0 +0.057028,0.91886,0 +-0.10426,0.99196,0 +-0.081221,1.1089,0 +0.28744,1.087,0 +0.39689,0.82383,0 +0.63882,0.88962,0 +0.82316,0.66301,0 +0.67339,0.64108,0 +1.0709,0.10015,0 +-0.046659,-0.57968,0 +-0.23675,-0.63816,0 +-0.15035,-0.36769,0 +-0.49021,-0.3019,0 +-0.46717,-0.13377,0 +-0.28859,-0.060673,0 +-0.61118,-0.067982,0 +-0.66302,-0.21418,0 +-0.59965,-0.41886,0 +-0.72638,-0.082602,0 +-0.83007,0.31213,0 +-0.72062,0.53874,0 +-0.59389,0.49488,0 +-0.48445,0.99927,0 +-0.0063364,0.99927,0 +0.63265,-0.030612,0 diff --git a/ex2/gradientFunction.py b/ex2/gradientFunction.py new file mode 100644 index 0000000..d3e1bdb --- /dev/null +++ b/ex2/gradientFunction.py @@ -0,0 +1,23 @@ +from sigmoid import sigmoid +from numpy import squeeze, asarray + + +def gradientFunction(theta, X, y): + """ + Compute cost and gradient for logistic regression with regularization + + computes the cost of using theta as the parameter for regularized logistic regression and the + gradient of the cost w.r.t. to the parameters. + """ + + m = len(y) # number of training examples + +# ====================== YOUR CODE HERE ====================== +# Instructions: Compute the gradient of a particular choice of theta. +# Compute the partial derivatives and set grad to the partial +# derivatives of the cost w.r.t. each parameter in theta + + +# ============================================================= + + return grad diff --git a/ex2/gradientFunctionReg.py b/ex2/gradientFunctionReg.py new file mode 100644 index 0000000..e9b63a6 --- /dev/null +++ b/ex2/gradientFunctionReg.py @@ -0,0 +1,23 @@ +from numpy import asfortranarray, squeeze, asarray + +from gradientFunction import gradientFunction + + +def gradientFunctionReg(theta, X, y, Lambda): + """ + Compute cost and gradient for logistic regression with regularization + + computes the cost of using theta as the parameter for regularized logistic regression and the + gradient of the cost w.r.t. to the parameters. + """ + m = len(y) # number of training examples + +# ====================== YOUR CODE HERE ====================== +# Instructions: Compute the gradient of a particular choice of theta. +# Compute the partial derivatives and set grad to the partial +# derivatives of the cost w.r.t. each parameter in theta + + +# ============================================================= + + return grad \ No newline at end of file diff --git a/ex2/ml.py b/ex2/ml.py new file mode 100644 index 0000000..15984e7 --- /dev/null +++ b/ex2/ml.py @@ -0,0 +1,75 @@ +import numpy as np +from matplotlib import pyplot as plt +from pandas import Series +from mpl_toolkits.mplot3d import axes3d + + +def plotData(X,y): + pos = X[np.where(y==1,True,False).flatten()] + neg = X[np.where(y==0,True,False).flatten()] + plt.plot(pos[:,0], pos[:,1], '+', markersize=7, markeredgecolor='black', markeredgewidth=2) + plt.plot(neg[:,0], neg[:,1], 'o', markersize=7, markeredgecolor='black', markerfacecolor='yellow') + +def plotDecisionBoundary(theta, X, y): + """ + Plots the data points X and y into a new figure with the decision boundary defined by theta + PLOTDECISIONBOUNDARY(theta, X,y) plots the data points with + for the + positive examples and o for the negative examples. X is assumed to be + a either + 1) Mx3 matrix, where the first column is an all-ones column for the + intercept. + 2) MxN, N>3 matrix, where the first column is all-ones + """ + + # Plot Data + plt.figure() + plotData(X[:,1:], y) + + if X.shape[1] <= 3: + # Only need 2 points to define a line, so choose two endpoints + plot_x = np.array([min(X[:, 2]), max(X[:, 2])]) + + # Calculate the decision boundary line + plot_y = (-1./theta[2])*(theta[1]*plot_x + theta[0]) + + # Plot, and adjust axes for better viewing + plt.plot(plot_x, plot_y) + + else: + # Here is the grid range + u = np.linspace(-1, 1.5, 50) + v = np.linspace(-1, 1.5, 50) + z = [ + np.array([mapFeature2(u[i], v[j]).dot(theta) for i in range(len(u))]) + for j in range(len(v)) + ] + plt.contour(u,v,z, levels=[0.0]) + + # Legend, specific for the exercise + # axis([30, 100, 30, 100]) + +def mapFeature(X, degree=6): + """ + Feature mapping function to polynomial features + + MAPFEATURE(X, degree) maps the two input features + to quadratic features used in the regularization exercise. + + Returns a new feature array with more features, comprising of + X1, X2, X1.^2, X2.^2, X1*X2, X1*X2.^2, etc.. + """ + quads = Series([X.iloc[0]**(i-j) * X.iloc[1]**j for i in range(1,degree+1) for j in range(i+1)]) + return Series([1]).append([X,quads]) + +def mapFeature2(X1, X2, degree=6): + """ + Feature mapping function to polynomial features + + MAPFEATURE(X, degree) maps the two input features + to quadratic features used in the regularization exercise. + + Returns a new feature array with more features, comprising of + X1, X2, X1.^2, X2.^2, X1*X2, X1*X2.^2, etc.. + """ + quads = Series([X1**(i-j) * X2**j for i in range(1,degree+1) for j in range(i+1)]) + return Series([1]).append([Series(X1), Series(X2), quads]) diff --git a/ex2/predict.py b/ex2/predict.py new file mode 100644 index 0000000..4fd3406 --- /dev/null +++ b/ex2/predict.py @@ -0,0 +1,21 @@ +from numpy import round + +from sigmoid import sigmoid + + +def predict(theta, X): + + """ computes the predictions for X using a threshold at 0.5 + (i.e., if sigmoid(theta'*x) >= 0.5, predict 1) + """ + +# ====================== YOUR CODE HERE ====================== +# Instructions: Complete the following code to make predictions using +# your learned logistic regression parameters. +# You should set p to a vector of 0's and 1's +# + + +# ========================================================================= + + return p \ No newline at end of file diff --git a/ex2/sigmoid.py b/ex2/sigmoid.py new file mode 100644 index 0000000..feaad0b --- /dev/null +++ b/ex2/sigmoid.py @@ -0,0 +1,11 @@ +from numpy import e + +def sigmoid(z): + """computes the sigmoid of z.""" + +# ====================== YOUR CODE HERE ====================== +# Instructions: Compute the sigmoid of each value of z (z can be a matrix, +# vector or scalar). + +# ============================================================= + return g \ No newline at end of file diff --git a/ex2/submit.py b/ex2/submit.py new file mode 100644 index 0000000..dfcc56a --- /dev/null +++ b/ex2/submit.py @@ -0,0 +1,53 @@ +import numpy as np + +from Submission import Submission +from Submission import sprintf + +__all__ = ['submit'] + +homework = 'logistic-regression' + +part_names = [ + 'Sigmoid Function', + 'Logistic Regression Cost', + 'Logistic Regression Gradient', + 'Predict', + 'Regularized Logistic Regression Cost', + 'Regularized Logistic Regression Gradient', + ] + +srcs = [ + 'sigmoid.py', + 'costFunction.py', + 'gradientFunction.py', + 'predict.py', + 'costFunctionReg.py', + 'gradientFunctionReg.py', + ] + + +def output(part_id): + X = np.column_stack((np.ones(20), + (np.exp(1) * np.sin(np.linspace(1, 20, 20))), + (np.exp(0.5) * np.cos(np.linspace(1, 20, 20))))) + Y = np.sin(X[:,0] + X[:,1]) > 0 + + fname = srcs[part_id-1].rsplit('.',1)[0] + mod = __import__(fname, fromlist=[fname], level=1) + func = getattr(mod, fname) + + if part_id == 1: + return sprintf('%0.5f ', func(X)) + elif part_id == 2: + return sprintf('%0.5f ', func(np.array([0.25, 0.5, -0.5]), X, Y)) + elif part_id == 3: + return sprintf('%0.5f ', func(np.array([0.25, 0.5, -0.5]), X, Y)) + elif part_id == 4: + return sprintf('%0.5f ', func(np.array([0.25, 0.5, -0.5]), X)) + elif part_id == 5: + return sprintf('%0.5f ', func(np.array([0.25, 0.5, -0.5]), X, Y, 0.1)) + elif part_id == 6: + return sprintf('%0.5f ', func(np.array([0.25, 0.5, -0.5]), X, Y, 0.1)) + +s = Submission(homework, part_names, srcs, output) +s.submit() diff --git a/ex3/__init__.py b/ex3/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ex3/displayData.py b/ex3/displayData.py new file mode 100644 index 0000000..21dbe57 --- /dev/null +++ b/ex3/displayData.py @@ -0,0 +1,51 @@ +import numpy as np +from matplotlib import use +use('TkAgg') +import matplotlib.pyplot as plt + +from show import show + +def displayData(X): + """displays 2D data + stored in X in a nice grid. It returns the figure handle h and the + displayed array if requested.""" + +# Compute rows, cols + m, n = X.shape + example_width = round(np.sqrt(n)) + example_height = (n / example_width) + +# Compute number of items to display + display_rows = np.floor(np.sqrt(m)) + display_cols = np.ceil(m / display_rows) + +# Between images padding + pad = 1 + +# Setup blank display + display_array = - np.ones((pad + display_rows * (example_height + pad), + pad + display_cols * (example_width + pad))) + +# Copy each example into a patch on the display array + curr_ex = 0 + for j in np.arange(display_rows): + for i in np.arange(display_cols): + if curr_ex > m: + break + # Get the max value of the patch + max_val = np.max(np.abs(X[curr_ex, : ])) + rows = [pad + j * (example_height + pad) + x for x in np.arange(example_height+1)] + cols = [pad + i * (example_width + pad) + x for x in np.arange(example_width+1)] + display_array[min(rows):max(rows), min(cols):max(cols)] = X[curr_ex, :].reshape(example_height, example_width) / max_val + curr_ex = curr_ex + 1 + if curr_ex > m: + break + +# Display Image + display_array = display_array.astype('float32') + plt.imshow(display_array.T) + plt.set_cmap('gray') +# Do not show axis + plt.axis('off') + show() + diff --git a/ex3/ex3.py b/ex3/ex3.py new file mode 100644 index 0000000..8f422e2 --- /dev/null +++ b/ex3/ex3.py @@ -0,0 +1,75 @@ +## Machine Learning Online Class - Exercise 3 | Part 1: One-vs-all +import scipy.io +import numpy as np +from matplotlib import use +use('TkAgg') + +from oneVsAll import oneVsAll +from predictOneVsAll import predictOneVsAll +from displayData import displayData + +# Instructions +# ------------ +# +# This file contains code that helps you get started on the +# linear exercise. You will need to complete the following functions +# in this exericse: +# +# lrCostFunction.m (logistic regression cost function) +# oneVsAll.m +# predictOneVsAll.m +# predict.m +# +# For this exercise, you will not need to change any code in this file, +# or any other files other than those mentioned above. +# + +## Setup the parameters you will use for this part of the exercise +input_layer_size = 400 # 20x20 Input Images of Digits +num_labels = 10 # 10 labels, from 1 to 10 + # (note that we have mapped "0" to label 10) + +## =========== Part 1: Loading and Visualizing Data ============= +# We start the exercise by first loading and visualizing the dataset. +# You will be working with a dataset that contains handwritten digits. +# + +# Load Training Data +print 'Loading and Visualizing Data ...' + +data = scipy.io.loadmat('ex3data1.mat') # training data stored in arrays X, y +X = data['X'] +y = data['y'] +m, _ = X.shape + +# Randomly select 100 data points to display +rand_indices = np.random.permutation(range(m)) +sel = X[rand_indices[0:100], :] + +displayData(sel) + +raw_input("Program paused. Press Enter to continue...") + +## ============ Part 2: Vectorize Logistic Regression ============ +# In this part of the exercise, you will reuse your logistic regression +# code from the last exercise. You task here is to make sure that your +# regularized logistic regression implementation is vectorized. After +# that, you will implement one-vs-all classification for the handwritten +# digit dataset. +# + +print 'Training One-vs-All Logistic Regression...' + +Lambda = 0.1 +all_theta = oneVsAll(X, y, num_labels, Lambda) + +raw_input("Program paused. Press Enter to continue...") + + +## ================ Part 3: Predict for One-Vs-All ================ +# After ... +pred = predictOneVsAll(all_theta, X) + +accuracy = np.mean(np.double(pred == np.squeeze(y))) * 100 +print '\nTraining Set Accuracy: %f\n' % accuracy + diff --git a/ex3/ex3_nn.py b/ex3/ex3_nn.py new file mode 100644 index 0000000..2a562ff --- /dev/null +++ b/ex3/ex3_nn.py @@ -0,0 +1,98 @@ +## Machine Learning Online Class - Exercise 3 | Part 2: Neural Networks + +# Instructions +# ------------ +# +# This file contains code that helps you get started on the +# linear exercise. You will need to complete the following functions +# in this exericse: +# +# lrCostFunction.m (logistic regression cost function) +# oneVsAll.m +# predictOneVsAll.m +# predict.m +# +# For this exercise, you will not need to change any code in this file, +# or any other files other than those mentioned above. +# +from matplotlib import use +use('TkAgg') +import scipy.io +import numpy as np +import matplotlib.pyplot as plt + +from displayData import displayData +from predict import predict + +## Setup the parameters you will use for this exercise +input_layer_size = 400 # 20x20 Input Images of Digits +hidden_layer_size = 25 # 25 hidden units +num_labels = 10 # 10 labels, from 1 to 10 + # (note that we have mapped "0" to label 10) + +## =========== Part 1: Loading and Visualizing Data ============= +# We start the exercise by first loading and visualizing the dataset. +# You will be working with a dataset that contains handwritten digits. +# + +# Load Training Data +print 'Loading and Visualizing Data ...' + +data = scipy.io.loadmat('ex3data1.mat') +X = data['X'] +y = data['y'] +m, _ = X.shape + +# Randomly select 100 data points to display +sel = np.random.permutation(range(m)) +sel = sel[0:100] + +displayData(X[sel,:]) + +raw_input("Program paused. Press Enter to continue...") + +## ================ Part 2: Loading Pameters ================ +# In this part of the exercise, we load some pre-initialized +# neural network parameters. + +print 'Loading Saved Neural Network Parameters ...' + +# Load the weights into variables Theta1 and Theta2 +data = scipy.io.loadmat('ex3weights.mat') +Theta1 = data['Theta1'] +Theta2 = data['Theta2'] + +## ================= Part 3: Implement Predict ================= +# After training the neural network, we would like to use it to predict +# the labels. You will now implement the "predict" function to use the +# neural network to predict the labels of the training set. This lets +# you compute the training set accuracy. + +pred = predict(Theta1, Theta2, X) + +print 'Training Set Accuracy: %f\n', np.mean(np.double(pred == np.squeeze(y))) * 100 + +raw_input("Program paused. Press Enter to continue...") + +# To give you an idea of the network's output, you can also run +# through the examples one at the a time to see what it is predicting. + +# Randomly permute examples +rp = np.random.permutation(range(m)) + +plt.figure() +for i in range(m): + # Display + X2 = X[rp[i],:] + print 'Displaying Example Image' + X2 = np.matrix(X[rp[i]]) + displayData(X2) + + pred = predict(Theta1, Theta2, X2.getA()) + pred = np.squeeze(pred) + print 'Neural Network Prediction: %d (digit %d)\n' % (pred, np.mod(pred, 10)) + + raw_input("Program paused. Press Enter to continue...") + plt.close() + + diff --git a/ex3/ex3data1.mat b/ex3/ex3data1.mat new file mode 100644 index 0000000..371bd0c Binary files /dev/null and b/ex3/ex3data1.mat differ diff --git a/ex3/ex3weights.mat b/ex3/ex3weights.mat new file mode 100644 index 0000000..ace2a09 Binary files /dev/null and b/ex3/ex3weights.mat differ diff --git a/ex3/lrCostFunction.py b/ex3/lrCostFunction.py new file mode 100644 index 0000000..f286833 --- /dev/null +++ b/ex3/lrCostFunction.py @@ -0,0 +1,27 @@ +from ex2.costFunctionReg import costFunctionReg + +def lrCostFunction(theta, X, y, Lambda): + """computes the cost of using + theta as the parameter for regularized logistic regression and the + gradient of the cost w.r.t. to the parameters. + """ + +# ====================== YOUR CODE HERE ====================== +# Instructions: Compute the cost of a particular choice of theta. +# You should set J to the cost. +# +# Hint: The computation of the cost function and gradients can be +# efficiently vectorized. For example, consider the computation +# +# sigmoid(X * theta) +# +# Each row of the resulting matrix will contain the value of the +# prediction for that example. You can make use of this to vectorize +# the cost function and gradient computations. +# + + + + # ============================================================= + + return J diff --git a/ex3/oneVsAll.py b/ex3/oneVsAll.py new file mode 100644 index 0000000..0e08123 --- /dev/null +++ b/ex3/oneVsAll.py @@ -0,0 +1,47 @@ +import numpy as np +from scipy.optimize import minimize + +from lrCostFunction import lrCostFunction +from ex2.gradientFunctionReg import gradientFunctionReg + + +def oneVsAll(X, y, num_labels, Lambda): + """trains multiple logistic regression classifiers and returns all + the classifiers in a matrix all_theta, where the i-th row of all_theta + corresponds to the classifier for label i + """ + +# Some useful variables + m, n = X.shape + +# You need to return the following variables correctly + all_theta = np.zeros((num_labels, n + 1)) + +# Add ones to the X data matrix + X = np.column_stack((np.ones((m, 1)), X)) + +# ====================== YOUR CODE HERE ====================== +# Instructions: You should complete the following code to train num_labels +# logistic regression classifiers with regularization +# parameter lambda. +# +# Hint: theta(:) will return a column vector. +# +# Hint: You can use y == c to obtain a vector of 1's and 0's that tell use +# whether the ground truth is true/false for this class. +# +# Note: For this assignment, we recommend using fmincg to optimize the cost +# function. It is okay to use a for-loop (for c = 1:num_labels) to +# loop over the different classes. + + # Set Initial theta + initial_theta = np.zeros((n + 1, 1)) + + # This function will return theta and the cost + + + +# ========================================================================= + + return all_theta + diff --git a/ex3/predict.py b/ex3/predict.py new file mode 100644 index 0000000..0af32b6 --- /dev/null +++ b/ex3/predict.py @@ -0,0 +1,28 @@ +import numpy as np + +from ex2.sigmoid import sigmoid + +def predict(Theta1, Theta2, X): + """ outputs the predicted label of X given the + trained weights of a neural network (Theta1, Theta2) + """ + +# Useful values + m, _ = X.shape + num_labels, _ = Theta2.shape + +# ====================== YOUR CODE HERE ====================== +# Instructions: Complete the following code to make predictions using +# your learned neural network. You should set p to a +# vector containing labels between 1 to num_labels. +# +# Hint: The max function might come in useful. In particular, the max +# function can also return the index of the max element, for more +# information see 'help max'. If your examples are in rows, then, you +# can use max(A, [], 2) to obtain the max for each row. +# + +# ========================================================================= + + return p + 1 # add 1 to offset index of maximum in A row + diff --git a/ex3/predictOneVsAll.py b/ex3/predictOneVsAll.py new file mode 100644 index 0000000..b1e406c --- /dev/null +++ b/ex3/predictOneVsAll.py @@ -0,0 +1,37 @@ +import numpy as np + +from ex2.sigmoid import sigmoid + +def predictOneVsAll(all_theta, X): + """will return a vector of predictions + for each example in the matrix X. Note that X contains the examples in + rows. all_theta is a matrix where the i-th row is a trained logistic + regression theta vector for the i-th class. You should set p to a vector + of values from 1..K (e.g., p = [1 3 1 2] predicts classes 1, 3, 1, 2 + for 4 examples) """ + + m = X.shape[0] + + # You need to return the following variables correctly + p = np.zeros((m, 1)) + + # Add ones to the X data matrix + X = np.column_stack((np.ones((m, 1)), X)) + +# ====================== YOUR CODE HERE ====================== +# Instructions: Complete the following code to make predictions using +# your learned logistic regression parameters (one-vs-all). +# You should set p to a vector of predictions (from 1 to +# num_labels). +# +# Hint: This code can be done all vectorized using the max function. +# In particular, the max function can also return the index of the +# max element, for more information see 'help max'. If your examples +# are in rows, then, you can use max(A, [], 2) to obtain the max +# for each row. +# + + +# ========================================================================= + + return p + 1 # add 1 to offset index of maximum in A row diff --git a/ex3/submit.py b/ex3/submit.py new file mode 100644 index 0000000..8f568de --- /dev/null +++ b/ex3/submit.py @@ -0,0 +1,58 @@ +import numpy as np + +from Submission import Submission +from Submission import sprintf +from lrCostFunction import lrCostFunction +from oneVsAll import oneVsAll +from predictOneVsAll import predictOneVsAll +from predict import predict +from ex2.gradientFunctionReg import gradientFunctionReg + +homework = 'multi-class-classification-and-neural-networks' + +part_names = [ + 'Regularized Logistic Regression', + 'One-vs-All Classifier Training', + 'One-vs-All Classifier Prediction', + 'Neural Network Prediction Function', + ] + +srcs = [ + 'lrCostFunction.py', + 'oneVsAll.py', + 'predictOneVsAll.py', + 'predict.py', + ] + + +def output(part_id): + # Random Test Cases + X = np.column_stack((np.ones(20), + (np.exp(1) * np.sin(np.linspace(1, 20, 20))), + (np.exp(0.5) * np.cos(np.linspace(1, 20, 20))))) + y = np.sin(X[:,0] + X[:,1]) > 0 + + Xm = np.array([[-1,-1],[-1,-2],[-2,-1],[-2,-2],[1,1],[1,2],[2,1],[2,2],[-1,1], + [-1,2],[-2,1],[-2,2],[1,-1],[1,-2],[-2,-1],[-2,-2]]).reshape((16,2)) + ym = np.array([1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4]).reshape(16,1) + t1 = np.sin(np.array(range(1,24,2)).reshape(3,4).T) + t2 = np.cos(np.array(range(1,40,2)).reshape(5,4).T) + + fname = srcs[part_id-1].rsplit('.',1)[0] + mod = __import__(fname, fromlist=[fname], level=1) + func = getattr(mod, fname) + + if part_id == 1: + J = lrCostFunction(np.array([0.25, 0.5, -0.5]), X, y, 0.1) + grad = gradientFunctionReg(np.array([0.25, 0.5, -0.5]), X, y, 0.1) + return sprintf('%0.5f ', np.hstack((J, grad)).tolist()) + elif part_id == 2: + return sprintf('%0.5f ', oneVsAll(Xm, ym, 4, 0.1)) + elif part_id == 3: + return sprintf('%0.5f ', predictOneVsAll(t1, Xm)) + elif part_id == 4: + return sprintf('%0.5f ', predict(t1, t2, Xm)) + +s = Submission(homework, part_names, srcs, output) +s.submit() + diff --git a/ex4/__init__.py b/ex4/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ex4/checkNNGradients.py b/ex4/checkNNGradients.py new file mode 100644 index 0000000..f2aa82a --- /dev/null +++ b/ex4/checkNNGradients.py @@ -0,0 +1,54 @@ +import numpy as np + +from debugInitializeWeights import debugInitializeWeights +from computeNumericalGradient import computeNumericalGradient +from nnCostFunction import nnCostFunction + +def checkNNGradients(Lambda = 0): + + """Creates a small neural network to check the + backpropagation gradients, it will output the analytical gradients + produced by your backprop code and the numerical gradients (computed + using computeNumericalGradient). These two gradient computations should + result in very similar values. + """ + + input_layer_size = 3 + hidden_layer_size = 5 + num_labels = 3 + m = 5 + + # We generate some 'random' test data + Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size) + Theta2 = debugInitializeWeights(num_labels, hidden_layer_size) + + # Reusing debugInitializeWeights to generate X + X = debugInitializeWeights(m, input_layer_size - 1) + y = np.mod(range(1, m+1), num_labels) + + # Unroll parameters + nn_params = np.hstack((Theta1.T.ravel(), Theta2.T.ravel())) + + # Short hand for cost function + + costFunc = lambda p: nnCostFunction(p, input_layer_size, hidden_layer_size, num_labels, X, y, Lambda) + + numgrad = computeNumericalGradient(costFunc, nn_params) + grad = costFunc(nn_params)[1] + + # Visually examine the two gradient computations. The two columns + # you get should be very similar. + print np.column_stack((numgrad, grad)) + + print 'The above two columns you get should be very similar.\n' \ + '(Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n' + + # Evaluate the norm of the difference between two solutions. + # If you have a correct implementation, and assuming you used EPSILON = 0.0001 + # in computeNumericalGradient.m, then diff below should be less than 1e-9 + diff = np.linalg.norm(numgrad-grad)/np.linalg.norm(numgrad+grad) + + print 'If your backpropagation implementation is correct, then\n ' \ + 'the relative difference will be small (less than 1e-9). \n' \ + '\nRelative Difference: %g\n' % diff + diff --git a/ex4/computeNumericalGradient.py b/ex4/computeNumericalGradient.py new file mode 100644 index 0000000..f46dabe --- /dev/null +++ b/ex4/computeNumericalGradient.py @@ -0,0 +1,29 @@ +import numpy as np + +def computeNumericalGradient(J, theta): + """computes the numerical gradient of the function J around theta. + Calling y = J(theta) should return the function value at theta. + """ +# Notes: The following code implements numerical gradient checking, and +# returns the numerical gradient.It sets numgrad(i) to (a numerical +# approximation of) the partial derivative of J with respect to the +# i-th input argument, evaluated at theta. (i.e., numgrad(i) should +# be the (approximately) the partial derivative of J with respect +# to theta(i).) + + numgrad = np.zeros(theta.shape[0]) + perturb = np.zeros(theta.shape[0]) + e = 1e-4 + for p in range(theta.size): + + # Set perturbation vector + perturb[p] = e + loss1 = J(theta - perturb) + loss2 = J(theta + perturb) + + # Compute Numerical Gradient + numgrad[p] = (loss2[0] - loss1[0]) / (2*e) + perturb[p] = 0 + + return numgrad + diff --git a/ex4/debugInitializeWeights.py b/ex4/debugInitializeWeights.py new file mode 100644 index 0000000..32ca154 --- /dev/null +++ b/ex4/debugInitializeWeights.py @@ -0,0 +1,20 @@ +import numpy as np + +def debugInitializeWeights(fan_out, fan_in): + """initializes the weights of a layer with fan_in incoming connections + and fan_out outgoing connections using a fix set of values + + Note that W should be set to a matrix of size(1 + fan_in, fan_out) as + the first row of W handles the "bias" terms + """ + +# Set W to zeros + W = np.zeros((fan_out, 1 + fan_in)) + +# Initialize W using "sin", this ensures that W is always of the same +# values and will be useful for debugging + W = np.reshape(np.sin(range(1, W.size+1)), W.T.shape).T / 10.0 + return W + +# ========================================================================= + diff --git a/ex4/ex4.py b/ex4/ex4.py new file mode 100644 index 0000000..1fc65a8 --- /dev/null +++ b/ex4/ex4.py @@ -0,0 +1,231 @@ +## Machine Learning Online Class - Exercise 4 Neural Network Learning + +# Instructions +# ------------ +# +# This file contains code that helps you get started on the +# linear exercise. You will need to complete the following functions +# in this exericse: +# +# sigmoidGradient.m +# randInitializeWeights.m +# nnCostFunction.m +# +# For this exercise, you will not need to change any code in this file, +# or any other files other than those mentioned above. +# + +import numpy as np +import scipy.io +from scipy.optimize import minimize + +from ex3.displayData import displayData +from ex3.predict import predict +from nnCostFunction import nnCostFunction +from sigmoidGradient import sigmoidGradient +from randInitializeWeights import randInitializeWeights +from checkNNGradients import checkNNGradients + +## Setup the parameters you will use for this exercise +input_layer_size = 400 # 20x20 Input Images of Digits +hidden_layer_size = 25 # 25 hidden units +num_labels = 10 # 10 labels, from 1 to 10 + # (note that we have mapped "0" to label 10) + +## =========== Part 1: Loading and Visualizing Data ============= +# We start the exercise by first loading and visualizing the dataset. +# You will be working with a dataset that contains handwritten digits. +# + +# Load Training Data +print 'Loading and Visualizing Data ...' + +data = scipy.io.loadmat('ex4data1.mat') +X = data['X'] +y = data['y'] +m, _ = X.shape + +# Randomly select 100 data points to display +rand_indices = np.random.permutation(range(m)) +sel = X[rand_indices[0:100], :] + +displayData(sel) + +raw_input("Program paused. Press Enter to continue...") + + +## ================ Part 2: Loading Parameters ================ +# In this part of the exercise, we load some pre-initialized +# neural network parameters. + +print 'Loading Saved Neural Network Parameters ...' + +# Load the weights into variables Theta1 and Theta2 +data = scipy.io.loadmat('ex4weights.mat') +Theta1 = data['Theta1'] +Theta2 = data['Theta2'] +y = np.squeeze(y) + +# Unroll parameters +nn_params = np.hstack((Theta1.T.ravel(), Theta2.T.ravel())) + +## ================ Part 3: Compute Cost (Feedforward) ================ +# To the neural network, you should first start by implementing the +# feedforward part of the neural network that returns the cost only. You +# should complete the code in nnCostFunction.m to return cost. After +# implementing the feedforward to compute the cost, you can verify that +# your implementation is correct by verifying that you get the same cost +# as us for the fixed debugging parameters. +# +# We suggest implementing the feedforward cost *without* regularization +# first so that it will be easier for you to debug. Later, in part 4, you +# will get to implement the regularized cost. +# +print 'Feedforward Using Neural Network ...' + +# Weight regularization parameter (we set this to 0 here). +Lambda = 0 + +J, _ = nnCostFunction(nn_params, input_layer_size, hidden_layer_size, + num_labels, X, y, Lambda) + +print 'Cost at parameters (loaded from ex4weights): %f \n(this value should be about 0.287629)\n' % J + +raw_input("Program paused. Press Enter to continue...") + +## =============== Part 4: Implement Regularization =============== +# Once your cost function implementation is correct, you should now +# continue to implement the regularization with the cost. +# + +print 'Checking Cost Function (w/ Regularization) ...' + +# Weight regularization parameter (we set this to 1 here). +Lambda = 1 + +J, _ = nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, Lambda) + +print 'Cost at parameters (loaded from ex4weights): %f \n(this value should be about 0.383770)' % J + +raw_input("Program paused. Press Enter to continue...") + + +## ================ Part 5: Sigmoid Gradient ================ +# Before you start implementing the neural network, you will first +# implement the gradient for the sigmoid function. You should complete the +# code in the sigmoidGradient.m file. +# + +print 'Evaluating sigmoid gradient...' + +g = sigmoidGradient(np.array([1, -0.5, 0, 0.5, 1])) +print 'Sigmoid gradient evaluated at [1 -0.5 0 0.5 1]: ' +print g + +raw_input("Program paused. Press Enter to continue...") + + +## ================ Part 6: Initializing Pameters ================ +# In this part of the exercise, you will be starting to implment a two +# layer neural network that classifies digits. You will start by +# implementing a function to initialize the weights of the neural network +# (randInitializeWeights.m) + +print 'Initializing Neural Network Parameters ...' + +initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size) +initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels) + +# Unroll parameters +initial_nn_params = np.hstack((initial_Theta1.T.ravel(), initial_Theta2.T.ravel())) + + +## =============== Part 7: Implement Backpropagation =============== +# Once your cost matches up with ours, you should proceed to implement the +# backpropagation algorithm for the neural network. You should add to the +# code you've written in nnCostFunction.m to return the partial +# derivatives of the parameters. +# +print 'Checking Backpropagation... ' + +# Check gradients by running checkNNGradients +checkNNGradients() + +raw_input("Program paused. Press Enter to continue...") + + +## =============== Part 8: Implement Regularization =============== +# Once your backpropagation implementation is correct, you should now +# continue to implement the regularization with the cost and gradient. +# + +print 'Checking Backpropagation (w/ Regularization) ... ' + +# Check gradients by running checkNNGradients +Lambda = 3.0 +checkNNGradients(Lambda) + +# Also output the costFunction debugging values +debug_J, _ = nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, Lambda) + +print 'Cost at (fixed) debugging parameters (w/ lambda = 10): %f (this value should be about 0.576051)\n\n' % debug_J + +raw_input("Program paused. Press Enter to continue...") + + +## =================== Part 8: Training NN =================== +# You have now implemented all the code necessary to train a neural +# network. To train your neural network, we will now use "fmincg", which +# is a function which works similarly to "fminunc". Recall that these +# advanced optimizers are able to train our cost functions efficiently as +# long as we provide them with the gradient computations. +# +print 'Training Neural Network... ' + +# After you have completed the assignment, change the MaxIter to a larger +# value to see how more training helps. +# options = optimset('MaxIter', 50) + +# You should also try different values of lambda +Lambda = 1 + +costFunc = lambda p: nnCostFunction(p, input_layer_size, hidden_layer_size, num_labels, X, y, Lambda)[0] +gradFunc = lambda p: nnCostFunction(p, input_layer_size, hidden_layer_size, num_labels, X, y, Lambda)[1] + +result = minimize(costFunc, initial_nn_params, method='CG', jac=gradFunc, options={'disp': True, 'maxiter': 50.0}) +nn_params = result.x +cost = result.fun + +# Obtain Theta1 and Theta2 back from nn_params +Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)], + (hidden_layer_size, input_layer_size + 1), order='F').copy() +Theta2 = np.reshape(nn_params[hidden_layer_size * (input_layer_size + 1):], + (num_labels, (hidden_layer_size + 1)), order='F').copy() + +raw_input("Program paused. Press Enter to continue...") + + +## ================= Part 9: Visualize Weights ================= +# You can now "visualize" what the neural network is learning by +# displaying the hidden units to see what features they are capturing in +# the data. + +print 'Visualizing Neural Network... ' + +displayData(Theta1[:, 1:]) + +raw_input("Program paused. Press Enter to continue...") + +## ================= Part 10: Implement Predict ================= +# After training the neural network, we would like to use it to predict +# the labels. You will now implement the "predict" function to use the +# neural network to predict the labels of the training set. This lets +# you compute the training set accuracy. + +pred = predict(Theta1, Theta2, X) + +accuracy = np.mean(np.double(pred == y)) * 100 +print 'Training Set Accuracy: %f\n'% accuracy + + +raw_input("Program paused. Press Enter to exit...") diff --git a/ex4/ex4data1.mat b/ex4/ex4data1.mat new file mode 100644 index 0000000..371bd0c Binary files /dev/null and b/ex4/ex4data1.mat differ diff --git a/ex4/ex4weights.mat b/ex4/ex4weights.mat new file mode 100644 index 0000000..ace2a09 Binary files /dev/null and b/ex4/ex4weights.mat differ diff --git a/ex4/nnCostFunction.py b/ex4/nnCostFunction.py new file mode 100644 index 0000000..91a60f4 --- /dev/null +++ b/ex4/nnCostFunction.py @@ -0,0 +1,74 @@ +import numpy as np + +from ex2.sigmoid import sigmoid +from sigmoidGradient import sigmoidGradient + + +def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, Lambda): + + """computes the cost and gradient of the neural network. The + parameters for the neural network are "unrolled" into the vector + nn_params and need to be converted back into the weight matrices. + + The returned parameter grad should be a "unrolled" vector of the + partial derivatives of the neural network. + """ + +# Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices +# for our 2 layer neural network +# Obtain Theta1 and Theta2 back from nn_params + Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)], + (hidden_layer_size, input_layer_size + 1), order='F').copy() + + Theta2 = np.reshape(nn_params[hidden_layer_size * (input_layer_size + 1):], + (num_labels, (hidden_layer_size + 1)), order='F').copy() + + + +# Setup some useful variables + m, _ = X.shape + + +# ====================== YOUR CODE HERE ====================== +# Instructions: You should complete the code by working through the +# following parts. +# +# Part 1: Feedforward the neural network and return the cost in the +# variable J. After implementing Part 1, you can verify that your +# cost function computation is correct by verifying the cost +# computed in ex4.m +# +# Part 2: Implement the backpropagation algorithm to compute the gradients +# Theta1_grad and Theta2_grad. You should return the partial derivatives of +# the cost function with respect to Theta1 and Theta2 in Theta1_grad and +# Theta2_grad, respectively. After implementing Part 2, you can check +# that your implementation is correct by running checkNNGradients +# +# Note: The vector y passed into the function is a vector of labels +# containing values from 1..K. You need to map this vector into a +# binary vector of 1's and 0's to be used with the neural network +# cost function. +# +# Hint: We recommend implementing backpropagation using a for-loop +# over the training examples if you are implementing it for the +# first time. +# +# Part 3: Implement regularization with the cost function and gradients. +# +# Hint: You can implement this around the code for +# backpropagation. That is, you can compute the gradients for +# the regularization separately and then add them to Theta1_grad +# and Theta2_grad from Part 2. +# + + + + # ------------------------------------------------------------- + + # ========================================================================= + + # Unroll gradient + grad = np.hstack((Theta1_grad.T.ravel(), Theta2_grad.T.ravel())) + + + return J, grad \ No newline at end of file diff --git a/ex4/randInitializeWeights.py b/ex4/randInitializeWeights.py new file mode 100644 index 0000000..e3a8c40 --- /dev/null +++ b/ex4/randInitializeWeights.py @@ -0,0 +1,21 @@ +import numpy as np + +def randInitializeWeights(L_in, L_out): + """randomly initializes the weights of a layer with L_in incoming connections and L_out outgoing + connections. + + Note that W should be set to a matrix of size(L_out, 1 + L_in) as the column row of W handles the "bias" terms + """ + + # ====================== YOUR CODE HERE ====================== + # Instructions: Initialize W randomly so that we break the symmetry while + # training the neural network. + # + # Note: The first row of W corresponds to the parameters for the bias units + # + + + +# ========================================================================= + + return W diff --git a/ex4/sigmoidGradient.py b/ex4/sigmoidGradient.py new file mode 100644 index 0000000..f3dd1c9 --- /dev/null +++ b/ex4/sigmoidGradient.py @@ -0,0 +1,16 @@ +from ex2.sigmoid import sigmoid + +def sigmoidGradient(z): + """computes the gradient of the sigmoid function + evaluated at z. This should work regardless if z is a matrix or a + vector. In particular, if z is a vector or matrix, you should return + the gradient for each element.""" + +# ====================== YOUR CODE HERE ====================== +# Instructions: Compute the gradient of the sigmoid function evaluated at +# each value of z (z can be a matrix, vector or scalar). + + +# ============================================================= + + return g diff --git a/ex4/submit.py b/ex4/submit.py new file mode 100644 index 0000000..91b4f2b --- /dev/null +++ b/ex4/submit.py @@ -0,0 +1,54 @@ +import numpy as np + +from Submission import Submission +from Submission import sprintf + +homework = 'neural-network-learning' + +part_names = [ + 'Feedforward and Cost Function', + 'Regularized Cost Function', + 'Sigmoid Gradient', + 'Neural Network Gradient (Backpropagation)', + 'Regularized Gradient', + ] + +srcs = [ + 'nnCostFunction.py', + 'nnCostFunction.py', + 'sigmoidGradient.py', + 'nnCostFunction.py', + 'nnCostFunction.py', + ] + + +def output(part_id): + # Random Test Cases + X = np.reshape(3.0*np.sin(np.linspace(1, 30, 30)), (3, 10), order='F') + Xm = np.reshape(np.sin(np.linspace(1, 32, 32)), (16, 2), order='F')/5.0 + ym = np.array(1 + np.mod(range(1,17),4)) + t1 = np.sin(np.reshape(range(1,24,2), (4,3), order='F')) + t2 = np.cos(np.reshape(range(1,40,2), (4,5), order='F')) + t = np.hstack((t1.T.ravel(), t2.T.ravel())) + + fname = srcs[part_id-1].rsplit('.',1)[0] + mod = __import__(fname, fromlist=[fname], level=1) + func = getattr(mod, fname) + + if part_id == 1: + J, grad = func(t, 2.0, 4.0, 4.0, Xm, ym, 0.0) + return sprintf('%0.5f ', J) + elif part_id == 2: + J, grad = func(t, 2.0, 4.0, 4.0, Xm, ym, 1.5) + return sprintf('%0.5f ', J) + elif part_id == 3: + return sprintf('%0.5f ', func(X)) + elif part_id == 4: + J, grad = func(t, 2, 4, 4, Xm, ym, 0) + return sprintf('%0.5f ', np.hstack((J, grad)).tolist()) + elif part_id == 5: + J, grad = func(t, 2, 4, 4, Xm, ym, 1.5) + return sprintf('%0.5f ', np.hstack((J, grad)).tolist()) + +s = Submission(homework, part_names, srcs, output) +s.submit() diff --git a/ex5/ex5.py b/ex5/ex5.py new file mode 100644 index 0000000..6eb9789 --- /dev/null +++ b/ex5/ex5.py @@ -0,0 +1,221 @@ + +import scipy.io +import matplotlib.pyplot as plt +import numpy as np + +from linearRegCostFunction import linearRegCostFunction +from trainLinearReg import trainLinearReg +from learningCurve import learningCurve +from polyFeatures import polyFeatures +from featureNormalize import featureNormalize +from plotFit import plotFit +from validationCurve import validationCurve + +## Machine Learning Online Class +# Exercise 5 | Regularized Linear Regression and Bias-Variance +# +# Instructions +# ------------ +# +# This file contains code that helps you get started on the +# exercise. You will need to complete the following functions: +# +# linearRegCostFunction.m +# learningCurve.m +# validationCurve.m +# +# For this exercise, you will not need to change any code in this file, +# or any other files other than those mentioned above. +# + +## =========== Part 1: Loading and Visualizing Data ============= +# We start the exercise by first loading and visualizing the dataset. +# The following code will load the dataset into your environment and plot +# the data. +# + +# Load Training Data +print 'Loading and Visualizing Data ...' + +# Load from ex5data1: +# You will have X, y, Xval, yval, Xtest, ytest in your environment +data = scipy.io.loadmat('ex5data1.mat') + +# m = Number of examples +X = data['X'][:, 0] +y = data['y'][:, 0] +Xval = data['Xval'][:, 0] +yval = data['yval'][:, 0] +Xtest = data['Xtest'][:, 0] + +m = X.size + +# Plot training data +plt.scatter(X, y, marker='x', s=60, edgecolor='r', lw=1.5) +plt.ylabel('Water flowing out of the dam (y)') # Set the y-axis label +plt.xlabel('Change in water level (x)') # Set the x-axis label + +raw_input("Program paused. Press Enter to continue...") + +## =========== Part 2: Regularized Linear Regression Cost ============= +# You should now implement the cost function for regularized linear +# regression. +# + +theta = np.array([1, 1]) +J = linearRegCostFunction(np.column_stack((np.ones(m), X)), y, theta, 1)[0] + +print 'Cost at theta = [1 1]: %f \n(this value should be about 303.993192)\n' % J + +raw_input("Program paused. Press Enter to continue...") + +## =========== Part 3: Regularized Linear Regression Gradient ============= +# You should now implement the gradient for regularized linear +# regression. +# + +theta = np.array([1, 1]) +J, grad = linearRegCostFunction(np.column_stack((np.ones(m), X)), y, theta, 1) + +print 'Gradient at theta = [1 1]: [%f %f] \n(this value should be about [-15.303016 598.250744])\n' %(grad[0], grad[1]) + +raw_input("Program paused. Press Enter to continue...") + + +## =========== Part 4: Train Linear Regression ============= +# Once you have implemented the cost and gradient correctly, the +# trainLinearReg function will use your cost function to train +# regularized linear regression. +# +# Write Up Note: The data is non-linear, so this will not give a great +# fit. +# + +# Train linear regression with Lambda = 0 +Lambda = 0 +theta = trainLinearReg(np.column_stack((np.ones(m), X)), y, 1) + +# Plot fit over the data +plt.scatter(X, y, marker='x', s=20, edgecolor='r', lw=1.5) +plt.ylabel('Water flowing out of the dam (y)') # Set the y-axis label +plt.xlabel('Change in water level (x)') # Set the x-axis label +plt.plot(X, np.column_stack((np.ones(m), X)).dot(theta), '--', lw=2.0) + +raw_input("Program paused. Press Enter to continue...") + + +## =========== Part 5: Learning Curve for Linear Regression ============= +# Next, you should implement the learningCurve function. +# +# Write Up Note: Since the model is underfitting the data, we expect to +# see a graph with "high bias" -- slide 8 in ML-advice.pdf +# + +Lambda = 0 +error_train, error_val = learningCurve(np.column_stack((np.ones(m), X)), y, + np.column_stack((np.ones(Xval.shape[0]), Xval)), yval, Lambda) +plt.figure() +plt.plot(range(m), error_train, color='b', lw=0.5, label='Train') +plt.plot(range(m), error_val, color='r', lw=0.5, label='Cross Validation') +plt.title('Learning curve for linear regression') +plt.legend() +plt.xlabel('Number of training examples') +plt.ylabel('Error') + +plt.xlim(0, 13) +plt.ylim(0, 150) +plt.legend(loc='upper right', shadow=True, fontsize='x-large', numpoints=1) + +print 'Training Examples\tTrain Error\tCross Validation Error' +for i in range(m): + print ' \t%d\t\t%f\t%f' % (i, error_train[i], error_val[i]) + +raw_input("Program paused. Press Enter to continue...") + +## =========== Part 6: Feature Mapping for Polynomial Regression ============= +# One solution to this is to use polynomial regression. You should now +# complete polyFeatures to map each example into its powers +# + +p = 8 + +# Map X onto Polynomial Features and Normalize +X_poly = polyFeatures(X, p) +X_poly, mu, sigma = featureNormalize(X_poly) # Normalize +X_poly = np.column_stack((np.ones(m), X_poly)) # Add Ones + +# Map X_poly_test and normalize (using mu and sigma) +X_poly_test = polyFeatures(Xtest, p) +X_poly_test = X_poly_test - mu +X_poly_test = X_poly_test / sigma +X_poly_test = np.column_stack((np.ones(X_poly_test.shape[0]), X_poly_test)) # Add Ones + +# Map X_poly_val and normalize (using mu and sigma) +X_poly_val = polyFeatures(Xval, p) +X_poly_val = X_poly_val - mu +X_poly_val = X_poly_val / sigma +X_poly_val = np.column_stack((np.ones(X_poly_test.shape[0]), X_poly_val)) # Add Ones + +print 'Normalized Training Example 1:' +print X_poly[0, :] + +print '\nProgram paused. Press enter to continue.' + + + +## =========== Part 7: Learning Curve for Polynomial Regression ============= +# Now, you will get to experiment with polynomial regression with multiple +# values of Lambda. The code below runs polynomial regression with +# Lambda = 0. You should try running the code with different values of +# Lambda to see how the fit and learning curve change. +# + +Lambda = 0 +theta = trainLinearReg(X_poly, y, Lambda, method='BFGS', maxiter=10) + +# Plot training data and fit +plt.figure() +plt.scatter(X, y, marker='x', s=10, edgecolor='r', lw=1.5) + +plotFit(min(X), max(X), mu, sigma, theta, p) + +plt.xlabel('Change in water level (x)') # Set the y-axis label +plt.ylabel('Water flowing out of the dam (y)') # Set the x-axis label +# plt.plot(X, np.column_stack((np.ones(m), X)).dot(theta), marker='_', lw=2.0) +plt.title('Polynomial Regression Fit (Lambda = %f)' % Lambda) + +error_train, error_val = learningCurve(X_poly, y, X_poly_val, yval, Lambda) +plt.plot(range(m), error_train, label='Train') +plt.plot(range(m), error_val, label='Cross Validation') +plt.title('Polynomial Regression Learning Curve (Lambda = %f)' % Lambda) +plt.xlabel('Number of training examples') +plt.ylabel('Error') +plt.xlim(0, 13) +plt.ylim(0, 150) +plt.legend() + +print 'Polynomial Regression (Lambda = %f)\n\n' % Lambda +print '# Training Examples\tTrain Error\tCross Validation Error' +for i in range(m): + print ' \t%d\t\t%f\t%f' % (i, error_train[i], error_val[i]) + +raw_input("Program paused. Press Enter to continue...") + +## =========== Part 8: Validation for Selecting Lambda ============= +# You will now implement validationCurve to test various values of +# Lambda on a validation set. You will then use this to select the +# "best" Lambda value. +# + +Lambda_vec, error_train, error_val = validationCurve(X_poly, y, X_poly_val, yval) + +plt.plot(Lambda_vec, error_train, Lambda_vec, error_val) +plt.legend('Train', 'Cross Validation') +plt.xlabel('Lambda') +plt.ylabel('Error') + +print 'Lambda\t\tTrain Error\tValidation Error' +for i in range(Lambda_vec.size): + print ' %f\t%f\t%f' % (Lambda_vec[i], error_train[i], error_val[i]) + +raw_input("Program paused. Press Enter to continue...") diff --git a/ex5/ex5data1.mat b/ex5/ex5data1.mat new file mode 100644 index 0000000..5a17abd Binary files /dev/null and b/ex5/ex5data1.mat differ diff --git a/ex5/featureNormalize.py b/ex5/featureNormalize.py new file mode 100644 index 0000000..2e8c961 --- /dev/null +++ b/ex5/featureNormalize.py @@ -0,0 +1,18 @@ +import numpy as np + + +def featureNormalize(X): + """ returns a normalized version of X where + the mean value of each feature is 0 and the standard deviation + is 1. This is often a good preprocessing step to do when + working with learning algorithms. + """ + + mu = np.mean(X, axis=0) + X_norm = X - mu + + sigma = np.std(X_norm, axis=0, ddof=1) + X_norm = X_norm / sigma + +# ============================================================ + return X_norm, mu, sigma \ No newline at end of file diff --git a/ex5/learningCurve.py b/ex5/learningCurve.py new file mode 100644 index 0000000..19af125 --- /dev/null +++ b/ex5/learningCurve.py @@ -0,0 +1,63 @@ +import numpy as np + +from trainLinearReg import trainLinearReg +from linearRegCostFunction import linearRegCostFunction + +def learningCurve(X, y, Xval, yval, Lambda): + """returns the train and + cross validation set errors for a learning curve. In particular, + it returns two vectors of the same length - error_train and + error_val. Then, error_train(i) contains the training error for + i examples (and similarly for error_val(i)). + + In this function, you will compute the train and test errors for + dataset sizes from 1 up to m. In practice, when working with larger + datasets, you might want to do this in larger intervals. + """ + +# Number of training examples + m, _ = X.shape + +# You need to return these values correctly + error_train = np.zeros(m) + error_val = np.zeros(m) + +# ====================== YOUR CODE HERE ====================== +# Instructions: Fill in this function to return training errors in +# error_train and the cross validation errors in error_val. +# i.e., error_train(i) and +# error_val(i) should give you the errors +# obtained after training on i examples. +# +# Note: You should evaluate the training error on the first i training +# examples (i.e., X(1:i, :) and y(1:i)). +# +# For the cross-validation error, you should instead evaluate on +# the _entire_ cross validation set (Xval and yval). +# +# Note: If you are using your cost function (linearRegCostFunction) +# to compute the training and cross validation error, you should +# call the function with the lambda argument set to 0. +# Do note that you will still need to use lambda when running +# the training to obtain the theta parameters. +# +# Hint: You can loop over the examples with the following: +# +# for i = 1:m +# # Compute train/cross validation errors using training examples +# # X(1:i, :) and y(1:i), storing the result in +# # error_train(i) and error_val(i) +# .... +# +# end +# + +# ---------------------- Sample Solution ---------------------- + + + +# ------------------------------------------------------------------------- + +# ========================================================================= + + return error_train, error_val \ No newline at end of file diff --git a/ex5/linearRegCostFunction.py b/ex5/linearRegCostFunction.py new file mode 100644 index 0000000..32cd7b4 --- /dev/null +++ b/ex5/linearRegCostFunction.py @@ -0,0 +1,21 @@ +import numpy as np +def linearRegCostFunction(X, y, theta, Lambda): + """computes the + cost of using theta as the parameter for linear regression to fit the + data points in X and y. Returns the cost in J and the gradient in grad + """ +# Initialize some useful values + + m = y.size # number of training examples + +# ====================== YOUR CODE HERE ====================== +# Instructions: Compute the cost and gradient of regularized linear +# regression for a particular choice of theta. +# +# You should set J to the cost and grad to the gradient. +# + + +# ========================================================================= + + return J, grad \ No newline at end of file diff --git a/ex5/plotFit.py b/ex5/plotFit.py new file mode 100644 index 0000000..04ffc54 --- /dev/null +++ b/ex5/plotFit.py @@ -0,0 +1,25 @@ +import matplotlib.pyplot as plt +import numpy as np + +from polyFeatures import polyFeatures + +def plotFit(min_x, max_x, mu, sigma, theta, p): + """plots the learned polynomial fit with power p + and feature normalization (mu, sigma). + """ + +# We plot a range slightly bigger than the min and max values to get +# an idea of how the fit will vary outside the range of the data points + x = np.arange(min_x - 15, max_x + 25, 0.05).T + +# Map the X values + X_poly = polyFeatures(x, p) + X_poly = X_poly - mu + X_poly = X_poly / sigma + +# Add ones + X_poly = np.column_stack((np.ones(x.shape[0]), X_poly)) + +# Plot + plt.plot(x, X_poly.dot(theta), '--', lw=2) + diff --git a/ex5/polyFeatures.py b/ex5/polyFeatures.py new file mode 100644 index 0000000..35daac5 --- /dev/null +++ b/ex5/polyFeatures.py @@ -0,0 +1,21 @@ +import numpy as np + +def polyFeatures(X, p): + """takes a data matrix X (size m x 1) and + maps each example into its polynomial features where + X_poly(i, :) = [X(i) X(i).^2 X(i).^3 ... X(i).^p] + """ +# You need to return the following variables correctly. + X_poly = np.zeros((X.size, p)) + +# ====================== YOUR CODE HERE ====================== +# Instructions: Given a vector X, return a matrix X_poly where the p-th +# column of X contains the values of X to the p-th power. +# +# + for i in range(1, p+1): + X_poly[:, i-1] = X**i + +# ========================================================================= + + return X_poly \ No newline at end of file diff --git a/ex5/submit.py b/ex5/submit.py new file mode 100644 index 0000000..61e0f29 --- /dev/null +++ b/ex5/submit.py @@ -0,0 +1,58 @@ +import numpy as np + +from Submission import Submission +from Submission import sprintf + +homework = 'regularized-linear-regression-and-bias-variance' + +part_names = [ + 'Regularized Linear Regression Cost Function', + 'Regularized Linear Regression Gradient', + 'Learning Curve', + 'Polynomial Feature Mapping', + 'Validation Curve' + ] + +srcs = [ + 'linearRegCostFunction.py', + 'linearRegCostFunction.py', + 'learningCurve.py', + 'polyFeatures.py', + 'validationCurve.py' + ] + + +def output(part_id): + # Random Test Cases + X = np.column_stack((np.ones(10), + (np.sin(np.arange(1, 16, 1.5))), + (np.cos(np.arange(1, 16, 1.5))))) + y = np.sin(np.arange(1, 30, 3)) + + Xval = np.column_stack((np.ones(10), + (np.sin(np.arange(0, 14, 1.5))), + (np.cos(np.arange(0, 14, 1.5))))) + yval = np.sin(np.arange(1,11)) + + fname = srcs[part_id-1].rsplit('.',1)[0] + mod = __import__(fname, fromlist=[fname], level=1) + func = getattr(mod, fname) + + if part_id == 1: + J, _ = func(X, y, np.array([0.1, 0.2, 0.3]), 0.5) + return sprintf('%0.5f ', J) + elif part_id == 2: + _, grad = func(X, y, np.array([0.1, 0.2, 0.3]), 0.5) + return sprintf('%0.5f ', grad) + elif part_id == 3: + error_train, error_val = func(X, y, Xval, yval, 1) + return sprintf('%0.5f ', np.hstack((error_train, error_val))) + elif part_id == 4: + X_poly = func(X[1, :].T, 8) + return sprintf('%0.5f ', X_poly) + elif part_id == 5: + lambda_vec, error_train, error_val = func(X, y, Xval, yval) + return sprintf('%0.5f', np.hstack((lambda_vec, error_train, error_val))) + +s = Submission(homework, part_names, srcs, output) +s.submit() diff --git a/ex5/trainLinearReg.py b/ex5/trainLinearReg.py new file mode 100644 index 0000000..b9afbb0 --- /dev/null +++ b/ex5/trainLinearReg.py @@ -0,0 +1,24 @@ +from scipy.optimize import minimize + +import numpy as np + +from linearRegCostFunction import linearRegCostFunction + + +def trainLinearReg(X, y, Lambda, method='CG', maxiter=200): + + """trains linear regression using + the dataset (X, y) and regularization parameter lambda. Returns the + trained parameters theta. + """ + +# Initialize Theta + initial_theta = np.zeros(X.shape[1]) + +# Create "short hand" for the cost function to be minimized + costFunction = lambda t: linearRegCostFunction(X, y, t, Lambda)[0] + gradFunction = lambda t: linearRegCostFunction(X, y, t, Lambda)[1] + + result = minimize(costFunction, initial_theta, method=method, jac=None, options={'disp': True, 'maxiter': maxiter}) + + return result.x diff --git a/ex5/validationCurve.py b/ex5/validationCurve.py new file mode 100644 index 0000000..6295a6a --- /dev/null +++ b/ex5/validationCurve.py @@ -0,0 +1,47 @@ +import numpy as np + +from trainLinearReg import trainLinearReg +from linearRegCostFunction import linearRegCostFunction + +def validationCurve(X, y, Xval, yval): + """returns the train + and validation errors (in error_train, error_val) + for different values of lambda. You are given the training set (X, + y) and validation set (Xval, yval). + """ + +# Selected values of lambda (you should not change this) + lambda_vec = np.array([0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]) + +# You need to return these variables correctly. + error_train = np.zeros(lambda_vec.size) + error_val = np.zeros(lambda_vec.size) + +# ====================== YOUR CODE HERE ====================== +# Instructions: Fill in this function to return training errors in +# error_train and the validation errors in error_val. The +# vector lambda_vec contains the different lambda parameters +# to use for each calculation of the errors, i.e, +# error_train(i), and error_val(i) should give +# you the errors obtained after training with +# lambda = lambda_vec(i) +# +# Note: You can loop over lambda_vec with the following: +# +# for i = 1:length(lambda_vec) +# lambda = lambda_vec(i) +# # Compute train / val errors when training linear +# # regression with regularization parameter lambda +# # You should store the result in error_train(i) +# # and error_val(i) +# .... +# +# end +# +# + + + +# ========================================================================= + + return lambda_vec, error_train, error_val \ No newline at end of file diff --git a/ex6/dataset3Params.py b/ex6/dataset3Params.py new file mode 100644 index 0000000..3a97a28 --- /dev/null +++ b/ex6/dataset3Params.py @@ -0,0 +1,29 @@ +import numpy as np +import sklearn.svm + + +def dataset3Params(X, y, Xval, yval): + """returns your choice of C and sigma. You should complete + this function to return the optimal C and sigma based on a + cross-validation set. + """ + +# You need to return the following variables correctly. + C = 1 + sigma = 0.3 + +# ====================== YOUR CODE HERE ====================== +# Instructions: Fill in this function to return the optimal C and sigma +# learning parameters found using the cross validation set. +# You can use svmPredict to predict the labels on the cross +# validation set. For example, +# predictions = svmPredict(model, Xval) +# will return the predictions on the cross validation set. +# +# Note: You can compute the prediction error using +# mean(double(predictions ~= yval)) +# + + +# ========================================================================= + return C, sigma diff --git a/ex6/emailFeatures.py b/ex6/emailFeatures.py new file mode 100644 index 0000000..e1eda6e --- /dev/null +++ b/ex6/emailFeatures.py @@ -0,0 +1,55 @@ +import numpy as np + + +def emailFeatures(word_indices): + """takes in a word_indices vector and + produces a feature vector from the word indices. + """ + +# Total number of words in the dictionary + n = 1899 + +# You need to return the following variables correctly. + x = np.zeros(n) +# ====================== YOUR CODE HERE ====================== +# Instructions: Fill in this function to return a feature vector for the +# given email (word_indices). To help make it easier to +# process the emails, we have have already pre-processed each +# email and converted each word in the email into an index in +# a fixed dictionary (of 1899 words). The variable +# word_indices contains the list of indices of the words +# which occur in one email. +# +# Concretely, if an email has the text: +# +# The quick brown fox jumped over the lazy dog. +# +# Then, the word_indices vector for this text might look +# like: +# +# 60 100 33 44 10 53 60 58 5 +# +# where, we have mapped each word onto a number, for example: +# +# the -- 60 +# quick -- 100 +# ... +# +# (note: the above numbers are just an example and are not the +# actual mappings). +# +# Your task is take one such word_indices vector and construct +# a binary feature vector that indicates whether a particular +# word occurs in the email. That is, x(i) = 1 when word i +# is present in the email. Concretely, if the word 'the' (say, +# index 60) appears in the email, then x(60) = 1. The feature +# vector should look like: +# +# x = [ 0 0 0 0 1 0 0 0 ... 0 0 0 0 1 ... 0 0 0 1 0 ..] +# +# + + +# ========================================================================= + + return x \ No newline at end of file diff --git a/ex6/emailSample1.txt b/ex6/emailSample1.txt new file mode 100644 index 0000000..eac52a3 --- /dev/null +++ b/ex6/emailSample1.txt @@ -0,0 +1,10 @@ +> Anyone knows how much it costs to host a web portal ? +> +Well, it depends on how many visitors you're expecting. +This can be anywhere from less than 10 bucks a month to a couple of $100. +You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 +if youre running something big.. + +To unsubscribe yourself from this mailing list, send an email to: +groupname-unsubscribe@egroups.com + diff --git a/ex6/emailSample2.txt b/ex6/emailSample2.txt new file mode 100644 index 0000000..e47acda --- /dev/null +++ b/ex6/emailSample2.txt @@ -0,0 +1,34 @@ +Folks, + +my first time posting - have a bit of Unix experience, but am new to Linux. + + +Just got a new PC at home - Dell box with Windows XP. Added a second hard disk +for Linux. Partitioned the disk and have installed Suse 7.2 from CD, which went +fine except it didn't pick up my monitor. + +I have a Dell branded E151FPp 15" LCD flat panel monitor and a nVidia GeForce4 +Ti4200 video card, both of which are probably too new to feature in Suse's default +set. I downloaded a driver from the nVidia website and installed it using RPM. +Then I ran Sax2 (as was recommended in some postings I found on the net), but +it still doesn't feature my video card in the available list. What next? + +Another problem. I have a Dell branded keyboard and if I hit Caps-Lock twice, +the whole machine crashes (in Linux, not Windows) - even the on/off switch is +inactive, leaving me to reach for the power cable instead. + +If anyone can help me in any way with these probs., I'd be really grateful - +I've searched the 'net but have run out of ideas. + +Or should I be going for a different version of Linux such as RedHat? Opinions +welcome. + +Thanks a lot, +Peter + +-- +Irish Linux Users' Group: ilug@linux.ie +http://www.linux.ie/mailman/listinfo/ilug for (un)subscription information. +List maintainer: listmaster@linux.ie + + diff --git a/ex6/ex6.py b/ex6/ex6.py new file mode 100644 index 0000000..f332eb7 --- /dev/null +++ b/ex6/ex6.py @@ -0,0 +1,171 @@ +## Machine Learning Online Class +# Exercise 6 | Support Vector Machines +# +# Instructions +# ------------ +# +# This file contains code that helps you get started on the +# exercise. You will need to complete the following functions: +# +# gaussianKernel.m +# dataset3Params.m +# processEmail.m +# emailFeatures.m +# +# For this exercise, you will not need to change any code in this file, +# or any other files other than those mentioned above. +# +from matplotlib import use, cm +use('TkAgg') +import numpy as np +import scipy.io +from sklearn import svm +from dataset3Params import dataset3Params +from plotData import plotData +from visualizeBoundary import visualizeBoundary +from visualizeBoundaryLinear import visualizeBoundaryLinear + +## =============== Part 1: Loading and Visualizing Data ================ +# We start the exercise by first loading and visualizing the dataset. +# The following code will load the dataset into your environment and plot +# the data. +# + +print 'Loading and Visualizing Data ...' + +# Load from ex6data1: +# You will have X, y in your environment +data = scipy.io.loadmat('ex6data1.mat') +X = data['X'] +y = data['y'].flatten() + +# Plot training data +plotData(X, y) + +raw_input("Program paused. Press Enter to continue...") + +## ==================== Part 2: Training Linear SVM ==================== +# The following code will train a linear SVM on the dataset and plot the +# decision boundary learned. +# + +# Load from ex6data1: +# You will have X, y in your environment +data = scipy.io.loadmat('ex6data1.mat') +X = data['X'] +y = data['y'].flatten() + +print 'Training Linear SVM ...' + +# You should try to change the C value below and see how the decision +# boundary varies (e.g., try C = 1000) + +C = 1 +clf = svm.SVC(C=C, kernel='linear', tol=1e-3, max_iter=20) +model = clf.fit(X, y) +visualizeBoundaryLinear(X, y, model) + +raw_input("Program paused. Press Enter to continue...") + +## =============== Part 3: Implementing Gaussian Kernel =============== +# You will now implement the Gaussian kernel to use +# with the SVM. You should complete the code in gaussianKernel.m +# +print 'Evaluating the Gaussian Kernel ...' + +x1 = np.array([1, 2, 1]) +x2 = np.array([0, 4, -1]) +sigma = 2 +# sim = gaussianKernel(x1, x2, sigma) +# +# print 'Gaussian Kernel between x1 = [1 2 1], x2 = [0 4 -1], sigma = %0.5f : ' \ +# '\t%f\n(this value should be about 0.324652)\n' % (sigma, sim) + +raw_input("Program paused. Press Enter to continue...") + +## =============== Part 4: Visualizing Dataset 2 ================ +# The following code will load the next dataset into your environment and +# plot the data. +# + +print 'Loading and Visualizing Data ...' + +# Load from ex6data2: +# You will have X, y in your environment +data = scipy.io.loadmat('ex6data2.mat') +X = data['X'] +y = data['y'].flatten() + +# Plot training data +plotData(X, y) + +raw_input("Program paused. Press Enter to continue...") + +## ========== Part 5: Training SVM with RBF Kernel (Dataset 2) ========== +# After you have implemented the kernel, we can now use it to train the +# SVM classifier. +# +print 'Training SVM with RBF Kernel (this may take 1 to 2 minutes) ...' + +# Load from ex6data2: +# You will have X, y in your environment +data = scipy.io.loadmat('ex6data2.mat') +X = data['X'] +y = data['y'].flatten() + +# SVM Parameters +C = 1 +sigma = 0.1 +gamma = 1.0 / (2.0 * sigma ** 2) + +# We set the tolerance and max_passes lower here so that the code will run +# faster. However, in practice, you will want to run the training to +# convergence. + +clf = svm.SVC(C=C, kernel='rbf', tol=1e-3, max_iter=200, gamma=gamma) +model = clf.fit(X, y) +visualizeBoundary(X, y, model) + +raw_input("Program paused. Press Enter to continue...") + +## =============== Part 6: Visualizing Dataset 3 ================ +# The following code will load the next dataset into your environment and +# plot the data. +# + +print 'Loading and Visualizing Data ...' + +# Load from ex6data3: +# You will have X, y in your environment +data = scipy.io.loadmat('ex6data3.mat') +X = data['X'] +y = data['y'].flatten() + +# Plot training data +plotData(X, y) + +raw_input("Program paused. Press Enter to continue...") + +## ========== Part 7: Training SVM with RBF Kernel (Dataset 3) ========== + +# This is a different dataset that you can use to experiment with. Try +# different values of C and sigma here. +# + +# Load from ex6data3: +# You will have X, y in your environment +data = scipy.io.loadmat('ex6data3.mat') +Xval = data['Xval'] +yval = data['yval'].flatten() + +# Try different SVM Parameters here +C, sigma = dataset3Params(X, y, Xval, yval) +gamma = 1.0 / (2.0 * sigma ** 2) +# Train the SVM + +clf = svm.SVC(C=C, kernel='rbf', tol=1e-3, max_iter=200, gamma=gamma) +model = clf.fit(X, y) +visualizeBoundary(X, y, model) + +raw_input("Program paused. Press Enter to continue...") + diff --git a/ex6/ex6_spam.py b/ex6/ex6_spam.py new file mode 100644 index 0000000..7c1ee6b --- /dev/null +++ b/ex6/ex6_spam.py @@ -0,0 +1,149 @@ +## Machine Learning Online Class +# Exercise 6 | Spam Classification with SVMs +# +# Instructions +# ------------ +# +# This file contains code that helps you get started on the +# exercise. You will need to complete the following functions: +# +# gaussianKernel.m +# dataset3Params.m +# processEmail.m +# emailFeatures.m +# +# For this exercise, you will not need to change any code in this file, +# or any other files other than those mentioned above. +# +import numpy as np +import scipy.io +from sklearn import svm +from collections import OrderedDict + +from processEmail import processEmail +from emailFeatures import emailFeatures +from getVocabList import getVocabList + +## ==================== Part 1: Email Preprocessing ==================== +# To use an SVM to classify emails into Spam v.s. Non-Spam, you first need +# to convert each email into a vector of features. In this part, you will +# implement the preprocessing steps for each email. You should +# complete the code in processEmail.m to produce a word indices vector +# for a given email. + +print 'Preprocessing sample email (emailSample1.txt)' + +# Extract Features +file = open('emailSample1.txt', 'r') +file_contents = file.readlines() +word_indices = processEmail(''.join(file_contents)) + +# Print Stats +print 'Word Indices: ' +print word_indices + +raw_input("Program paused. Press Enter to continue...") + +## ==================== Part 2: Feature Extraction ==================== +# Now, you will convert each email into a vector of features in R^n. +# You should complete the code in emailFeatures.m to produce a feature +# vector for a given email. + +print 'Extracting features from sample email (emailSample1.txt)' + +# Extract Features +file = open('emailSample1.txt') +file_contents = file.readlines() +word_indices = processEmail(''.join(file_contents)) +features = emailFeatures(word_indices) + +# Print Stats +print 'Length of feature vector: %d'% features.size +print 'Number of non-zero entries: %d'% sum(features > 0) + +raw_input("Program paused. Press Enter to continue...") + +## =========== Part 3: Train Linear SVM for Spam Classification ======== +# In this section, you will train a linear classifier to determine if an +# email is Spam or Not-Spam. + +# Load the Spam Email dataset +# You will have X, y in your environment +data = scipy.io.loadmat('spamTrain.mat') +X = data['X'] +y = data['y'].flatten() + +print 'Training Linear SVM (Spam Classification)' +print '(this may take 1 to 2 minutes) ...' + +C = 0.1 +clf = svm.SVC(C=C, kernel='linear', tol=1e-3, max_iter=200) +model = clf.fit(X, y) + +p = model.predict(X) + +print 'Training Accuracy: %f', np.mean(np.double(p == y)) * 100 + +## =================== Part 4: Test Spam Classification ================ +# After training the classifier, we can evaluate it on a test set. We have +# included a test set in spamTest.mat + +# Load the test dataset +# You will have Xtest, ytest in your environment +data = scipy.io.loadmat('spamTest.mat') +Xtest = data['Xtest'] +ytest = data['ytest'] + +print 'Evaluating the trained Linear SVM on a test set ...' + +p = model.predict(Xtest) + +print 'Test Accuracy: %f', np. mean(np.double(p == ytest)) * 100 + + +## ================= Part 5: Top Predictors of Spam ==================== +# Since the model we are training is a linear SVM, we can inspect the +# weights learned by the model to understand better how it is determining +# whether an email is spam or not. The following code finds the words with +# the highest weights in the classifier. Informally, the classifier +# 'thinks' that these words are the most likely indicators of spam. +# + +# Sort the weights and obtain the vocabulary list + +t = sorted(list(enumerate(model.coef_[0])),key=lambda e: e[1], reverse=True) +d = OrderedDict(t) +idx = d.keys() +weight = d.values() +vocabList = getVocabList() + +print 'Top predictors of spam: ' +for i in range(15): + print ' %-15s (%f)' %(vocabList[idx[i]], weight[i]) + +print 'Program paused. Press enter to continue.' + +## =================== Part 6: Try Your Own Emails ===================== +# Now that you've trained the spam classifier, you can use it on your own +# emails! In the starter code, we have included spamSample1.txt, +# spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. +# The following code reads in one of these emails and then uses your +# learned SVM classifier to determine whether the email is Spam or +# Not Spam + +# Set the file to be read in (change this to spamSample2.txt, +# emailSample1.txt or emailSample2.txt to see different predictions on +# different emails types). Try your own emails as well! +filename = 'spamSample1.txt' + +# Read and predict + +file = open(filename) +file_contents = file.readlines() +word_indices = processEmail(''.join(file_contents)) +x = emailFeatures(word_indices) +p = model.predict(x) + +print 'Processed %s\n\nSpam Classification: %d' % (filename, p) +print '(1 indicates spam, 0 indicates not spam)' + diff --git a/ex6/ex6data1.mat b/ex6/ex6data1.mat new file mode 100644 index 0000000..ae0d2aa Binary files /dev/null and b/ex6/ex6data1.mat differ diff --git a/ex6/ex6data2.mat b/ex6/ex6data2.mat new file mode 100644 index 0000000..c6ad661 Binary files /dev/null and b/ex6/ex6data2.mat differ diff --git a/ex6/ex6data3.mat b/ex6/ex6data3.mat new file mode 100644 index 0000000..a0441ac Binary files /dev/null and b/ex6/ex6data3.mat differ diff --git a/ex6/gaussianKernel.py b/ex6/gaussianKernel.py new file mode 100644 index 0000000..1c943b1 --- /dev/null +++ b/ex6/gaussianKernel.py @@ -0,0 +1,25 @@ +import numpy as np + + +def gaussianKernel(x1, x2, sigma): + """returns a gaussian kernel between x1 and x2 + and returns the value in sim + """ + +# Ensure that x1 and x2 are column vectors +# x1 = x1.ravel() +# x2 = x2.ravel() + +# You need to return the following variables correctly. + sim = 0 + +# ====================== YOUR CODE HERE ====================== +# Instructions: Fill in this function to return the similarity between x1 +# and x2 computed using a Gaussian kernel with bandwidth +# sigma +# +# + + +# ============================================================= + return sim \ No newline at end of file diff --git a/ex6/getVocabList.py b/ex6/getVocabList.py new file mode 100644 index 0000000..4fb9859 --- /dev/null +++ b/ex6/getVocabList.py @@ -0,0 +1,22 @@ +import numpy as np + + +def getVocabList(): + + """reads the fixed vocabulary list in vocab.txt + and returns a cell array of the words in vocabList. + """ + +## Read the fixed vocabulary list + with open('vocab.txt') as f: + +# Store all dictionary words in cell array vocab{} + +# For ease of implementation, we use a struct to map the strings => integers +# In practice, you'll want to use some form of hashmap + vocabList = [] + for line in f: + idx, w = line.split() + vocabList.append(w) + + return vocabList diff --git a/ex6/linearKernel.py b/ex6/linearKernel.py new file mode 100644 index 0000000..5d9f827 --- /dev/null +++ b/ex6/linearKernel.py @@ -0,0 +1,13 @@ +def linearKernel(x1, x2): + """returns a linear kernel between x1 and x2 + and returns the value in sim + """ + +# Ensure that x1 and x2 are column vectors + x1 = x1.ravel() + x2 = x2.ravel() + +# Compute the kernel + sim = x1.T.dot(x2) # dot product + + return sim diff --git a/ex6/plotData.py b/ex6/plotData.py new file mode 100644 index 0000000..e4f869d --- /dev/null +++ b/ex6/plotData.py @@ -0,0 +1,20 @@ +import matplotlib.pyplot as plt +import numpy as np +from show import show +def plotData(X, y): + """plots the data points with + for the positive examples + and o for the negative examples. X is assumed to be a Mx2 matrix. + + Note: This was slightly modified such that it expects y = 1 or y = 0 + """ + plt.figure() + +# Find Indices of Positive and Negative Examples + pos = np.where(y==1, True, False).flatten() + neg = np.where(y==0, True, False).flatten() + +# Plot Examples + plt.plot(X[pos,0], X[pos, 1], 'k+', linewidth=1, markersize=7) + plt.plot(X[neg,0], X[neg, 1], 'ko', color='y', markersize=7) + show() + diff --git a/ex6/porterStemmer.py b/ex6/porterStemmer.py new file mode 100644 index 0000000..ac3f722 --- /dev/null +++ b/ex6/porterStemmer.py @@ -0,0 +1,347 @@ +"""Porter Stemming Algorithm +This is the Porter stemming algorithm, ported to Python from the +version coded up in ANSI C by the author. It may be be regarded +as canonical, in that it follows the algorithm presented in + +Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, +no. 3, pp 130-137, + +only differing from it at the points maked --DEPARTURE-- below. + +See also http://www.tartarus.org/~martin/PorterStemmer + +The algorithm as described in the paper could be exactly replicated +by adjusting the points of DEPARTURE, but this is barely necessary, +because (a) the points of DEPARTURE are definitely improvements, and +(b) no encoding of the Porter stemmer I have seen is anything like +as exact as this version, even with the points of DEPARTURE! + +Vivake Gupta (v@nano.com) + +Release 1: January 2001 + +Further adjustments by Santiago Bruno (bananabruno@gmail.com) +to allow word input not restricted to one word per line, leading +to: + +release 2: July 2008 +""" + +import sys + +class PorterStemmer: + + def __init__(self): + """The main part of the stemming algorithm starts here. + b is a buffer holding a word to be stemmed. The letters are in b[k0], + b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is + readjusted downwards as the stemming progresses. Zero termination is + not in fact used in the algorithm. + + Note that only lower case sequences are stemmed. Forcing to lower case + should be done before stem(...) is called. + """ + + self.b = "" # buffer for word to be stemmed + self.k = 0 + self.k0 = 0 + self.j = 0 # j is a general offset into the string + + def cons(self, i): + """cons(i) is TRUE <=> b[i] is a consonant.""" + if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u': + return 0 + if self.b[i] == 'y': + if i == self.k0: + return 1 + else: + return (not self.cons(i - 1)) + return 1 + + def m(self): + """m() measures the number of consonant sequences between k0 and j. + if c is a consonant sequence and v a vowel sequence, and <..> + indicates arbitrary presence, + + gives 0 + vc gives 1 + vcvc gives 2 + vcvcvc gives 3 + .... + """ + n = 0 + i = self.k0 + while 1: + if i > self.j: + return n + if not self.cons(i): + break + i = i + 1 + i = i + 1 + while 1: + while 1: + if i > self.j: + return n + if self.cons(i): + break + i = i + 1 + i = i + 1 + n = n + 1 + while 1: + if i > self.j: + return n + if not self.cons(i): + break + i = i + 1 + i = i + 1 + + def vowelinstem(self): + """vowelinstem() is TRUE <=> k0,...j contains a vowel""" + for i in range(self.k0, self.j + 1): + if not self.cons(i): + return 1 + return 0 + + def doublec(self, j): + """doublec(j) is TRUE <=> j,(j-1) contain a double consonant.""" + if j < (self.k0 + 1): + return 0 + if (self.b[j] != self.b[j-1]): + return 0 + return self.cons(j) + + def cvc(self, i): + """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant + and also if the second c is not w,x or y. this is used when trying to + restore an e at the end of a short e.g. + + cav(e), lov(e), hop(e), crim(e), but + snow, box, tray. + """ + if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2): + return 0 + ch = self.b[i] + if ch == 'w' or ch == 'x' or ch == 'y': + return 0 + return 1 + + def ends(self, s): + """ends(s) is TRUE <=> k0,...k ends with the string s.""" + length = len(s) + if s[length - 1] != self.b[self.k]: # tiny speed-up + return 0 + if length > (self.k - self.k0 + 1): + return 0 + if self.b[self.k-length+1:self.k+1] != s: + return 0 + self.j = self.k - length + return 1 + + def setto(self, s): + """setto(s) sets (j+1),...k to the characters in the string s, readjusting k.""" + length = len(s) + self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:] + self.k = self.j + length + + def r(self, s): + """r(s) is used further down.""" + if self.m() > 0: + self.setto(s) + + def step1ab(self): + """step1ab() gets rid of plurals and -ed or -ing. e.g. + + caresses -> caress + ponies -> poni + ties -> ti + caress -> caress + cats -> cat + + feed -> feed + agreed -> agree + disabled -> disable + + matting -> mat + mating -> mate + meeting -> meet + milling -> mill + messing -> mess + + meetings -> meet + """ + if self.b[self.k] == 's': + if self.ends("sses"): + self.k = self.k - 2 + elif self.ends("ies"): + self.setto("i") + elif self.b[self.k - 1] != 's': + self.k = self.k - 1 + if self.ends("eed"): + if self.m() > 0: + self.k = self.k - 1 + elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem(): + self.k = self.j + if self.ends("at"): self.setto("ate") + elif self.ends("bl"): self.setto("ble") + elif self.ends("iz"): self.setto("ize") + elif self.doublec(self.k): + self.k = self.k - 1 + ch = self.b[self.k] + if ch == 'l' or ch == 's' or ch == 'z': + self.k = self.k + 1 + elif (self.m() == 1 and self.cvc(self.k)): + self.setto("e") + + def step1c(self): + """step1c() turns terminal y to i when there is another vowel in the stem.""" + if (self.ends("y") and self.vowelinstem()): + self.b = self.b[:self.k] + 'i' + self.b[self.k+1:] + + def step2(self): + """step2() maps double suffices to single ones. + so -ization ( = -ize plus -ation) maps to -ize etc. note that the + string before the suffix must give m() > 0. + """ + if self.b[self.k - 1] == 'a': + if self.ends("ational"): self.r("ate") + elif self.ends("tional"): self.r("tion") + elif self.b[self.k - 1] == 'c': + if self.ends("enci"): self.r("ence") + elif self.ends("anci"): self.r("ance") + elif self.b[self.k - 1] == 'e': + if self.ends("izer"): self.r("ize") + elif self.b[self.k - 1] == 'l': + if self.ends("bli"): self.r("ble") # --DEPARTURE-- + # To match the published algorithm, replace this phrase with + # if self.ends("abli"): self.r("able") + elif self.ends("alli"): self.r("al") + elif self.ends("entli"): self.r("ent") + elif self.ends("eli"): self.r("e") + elif self.ends("ousli"): self.r("ous") + elif self.b[self.k - 1] == 'o': + if self.ends("ization"): self.r("ize") + elif self.ends("ation"): self.r("ate") + elif self.ends("ator"): self.r("ate") + elif self.b[self.k - 1] == 's': + if self.ends("alism"): self.r("al") + elif self.ends("iveness"): self.r("ive") + elif self.ends("fulness"): self.r("ful") + elif self.ends("ousness"): self.r("ous") + elif self.b[self.k - 1] == 't': + if self.ends("aliti"): self.r("al") + elif self.ends("iviti"): self.r("ive") + elif self.ends("biliti"): self.r("ble") + elif self.b[self.k - 1] == 'g': # --DEPARTURE-- + if self.ends("logi"): self.r("log") + # To match the published algorithm, delete this phrase + + def step3(self): + """step3() dels with -ic-, -full, -ness etc. similar strategy to step2.""" + if self.b[self.k] == 'e': + if self.ends("icate"): self.r("ic") + elif self.ends("ative"): self.r("") + elif self.ends("alize"): self.r("al") + elif self.b[self.k] == 'i': + if self.ends("iciti"): self.r("ic") + elif self.b[self.k] == 'l': + if self.ends("ical"): self.r("ic") + elif self.ends("ful"): self.r("") + elif self.b[self.k] == 's': + if self.ends("ness"): self.r("") + + def step4(self): + """step4() takes off -ant, -ence etc., in context vcvc.""" + if self.b[self.k - 1] == 'a': + if self.ends("al"): pass + else: return + elif self.b[self.k - 1] == 'c': + if self.ends("ance"): pass + elif self.ends("ence"): pass + else: return + elif self.b[self.k - 1] == 'e': + if self.ends("er"): pass + else: return + elif self.b[self.k - 1] == 'i': + if self.ends("ic"): pass + else: return + elif self.b[self.k - 1] == 'l': + if self.ends("able"): pass + elif self.ends("ible"): pass + else: return + elif self.b[self.k - 1] == 'n': + if self.ends("ant"): pass + elif self.ends("ement"): pass + elif self.ends("ment"): pass + elif self.ends("ent"): pass + else: return + elif self.b[self.k - 1] == 'o': + if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass + elif self.ends("ou"): pass + # takes care of -ous + else: return + elif self.b[self.k - 1] == 's': + if self.ends("ism"): pass + else: return + elif self.b[self.k - 1] == 't': + if self.ends("ate"): pass + elif self.ends("iti"): pass + else: return + elif self.b[self.k - 1] == 'u': + if self.ends("ous"): pass + else: return + elif self.b[self.k - 1] == 'v': + if self.ends("ive"): pass + else: return + elif self.b[self.k - 1] == 'z': + if self.ends("ize"): pass + else: return + else: + return + if self.m() > 1: + self.k = self.j + + def step5(self): + """step5() removes a final -e if m() > 1, and changes -ll to -l if + m() > 1. + """ + self.j = self.k + if self.b[self.k] == 'e': + a = self.m() + if a > 1 or (a == 1 and not self.cvc(self.k-1)): + self.k = self.k - 1 + if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1: + self.k = self.k -1 + + def stem(self, p, i, j): + """In stem(p,i,j), p is a char pointer, and the string to be stemmed + is from p[i] to p[j] inclusive. Typically i is zero and j is the + offset to the last character of a string, (p[j+1] == '\0'). The + stemmer adjusts the characters p[i] ... p[j] and returns the new + end-point of the string, k. Stemming never increases word length, so + i <= k <= j. To turn the stemmer into a module, declare 'stem' as + extern, and delete the remainder of this file. + """ + # copy the parameters into statics + self.b = p + self.k = j + self.k0 = i + if self.k <= self.k0 + 1: + return self.b # --DEPARTURE-- + + # With this line, strings of length 1 or 2 don't go through the + # stemming process, although no mention is made of this in the + # published algorithm. Remove the line to match the published + # algorithm. + + self.step1ab() + self.step1c() + self.step2() + self.step3() + self.step4() + self.step5() + return self.b[self.k0:self.k+1] + + +def porterStemmer(word): + p = PorterStemmer() + return p.stem(word, 0, len(word)-1) diff --git a/ex6/processEmail.py b/ex6/processEmail.py new file mode 100644 index 0000000..8b35d90 --- /dev/null +++ b/ex6/processEmail.py @@ -0,0 +1,124 @@ +from string import lower +from porterStemmer import porterStemmer +from getVocabList import getVocabList +import re + +def processEmail(email_contents): + """preprocesses a the body of an email and + returns a list of word_indices + word_indices = PROCESSEMAIL(email_contents) preprocesses + the body of an email and returns a list of indices of the + words contained in the email. + """ + +# Load Vocabulary + vocabList = getVocabList() + +# Init return value + word_indices = [] + +# ========================== Preprocess Email =========================== + +# Find the Headers ( \n\n and remove ) +# Uncomment the following lines if you are working with raw emails with the +# full headers + +# hdrstart = strfind(email_contents, ([chr(10) chr(10)])) +# email_contents = email_contents(hdrstart(1):end) + +# Lower case + email_contents = lower(email_contents) + +# Strip all HTML +# Looks for any expression that starts with < and ends with > and replace +# and does not have any < or > in the tag it with a space + rx = re.compile('<[^<>]+>|\n') + email_contents = rx.sub(' ', email_contents) +# Handle Numbers +# Look for one or more characters between 0-9 + rx = re.compile('[0-9]+') + email_contents = rx.sub('number ', email_contents) + +# Handle URLS +# Look for strings starting with http:// or https:// + rx = re.compile('(http|https)://[^\s]*') + email_contents = rx.sub('httpaddr ', email_contents) + +# Handle Email Addresses +# Look for strings with @ in the middle + rx = re.compile('[^\s]+@[^\s]+') + email_contents = rx.sub('emailaddr ', email_contents) + +# Handle $ sign + rx = re.compile('[$]+') + email_contents = rx.sub('dollar ', email_contents) + +# ========================== Tokenize Email =========================== + +# Output the email to screen as well + print '==== Processed Email ====\n' + +# Process file + l = 0 + +# Remove any non alphanumeric characters + rx = re.compile('[^a-zA-Z0-9 ]') + email_contents = rx.sub('', email_contents).split() + + for str in email_contents: + + # Tokenize and also get rid of any punctuation + # str = re.split('[' + re.escape(' @$/#.-:&*+=[]?!(){},''">_<#') + # + chr(10) + chr(13) + ']', str) + + # Stem the word + # (the porterStemmer sometimes has issues, so we use a try catch block) + try: + str = porterStemmer(str.strip()) + except: + str = '' + continue + + # Skip the word if it is too short + if len(str) < 1: + continue + + # Look up the word in the dictionary and add to word_indices if + # found + # ====================== YOUR CODE HERE ====================== + # Instructions: Fill in this function to add the index of str to + # word_indices if it is in the vocabulary. At this point + # of the code, you have a stemmed word from the email in + # the variable str. You should look up str in the + # vocabulary list (vocabList). If a match exists, you + # should add the index of the word to the word_indices + # vector. Concretely, if str = 'action', then you should + # look up the vocabulary list to find where in vocabList + # 'action' appears. For example, if vocabList{18} = + # 'action', then, you should add 18 to the word_indices + # vector (e.g., word_indices = [word_indices 18] ). + # + # Note: vocabList{idx} returns a the word with index idx in the + # vocabulary list. + # + # Note: You can use strcmp(str1, str2) to compare two strings (str1 and + # str2). It will return 1 only if the two strings are equivalent. + # + + + + + # ============================================================= + + # Print to screen, ensuring that the output lines are not too long + if (l + len(str) + 1) > 78: + print str + l = 0 + else: + print str, + l = l + len(str) + 1 + +# Print footer + print '\n=========================' + return word_indices + diff --git a/ex6/spamSample1.txt b/ex6/spamSample1.txt new file mode 100644 index 0000000..bab0ca2 --- /dev/null +++ b/ex6/spamSample1.txt @@ -0,0 +1,42 @@ +Do You Want To Make $1000 Or More Per Week? + + + +If you are a motivated and qualified individual - I +will personally demonstrate to you a system that will +make you $1,000 per week or more! This is NOT mlm. + + + +Call our 24 hour pre-recorded number to get the +details. + + + +000-456-789 + + + +I need people who want to make serious money. Make +the call and get the facts. + +Invest 2 minutes in yourself now! + + + +000-456-789 + + + +Looking forward to your call and I will introduce you +to people like yourself who +are currently making $10,000 plus per week! + + + +000-456-789 + + + +3484lJGv6-241lEaN9080lRmS6-271WxHo7524qiyT5-438rjUv5615hQcf0-662eiDB9057dMtVl72 + diff --git a/ex6/spamSample2.txt b/ex6/spamSample2.txt new file mode 100644 index 0000000..f8e8fce --- /dev/null +++ b/ex6/spamSample2.txt @@ -0,0 +1,8 @@ +Best Buy Viagra Generic Online + +Viagra 100mg x 60 Pills $125, Free Pills & Reorder Discount, Top Selling 100% Quality & Satisfaction guaranteed! + +We accept VISA, Master & E-Check Payments, 90000+ Satisfied Customers! +http://medphysitcstech.ru + + diff --git a/ex6/spamTest.mat b/ex6/spamTest.mat new file mode 100644 index 0000000..b7bf953 Binary files /dev/null and b/ex6/spamTest.mat differ diff --git a/ex6/spamTrain.mat b/ex6/spamTrain.mat new file mode 100644 index 0000000..1b9c81f Binary files /dev/null and b/ex6/spamTrain.mat differ diff --git a/ex6/submit.py b/ex6/submit.py new file mode 100644 index 0000000..a07d8c7 --- /dev/null +++ b/ex6/submit.py @@ -0,0 +1,56 @@ +import numpy as np +import scipy.io + +from Submission import Submission +from Submission import sprintf + +homework = 'support-vector-machines' + +part_names = [ + 'Gaussian Kernel', + 'Parameters (C, sigma) for Dataset 3', + 'Email Preprocessing', + 'Email Feature Extraction', + ] + +srcs = [ + 'gaussianKernel.py', + 'dataset3Params.py', + 'processEmail.py', + 'emailFeatures.py', + ] + + +def output(part_id): + # Random Test Cases + x1 = np.sin(np.arange(1,11)) + x2 = np.cos(np.arange(1,11)) + ec = 'the quick brown fox jumped over the lazy dog' + wi = np.abs(np.round(x1 * 1863)) + wi = np.hstack((wi, wi)) + + fname = srcs[part_id-1].rsplit('.',1)[0] + mod = __import__(fname, fromlist=[fname], level=1) + func = getattr(mod, fname) + + if part_id == 1: + sim = func(x1, x2, 2) + return sprintf('%0.5f ', sim) + elif part_id == 2: + data = scipy.io.loadmat('ex6data3.mat') + X = data['X'] + y = data['y'].flatten() + Xval = data['Xval'] + yval = data['yval'].flatten() + C, sigma = func(X, y, Xval, yval) + return sprintf('%0.5f ', np.hstack((C, sigma))) + elif part_id == 3: + word_indices = np.array(func(ec)) + return sprintf('%d ', (word_indices + 1).tolist()) + elif part_id == 4: + x = func(wi) + return sprintf('%d', x) + +s = Submission(homework, part_names, srcs, output) +s.submit() + diff --git a/ex6/visualizeBoundary.py b/ex6/visualizeBoundary.py new file mode 100644 index 0000000..270b880 --- /dev/null +++ b/ex6/visualizeBoundary.py @@ -0,0 +1,24 @@ +import numpy as np +from plotData import plotData +from matplotlib import pyplot as plt + +def visualizeBoundary(X, y, model): + """plots a non-linear decision boundary learned by the + SVM and overlays the data on it""" + +# Plot the training data on top of the boundary + plotData(X, y) + + # Make classification predictions over a grid of values + x1plot = np.linspace(min(X[:,0]), max(X[:,0]), X.shape[0]).T + x2plot = np.linspace(min(X[:,1]), max(X[:,1]), X.shape[0]).T + X1, X2 = np.meshgrid(x1plot, x2plot) + vals = np.zeros(X1.shape) + + for i in range(X1.shape[1]): + this_X = np.column_stack((X1[:, i], X2[:, i])) + vals[:, i] = model.predict(this_X) + + # Plot the SVM boundary + #contour(X1, X2, vals, [0 0], 'Color', 'b') + plt.contour(X1, X2, vals, levels=[0.0, 0.0]) diff --git a/ex6/visualizeBoundaryLinear.py b/ex6/visualizeBoundaryLinear.py new file mode 100644 index 0000000..cdb96ad --- /dev/null +++ b/ex6/visualizeBoundaryLinear.py @@ -0,0 +1,17 @@ +import matplotlib.pyplot as plt +import numpy as np +from plotData import plotData + + +def visualizeBoundaryLinear(X, y, model): + """plots a linear decision boundary + learned by the SVM and overlays the data on it + """ + + w = model.coef_.flatten() + b = model.intercept_.flatten() + xp = np.linspace(min(X[:, 0]), max(X[:, 0]), 100) + yp = -(w[0]*xp + b)/w[1] + plotData(X, y) + plt.plot(xp, yp, '-b') + diff --git a/ex6/vocab.txt b/ex6/vocab.txt new file mode 100644 index 0000000..27f64a3 --- /dev/null +++ b/ex6/vocab.txt @@ -0,0 +1,1899 @@ +1 aa +2 ab +3 abil +4 abl +5 about +6 abov +7 absolut +8 abus +9 ac +10 accept +11 access +12 accord +13 account +14 achiev +15 acquir +16 across +17 act +18 action +19 activ +20 actual +21 ad +22 adam +23 add +24 addit +25 address +26 administr +27 adult +28 advanc +29 advantag +30 advertis +31 advic +32 advis +33 ae +34 af +35 affect +36 affili +37 afford +38 africa +39 after +40 ag +41 again +42 against +43 agenc +44 agent +45 ago +46 agre +47 agreement +48 aid +49 air +50 al +51 alb +52 align +53 all +54 allow +55 almost +56 alon +57 along +58 alreadi +59 alsa +60 also +61 altern +62 although +63 alwai +64 am +65 amaz +66 america +67 american +68 among +69 amount +70 amp +71 an +72 analysi +73 analyst +74 and +75 ani +76 anim +77 announc +78 annual +79 annuiti +80 anoth +81 answer +82 anti +83 anumb +84 anybodi +85 anymor +86 anyon +87 anyth +88 anywai +89 anywher +90 aol +91 ap +92 apolog +93 app +94 appar +95 appear +96 appl +97 appli +98 applic +99 appreci +100 approach +101 approv +102 apt +103 ar +104 archiv +105 area +106 aren +107 argument +108 arial +109 arm +110 around +111 arrai +112 arriv +113 art +114 articl +115 artist +116 as +117 ascii +118 ask +119 asset +120 assist +121 associ +122 assum +123 assur +124 at +125 atol +126 attach +127 attack +128 attempt +129 attent +130 attornei +131 attract +132 audio +133 aug +134 august +135 author +136 auto +137 autom +138 automat +139 avail +140 averag +141 avoid +142 awai +143 awar +144 award +145 ba +146 babi +147 back +148 background +149 backup +150 bad +151 balanc +152 ban +153 bank +154 bar +155 base +156 basenumb +157 basi +158 basic +159 bb +160 bc +161 bd +162 be +163 beat +164 beberg +165 becaus +166 becom +167 been +168 befor +169 begin +170 behalf +171 behavior +172 behind +173 believ +174 below +175 benefit +176 best +177 beta +178 better +179 between +180 bf +181 big +182 bill +183 billion +184 bin +185 binari +186 bit +187 black +188 blank +189 block +190 blog +191 blood +192 blue +193 bnumber +194 board +195 bodi +196 boi +197 bonu +198 book +199 boot +200 border +201 boss +202 boston +203 botan +204 both +205 bottl +206 bottom +207 boundari +208 box +209 brain +210 brand +211 break +212 brian +213 bring +214 broadcast +215 broker +216 browser +217 bug +218 bui +219 build +220 built +221 bulk +222 burn +223 bush +224 busi +225 but +226 button +227 by +228 byte +229 ca +230 cabl +231 cach +232 calcul +233 california +234 call +235 came +236 camera +237 campaign +238 can +239 canada +240 cannot +241 canon +242 capabl +243 capillari +244 capit +245 car +246 card +247 care +248 career +249 carri +250 cartridg +251 case +252 cash +253 cat +254 catch +255 categori +256 caus +257 cb +258 cc +259 cd +260 ce +261 cell +262 cent +263 center +264 central +265 centuri +266 ceo +267 certain +268 certainli +269 cf +270 challeng +271 chanc +272 chang +273 channel +274 char +275 charact +276 charg +277 charset +278 chat +279 cheap +280 check +281 cheer +282 chief +283 children +284 china +285 chip +286 choic +287 choos +288 chri +289 citi +290 citizen +291 civil +292 claim +293 class +294 classifi +295 clean +296 clear +297 clearli +298 click +299 client +300 close +301 clue +302 cnet +303 cnumber +304 co +305 code +306 collect +307 colleg +308 color +309 com +310 combin +311 come +312 comfort +313 command +314 comment +315 commentari +316 commerci +317 commiss +318 commit +319 common +320 commun +321 compani +322 compar +323 comparison +324 compat +325 compet +326 competit +327 compil +328 complet +329 comprehens +330 comput +331 concentr +332 concept +333 concern +334 condit +335 conf +336 confer +337 confid +338 confidenti +339 config +340 configur +341 confirm +342 conflict +343 confus +344 congress +345 connect +346 consid +347 consolid +348 constitut +349 construct +350 consult +351 consum +352 contact +353 contain +354 content +355 continu +356 contract +357 contribut +358 control +359 conveni +360 convers +361 convert +362 cool +363 cooper +364 copi +365 copyright +366 core +367 corpor +368 correct +369 correspond +370 cost +371 could +372 couldn +373 count +374 countri +375 coupl +376 cours +377 court +378 cover +379 coverag +380 crash +381 creat +382 creativ +383 credit +384 critic +385 cross +386 cultur +387 current +388 custom +389 cut +390 cv +391 da +392 dagga +393 dai +394 daili +395 dan +396 danger +397 dark +398 data +399 databas +400 datapow +401 date +402 dave +403 david +404 dc +405 de +406 dead +407 deal +408 dear +409 death +410 debt +411 decad +412 decid +413 decis +414 declar +415 declin +416 decor +417 default +418 defend +419 defens +420 defin +421 definit +422 degre +423 delai +424 delet +425 deliv +426 deliveri +427 dell +428 demand +429 democrat +430 depart +431 depend +432 deposit +433 describ +434 descript +435 deserv +436 design +437 desir +438 desktop +439 despit +440 detail +441 detect +442 determin +443 dev +444 devel +445 develop +446 devic +447 di +448 dial +449 did +450 didn +451 diet +452 differ +453 difficult +454 digit +455 direct +456 directli +457 director +458 directori +459 disabl +460 discount +461 discov +462 discoveri +463 discuss +464 disk +465 displai +466 disposit +467 distanc +468 distribut +469 dn +470 dnumber +471 do +472 doc +473 document +474 doe +475 doer +476 doesn +477 dollar +478 dollarac +479 dollarnumb +480 domain +481 don +482 done +483 dont +484 doubl +485 doubt +486 down +487 download +488 dr +489 draw +490 dream +491 drive +492 driver +493 drop +494 drug +495 due +496 dure +497 dvd +498 dw +499 dynam +500 ea +501 each +502 earli +503 earlier +504 earn +505 earth +506 easi +507 easier +508 easili +509 eat +510 eb +511 ebai +512 ec +513 echo +514 econom +515 economi +516 ed +517 edg +518 edit +519 editor +520 educ +521 eff +522 effect +523 effici +524 effort +525 either +526 el +527 electron +528 elimin +529 els +530 email +531 emailaddr +532 emerg +533 empir +534 employ +535 employe +536 en +537 enabl +538 encod +539 encourag +540 end +541 enemi +542 enenkio +543 energi +544 engin +545 english +546 enhanc +547 enjoi +548 enough +549 ensur +550 enter +551 enterpris +552 entertain +553 entir +554 entri +555 enumb +556 environ +557 equal +558 equip +559 equival +560 error +561 especi +562 essenti +563 establish +564 estat +565 estim +566 et +567 etc +568 euro +569 europ +570 european +571 even +572 event +573 eventu +574 ever +575 everi +576 everyon +577 everyth +578 evid +579 evil +580 exactli +581 exampl +582 excel +583 except +584 exchang +585 excit +586 exclus +587 execut +588 exercis +589 exist +590 exmh +591 expand +592 expect +593 expens +594 experi +595 expert +596 expir +597 explain +598 explor +599 express +600 extend +601 extens +602 extra +603 extract +604 extrem +605 ey +606 fa +607 face +608 fact +609 factor +610 fail +611 fair +612 fall +613 fals +614 famili +615 faq +616 far +617 fast +618 faster +619 fastest +620 fat +621 father +622 favorit +623 fax +624 fb +625 fd +626 featur +627 feder +628 fee +629 feed +630 feedback +631 feel +632 femal +633 few +634 ffffff +635 ffnumber +636 field +637 fight +638 figur +639 file +640 fill +641 film +642 filter +643 final +644 financ +645 financi +646 find +647 fine +648 finish +649 fire +650 firewal +651 firm +652 first +653 fit +654 five +655 fix +656 flag +657 flash +658 flow +659 fnumber +660 focu +661 folder +662 folk +663 follow +664 font +665 food +666 for +667 forc +668 foreign +669 forev +670 forget +671 fork +672 form +673 format +674 former +675 fortun +676 forward +677 found +678 foundat +679 four +680 franc +681 free +682 freedom +683 french +684 freshrpm +685 fri +686 fridai +687 friend +688 from +689 front +690 ftoc +691 ftp +692 full +693 fulli +694 fun +695 function +696 fund +697 further +698 futur +699 ga +700 gain +701 game +702 gari +703 garrigu +704 gave +705 gcc +706 geek +707 gener +708 get +709 gif +710 gift +711 girl +712 give +713 given +714 global +715 gnome +716 gnu +717 gnupg +718 go +719 goal +720 god +721 goe +722 gold +723 gone +724 good +725 googl +726 got +727 govern +728 gpl +729 grand +730 grant +731 graphic +732 great +733 greater +734 ground +735 group +736 grow +737 growth +738 gt +739 guarante +740 guess +741 gui +742 guid +743 ha +744 hack +745 had +746 half +747 ham +748 hand +749 handl +750 happen +751 happi +752 hard +753 hardwar +754 hat +755 hate +756 have +757 haven +758 he +759 head +760 header +761 headlin +762 health +763 hear +764 heard +765 heart +766 heaven +767 hei +768 height +769 held +770 hello +771 help +772 helvetica +773 her +774 herba +775 here +776 hermio +777 hettinga +778 hi +779 high +780 higher +781 highli +782 highlight +783 him +784 histori +785 hit +786 hold +787 home +788 honor +789 hope +790 host +791 hot +792 hour +793 hous +794 how +795 howev +796 hp +797 html +798 http +799 httpaddr +800 huge +801 human +802 hundr +803 ibm +804 id +805 idea +806 ident +807 identifi +808 idnumb +809 ie +810 if +811 ignor +812 ii +813 iii +814 iiiiiiihnumberjnumberhnumberjnumberhnumb +815 illeg +816 im +817 imag +818 imagin +819 immedi +820 impact +821 implement +822 import +823 impress +824 improv +825 in +826 inc +827 includ +828 incom +829 increas +830 incred +831 inde +832 independ +833 index +834 india +835 indian +836 indic +837 individu +838 industri +839 info +840 inform +841 initi +842 inlin +843 innov +844 input +845 insert +846 insid +847 instal +848 instanc +849 instant +850 instead +851 institut +852 instruct +853 insur +854 int +855 integr +856 intel +857 intellig +858 intend +859 interact +860 interest +861 interfac +862 intern +863 internet +864 interview +865 into +866 intro +867 introduc +868 inumb +869 invest +870 investig +871 investor +872 invok +873 involv +874 ip +875 ireland +876 irish +877 is +878 island +879 isn +880 iso +881 isp +882 issu +883 it +884 item +885 itself +886 jabber +887 jame +888 java +889 jim +890 jnumberiiiiiiihepihepihf +891 job +892 joe +893 john +894 join +895 journal +896 judg +897 judgment +898 jul +899 juli +900 jump +901 june +902 just +903 justin +904 keep +905 kei +906 kept +907 kernel +908 kevin +909 keyboard +910 kid +911 kill +912 kind +913 king +914 kingdom +915 knew +916 know +917 knowledg +918 known +919 la +920 lack +921 land +922 languag +923 laptop +924 larg +925 larger +926 largest +927 laser +928 last +929 late +930 later +931 latest +932 launch +933 law +934 lawrenc +935 le +936 lead +937 leader +938 learn +939 least +940 leav +941 left +942 legal +943 lender +944 length +945 less +946 lesson +947 let +948 letter +949 level +950 lib +951 librari +952 licens +953 life +954 lifetim +955 light +956 like +957 limit +958 line +959 link +960 linux +961 list +962 listen +963 littl +964 live +965 ll +966 lo +967 load +968 loan +969 local +970 locat +971 lock +972 lockergnom +973 log +974 long +975 longer +976 look +977 lose +978 loss +979 lost +980 lot +981 love +982 low +983 lower +984 lowest +985 lt +986 ma +987 mac +988 machin +989 made +990 magazin +991 mai +992 mail +993 mailer +994 main +995 maintain +996 major +997 make +998 maker +999 male +1000 man +1001 manag +1002 mani +1003 manual +1004 manufactur +1005 map +1006 march +1007 margin +1008 mark +1009 market +1010 marshal +1011 mass +1012 master +1013 match +1014 materi +1015 matter +1016 matthia +1017 mayb +1018 me +1019 mean +1020 measur +1021 mechan +1022 media +1023 medic +1024 meet +1025 member +1026 membership +1027 memori +1028 men +1029 mention +1030 menu +1031 merchant +1032 messag +1033 method +1034 mh +1035 michael +1036 microsoft +1037 middl +1038 might +1039 mike +1040 mile +1041 militari +1042 million +1043 mime +1044 mind +1045 mine +1046 mini +1047 minimum +1048 minut +1049 miss +1050 mistak +1051 mobil +1052 mode +1053 model +1054 modem +1055 modifi +1056 modul +1057 moment +1058 mon +1059 mondai +1060 monei +1061 monitor +1062 month +1063 monthli +1064 more +1065 morn +1066 mortgag +1067 most +1068 mostli +1069 mother +1070 motiv +1071 move +1072 movi +1073 mpnumber +1074 mr +1075 ms +1076 msg +1077 much +1078 multi +1079 multipart +1080 multipl +1081 murphi +1082 music +1083 must +1084 my +1085 myself +1086 name +1087 nation +1088 natur +1089 nbsp +1090 near +1091 nearli +1092 necessari +1093 need +1094 neg +1095 net +1096 netscap +1097 network +1098 never +1099 new +1100 newslett +1101 next +1102 nextpart +1103 nice +1104 nigeria +1105 night +1106 no +1107 nobodi +1108 non +1109 none +1110 nor +1111 normal +1112 north +1113 not +1114 note +1115 noth +1116 notic +1117 now +1118 nt +1119 null +1120 number +1121 numbera +1122 numberam +1123 numberanumb +1124 numberb +1125 numberbit +1126 numberc +1127 numbercb +1128 numbercbr +1129 numbercfont +1130 numbercli +1131 numbercnumb +1132 numbercp +1133 numberctd +1134 numberd +1135 numberdari +1136 numberdnumb +1137 numberenumb +1138 numberf +1139 numberfb +1140 numberff +1141 numberffont +1142 numberfp +1143 numberftd +1144 numberk +1145 numberm +1146 numbermb +1147 numberp +1148 numberpd +1149 numberpm +1150 numberpx +1151 numberst +1152 numberth +1153 numbertnumb +1154 numberx +1155 object +1156 oblig +1157 obtain +1158 obvious +1159 occur +1160 oct +1161 octob +1162 of +1163 off +1164 offer +1165 offic +1166 offici +1167 often +1168 oh +1169 ok +1170 old +1171 on +1172 onc +1173 onli +1174 onlin +1175 open +1176 oper +1177 opinion +1178 opportun +1179 opt +1180 optim +1181 option +1182 or +1183 order +1184 org +1185 organ +1186 origin +1187 os +1188 osdn +1189 other +1190 otherwis +1191 our +1192 out +1193 outlook +1194 output +1195 outsid +1196 over +1197 own +1198 owner +1199 oz +1200 pacif +1201 pack +1202 packag +1203 page +1204 pai +1205 paid +1206 pain +1207 palm +1208 panel +1209 paper +1210 paragraph +1211 parent +1212 part +1213 parti +1214 particip +1215 particular +1216 particularli +1217 partit +1218 partner +1219 pass +1220 password +1221 past +1222 patch +1223 patent +1224 path +1225 pattern +1226 paul +1227 payment +1228 pc +1229 peac +1230 peopl +1231 per +1232 percent +1233 percentag +1234 perfect +1235 perfectli +1236 perform +1237 perhap +1238 period +1239 perl +1240 perman +1241 permiss +1242 person +1243 pgp +1244 phone +1245 photo +1246 php +1247 phrase +1248 physic +1249 pick +1250 pictur +1251 piec +1252 piiiiiiii +1253 pipe +1254 pjnumber +1255 place +1256 plai +1257 plain +1258 plan +1259 planet +1260 plant +1261 planta +1262 platform +1263 player +1264 pleas +1265 plu +1266 plug +1267 pm +1268 pocket +1269 point +1270 polic +1271 polici +1272 polit +1273 poor +1274 pop +1275 popul +1276 popular +1277 port +1278 posit +1279 possibl +1280 post +1281 potenti +1282 pound +1283 powel +1284 power +1285 powershot +1286 practic +1287 pre +1288 predict +1289 prefer +1290 premium +1291 prepar +1292 present +1293 presid +1294 press +1295 pretti +1296 prevent +1297 previou +1298 previous +1299 price +1300 principl +1301 print +1302 printabl +1303 printer +1304 privaci +1305 privat +1306 prize +1307 pro +1308 probabl +1309 problem +1310 procedur +1311 process +1312 processor +1313 procmail +1314 produc +1315 product +1316 profession +1317 profil +1318 profit +1319 program +1320 programm +1321 progress +1322 project +1323 promis +1324 promot +1325 prompt +1326 properti +1327 propos +1328 proprietari +1329 prospect +1330 protect +1331 protocol +1332 prove +1333 proven +1334 provid +1335 proxi +1336 pub +1337 public +1338 publish +1339 pudg +1340 pull +1341 purchas +1342 purpos +1343 put +1344 python +1345 qnumber +1346 qualifi +1347 qualiti +1348 quarter +1349 question +1350 quick +1351 quickli +1352 quit +1353 quot +1354 radio +1355 ragga +1356 rais +1357 random +1358 rang +1359 rate +1360 rather +1361 ratio +1362 razor +1363 razornumb +1364 re +1365 reach +1366 read +1367 reader +1368 readi +1369 real +1370 realiz +1371 realli +1372 reason +1373 receiv +1374 recent +1375 recipi +1376 recommend +1377 record +1378 red +1379 redhat +1380 reduc +1381 refer +1382 refin +1383 reg +1384 regard +1385 region +1386 regist +1387 regul +1388 regular +1389 rel +1390 relat +1391 relationship +1392 releas +1393 relev +1394 reliabl +1395 remain +1396 rememb +1397 remot +1398 remov +1399 replac +1400 repli +1401 report +1402 repositori +1403 repres +1404 republ +1405 request +1406 requir +1407 research +1408 reserv +1409 resid +1410 resourc +1411 respect +1412 respond +1413 respons +1414 rest +1415 result +1416 retail +1417 return +1418 reveal +1419 revenu +1420 revers +1421 review +1422 revok +1423 rh +1424 rich +1425 right +1426 risk +1427 road +1428 robert +1429 rock +1430 role +1431 roll +1432 rom +1433 roman +1434 room +1435 root +1436 round +1437 rpm +1438 rss +1439 rule +1440 run +1441 sa +1442 safe +1443 sai +1444 said +1445 sale +1446 same +1447 sampl +1448 san +1449 saou +1450 sat +1451 satellit +1452 save +1453 saw +1454 scan +1455 schedul +1456 school +1457 scienc +1458 score +1459 screen +1460 script +1461 se +1462 search +1463 season +1464 second +1465 secret +1466 section +1467 secur +1468 see +1469 seed +1470 seek +1471 seem +1472 seen +1473 select +1474 self +1475 sell +1476 seminar +1477 send +1478 sender +1479 sendmail +1480 senior +1481 sens +1482 sensit +1483 sent +1484 sep +1485 separ +1486 septemb +1487 sequenc +1488 seri +1489 serif +1490 seriou +1491 serv +1492 server +1493 servic +1494 set +1495 setup +1496 seven +1497 seventh +1498 sever +1499 sex +1500 sexual +1501 sf +1502 shape +1503 share +1504 she +1505 shell +1506 ship +1507 shop +1508 short +1509 shot +1510 should +1511 show +1512 side +1513 sign +1514 signatur +1515 signific +1516 similar +1517 simpl +1518 simpli +1519 sinc +1520 sincer +1521 singl +1522 sit +1523 site +1524 situat +1525 six +1526 size +1527 skeptic +1528 skill +1529 skin +1530 skip +1531 sleep +1532 slow +1533 small +1534 smart +1535 smoke +1536 smtp +1537 snumber +1538 so +1539 social +1540 societi +1541 softwar +1542 sold +1543 solut +1544 solv +1545 some +1546 someon +1547 someth +1548 sometim +1549 son +1550 song +1551 soni +1552 soon +1553 sorri +1554 sort +1555 sound +1556 sourc +1557 south +1558 space +1559 spain +1560 spam +1561 spamassassin +1562 spamd +1563 spammer +1564 speak +1565 spec +1566 special +1567 specif +1568 specifi +1569 speech +1570 speed +1571 spend +1572 sponsor +1573 sport +1574 spot +1575 src +1576 ssh +1577 st +1578 stabl +1579 staff +1580 stai +1581 stand +1582 standard +1583 star +1584 start +1585 state +1586 statement +1587 statu +1588 step +1589 steve +1590 still +1591 stock +1592 stop +1593 storag +1594 store +1595 stori +1596 strategi +1597 stream +1598 street +1599 string +1600 strip +1601 strong +1602 structur +1603 studi +1604 stuff +1605 stupid +1606 style +1607 subject +1608 submit +1609 subscrib +1610 subscript +1611 substanti +1612 success +1613 such +1614 suffer +1615 suggest +1616 suit +1617 sum +1618 summari +1619 summer +1620 sun +1621 super +1622 suppli +1623 support +1624 suppos +1625 sure +1626 surpris +1627 suse +1628 suspect +1629 sweet +1630 switch +1631 system +1632 tab +1633 tabl +1634 tablet +1635 tag +1636 take +1637 taken +1638 talk +1639 tape +1640 target +1641 task +1642 tax +1643 teach +1644 team +1645 tech +1646 technic +1647 techniqu +1648 technolog +1649 tel +1650 telecom +1651 telephon +1652 tell +1653 temperatur +1654 templ +1655 ten +1656 term +1657 termin +1658 terror +1659 terrorist +1660 test +1661 texa +1662 text +1663 than +1664 thank +1665 that +1666 the +1667 thei +1668 their +1669 them +1670 themselv +1671 then +1672 theori +1673 there +1674 therefor +1675 these +1676 thi +1677 thing +1678 think +1679 thinkgeek +1680 third +1681 those +1682 though +1683 thought +1684 thousand +1685 thread +1686 threat +1687 three +1688 through +1689 thu +1690 thursdai +1691 ti +1692 ticket +1693 tim +1694 time +1695 tip +1696 tire +1697 titl +1698 tm +1699 to +1700 todai +1701 togeth +1702 token +1703 told +1704 toll +1705 tom +1706 toner +1707 toni +1708 too +1709 took +1710 tool +1711 top +1712 topic +1713 total +1714 touch +1715 toward +1716 track +1717 trade +1718 tradit +1719 traffic +1720 train +1721 transact +1722 transfer +1723 travel +1724 treat +1725 tree +1726 tri +1727 trial +1728 trick +1729 trip +1730 troubl +1731 true +1732 truli +1733 trust +1734 truth +1735 try +1736 tue +1737 tuesdai +1738 turn +1739 tv +1740 two +1741 type +1742 uk +1743 ultim +1744 un +1745 under +1746 understand +1747 unfortun +1748 uniqu +1749 unison +1750 unit +1751 univers +1752 unix +1753 unless +1754 unlik +1755 unlimit +1756 unseen +1757 unsolicit +1758 unsubscrib +1759 until +1760 up +1761 updat +1762 upgrad +1763 upon +1764 urgent +1765 url +1766 us +1767 usa +1768 usag +1769 usb +1770 usd +1771 usdollarnumb +1772 useless +1773 user +1774 usr +1775 usual +1776 util +1777 vacat +1778 valid +1779 valu +1780 valuabl +1781 var +1782 variabl +1783 varieti +1784 variou +1785 ve +1786 vendor +1787 ventur +1788 veri +1789 verifi +1790 version +1791 via +1792 video +1793 view +1794 virtual +1795 visa +1796 visit +1797 visual +1798 vnumber +1799 voic +1800 vote +1801 vs +1802 vulner +1803 wa +1804 wai +1805 wait +1806 wake +1807 walk +1808 wall +1809 want +1810 war +1811 warm +1812 warn +1813 warranti +1814 washington +1815 wasn +1816 wast +1817 watch +1818 water +1819 we +1820 wealth +1821 weapon +1822 web +1823 weblog +1824 websit +1825 wed +1826 wednesdai +1827 week +1828 weekli +1829 weight +1830 welcom +1831 well +1832 went +1833 were +1834 west +1835 what +1836 whatev +1837 when +1838 where +1839 whether +1840 which +1841 while +1842 white +1843 whitelist +1844 who +1845 whole +1846 whose +1847 why +1848 wi +1849 wide +1850 width +1851 wife +1852 will +1853 william +1854 win +1855 window +1856 wing +1857 winner +1858 wireless +1859 wish +1860 with +1861 within +1862 without +1863 wnumberp +1864 woman +1865 women +1866 won +1867 wonder +1868 word +1869 work +1870 worker +1871 world +1872 worldwid +1873 worri +1874 worst +1875 worth +1876 would +1877 wouldn +1878 write +1879 written +1880 wrong +1881 wrote +1882 www +1883 ximian +1884 xml +1885 xp +1886 yahoo +1887 ye +1888 yeah +1889 year +1890 yesterdai +1891 yet +1892 york +1893 you +1894 young +1895 your +1896 yourself +1897 zdnet +1898 zero +1899 zip diff --git a/ex7/bird_small.mat b/ex7/bird_small.mat new file mode 100644 index 0000000..04c224c Binary files /dev/null and b/ex7/bird_small.mat differ diff --git a/ex7/bird_small.png b/ex7/bird_small.png new file mode 100644 index 0000000..a3cd00c Binary files /dev/null and b/ex7/bird_small.png differ diff --git a/ex7/computeCentroids.py b/ex7/computeCentroids.py new file mode 100644 index 0000000..4d7b4ff --- /dev/null +++ b/ex7/computeCentroids.py @@ -0,0 +1,33 @@ +import numpy as np + + +def computeCentroids(X, idx, K): + """returns the new centroids by + computing the means of the data points assigned to each centroid. It is + given a dataset X where each row is a single data point, a vector + idx of centroid assignments (i.e. each entry in range [1..K]) for each + example, and K, the number of centroids. You should return a matrix + centroids, where each row of centroids is the mean of the data points + assigned to it. + """ + +# Useful variables + m, n = X.shape + +# You need to return the following variables correctly. + centroids = [] + + +# ====================== YOUR CODE HERE ====================== +# Instructions: Go over every centroid and compute mean of all points that +# belong to it. Concretely, the row vector centroids(i, :) +# should contain the mean of the data points assigned to +# centroid i. +# +# Note: You can use a for-loop over the centroids to compute this. +# + + +# ============================================================= + + return centroids diff --git a/ex7/drawLine.py b/ex7/drawLine.py new file mode 100644 index 0000000..44761f1 --- /dev/null +++ b/ex7/drawLine.py @@ -0,0 +1,12 @@ +import matplotlib.pyplot as plt +import numpy as np + +from show import show + +def drawLine(p1, p2, varargin): + """Draws a line from point p1 to point p2 and holds the + current figure + """ + + plt.plot(np.column_stack(p1(1), p2(1)), np.column_stack(p1(2), p2(2)), varargin) + show() \ No newline at end of file diff --git a/ex7/ex7.py b/ex7/ex7.py new file mode 100644 index 0000000..803f45e --- /dev/null +++ b/ex7/ex7.py @@ -0,0 +1,183 @@ +## Machine Learning Online Class +# Exercise 7 | Principle Component Analysis and K-Means Clustering +# +# Instructions +# ------------ +# +# This file contains code that helps you get started on the +# exercise. You will need to complete the following functions: +# +# pca.m +# projectData.m +# recoverData.m +# computeCentroids.m +# findClosestCentroids.m +# kMeansInitCentroids.m +# +# For this exercise, you will not need to change any code in this file, +# or any other files other than those mentioned above. +# + +## ================= Part 1: Find Closest Centroids ==================== +# To help you implement K-Means, we have divided the learning algorithm +# into two functions -- findClosestCentroids and computeCentroids. In this +# part, you shoudl complete the code in the findClosestCentroids function. +# +from matplotlib import use, cm +use('TkAgg') +import numpy as np +import scipy.io +import scipy.misc +import matplotlib.pyplot as plt + +from findClosestCentroids import findClosestCentroids +from computeCentroids import computeCentroids +from runkMeans import runkMeans +from kMeansInitCentroids import kMeansInitCentroids +from show import show + +print 'Finding closest centroids.' + +# Load an example dataset that we will be using +data = scipy.io.loadmat('ex7data2.mat') +X = data['X'] + +# Select an initial set of centroids +K = 3 # 3 Centroids +initial_centroids = np.array([[3, 3], [6, 2], [8, 5]]) + +# Find the closest centroids for the examples using the +# initial_centroids +val, idx = findClosestCentroids(X, initial_centroids) + +print 'Closest centroids for the first 3 examples:' +print idx[0:3].tolist() +print '(the closest centroids should be 0, 2, 1 respectively)' + +raw_input("Program paused. Press Enter to continue...") + +## ===================== Part 2: Compute Means ========================= +# After implementing the closest centroids function, you should now +# complete the computeCentroids function. +# +print 'Computing centroids means.' + +# Compute means based on the closest centroids found in the previous part. +centroids = computeCentroids(X, idx, K) + +print 'Centroids computed after initial finding of closest centroids:' +for c in centroids: + print c + +print '(the centroids should be' +print ' [ 2.428301 3.157924 ]' +print ' [ 5.813503 2.633656 ]' +print ' [ 7.119387 3.616684 ]' + +raw_input("Program paused. Press Enter to continue...") + + +## =================== Part 3: K-Means Clustering ====================== +# After you have completed the two functions computeCentroids and +# findClosestCentroids, you have all the necessary pieces to run the +# kMeans algorithm. In this part, you will run the K-Means algorithm on +# the example dataset we have provided. +# +print 'Running K-Means clustering on example dataset.' + +# Load an example dataset +data = scipy.io.loadmat('ex7data2.mat') +X = data['X'] + +# Settings for running K-Means +K = 3 +max_iters = 10 + +# For consistency, here we set centroids to specific values +# but in practice you want to generate them automatically, such as by +# settings them to be random examples (as can be seen in +# kMeansInitCentroids). +initial_centroids = [[3, 3], [6, 2], [8, 5]] + +# Run K-Means algorithm. The 'true' at the end tells our function to plot +# the progress of K-Means +centroids, idx = runkMeans(X, initial_centroids, max_iters, True) +print 'K-Means Done.' + +raw_input("Program paused. Press Enter to continue...") + +## ============= Part 4: K-Means Clustering on Pixels =============== +# In this exercise, you will use K-Means to compress an image. To do this, +# you will first run K-Means on the colors of the pixels in the image and +# then you will map each pixel on to it's closest centroid. +# +# You should now complete the code in kMeansInitCentroids.m +# + +print 'Running K-Means clustering on pixels from an image.' + +# Load an image of a bird +A = scipy.misc.imread('bird_small.png') + +# If imread does not work for you, you can try instead +# load ('bird_small.mat') + +A = A / 255.0 # Divide by 255 so that all values are in the range 0 - 1 + +# Size of the image +img_size = A.shape + +# Reshape the image into an Nx3 matrix where N = number of pixels. +# Each row will contain the Red, Green and Blue pixel values +# This gives us our dataset matrix X that we will use K-Means on. +X = A.reshape(img_size[0] * img_size[1], 3) + +# Run your K-Means algorithm on this data +# You should try different values of K and max_iters here +K = 16 +max_iters = 10 + +# When using K-Means, it is important the initialize the centroids +# randomly. +# You should complete the code in kMeansInitCentroids.m before proceeding +initial_centroids = kMeansInitCentroids(X, K) + +# Run K-Means +centroids, idx = runkMeans(X, initial_centroids, max_iters) + +raw_input("Program paused. Press Enter to continue...") + + +## ================= Part 5: Image Compression ====================== +# In this part of the exercise, you will use the clusters of K-Means to +# compress an image. To do this, we first find the closest clusters for +# each example. After that, we + +print 'Applying K-Means to compress an image.' + +# Find closest cluster members +_, idx = findClosestCentroids(X, centroids) + +# Essentially, now we have represented the image X as in terms of the +# indices in idx. + +# We can now recover the image from the indices (idx) by mapping each pixel +# (specified by it's index in idx) to the centroid value +X_recovered = np.array([centroids[e] for e in idx]) + +# Reshape the recovered image into proper dimensions +X_recovered = X_recovered.reshape(img_size[0], img_size[1], 3) + +# Display the original image +plt.subplot(1, 2, 1) +plt.imshow(A) +plt.title('Original') +show() + +# Display compressed image side by side +plt.subplot(1, 2, 2) +plt.imshow(X_recovered) +plt.title('Compressed, with %d colors.' % K) +show() + +raw_input("Program paused. Press Enter to continue...") \ No newline at end of file diff --git a/ex7/ex7_pca.py b/ex7/ex7_pca.py new file mode 100644 index 0000000..1865592 --- /dev/null +++ b/ex7/ex7_pca.py @@ -0,0 +1,253 @@ +## Machine Learning Online Class +# Exercise 7 | Principle Component Analysis and K-Means Clustering +# +# Instructions +# ------------ +# +# This file contains code that helps you get started on the +# exercise. You will need to complete the following functions: +# +# pca.m +# projectData.m` +# recoverData.m +# computeCentroids.m +# findClosestCentroids.m +# kMeansInitCentroids.m +# +# For this exercise, you will not need to change any code in this file, +# or any other files other than those mentioned above. + +from matplotlib import use +use('TkAgg') +import numpy as np +import scipy.io +import scipy.misc +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D + +from featureNormalize import featureNormalize +from pca import pca +from projectData import projectData +from recoverData import recoverData +from kMeansInitCentroids import kMeansInitCentroids +from runkMeans import runkMeans +from plotDataPoints import plotDataPoints +from ex3.displayData import displayData +from show import show + +## ================== Part 1: Load Example Dataset =================== +# We start this exercise by using a small dataset that is easily to +# visualize + +print 'Visualizing example dataset for PCA.' +# The following command loads the dataset. You should now have the +# variable X in your environment +data = scipy.io.loadmat('ex7data1.mat') +X = data['X'] + +# Visualize the example dataset +plt.scatter(X[:, 0], X[:, 1], marker='o', color='b', facecolors='none', lw=1.0) +plt.axis([0.5, 6.5, 2, 8]) +plt.axis('equal') +show() + +raw_input('Program paused. Press Enter to continue...') + +## =============== Part 2: Principal Component Analysis =============== +# You should now implement PCA, a dimension reduction technique. You +# should complete the code in pca.m +# +print 'Running PCA on example dataset.' + +# Before running PCA, it is important to first normalize X +X_norm, mu, sigma = featureNormalize(X) + +# Run PCA +U, S, V = pca(X_norm) + +# Compute mu, the mean of the each feature + +# Draw the eigenvectors centered at mean of data. These lines show the +# directions of maximum variations in the dataset. +mu2 = mu + 1.5 * S.dot(U.T) +plt.plot([mu[0], mu2[0, 0]], [mu[1], mu2[0, 1]], '-k', lw=2) +plt.plot([mu[0], mu2[1, 0]], [mu[1], mu2[1, 1]], '-k', lw=2) +show() + +print 'Top eigenvector: ' +print ' U(:,1) = %f %f ', U[0,0], U[1,0] +print '(you should expect to see -0.707107 -0.707107)' + +raw_input('Program paused. Press Enter to continue...') + + +## =================== Part 3: Dimension Reduction =================== +# You should now implement the projection step to map the data onto the +# first k eigenvectors. The code will then plot the data in this reduced +# dimensional space. This will show you what the data looks like when +# using only the corresponding eigenvectors to reconstruct it. +# +# You should complete the code in projectData.m +# +print 'Dimension reduction on example dataset.' + +# Plot the normalized dataset (returned from pca) +plt.figure() +plt.scatter(X_norm[:, 0], X_norm[:, 1], marker='o', color='b', facecolors='none', lw=1.0) +plt.axis([-4, 3, -4, 3]) #axis square +plt.axis('equal') +show() + +# Project the data onto K = 1 dimension +K = 1 +Z = projectData(X_norm, U, K) +print 'Projection of the first example: %f', Z[0] +print '(this value should be about 1.481274)' + +X_rec = recoverData(Z, U, K) +print 'Approximation of the first example: %f %f'% (X_rec[0, 0], X_rec[0, 1]) +print '(this value should be about -1.047419 -1.047419)' + +# Draw lines connecting the projected points to the original points +plt.scatter(X_rec[:, 0], X_rec[:, 1], marker='o', color='r', facecolor='none', lw=1.0) +for i in range(len(X_norm)): + plt.plot([X_norm[i, 0], X_rec[i, 0]], [X_norm[i, 1], X_rec[i, 1]], '--k') + +show() +raw_input('Program paused. Press Enter to continue...') + +## =============== Part 4: Loading and Visualizing Face Data ============= +# We start the exercise by first loading and visualizing the dataset. +# The following code will load the dataset into your environment +# +print 'Loading face dataset.' + +# Load Face dataset +data = scipy.io.loadmat('ex7faces.mat') +X = data['X'] + +# Display the first 100 faces in the dataset +displayData(X[0:100, :]) + +raw_input('Program paused. Press Enter to continue...') + +## =========== Part 5: PCA on Face Data: Eigenfaces =================== +# Run PCA and visualize the eigenvectors which are in this case eigenfaces +# We display the first 36 eigenfaces. +# +print 'Running PCA on face dataset.\n(this might take a minute or two ...)\n\n' + +# Before running PCA, it is important to first normalize X by subtracting +# the mean value from each feature +X_norm, mu, sigma = featureNormalize(X) + +# Run PCA +U, S, V = pca(X_norm) + +# Visualize the top 36 eigenvectors found +displayData(U[:, 1:36].T) + +raw_input('Program paused. Press Enter to continue...') + +## ============= Part 6: Dimension Reduction for Faces ================= +# Project images to the eigen space using the top k eigenvectors +# If you are applying a machine learning algorithm +print 'Dimension reduction for face dataset.' + +K = 100 +Z = projectData(X_norm, U, K) + +print 'The projected data Z has a size of: ' +print '%d %d'% Z.shape + +raw_input('Program paused. Press Enter to continue...') + +## ==== Part 7: Visualization of Faces after PCA Dimension Reduction ==== +# Project images to the eigen space using the top K eigen vectors and +# visualize only using those K dimensions +# Compare to the original input, which is also displayed + +print 'Visualizing the projected (reduced dimension) faces.' + +K = 100 +X_rec = recoverData(Z, U, K) + +# Display normalized data +plt.subplot(1, 2, 1) +displayData(X_norm[:100,:]) +plt.title('Original faces') +plt.axis('equal') + +# Display reconstructed data from only k eigenfaces +plt.subplot(1, 2, 2) +displayData(X_rec[:100,:]) +plt.title('Recovered faces') +plt.axis('equal') +show() +raw_input('Program paused. Press Enter to continue...') + + +## === Part 8(a): Optional (ungraded) Exercise: PCA for Visualization === +# One useful application of PCA is to use it to visualize high-dimensional +# data. In the last K-Means exercise you ran K-Means on 3-dimensional +# pixel colors of an image. We first visualize this output in 3D, and then +# apply PCA to obtain a visualization in 2D. + +# Re-load the image from the previous exercise and run K-Means on it +# For this to work, you need to complete the K-Means assignment first +A = scipy.misc.imread('bird_small.png') + +# If imread does not work for you, you can try instead +# load ('bird_small.mat') + +A = A / 255.0 +img_size = A.shape +X = A.reshape(img_size[0] * img_size[1], 3) +K = 16 +max_iters = 10 +initial_centroids = kMeansInitCentroids(X, K) +centroids, idx = runkMeans(X, initial_centroids, max_iters) + +# Sample 1000 random indexes (since working with all the data is +# too expensive. If you have a fast computer, you may increase this. +sel = np.floor(np.random.random(1000) * len(X)) + 1 + +# Setup Color Palette + +# Visualize the data and centroid memberships in 3D +fig = plt.figure() +ax = fig.add_subplot(111, projection='3d') +Xs = np.array([X[s] for s in sel]) +xs = Xs[:, 0] +ys = Xs[:, 1] +zs = Xs[:, 2] +cmap = plt.get_cmap("jet") +idxn = sel.astype('float')/max(sel.astype('float')) +colors = cmap(idxn) +# ax = Axes3D(fig) +ax.scatter3D(xs, ys, zs=zs, edgecolors=colors, marker='o', facecolors='none', lw=0.4, s=10) + +plt.title('Pixel dataset plotted in 3D. Color shows centroid memberships') +show() +raw_input('Program paused. Press Enter to continue...') + +## === Part 8(b): Optional (ungraded) Exercise: PCA for Visualization === +# Use PCA to project this cloud to 2D for visualization + +# Subtract the mean to use PCA +X_norm, mu, sigma = featureNormalize(X) + +# PCA and project the data to 2D +U, S, V = pca(X_norm) +Z = projectData(X_norm, U, 2) + +# Plot in 2D +plt.figure() +zs = np.array([Z[s] for s in sel]) +idxs = np.array([idx[s] for s in sel]) + +# plt.scatter(zs[:,0], zs[:,1]) +plotDataPoints(zs, idxs) +plt.title('Pixel dataset plotted in 2D, using PCA for dimensionality reduction') +show() +raw_input('Program paused. Press Enter to continue...') diff --git a/ex7/ex7data1.mat b/ex7/ex7data1.mat new file mode 100644 index 0000000..f9c3961 Binary files /dev/null and b/ex7/ex7data1.mat differ diff --git a/ex7/ex7data2.mat b/ex7/ex7data2.mat new file mode 100644 index 0000000..de3f5b9 Binary files /dev/null and b/ex7/ex7data2.mat differ diff --git a/ex7/ex7faces.mat b/ex7/ex7faces.mat new file mode 100644 index 0000000..3965bd1 Binary files /dev/null and b/ex7/ex7faces.mat differ diff --git a/ex7/featureNormalize.py b/ex7/featureNormalize.py new file mode 100644 index 0000000..8044c56 --- /dev/null +++ b/ex7/featureNormalize.py @@ -0,0 +1,18 @@ +import numpy as np + + +def featureNormalize(X): + """ + returns a normalized version of X where + the mean value of each feature is 0 and the standard deviation + is 1. This is often a good preprocessing step to do when + working with learning algorithms. + """ + + mu = np.mean(X, axis=0) + X_norm = X - mu + + sigma = np.std(X_norm, axis=0, ddof=1) + X_norm = X_norm / sigma + + return X_norm, mu, sigma \ No newline at end of file diff --git a/ex7/findClosestCentroids.py b/ex7/findClosestCentroids.py new file mode 100644 index 0000000..0526f40 --- /dev/null +++ b/ex7/findClosestCentroids.py @@ -0,0 +1,29 @@ +import numpy as np + + +def findClosestCentroids(X, centroids): + """returns the closest centroids + in idx for a dataset X where each row is a single example. idx = m x 1 + vector of centroid assignments (i.e. each entry in range [1..K]) + """ + +# Set K + K = len(centroids) + +# You need to return the following variables correctly. + idx = np.zeros(X.shape[0]) + +# ====================== YOUR CODE HERE ====================== +# Instructions: Go over every example, find its closest centroid, and store +# the index inside idx at the appropriate location. +# Concretely, idx(i) should contain the index of the centroid +# closest to example i. Hence, it should be a value in the +# range 1..K +# +# Note: You can use a for-loop over the examples to compute this. + + +# ============================================================= + + return val, idx + diff --git a/ex7/kMeansInitCentroids.py b/ex7/kMeansInitCentroids.py new file mode 100644 index 0000000..9ddd776 --- /dev/null +++ b/ex7/kMeansInitCentroids.py @@ -0,0 +1,19 @@ +import numpy as np + + +def kMeansInitCentroids(X, K): + """returns K initial centroids to be + used with the K-Means on the dataset X + """ + +# You should return this values correctly + centroids = np.zeros((K, X.shape[1])) + +# ====================== YOUR CODE HERE ====================== +# Instructions: You should set centroids to randomly chosen examples from +# the dataset X +# + + +# ============================================================= + return centroids diff --git a/ex7/pca.py b/ex7/pca.py new file mode 100644 index 0000000..2892a20 --- /dev/null +++ b/ex7/pca.py @@ -0,0 +1,25 @@ +import numpy as np + + +def pca(X): + """computes eigenvectors of the covariance matrix of X + Returns the eigenvectors U, the eigenvalues (on diagonal) in S + """ + + # Useful values + m, n = X.shape + + # You need to return the following variables correctly. + + # ====================== YOUR CODE HERE ====================== + # Instructions: You should first compute the covariance matrix. Then, you + # should use the "svd" function to compute the eigenvectors + # and eigenvalues of the covariance matrix. + # + # Note: When computing the covariance matrix, remember to divide by m (the + # number of examples). + # + + +# ========================================================================= + return U, S, V \ No newline at end of file diff --git a/ex7/plotDataPoints.py b/ex7/plotDataPoints.py new file mode 100644 index 0000000..1502d35 --- /dev/null +++ b/ex7/plotDataPoints.py @@ -0,0 +1,23 @@ +import matplotlib.pyplot as plt + +from show import show + +def plotDataPoints(X, idx): + + """plots data points in X, coloring them so that those + with the same index assignments in idx have the same color + """ + pass + # Create palette + # palette = hsv(K + 1) + # colors = palette(idx, :) + # + # # Plot the data + + # c = dict(enumerate(np.eye(3))) + # colors=idx + map = plt.get_cmap("jet") + idxn = idx.astype('float')/max(idx.astype('float')) + colors = map(idxn) + plt.scatter(X[:, 0], X[:, 1], 15, edgecolors=colors, marker='o', facecolors='none', lw=0.5) + show() \ No newline at end of file diff --git a/ex7/plotProgresskMeans.py b/ex7/plotProgresskMeans.py new file mode 100644 index 0000000..c79bf9d --- /dev/null +++ b/ex7/plotProgresskMeans.py @@ -0,0 +1,29 @@ +import matplotlib.pyplot as plt + +from plotDataPoints import plotDataPoints +from show import show + +def plotProgresskMeans(X, centroids, previous, idx, K, i, color): + """plots the data + points with colors assigned to each centroid. With the previous + centroids, it also plots a line between the previous locations and + current locations of the centroids. + """ + +# Plot the examples + plotDataPoints(X, idx) + +# Plot the centroids as black x's + plt.scatter(centroids[:, 0], centroids[:, 1], + marker='x', s=60, lw=3, edgecolor='k') + +# Plot the history of the centroids with lines + for j in range(len(centroids)): + plt.plot([centroids[j,0], previous[j,0]], + [centroids[j,1], previous[j,1]], c=color) + +# Title + plt.title('Iteration number %d' % i) + show() + raw_input("Program paused. Press Enter to continue...") + diff --git a/ex7/projectData.py b/ex7/projectData.py new file mode 100644 index 0000000..c3d0c78 --- /dev/null +++ b/ex7/projectData.py @@ -0,0 +1,21 @@ +def projectData(X, U, K): + """computes the projection of + the normalized inputs X into the reduced dimensional space spanned by + the first K columns of U. It returns the projected examples in Z. + """ + + # ====================== YOUR CODE HERE ====================== + # Instructions: Compute the projection of the data using only the top K + # eigenvectors in U (first K columns). + # For the i-th example X(i,:), the projection on to the k-th + # eigenvector is given as follows: + # x = X(i, :)' + # projection_k = x' * U(:, k) + # + + + + # ============================================================= + + + return Z diff --git a/ex7/recoverData.py b/ex7/recoverData.py new file mode 100644 index 0000000..4ac1955 --- /dev/null +++ b/ex7/recoverData.py @@ -0,0 +1,24 @@ +def recoverData(Z, U, K): + """ + recovers an approximation the + original data that has been reduced to K dimensions. It returns the + approximate reconstruction in X_rec. + """ + + + # ====================== YOUR CODE HERE ====================== + # Instructions: Compute the approximation of the data by projecting back + # onto the original space using the top K eigenvectors in U. + # + # For the i-th example Z(i,:), the (approximate) + # recovered data for dimension j is given as follows: + # v = Z(i, :)' + # recovered_j = v' * U(j, 1:K)' + # + # Notice that U(j, 1:K) is a row vector. + # + + + # ============================================================= + + return X_rec diff --git a/ex7/runkMeans.py b/ex7/runkMeans.py new file mode 100644 index 0000000..2aa124b --- /dev/null +++ b/ex7/runkMeans.py @@ -0,0 +1,56 @@ +from computeCentroids import computeCentroids +from plotProgresskMeans import plotProgresskMeans +from findClosestCentroids import findClosestCentroids +import matplotlib.pyplot as plt +import numpy as np +import itertools + + +def runkMeans(X, initial_centroids, max_iters, plot_progress=False): + """runs the K-Means algorithm on data matrix X, where each + row of X is a single example. It uses initial_centroids used as the + initial centroids. max_iters specifies the total number of interactions + of K-Means to execute. plot_progress is a true/false flag that + indicates if the function should also plot its progress as the + learning happens. This is set to false by default. runkMeans returns + centroids, a Kxn matrix of the computed centroids and idx, a m x 1 + vector of centroid assignments (i.e. each entry in range [1..K]) + """ + +# Plot the data if we are plotting progress + if plot_progress: + plt.figure() + +# Initialize values + m, n = X.shape + K = len(initial_centroids) + centroids = initial_centroids + previous_centroids = centroids + idx = np.zeros(m) + c = itertools.cycle('012') + rgb = np.eye(3) +# Run K-Means + for i in range(max_iters): + + # Output progress + print 'K-Means iteration %d/%d...' % (i, max_iters) + + # For each example in X, assign it to the closest centroid + _, idx = findClosestCentroids(X, centroids) + + # Optionally, plot progress here + if plot_progress: + color = rgb[int(next(c))] + plotProgresskMeans(X, np.array(centroids), + np.array(previous_centroids), idx, K, i, color) + previous_centroids = centroids + # raw_input("Press Enter to continue...") + + # Given the memberships, compute new centroids + centroids = computeCentroids(X, idx, K) + +# Hold off if we are plotting progress + if plot_progress: + pass + # hold off + return centroids, idx diff --git a/ex7/submit.py b/ex7/submit.py new file mode 100644 index 0000000..af08f6a --- /dev/null +++ b/ex7/submit.py @@ -0,0 +1,54 @@ +import numpy as np + +from Submission import Submission +from Submission import sprintf + +homework = 'k-means-clustering-and-pca' + +part_names = [ + 'Find Closest Centroids (k-Means)', + 'Compute Centroid Means (k-Means)', + 'PCA', + 'Project Data (PCA)', + 'Recover Data (PCA)', + ] + +srcs = [ + 'findClosestCentroids.py', + 'computeCentroids.py', + 'pca.py', + 'projectData.py', + 'recoverData' + ] + + +def output(part_id): + # Random Test Cases + X = np.sin(np.arange(1,166)).reshape((11, 15)).T + Z = np.cos(np.arange(1,122)).reshape((11, 11)).T + C = Z[:5, :] + idx = (np.mod(np.arange(1,16), 3)).T + + fname = srcs[part_id-1].rsplit('.',1)[0] + mod = __import__(fname, fromlist=[fname], level=1) + func = getattr(mod, fname) + + if part_id == 1: + idx = func(X, C) + return sprintf('%0.5f ', idx[1]+1) + elif part_id == 2: + centroids = func(X, idx, 3) + return sprintf('%0.5f ', centroids) + elif part_id == 3: + U, S, V = func(X) + return sprintf('%0.5f ', abs(np.hstack((U.T.flatten(), S.T.flatten())))) + elif part_id == 4: + X_proj = func(X, Z, 5) + return sprintf('%0.5f ', X_proj.T.flatten()) + elif part_id == 5: + X_rec = func(X[:, :5], Z, 5) + return sprintf('%0.5f ', X_rec.T.flatten()) + +s = Submission(homework, part_names, srcs, output) +s.submit() + diff --git a/ex8/checkCostFunction.py b/ex8/checkCostFunction.py new file mode 100644 index 0000000..93b462a --- /dev/null +++ b/ex8/checkCostFunction.py @@ -0,0 +1,54 @@ +import numpy as np +from ex4.computeNumericalGradient import computeNumericalGradient +from cofiCostFunc import cofiCostFunc + +def checkCostFunction(Lambda=0): + """Creates a collaborative filering problem + to check your cost function and gradients, it will output the + analytical gradients produced by your code and the numerical gradients + (computed using computeNumericalGradient). These two gradient + computations should result in very similar values. + """ + + ## Create small problem + X_t = np.random.rand(4, 3) + Theta_t = np.random.rand(5, 3) + + # Zap out most entries + Y = X_t.dot(Theta_t.T) + Y[np.where(np.random.random_sample(Y.shape) > 0.5, True, False)] = 0 + R = np.zeros(Y.shape) + R[np.where(Y != 0, True, False)] = 1 + + ## Run Gradient Checking + X = np.random.random_sample(X_t.shape) + Theta = np.random.random_sample(Theta_t.shape) + num_users = Y.shape[1] + num_movies = Y.shape[0] + num_features = Theta_t.shape[1] + + # Unroll parameters + params = np.hstack((X.T.flatten(), Theta.T.flatten())) + + costFunc = lambda t: cofiCostFunc(t, Y, R, num_users, num_movies, num_features, Lambda) + + def costFunc_w(t): + Jgrad = costFunc(t) + return Jgrad + + numgrad = computeNumericalGradient(costFunc_w, params) + + cost, grad = cofiCostFunc(params, Y, R, num_users, num_movies, num_features, Lambda) + + + print np.column_stack((numgrad, grad)) + + print 'The above two columns you get should be very similar.\n' \ + '(Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n' + + diff = np.linalg.norm(numgrad-grad)/np.linalg.norm(numgrad+grad) + + print 'If your backpropagation implementation is correct, then\n ' \ + 'the relative difference will be small (less than 1e-9). \n' \ + '\nRelative Difference: %g\n' % diff + diff --git a/ex8/cofiCostFunc.py b/ex8/cofiCostFunc.py new file mode 100644 index 0000000..017b39a --- /dev/null +++ b/ex8/cofiCostFunc.py @@ -0,0 +1,45 @@ +import numpy as np + + +def cofiCostFunc(params, Y, R, num_users, num_movies, num_features, Lambda): + """returns the cost and gradient for the + """ + + # Unfold the U and W matrices from params + X = np.array(params[:num_movies*num_features]).reshape(num_features, num_movies).T.copy() + Theta = np.array(params[num_movies*num_features:]).reshape(num_features, num_users).T.copy() + + + # You need to return the following values correctly + J = 0 + X_grad = np.zeros(X.shape) + Theta_grad = np.zeros(Theta.shape) + + # ====================== YOUR CODE HERE ====================== + # Instructions: Compute the cost function and gradient for collaborative + # filtering. Concretely, you should first implement the cost + # function (without regularization) and make sure it is + # matches our costs. After that, you should implement the + # gradient and use the checkCostFunction routine to check + # that the gradient is correct. Finally, you should implement + # regularization. + # + # Notes: X - num_movies x num_features matrix of movie features + # Theta - num_users x num_features matrix of user features + # Y - num_movies x num_users matrix of user ratings of movies + # R - num_movies x num_users matrix, where R(i, j) = 1 if the + # i-th movie was rated by the j-th user + # + # You should set the following variables correctly: + # + # X_grad - num_movies x num_features matrix, containing the + # partial derivatives w.r.t. to each element of X + # Theta_grad - num_users x num_features matrix, containing the + # partial derivatives w.r.t. to each element of Theta + + + # ============================================================= + + grad = np.hstack((X_grad.T.flatten(),Theta_grad.T.flatten())) + + return J, grad diff --git a/ex8/estimateGaussian.py b/ex8/estimateGaussian.py new file mode 100644 index 0000000..b322ec6 --- /dev/null +++ b/ex8/estimateGaussian.py @@ -0,0 +1,27 @@ +import numpy as np + + +def estimateGaussian(X): + """ + This function estimates the parameters of a + Gaussian distribution using the data in X + The input X is the dataset with each n-dimensional data point in one row + The output is an n-dimensional vector mu, the mean of the data set + and the variances sigma^2, an n x 1 vector + """ + m = len(X) + + # ====================== YOUR CODE HERE ====================== + # Instructions: Compute the mean of the data and the variances + # In particular, mu(i) should contain the mean of + # the data for the i-th feature and sigma2(i) + # should contain variance of the i-th feature. + # + + + +# ============================================================= + + return mu, sigma2 + + diff --git a/ex8/ex8.py b/ex8/ex8.py new file mode 100644 index 0000000..402189f --- /dev/null +++ b/ex8/ex8.py @@ -0,0 +1,134 @@ +from matplotlib import use, cm +use('TkAgg') +import numpy as np +import scipy.io +import matplotlib.pyplot as plt + +from estimateGaussian import estimateGaussian +from selectThreshold import selectThreshold +from multivariateGaussian import multivariateGaussian +from visualizeFit import visualizeFit +from show import show + +## Machine Learning Online Class +# Exercise 8 | Anomaly Detection and Collaborative Filtering +# +# Instructions +# ------------ +# +# This file contains code that helps you get started on the +# exercise. You will need to complete the following functions: +# +# estimateGaussian.m +# selectThreshold.m +# cofiCostFunc.m +# +# For this exercise, you will not need to change any code in this file, +# or any other files other than those mentioned above. +# + +## ================== Part 1: Load Example Dataset =================== +# We start this exercise by using a small dataset that is easy to +# visualize. +# +# Our example case consists of 2 network server statistics across +# several machines: the latency and throughput of each machine. +# This exercise will help us find possibly faulty (or very fast) machines. +# + +print 'Visualizing example dataset for outlier detection.' + +# The following command loads the dataset. You should now have the +# variables X, Xval, yval in your environment +data = scipy.io.loadmat('ex8data1.mat') +X = data['X'] +Xval = data['Xval'] +yval = data['yval'].flatten() + +# Visualize the example dataset +plt.plot(X[:, 0], X[:, 1], 'bx') +plt.axis([0, 30, 0, 30]) +plt.xlabel('Latency (ms)') +plt.ylabel('Throughput (mb/s)') +show() +raw_input("Program paused. Press Enter to continue...") + + +## ================== Part 2: Estimate the dataset statistics =================== +# For this exercise, we assume a Gaussian distribution for the dataset. +# +# We first estimate the parameters of our assumed Gaussian distribution, +# then compute the probabilities for each of the points and then visualize +# both the overall distribution and where each of the points falls in +# terms of that distribution. +# +print 'Visualizing Gaussian fit.' + +# Estimate my and sigma2 +mu, sigma2 = estimateGaussian(X) + +# Returns the density of the multivariate normal at each data point (row) +# of X +p = multivariateGaussian(X, mu, sigma2) + +# Visualize the fit +visualizeFit(X, mu, sigma2) +plt.xlabel('Latency (ms)') +plt.ylabel('Throughput (mb/s)') +show() + +raw_input("Program paused. Press Enter to continue...") + +## ================== Part 3: Find Outliers =================== +# Now you will find a good epsilon threshold using a cross-validation set +# probabilities given the estimated Gaussian distribution +# + +pval = multivariateGaussian(Xval, mu, sigma2) + +epsilon, F1 = selectThreshold(yval, pval) +print 'Best epsilon found using cross-validation: %e' % epsilon +print 'Best F1 on Cross Validation Set: %f' % F1 +print ' (you should see a value epsilon of about 8.99e-05)' + +# Find the outliers in the training set and plot the +outliers = np.where(p < epsilon, True, False) + +# Draw a red circle around those outliers +plt.plot(X[outliers, 0], X[outliers, 1], 'ro', lw=2, markersize=10, fillstyle='none', markeredgewidth=1) +show() + +raw_input("Program paused. Press Enter to continue...") + +## ================== Part 4: Multidimensional Outliers =================== +# We will now use the code from the previous part and apply it to a +# harder problem in which more features describe each datapoint and only +# some features indicate whether a point is an outlier. +# + +# Loads the second dataset. You should now have the +# variables X, Xval, yval in your environment +data = scipy.io.loadmat('ex8data2.mat') +X = data['X'] +Xval = data['Xval'] +yval = data['yval'].flatten() + +# Apply the same steps to the larger dataset +mu, sigma2 = estimateGaussian(X) + +# Training set +p = multivariateGaussian(X, mu, sigma2) + +# Cross-validation set +pval = multivariateGaussian(Xval, mu, sigma2) + +# Find the best threshold +epsilon, F1 = selectThreshold(yval, pval) + +print 'Best epsilon found using cross-validation: %e' % epsilon +print 'Best F1 on Cross Validation Set: %f' % F1 +print '# Outliers found: %d' % sum(p < epsilon) +print ' (you should see a value epsilon of about 1.38e-18)' + + + diff --git a/ex8/ex8_cofi.py b/ex8/ex8_cofi.py new file mode 100644 index 0000000..761a655 --- /dev/null +++ b/ex8/ex8_cofi.py @@ -0,0 +1,249 @@ +## Machine Learning Online Class +# Exercise 8 | Anomaly Detection and Collaborative Filtering +# +# Instructions +# ------------ +# +# This file contains code that helps you get started on the +# exercise. You will need to complete the following functions: +# +# estimateGaussian.m +# selectThreshold.m +# cofiCostFunc.m +# +# For this exercise, you will not need to change any code in this file, +# or any other files other than those mentioned above. +# +from matplotlib import use, cm +use('TkAgg') +import numpy as np +import matplotlib.pyplot as plt +import scipy.io +from scipy.optimize import minimize +from show import show + +## =============== Part 1: Loading movie ratings dataset ================ +# You will start by loading the movie ratings dataset to understand the +# structure of the data. +# +from cofiCostFunc import cofiCostFunc +from checkCostFunction import checkCostFunction +from loadMovieList import loadMovieList +from normalizeRatings import normalizeRatings + +print 'Loading movie ratings dataset.' + +# Load data +data = scipy.io.loadmat('ex8_movies.mat') +Y = data['Y'] +R = data['R'].astype(bool) +# Y is a 1682x943 matrix, containing ratings (1-5) of 1682 movies on +# 943 users +# +# R is a 1682x943 matrix, where R(i,j) = 1 if and only if user j gave a +# rating to movie i + +# From the matrix, we can compute statistics like average rating. +print 'Average rating for movie 1 (Toy Story): %f / 5' % np.mean(Y[0, R[0, :]]) + +# We can "visualize" the ratings matrix by plotting it with imagesc + +plt.figure() +plt.imshow(Y, aspect='equal', origin='upper', extent=(0, Y.shape[1], 0, Y.shape[0]/2.0)) +plt.ylabel('Movies') +plt.xlabel('Users') +show() + +raw_input("Program paused. Press Enter to continue...") + +## ============ Part 2: Collaborative Filtering Cost Function =========== +# You will now implement the cost function for collaborative filtering. +# To help you debug your cost function, we have included set of weights +# that we trained on that. Specifically, you should complete the code in +# cofiCostFunc.m to return J. + +# Load pre-trained weights (X, Theta, num_users, num_movies, num_features) +data = scipy.io.loadmat('ex8_movieParams.mat') +X = data['X'] +Theta = data['Theta'] +num_users = data['num_users'] +num_movies = data['num_movies'] +num_features = data['num_features'] + +# Reduce the data set size so that this runs faster +num_users = 4 +num_movies = 5 +num_features = 3 +X = X[:num_movies, :num_features] +Theta = Theta[:num_users, :num_features] +Y = Y[:num_movies, :num_users] +R = R[:num_movies, :num_users] + +# Evaluate cost function +J, grad = cofiCostFunc(np.hstack((X.T.flatten(), Theta.T.flatten())), Y, R, num_users, num_movies, + num_features, 0) + +print 'Cost at loaded parameters: %f \n(this value should be about 22.22)' % J + +raw_input("Program paused. Press Enter to continue...") + + +## ============== Part 3: Collaborative Filtering Gradient ============== +# Once your cost function matches up with ours, you should now implement +# the collaborative filtering gradient function. Specifically, you should +# complete the code in cofiCostFunc.m to return the grad argument. +# +print 'Checking Gradients (without regularization) ...' + +# Check gradients by running checkNNGradients +checkCostFunction() + +raw_input("Program paused. Press Enter to continue...") + + +## ========= Part 4: Collaborative Filtering Cost Regularization ======== +# Now, you should implement regularization for the cost function for +# collaborative filtering. You can implement it by adding the cost of +# regularization to the original cost computation. +# + +# Evaluate cost function +J, grad = cofiCostFunc(np.hstack((X.T.flatten(), Theta.T.flatten())), Y, R, num_users, num_movies, + num_features, 1.5) + +print 'Cost at loaded parameters (lambda = 1.5): %f \n(this value should be about 31.34)\n' % J + +raw_input("Program paused. Press Enter to continue...") + + +## ======= Part 5: Collaborative Filtering Gradient Regularization ====== +# Once your cost matches up with ours, you should proceed to implement +# regularization for the gradient. +# + +# +print 'Checking Gradients (with regularization) ...' + +# Check gradients by running checkNNGradients +checkCostFunction(1.5) + +raw_input("Program paused. Press Enter to continue...") + + +## ============== Part 6: Entering ratings for a new user =============== +# Before we will train the collaborative filtering model, we will first +# add ratings that correspond to a new user that we just observed. This +# part of the code will also allow you to put in your own ratings for the +# movies in our dataset! +# +movieList = loadMovieList() + +# Initialize my ratings +my_ratings = np.zeros(1682) + +# Check the file movie_idx.txt for id of each movie in our dataset +# For example, Toy Story (1995) has ID 1, so to rate it "4", you can set +my_ratings[0] = 4 + +# Or suppose did not enjoy Silence of the Lambs (1991), you can set +my_ratings[97] = 2 + +# We have selected a few movies we liked / did not like and the ratings we +# gave are as follows: +my_ratings[6] = 3 +my_ratings[11] = 5 +my_ratings[53] = 4 +my_ratings[63] = 5 +my_ratings[65] = 3 +my_ratings[68] = 5 +my_ratings[182] = 4 +my_ratings[225] = 5 +my_ratings[354] = 5 + +print 'New user ratings:' +for i in range(len(my_ratings)): + if my_ratings[i] > 0: + print 'Rated %d for %s\n' % (my_ratings[i], movieList[i]) + +raw_input("Program paused. Press Enter to continue...") + + +## ================== Part 7: Learning Movie Ratings ==================== +# Now, you will train the collaborative filtering model on a movie rating +# dataset of 1682 movies and 943 users +# + +print '\nTraining collaborative filtering...' + +# Load data +data = scipy.io.loadmat('ex8_movies.mat') +Y = data['Y'] +R = data['R'].astype(bool) + +# Y is a 1682x943 matrix, containing ratings (1-5) of 1682 movies by +# 943 users +# +# R is a 1682x943 matrix, where R(i,j) = 1 if and only if user j gave a +# rating to movie i + +# Add our own ratings to the data matrix +Y = np.column_stack((my_ratings, Y)) +R = np.column_stack((my_ratings, R)).astype(bool) + +# Normalize Ratings +Ynorm, Ymean = normalizeRatings(Y, R) + +# Useful Values +num_users = Y.shape[1] +num_movies = Y.shape[0] +num_features = 10 + +# Set Initial Parameters (Theta, X) +X = np.random.rand(num_movies, num_features) +Theta = np.random.rand(num_users, num_features) + +initial_parameters = np.hstack((X.T.flatten(), Theta.T.flatten())) +# Set Regularization +Lambda = 10 + +costFunc = lambda p: cofiCostFunc(p, Ynorm, R, num_users, num_movies, num_features, Lambda)[0] +gradFunc = lambda p: cofiCostFunc(p, Ynorm, R, num_users, num_movies, num_features, Lambda)[1] + +result = minimize(costFunc, initial_parameters, method='CG', jac=gradFunc, options={'disp': True, 'maxiter': 1000.0}) +theta = result.x +cost = result.fun + + +# Unfold the returned theta back into U and W +X = theta[:num_movies*num_features].reshape(num_movies, num_features) +Theta = theta[num_movies*num_features:].reshape(num_users, num_features) + +print 'Recommender system learning completed.' + +raw_input("Program paused. Press Enter to continue...") + +## ================== Part 8: Recommendation for you ==================== +# After training the model, you can now make recommendations by computing +# the predictions matrix. +# + +p = X.dot(Theta.T) +my_predictions = p[:, 0] + Ymean + +movieList = loadMovieList() + +# sort predictions descending +pre=np.array([[idx, p] for idx, p in enumerate(my_predictions)]) +post = pre[pre[:,1].argsort()[::-1]] +r = post[:,1] +ix = post[:,0] + +print '\nTop recommendations for you:' +for i in range(10): + j = int(ix[i]) + print 'Predicting rating %.1f for movie %s\n' % (my_predictions[j], movieList[j]) + +print '\nOriginal ratings provided:' +for i in range(len(my_ratings)): + if my_ratings[i] > 0: + print 'Rated %d for %s\n' % (my_ratings[i], movieList[i]) diff --git a/ex8/ex8_movieParams.mat b/ex8/ex8_movieParams.mat new file mode 100644 index 0000000..2dea689 Binary files /dev/null and b/ex8/ex8_movieParams.mat differ diff --git a/ex8/ex8_movies.mat b/ex8/ex8_movies.mat new file mode 100644 index 0000000..31ecd00 Binary files /dev/null and b/ex8/ex8_movies.mat differ diff --git a/ex8/ex8data1.mat b/ex8/ex8data1.mat new file mode 100644 index 0000000..1f08123 Binary files /dev/null and b/ex8/ex8data1.mat differ diff --git a/ex8/ex8data2.mat b/ex8/ex8data2.mat new file mode 100644 index 0000000..fe48db3 Binary files /dev/null and b/ex8/ex8data2.mat differ diff --git a/ex8/loadMovieList.py b/ex8/loadMovieList.py new file mode 100644 index 0000000..a547893 --- /dev/null +++ b/ex8/loadMovieList.py @@ -0,0 +1,24 @@ +import io + + +def loadMovieList(): + """ + reads the fixed movie list in movie.txt + and returns a cell array of the words in movieList. + """ + + ## Read the fixed movieulary list + with io.open('movie_ids.txt', encoding='ISO-8859-1') as f: + + # Store all movies in cell array movie{} + n = 1682 # Total number of movies + + movieList = [] + for i in range(n): + # Read line + line = f.readline() + # Word Index (can ignore since it will be = i) + str = line.split() + # Actual Word + movieList.append(' '.join(str[1:]).strip()) + return movieList diff --git a/ex8/movie_ids.txt b/ex8/movie_ids.txt new file mode 100644 index 0000000..392427a --- /dev/null +++ b/ex8/movie_ids.txt @@ -0,0 +1,1682 @@ +1 Toy Story (1995) +2 GoldenEye (1995) +3 Four Rooms (1995) +4 Get Shorty (1995) +5 Copycat (1995) +6 Shanghai Triad (Yao a yao yao dao waipo qiao) (1995) +7 Twelve Monkeys (1995) +8 Babe (1995) +9 Dead Man Walking (1995) +10 Richard III (1995) +11 Seven (Se7en) (1995) +12 Usual Suspects, The (1995) +13 Mighty Aphrodite (1995) +14 Postino, Il (1994) +15 Mr. Holland's Opus (1995) +16 French Twist (Gazon maudit) (1995) +17 From Dusk Till Dawn (1996) +18 White Balloon, The (1995) +19 Antonia's Line (1995) +20 Angels and Insects (1995) +21 Muppet Treasure Island (1996) +22 Braveheart (1995) +23 Taxi Driver (1976) +24 Rumble in the Bronx (1995) +25 Birdcage, The (1996) +26 Brothers McMullen, The (1995) +27 Bad Boys (1995) +28 Apollo 13 (1995) +29 Batman Forever (1995) +30 Belle de jour (1967) +31 Crimson Tide (1995) +32 Crumb (1994) +33 Desperado (1995) +34 Doom Generation, The (1995) +35 Free Willy 2: The Adventure Home (1995) +36 Mad Love (1995) +37 Nadja (1994) +38 Net, The (1995) +39 Strange Days (1995) +40 To Wong Foo, Thanks for Everything! Julie Newmar (1995) +41 Billy Madison (1995) +42 Clerks (1994) +43 Disclosure (1994) +44 Dolores Claiborne (1994) +45 Eat Drink Man Woman (1994) +46 Exotica (1994) +47 Ed Wood (1994) +48 Hoop Dreams (1994) +49 I.Q. (1994) +50 Star Wars (1977) +51 Legends of the Fall (1994) +52 Madness of King George, The (1994) +53 Natural Born Killers (1994) +54 Outbreak (1995) +55 Professional, The (1994) +56 Pulp Fiction (1994) +57 Priest (1994) +58 Quiz Show (1994) +59 Three Colors: Red (1994) +60 Three Colors: Blue (1993) +61 Three Colors: White (1994) +62 Stargate (1994) +63 Santa Clause, The (1994) +64 Shawshank Redemption, The (1994) +65 What's Eating Gilbert Grape (1993) +66 While You Were Sleeping (1995) +67 Ace Ventura: Pet Detective (1994) +68 Crow, The (1994) +69 Forrest Gump (1994) +70 Four Weddings and a Funeral (1994) +71 Lion King, The (1994) +72 Mask, The (1994) +73 Maverick (1994) +74 Faster Pussycat! Kill! Kill! (1965) +75 Brother Minister: The Assassination of Malcolm X (1994) +76 Carlito's Way (1993) +77 Firm, The (1993) +78 Free Willy (1993) +79 Fugitive, The (1993) +80 Hot Shots! Part Deux (1993) +81 Hudsucker Proxy, The (1994) +82 Jurassic Park (1993) +83 Much Ado About Nothing (1993) +84 Robert A. Heinlein's The Puppet Masters (1994) +85 Ref, The (1994) +86 Remains of the Day, The (1993) +87 Searching for Bobby Fischer (1993) +88 Sleepless in Seattle (1993) +89 Blade Runner (1982) +90 So I Married an Axe Murderer (1993) +91 Nightmare Before Christmas, The (1993) +92 True Romance (1993) +93 Welcome to the Dollhouse (1995) +94 Home Alone (1990) +95 Aladdin (1992) +96 Terminator 2: Judgment Day (1991) +97 Dances with Wolves (1990) +98 Silence of the Lambs, The (1991) +99 Snow White and the Seven Dwarfs (1937) +100 Fargo (1996) +101 Heavy Metal (1981) +102 Aristocats, The (1970) +103 All Dogs Go to Heaven 2 (1996) +104 Theodore Rex (1995) +105 Sgt. Bilko (1996) +106 Diabolique (1996) +107 Moll Flanders (1996) +108 Kids in the Hall: Brain Candy (1996) +109 Mystery Science Theater 3000: The Movie (1996) +110 Operation Dumbo Drop (1995) +111 Truth About Cats & Dogs, The (1996) +112 Flipper (1996) +113 Horseman on the Roof, The (Hussard sur le toit, Le) (1995) +114 Wallace & Gromit: The Best of Aardman Animation (1996) +115 Haunted World of Edward D. Wood Jr., The (1995) +116 Cold Comfort Farm (1995) +117 Rock, The (1996) +118 Twister (1996) +119 Maya Lin: A Strong Clear Vision (1994) +120 Striptease (1996) +121 Independence Day (ID4) (1996) +122 Cable Guy, The (1996) +123 Frighteners, The (1996) +124 Lone Star (1996) +125 Phenomenon (1996) +126 Spitfire Grill, The (1996) +127 Godfather, The (1972) +128 Supercop (1992) +129 Bound (1996) +130 Kansas City (1996) +131 Breakfast at Tiffany's (1961) +132 Wizard of Oz, The (1939) +133 Gone with the Wind (1939) +134 Citizen Kane (1941) +135 2001: A Space Odyssey (1968) +136 Mr. Smith Goes to Washington (1939) +137 Big Night (1996) +138 D3: The Mighty Ducks (1996) +139 Love Bug, The (1969) +140 Homeward Bound: The Incredible Journey (1993) +141 20,000 Leagues Under the Sea (1954) +142 Bedknobs and Broomsticks (1971) +143 Sound of Music, The (1965) +144 Die Hard (1988) +145 Lawnmower Man, The (1992) +146 Unhook the Stars (1996) +147 Long Kiss Goodnight, The (1996) +148 Ghost and the Darkness, The (1996) +149 Jude (1996) +150 Swingers (1996) +151 Willy Wonka and the Chocolate Factory (1971) +152 Sleeper (1973) +153 Fish Called Wanda, A (1988) +154 Monty Python's Life of Brian (1979) +155 Dirty Dancing (1987) +156 Reservoir Dogs (1992) +157 Platoon (1986) +158 Weekend at Bernie's (1989) +159 Basic Instinct (1992) +160 Glengarry Glen Ross (1992) +161 Top Gun (1986) +162 On Golden Pond (1981) +163 Return of the Pink Panther, The (1974) +164 Abyss, The (1989) +165 Jean de Florette (1986) +166 Manon of the Spring (Manon des sources) (1986) +167 Private Benjamin (1980) +168 Monty Python and the Holy Grail (1974) +169 Wrong Trousers, The (1993) +170 Cinema Paradiso (1988) +171 Delicatessen (1991) +172 Empire Strikes Back, The (1980) +173 Princess Bride, The (1987) +174 Raiders of the Lost Ark (1981) +175 Brazil (1985) +176 Aliens (1986) +177 Good, The Bad and The Ugly, The (1966) +178 12 Angry Men (1957) +179 Clockwork Orange, A (1971) +180 Apocalypse Now (1979) +181 Return of the Jedi (1983) +182 GoodFellas (1990) +183 Alien (1979) +184 Army of Darkness (1993) +185 Psycho (1960) +186 Blues Brothers, The (1980) +187 Godfather: Part II, The (1974) +188 Full Metal Jacket (1987) +189 Grand Day Out, A (1992) +190 Henry V (1989) +191 Amadeus (1984) +192 Raging Bull (1980) +193 Right Stuff, The (1983) +194 Sting, The (1973) +195 Terminator, The (1984) +196 Dead Poets Society (1989) +197 Graduate, The (1967) +198 Nikita (La Femme Nikita) (1990) +199 Bridge on the River Kwai, The (1957) +200 Shining, The (1980) +201 Evil Dead II (1987) +202 Groundhog Day (1993) +203 Unforgiven (1992) +204 Back to the Future (1985) +205 Patton (1970) +206 Akira (1988) +207 Cyrano de Bergerac (1990) +208 Young Frankenstein (1974) +209 This Is Spinal Tap (1984) +210 Indiana Jones and the Last Crusade (1989) +211 M*A*S*H (1970) +212 Unbearable Lightness of Being, The (1988) +213 Room with a View, A (1986) +214 Pink Floyd - The Wall (1982) +215 Field of Dreams (1989) +216 When Harry Met Sally... (1989) +217 Bram Stoker's Dracula (1992) +218 Cape Fear (1991) +219 Nightmare on Elm Street, A (1984) +220 Mirror Has Two Faces, The (1996) +221 Breaking the Waves (1996) +222 Star Trek: First Contact (1996) +223 Sling Blade (1996) +224 Ridicule (1996) +225 101 Dalmatians (1996) +226 Die Hard 2 (1990) +227 Star Trek VI: The Undiscovered Country (1991) +228 Star Trek: The Wrath of Khan (1982) +229 Star Trek III: The Search for Spock (1984) +230 Star Trek IV: The Voyage Home (1986) +231 Batman Returns (1992) +232 Young Guns (1988) +233 Under Siege (1992) +234 Jaws (1975) +235 Mars Attacks! (1996) +236 Citizen Ruth (1996) +237 Jerry Maguire (1996) +238 Raising Arizona (1987) +239 Sneakers (1992) +240 Beavis and Butt-head Do America (1996) +241 Last of the Mohicans, The (1992) +242 Kolya (1996) +243 Jungle2Jungle (1997) +244 Smilla's Sense of Snow (1997) +245 Devil's Own, The (1997) +246 Chasing Amy (1997) +247 Turbo: A Power Rangers Movie (1997) +248 Grosse Pointe Blank (1997) +249 Austin Powers: International Man of Mystery (1997) +250 Fifth Element, The (1997) +251 Shall We Dance? (1996) +252 Lost World: Jurassic Park, The (1997) +253 Pillow Book, The (1995) +254 Batman & Robin (1997) +255 My Best Friend's Wedding (1997) +256 When the Cats Away (Chacun cherche son chat) (1996) +257 Men in Black (1997) +258 Contact (1997) +259 George of the Jungle (1997) +260 Event Horizon (1997) +261 Air Bud (1997) +262 In the Company of Men (1997) +263 Steel (1997) +264 Mimic (1997) +265 Hunt for Red October, The (1990) +266 Kull the Conqueror (1997) +267 unknown +268 Chasing Amy (1997) +269 Full Monty, The (1997) +270 Gattaca (1997) +271 Starship Troopers (1997) +272 Good Will Hunting (1997) +273 Heat (1995) +274 Sabrina (1995) +275 Sense and Sensibility (1995) +276 Leaving Las Vegas (1995) +277 Restoration (1995) +278 Bed of Roses (1996) +279 Once Upon a Time... When We Were Colored (1995) +280 Up Close and Personal (1996) +281 River Wild, The (1994) +282 Time to Kill, A (1996) +283 Emma (1996) +284 Tin Cup (1996) +285 Secrets & Lies (1996) +286 English Patient, The (1996) +287 Marvin's Room (1996) +288 Scream (1996) +289 Evita (1996) +290 Fierce Creatures (1997) +291 Absolute Power (1997) +292 Rosewood (1997) +293 Donnie Brasco (1997) +294 Liar Liar (1997) +295 Breakdown (1997) +296 Promesse, La (1996) +297 Ulee's Gold (1997) +298 Face/Off (1997) +299 Hoodlum (1997) +300 Air Force One (1997) +301 In & Out (1997) +302 L.A. Confidential (1997) +303 Ulee's Gold (1997) +304 Fly Away Home (1996) +305 Ice Storm, The (1997) +306 Mrs. Brown (Her Majesty, Mrs. Brown) (1997) +307 Devil's Advocate, The (1997) +308 FairyTale: A True Story (1997) +309 Deceiver (1997) +310 Rainmaker, The (1997) +311 Wings of the Dove, The (1997) +312 Midnight in the Garden of Good and Evil (1997) +313 Titanic (1997) +314 3 Ninjas: High Noon At Mega Mountain (1998) +315 Apt Pupil (1998) +316 As Good As It Gets (1997) +317 In the Name of the Father (1993) +318 Schindler's List (1993) +319 Everyone Says I Love You (1996) +320 Paradise Lost: The Child Murders at Robin Hood Hills (1996) +321 Mother (1996) +322 Murder at 1600 (1997) +323 Dante's Peak (1997) +324 Lost Highway (1997) +325 Crash (1996) +326 G.I. Jane (1997) +327 Cop Land (1997) +328 Conspiracy Theory (1997) +329 Desperate Measures (1998) +330 187 (1997) +331 Edge, The (1997) +332 Kiss the Girls (1997) +333 Game, The (1997) +334 U Turn (1997) +335 How to Be a Player (1997) +336 Playing God (1997) +337 House of Yes, The (1997) +338 Bean (1997) +339 Mad City (1997) +340 Boogie Nights (1997) +341 Critical Care (1997) +342 Man Who Knew Too Little, The (1997) +343 Alien: Resurrection (1997) +344 Apostle, The (1997) +345 Deconstructing Harry (1997) +346 Jackie Brown (1997) +347 Wag the Dog (1997) +348 Desperate Measures (1998) +349 Hard Rain (1998) +350 Fallen (1998) +351 Prophecy II, The (1998) +352 Spice World (1997) +353 Deep Rising (1998) +354 Wedding Singer, The (1998) +355 Sphere (1998) +356 Client, The (1994) +357 One Flew Over the Cuckoo's Nest (1975) +358 Spawn (1997) +359 Assignment, The (1997) +360 Wonderland (1997) +361 Incognito (1997) +362 Blues Brothers 2000 (1998) +363 Sudden Death (1995) +364 Ace Ventura: When Nature Calls (1995) +365 Powder (1995) +366 Dangerous Minds (1995) +367 Clueless (1995) +368 Bio-Dome (1996) +369 Black Sheep (1996) +370 Mary Reilly (1996) +371 Bridges of Madison County, The (1995) +372 Jeffrey (1995) +373 Judge Dredd (1995) +374 Mighty Morphin Power Rangers: The Movie (1995) +375 Showgirls (1995) +376 Houseguest (1994) +377 Heavyweights (1994) +378 Miracle on 34th Street (1994) +379 Tales From the Crypt Presents: Demon Knight (1995) +380 Star Trek: Generations (1994) +381 Muriel's Wedding (1994) +382 Adventures of Priscilla, Queen of the Desert, The (1994) +383 Flintstones, The (1994) +384 Naked Gun 33 1/3: The Final Insult (1994) +385 True Lies (1994) +386 Addams Family Values (1993) +387 Age of Innocence, The (1993) +388 Beverly Hills Cop III (1994) +389 Black Beauty (1994) +390 Fear of a Black Hat (1993) +391 Last Action Hero (1993) +392 Man Without a Face, The (1993) +393 Mrs. Doubtfire (1993) +394 Radioland Murders (1994) +395 Robin Hood: Men in Tights (1993) +396 Serial Mom (1994) +397 Striking Distance (1993) +398 Super Mario Bros. (1993) +399 Three Musketeers, The (1993) +400 Little Rascals, The (1994) +401 Brady Bunch Movie, The (1995) +402 Ghost (1990) +403 Batman (1989) +404 Pinocchio (1940) +405 Mission: Impossible (1996) +406 Thinner (1996) +407 Spy Hard (1996) +408 Close Shave, A (1995) +409 Jack (1996) +410 Kingpin (1996) +411 Nutty Professor, The (1996) +412 Very Brady Sequel, A (1996) +413 Tales from the Crypt Presents: Bordello of Blood (1996) +414 My Favorite Year (1982) +415 Apple Dumpling Gang, The (1975) +416 Old Yeller (1957) +417 Parent Trap, The (1961) +418 Cinderella (1950) +419 Mary Poppins (1964) +420 Alice in Wonderland (1951) +421 William Shakespeare's Romeo and Juliet (1996) +422 Aladdin and the King of Thieves (1996) +423 E.T. the Extra-Terrestrial (1982) +424 Children of the Corn: The Gathering (1996) +425 Bob Roberts (1992) +426 Transformers: The Movie, The (1986) +427 To Kill a Mockingbird (1962) +428 Harold and Maude (1971) +429 Day the Earth Stood Still, The (1951) +430 Duck Soup (1933) +431 Highlander (1986) +432 Fantasia (1940) +433 Heathers (1989) +434 Forbidden Planet (1956) +435 Butch Cassidy and the Sundance Kid (1969) +436 American Werewolf in London, An (1981) +437 Amityville 1992: It's About Time (1992) +438 Amityville 3-D (1983) +439 Amityville: A New Generation (1993) +440 Amityville II: The Possession (1982) +441 Amityville Horror, The (1979) +442 Amityville Curse, The (1990) +443 Birds, The (1963) +444 Blob, The (1958) +445 Body Snatcher, The (1945) +446 Burnt Offerings (1976) +447 Carrie (1976) +448 Omen, The (1976) +449 Star Trek: The Motion Picture (1979) +450 Star Trek V: The Final Frontier (1989) +451 Grease (1978) +452 Jaws 2 (1978) +453 Jaws 3-D (1983) +454 Bastard Out of Carolina (1996) +455 Jackie Chan's First Strike (1996) +456 Beverly Hills Ninja (1997) +457 Free Willy 3: The Rescue (1997) +458 Nixon (1995) +459 Cry, the Beloved Country (1995) +460 Crossing Guard, The (1995) +461 Smoke (1995) +462 Like Water For Chocolate (Como agua para chocolate) (1992) +463 Secret of Roan Inish, The (1994) +464 Vanya on 42nd Street (1994) +465 Jungle Book, The (1994) +466 Red Rock West (1992) +467 Bronx Tale, A (1993) +468 Rudy (1993) +469 Short Cuts (1993) +470 Tombstone (1993) +471 Courage Under Fire (1996) +472 Dragonheart (1996) +473 James and the Giant Peach (1996) +474 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963) +475 Trainspotting (1996) +476 First Wives Club, The (1996) +477 Matilda (1996) +478 Philadelphia Story, The (1940) +479 Vertigo (1958) +480 North by Northwest (1959) +481 Apartment, The (1960) +482 Some Like It Hot (1959) +483 Casablanca (1942) +484 Maltese Falcon, The (1941) +485 My Fair Lady (1964) +486 Sabrina (1954) +487 Roman Holiday (1953) +488 Sunset Blvd. (1950) +489 Notorious (1946) +490 To Catch a Thief (1955) +491 Adventures of Robin Hood, The (1938) +492 East of Eden (1955) +493 Thin Man, The (1934) +494 His Girl Friday (1940) +495 Around the World in 80 Days (1956) +496 It's a Wonderful Life (1946) +497 Bringing Up Baby (1938) +498 African Queen, The (1951) +499 Cat on a Hot Tin Roof (1958) +500 Fly Away Home (1996) +501 Dumbo (1941) +502 Bananas (1971) +503 Candidate, The (1972) +504 Bonnie and Clyde (1967) +505 Dial M for Murder (1954) +506 Rebel Without a Cause (1955) +507 Streetcar Named Desire, A (1951) +508 People vs. Larry Flynt, The (1996) +509 My Left Foot (1989) +510 Magnificent Seven, The (1954) +511 Lawrence of Arabia (1962) +512 Wings of Desire (1987) +513 Third Man, The (1949) +514 Annie Hall (1977) +515 Boot, Das (1981) +516 Local Hero (1983) +517 Manhattan (1979) +518 Miller's Crossing (1990) +519 Treasure of the Sierra Madre, The (1948) +520 Great Escape, The (1963) +521 Deer Hunter, The (1978) +522 Down by Law (1986) +523 Cool Hand Luke (1967) +524 Great Dictator, The (1940) +525 Big Sleep, The (1946) +526 Ben-Hur (1959) +527 Gandhi (1982) +528 Killing Fields, The (1984) +529 My Life as a Dog (Mitt liv som hund) (1985) +530 Man Who Would Be King, The (1975) +531 Shine (1996) +532 Kama Sutra: A Tale of Love (1996) +533 Daytrippers, The (1996) +534 Traveller (1997) +535 Addicted to Love (1997) +536 Ponette (1996) +537 My Own Private Idaho (1991) +538 Anastasia (1997) +539 Mouse Hunt (1997) +540 Money Train (1995) +541 Mortal Kombat (1995) +542 Pocahontas (1995) +543 Misérables, Les (1995) +544 Things to Do in Denver when You're Dead (1995) +545 Vampire in Brooklyn (1995) +546 Broken Arrow (1996) +547 Young Poisoner's Handbook, The (1995) +548 NeverEnding Story III, The (1994) +549 Rob Roy (1995) +550 Die Hard: With a Vengeance (1995) +551 Lord of Illusions (1995) +552 Species (1995) +553 Walk in the Clouds, A (1995) +554 Waterworld (1995) +555 White Man's Burden (1995) +556 Wild Bill (1995) +557 Farinelli: il castrato (1994) +558 Heavenly Creatures (1994) +559 Interview with the Vampire (1994) +560 Kid in King Arthur's Court, A (1995) +561 Mary Shelley's Frankenstein (1994) +562 Quick and the Dead, The (1995) +563 Stephen King's The Langoliers (1995) +564 Tales from the Hood (1995) +565 Village of the Damned (1995) +566 Clear and Present Danger (1994) +567 Wes Craven's New Nightmare (1994) +568 Speed (1994) +569 Wolf (1994) +570 Wyatt Earp (1994) +571 Another Stakeout (1993) +572 Blown Away (1994) +573 Body Snatchers (1993) +574 Boxing Helena (1993) +575 City Slickers II: The Legend of Curly's Gold (1994) +576 Cliffhanger (1993) +577 Coneheads (1993) +578 Demolition Man (1993) +579 Fatal Instinct (1993) +580 Englishman Who Went Up a Hill, But Came Down a Mountain, The (1995) +581 Kalifornia (1993) +582 Piano, The (1993) +583 Romeo Is Bleeding (1993) +584 Secret Garden, The (1993) +585 Son in Law (1993) +586 Terminal Velocity (1994) +587 Hour of the Pig, The (1993) +588 Beauty and the Beast (1991) +589 Wild Bunch, The (1969) +590 Hellraiser: Bloodline (1996) +591 Primal Fear (1996) +592 True Crime (1995) +593 Stalingrad (1993) +594 Heavy (1995) +595 Fan, The (1996) +596 Hunchback of Notre Dame, The (1996) +597 Eraser (1996) +598 Big Squeeze, The (1996) +599 Police Story 4: Project S (Chao ji ji hua) (1993) +600 Daniel Defoe's Robinson Crusoe (1996) +601 For Whom the Bell Tolls (1943) +602 American in Paris, An (1951) +603 Rear Window (1954) +604 It Happened One Night (1934) +605 Meet Me in St. Louis (1944) +606 All About Eve (1950) +607 Rebecca (1940) +608 Spellbound (1945) +609 Father of the Bride (1950) +610 Gigi (1958) +611 Laura (1944) +612 Lost Horizon (1937) +613 My Man Godfrey (1936) +614 Giant (1956) +615 39 Steps, The (1935) +616 Night of the Living Dead (1968) +617 Blue Angel, The (Blaue Engel, Der) (1930) +618 Picnic (1955) +619 Extreme Measures (1996) +620 Chamber, The (1996) +621 Davy Crockett, King of the Wild Frontier (1955) +622 Swiss Family Robinson (1960) +623 Angels in the Outfield (1994) +624 Three Caballeros, The (1945) +625 Sword in the Stone, The (1963) +626 So Dear to My Heart (1949) +627 Robin Hood: Prince of Thieves (1991) +628 Sleepers (1996) +629 Victor/Victoria (1982) +630 Great Race, The (1965) +631 Crying Game, The (1992) +632 Sophie's Choice (1982) +633 Christmas Carol, A (1938) +634 Microcosmos: Le peuple de l'herbe (1996) +635 Fog, The (1980) +636 Escape from New York (1981) +637 Howling, The (1981) +638 Return of Martin Guerre, The (Retour de Martin Guerre, Le) (1982) +639 Tin Drum, The (Blechtrommel, Die) (1979) +640 Cook the Thief His Wife & Her Lover, The (1989) +641 Paths of Glory (1957) +642 Grifters, The (1990) +643 The Innocent (1994) +644 Thin Blue Line, The (1988) +645 Paris Is Burning (1990) +646 Once Upon a Time in the West (1969) +647 Ran (1985) +648 Quiet Man, The (1952) +649 Once Upon a Time in America (1984) +650 Seventh Seal, The (Sjunde inseglet, Det) (1957) +651 Glory (1989) +652 Rosencrantz and Guildenstern Are Dead (1990) +653 Touch of Evil (1958) +654 Chinatown (1974) +655 Stand by Me (1986) +656 M (1931) +657 Manchurian Candidate, The (1962) +658 Pump Up the Volume (1990) +659 Arsenic and Old Lace (1944) +660 Fried Green Tomatoes (1991) +661 High Noon (1952) +662 Somewhere in Time (1980) +663 Being There (1979) +664 Paris, Texas (1984) +665 Alien 3 (1992) +666 Blood For Dracula (Andy Warhol's Dracula) (1974) +667 Audrey Rose (1977) +668 Blood Beach (1981) +669 Body Parts (1991) +670 Body Snatchers (1993) +671 Bride of Frankenstein (1935) +672 Candyman (1992) +673 Cape Fear (1962) +674 Cat People (1982) +675 Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922) +676 Crucible, The (1996) +677 Fire on the Mountain (1996) +678 Volcano (1997) +679 Conan the Barbarian (1981) +680 Kull the Conqueror (1997) +681 Wishmaster (1997) +682 I Know What You Did Last Summer (1997) +683 Rocket Man (1997) +684 In the Line of Fire (1993) +685 Executive Decision (1996) +686 Perfect World, A (1993) +687 McHale's Navy (1997) +688 Leave It to Beaver (1997) +689 Jackal, The (1997) +690 Seven Years in Tibet (1997) +691 Dark City (1998) +692 American President, The (1995) +693 Casino (1995) +694 Persuasion (1995) +695 Kicking and Screaming (1995) +696 City Hall (1996) +697 Basketball Diaries, The (1995) +698 Browning Version, The (1994) +699 Little Women (1994) +700 Miami Rhapsody (1995) +701 Wonderful, Horrible Life of Leni Riefenstahl, The (1993) +702 Barcelona (1994) +703 Widows' Peak (1994) +704 House of the Spirits, The (1993) +705 Singin' in the Rain (1952) +706 Bad Moon (1996) +707 Enchanted April (1991) +708 Sex, Lies, and Videotape (1989) +709 Strictly Ballroom (1992) +710 Better Off Dead... (1985) +711 Substance of Fire, The (1996) +712 Tin Men (1987) +713 Othello (1995) +714 Carrington (1995) +715 To Die For (1995) +716 Home for the Holidays (1995) +717 Juror, The (1996) +718 In the Bleak Midwinter (1995) +719 Canadian Bacon (1994) +720 First Knight (1995) +721 Mallrats (1995) +722 Nine Months (1995) +723 Boys on the Side (1995) +724 Circle of Friends (1995) +725 Exit to Eden (1994) +726 Fluke (1995) +727 Immortal Beloved (1994) +728 Junior (1994) +729 Nell (1994) +730 Queen Margot (Reine Margot, La) (1994) +731 Corrina, Corrina (1994) +732 Dave (1993) +733 Go Fish (1994) +734 Made in America (1993) +735 Philadelphia (1993) +736 Shadowlands (1993) +737 Sirens (1994) +738 Threesome (1994) +739 Pretty Woman (1990) +740 Jane Eyre (1996) +741 Last Supper, The (1995) +742 Ransom (1996) +743 Crow: City of Angels, The (1996) +744 Michael Collins (1996) +745 Ruling Class, The (1972) +746 Real Genius (1985) +747 Benny & Joon (1993) +748 Saint, The (1997) +749 MatchMaker, The (1997) +750 Amistad (1997) +751 Tomorrow Never Dies (1997) +752 Replacement Killers, The (1998) +753 Burnt By the Sun (1994) +754 Red Corner (1997) +755 Jumanji (1995) +756 Father of the Bride Part II (1995) +757 Across the Sea of Time (1995) +758 Lawnmower Man 2: Beyond Cyberspace (1996) +759 Fair Game (1995) +760 Screamers (1995) +761 Nick of Time (1995) +762 Beautiful Girls (1996) +763 Happy Gilmore (1996) +764 If Lucy Fell (1996) +765 Boomerang (1992) +766 Man of the Year (1995) +767 Addiction, The (1995) +768 Casper (1995) +769 Congo (1995) +770 Devil in a Blue Dress (1995) +771 Johnny Mnemonic (1995) +772 Kids (1995) +773 Mute Witness (1994) +774 Prophecy, The (1995) +775 Something to Talk About (1995) +776 Three Wishes (1995) +777 Castle Freak (1995) +778 Don Juan DeMarco (1995) +779 Drop Zone (1994) +780 Dumb & Dumber (1994) +781 French Kiss (1995) +782 Little Odessa (1994) +783 Milk Money (1994) +784 Beyond Bedlam (1993) +785 Only You (1994) +786 Perez Family, The (1995) +787 Roommates (1995) +788 Relative Fear (1994) +789 Swimming with Sharks (1995) +790 Tommy Boy (1995) +791 Baby-Sitters Club, The (1995) +792 Bullets Over Broadway (1994) +793 Crooklyn (1994) +794 It Could Happen to You (1994) +795 Richie Rich (1994) +796 Speechless (1994) +797 Timecop (1994) +798 Bad Company (1995) +799 Boys Life (1995) +800 In the Mouth of Madness (1995) +801 Air Up There, The (1994) +802 Hard Target (1993) +803 Heaven & Earth (1993) +804 Jimmy Hollywood (1994) +805 Manhattan Murder Mystery (1993) +806 Menace II Society (1993) +807 Poetic Justice (1993) +808 Program, The (1993) +809 Rising Sun (1993) +810 Shadow, The (1994) +811 Thirty-Two Short Films About Glenn Gould (1993) +812 Andre (1994) +813 Celluloid Closet, The (1995) +814 Great Day in Harlem, A (1994) +815 One Fine Day (1996) +816 Candyman: Farewell to the Flesh (1995) +817 Frisk (1995) +818 Girl 6 (1996) +819 Eddie (1996) +820 Space Jam (1996) +821 Mrs. Winterbourne (1996) +822 Faces (1968) +823 Mulholland Falls (1996) +824 Great White Hype, The (1996) +825 Arrival, The (1996) +826 Phantom, The (1996) +827 Daylight (1996) +828 Alaska (1996) +829 Fled (1996) +830 Power 98 (1995) +831 Escape from L.A. (1996) +832 Bogus (1996) +833 Bulletproof (1996) +834 Halloween: The Curse of Michael Myers (1995) +835 Gay Divorcee, The (1934) +836 Ninotchka (1939) +837 Meet John Doe (1941) +838 In the Line of Duty 2 (1987) +839 Loch Ness (1995) +840 Last Man Standing (1996) +841 Glimmer Man, The (1996) +842 Pollyanna (1960) +843 Shaggy Dog, The (1959) +844 Freeway (1996) +845 That Thing You Do! (1996) +846 To Gillian on Her 37th Birthday (1996) +847 Looking for Richard (1996) +848 Murder, My Sweet (1944) +849 Days of Thunder (1990) +850 Perfect Candidate, A (1996) +851 Two or Three Things I Know About Her (1966) +852 Bloody Child, The (1996) +853 Braindead (1992) +854 Bad Taste (1987) +855 Diva (1981) +856 Night on Earth (1991) +857 Paris Was a Woman (1995) +858 Amityville: Dollhouse (1996) +859 April Fool's Day (1986) +860 Believers, The (1987) +861 Nosferatu a Venezia (1986) +862 Jingle All the Way (1996) +863 Garden of Finzi-Contini, The (Giardino dei Finzi-Contini, Il) (1970) +864 My Fellow Americans (1996) +865 Ice Storm, The (1997) +866 Michael (1996) +867 Whole Wide World, The (1996) +868 Hearts and Minds (1996) +869 Fools Rush In (1997) +870 Touch (1997) +871 Vegas Vacation (1997) +872 Love Jones (1997) +873 Picture Perfect (1997) +874 Career Girls (1997) +875 She's So Lovely (1997) +876 Money Talks (1997) +877 Excess Baggage (1997) +878 That Darn Cat! (1997) +879 Peacemaker, The (1997) +880 Soul Food (1997) +881 Money Talks (1997) +882 Washington Square (1997) +883 Telling Lies in America (1997) +884 Year of the Horse (1997) +885 Phantoms (1998) +886 Life Less Ordinary, A (1997) +887 Eve's Bayou (1997) +888 One Night Stand (1997) +889 Tango Lesson, The (1997) +890 Mortal Kombat: Annihilation (1997) +891 Bent (1997) +892 Flubber (1997) +893 For Richer or Poorer (1997) +894 Home Alone 3 (1997) +895 Scream 2 (1997) +896 Sweet Hereafter, The (1997) +897 Time Tracers (1995) +898 Postman, The (1997) +899 Winter Guest, The (1997) +900 Kundun (1997) +901 Mr. Magoo (1997) +902 Big Lebowski, The (1998) +903 Afterglow (1997) +904 Ma vie en rose (My Life in Pink) (1997) +905 Great Expectations (1998) +906 Oscar & Lucinda (1997) +907 Vermin (1998) +908 Half Baked (1998) +909 Dangerous Beauty (1998) +910 Nil By Mouth (1997) +911 Twilight (1998) +912 U.S. Marshalls (1998) +913 Love and Death on Long Island (1997) +914 Wild Things (1998) +915 Primary Colors (1998) +916 Lost in Space (1998) +917 Mercury Rising (1998) +918 City of Angels (1998) +919 City of Lost Children, The (1995) +920 Two Bits (1995) +921 Farewell My Concubine (1993) +922 Dead Man (1995) +923 Raise the Red Lantern (1991) +924 White Squall (1996) +925 Unforgettable (1996) +926 Down Periscope (1996) +927 Flower of My Secret, The (Flor de mi secreto, La) (1995) +928 Craft, The (1996) +929 Harriet the Spy (1996) +930 Chain Reaction (1996) +931 Island of Dr. Moreau, The (1996) +932 First Kid (1996) +933 Funeral, The (1996) +934 Preacher's Wife, The (1996) +935 Paradise Road (1997) +936 Brassed Off (1996) +937 Thousand Acres, A (1997) +938 Smile Like Yours, A (1997) +939 Murder in the First (1995) +940 Airheads (1994) +941 With Honors (1994) +942 What's Love Got to Do with It (1993) +943 Killing Zoe (1994) +944 Renaissance Man (1994) +945 Charade (1963) +946 Fox and the Hound, The (1981) +947 Big Blue, The (Grand bleu, Le) (1988) +948 Booty Call (1997) +949 How to Make an American Quilt (1995) +950 Georgia (1995) +951 Indian in the Cupboard, The (1995) +952 Blue in the Face (1995) +953 Unstrung Heroes (1995) +954 Unzipped (1995) +955 Before Sunrise (1995) +956 Nobody's Fool (1994) +957 Pushing Hands (1992) +958 To Live (Huozhe) (1994) +959 Dazed and Confused (1993) +960 Naked (1993) +961 Orlando (1993) +962 Ruby in Paradise (1993) +963 Some Folks Call It a Sling Blade (1993) +964 Month by the Lake, A (1995) +965 Funny Face (1957) +966 Affair to Remember, An (1957) +967 Little Lord Fauntleroy (1936) +968 Inspector General, The (1949) +969 Winnie the Pooh and the Blustery Day (1968) +970 Hear My Song (1991) +971 Mediterraneo (1991) +972 Passion Fish (1992) +973 Grateful Dead (1995) +974 Eye for an Eye (1996) +975 Fear (1996) +976 Solo (1996) +977 Substitute, The (1996) +978 Heaven's Prisoners (1996) +979 Trigger Effect, The (1996) +980 Mother Night (1996) +981 Dangerous Ground (1997) +982 Maximum Risk (1996) +983 Rich Man's Wife, The (1996) +984 Shadow Conspiracy (1997) +985 Blood & Wine (1997) +986 Turbulence (1997) +987 Underworld (1997) +988 Beautician and the Beast, The (1997) +989 Cats Don't Dance (1997) +990 Anna Karenina (1997) +991 Keys to Tulsa (1997) +992 Head Above Water (1996) +993 Hercules (1997) +994 Last Time I Committed Suicide, The (1997) +995 Kiss Me, Guido (1997) +996 Big Green, The (1995) +997 Stuart Saves His Family (1995) +998 Cabin Boy (1994) +999 Clean Slate (1994) +1000 Lightning Jack (1994) +1001 Stupids, The (1996) +1002 Pest, The (1997) +1003 That Darn Cat! (1997) +1004 Geronimo: An American Legend (1993) +1005 Double vie de Véronique, La (Double Life of Veronique, The) (1991) +1006 Until the End of the World (Bis ans Ende der Welt) (1991) +1007 Waiting for Guffman (1996) +1008 I Shot Andy Warhol (1996) +1009 Stealing Beauty (1996) +1010 Basquiat (1996) +1011 2 Days in the Valley (1996) +1012 Private Parts (1997) +1013 Anaconda (1997) +1014 Romy and Michele's High School Reunion (1997) +1015 Shiloh (1997) +1016 Con Air (1997) +1017 Trees Lounge (1996) +1018 Tie Me Up! Tie Me Down! (1990) +1019 Die xue shuang xiong (Killer, The) (1989) +1020 Gaslight (1944) +1021 8 1/2 (1963) +1022 Fast, Cheap & Out of Control (1997) +1023 Fathers' Day (1997) +1024 Mrs. Dalloway (1997) +1025 Fire Down Below (1997) +1026 Lay of the Land, The (1997) +1027 Shooter, The (1995) +1028 Grumpier Old Men (1995) +1029 Jury Duty (1995) +1030 Beverly Hillbillies, The (1993) +1031 Lassie (1994) +1032 Little Big League (1994) +1033 Homeward Bound II: Lost in San Francisco (1996) +1034 Quest, The (1996) +1035 Cool Runnings (1993) +1036 Drop Dead Fred (1991) +1037 Grease 2 (1982) +1038 Switchback (1997) +1039 Hamlet (1996) +1040 Two if by Sea (1996) +1041 Forget Paris (1995) +1042 Just Cause (1995) +1043 Rent-a-Kid (1995) +1044 Paper, The (1994) +1045 Fearless (1993) +1046 Malice (1993) +1047 Multiplicity (1996) +1048 She's the One (1996) +1049 House Arrest (1996) +1050 Ghost and Mrs. Muir, The (1947) +1051 Associate, The (1996) +1052 Dracula: Dead and Loving It (1995) +1053 Now and Then (1995) +1054 Mr. Wrong (1996) +1055 Simple Twist of Fate, A (1994) +1056 Cronos (1992) +1057 Pallbearer, The (1996) +1058 War, The (1994) +1059 Don't Be a Menace to South Central While Drinking Your Juice in the Hood (1996) +1060 Adventures of Pinocchio, The (1996) +1061 Evening Star, The (1996) +1062 Four Days in September (1997) +1063 Little Princess, A (1995) +1064 Crossfire (1947) +1065 Koyaanisqatsi (1983) +1066 Balto (1995) +1067 Bottle Rocket (1996) +1068 Star Maker, The (Uomo delle stelle, L') (1995) +1069 Amateur (1994) +1070 Living in Oblivion (1995) +1071 Party Girl (1995) +1072 Pyromaniac's Love Story, A (1995) +1073 Shallow Grave (1994) +1074 Reality Bites (1994) +1075 Man of No Importance, A (1994) +1076 Pagemaster, The (1994) +1077 Love and a .45 (1994) +1078 Oliver & Company (1988) +1079 Joe's Apartment (1996) +1080 Celestial Clockwork (1994) +1081 Curdled (1996) +1082 Female Perversions (1996) +1083 Albino Alligator (1996) +1084 Anne Frank Remembered (1995) +1085 Carried Away (1996) +1086 It's My Party (1995) +1087 Bloodsport 2 (1995) +1088 Double Team (1997) +1089 Speed 2: Cruise Control (1997) +1090 Sliver (1993) +1091 Pete's Dragon (1977) +1092 Dear God (1996) +1093 Live Nude Girls (1995) +1094 Thin Line Between Love and Hate, A (1996) +1095 High School High (1996) +1096 Commandments (1997) +1097 Hate (Haine, La) (1995) +1098 Flirting With Disaster (1996) +1099 Red Firecracker, Green Firecracker (1994) +1100 What Happened Was... (1994) +1101 Six Degrees of Separation (1993) +1102 Two Much (1996) +1103 Trust (1990) +1104 C'est arrivé près de chez vous (1992) +1105 Firestorm (1998) +1106 Newton Boys, The (1998) +1107 Beyond Rangoon (1995) +1108 Feast of July (1995) +1109 Death and the Maiden (1994) +1110 Tank Girl (1995) +1111 Double Happiness (1994) +1112 Cobb (1994) +1113 Mrs. Parker and the Vicious Circle (1994) +1114 Faithful (1996) +1115 Twelfth Night (1996) +1116 Mark of Zorro, The (1940) +1117 Surviving Picasso (1996) +1118 Up in Smoke (1978) +1119 Some Kind of Wonderful (1987) +1120 I'm Not Rappaport (1996) +1121 Umbrellas of Cherbourg, The (Parapluies de Cherbourg, Les) (1964) +1122 They Made Me a Criminal (1939) +1123 Last Time I Saw Paris, The (1954) +1124 Farewell to Arms, A (1932) +1125 Innocents, The (1961) +1126 Old Man and the Sea, The (1958) +1127 Truman Show, The (1998) +1128 Heidi Fleiss: Hollywood Madam (1995) +1129 Chungking Express (1994) +1130 Jupiter's Wife (1994) +1131 Safe (1995) +1132 Feeling Minnesota (1996) +1133 Escape to Witch Mountain (1975) +1134 Get on the Bus (1996) +1135 Doors, The (1991) +1136 Ghosts of Mississippi (1996) +1137 Beautiful Thing (1996) +1138 Best Men (1997) +1139 Hackers (1995) +1140 Road to Wellville, The (1994) +1141 War Room, The (1993) +1142 When We Were Kings (1996) +1143 Hard Eight (1996) +1144 Quiet Room, The (1996) +1145 Blue Chips (1994) +1146 Calendar Girl (1993) +1147 My Family (1995) +1148 Tom & Viv (1994) +1149 Walkabout (1971) +1150 Last Dance (1996) +1151 Original Gangstas (1996) +1152 In Love and War (1996) +1153 Backbeat (1993) +1154 Alphaville (1965) +1155 Rendezvous in Paris (Rendez-vous de Paris, Les) (1995) +1156 Cyclo (1995) +1157 Relic, The (1997) +1158 Fille seule, La (A Single Girl) (1995) +1159 Stalker (1979) +1160 Love! Valour! Compassion! (1997) +1161 Palookaville (1996) +1162 Phat Beach (1996) +1163 Portrait of a Lady, The (1996) +1164 Zeus and Roxanne (1997) +1165 Big Bully (1996) +1166 Love & Human Remains (1993) +1167 Sum of Us, The (1994) +1168 Little Buddha (1993) +1169 Fresh (1994) +1170 Spanking the Monkey (1994) +1171 Wild Reeds (1994) +1172 Women, The (1939) +1173 Bliss (1997) +1174 Caught (1996) +1175 Hugo Pool (1997) +1176 Welcome To Sarajevo (1997) +1177 Dunston Checks In (1996) +1178 Major Payne (1994) +1179 Man of the House (1995) +1180 I Love Trouble (1994) +1181 Low Down Dirty Shame, A (1994) +1182 Cops and Robbersons (1994) +1183 Cowboy Way, The (1994) +1184 Endless Summer 2, The (1994) +1185 In the Army Now (1994) +1186 Inkwell, The (1994) +1187 Switchblade Sisters (1975) +1188 Young Guns II (1990) +1189 Prefontaine (1997) +1190 That Old Feeling (1997) +1191 Letter From Death Row, A (1998) +1192 Boys of St. Vincent, The (1993) +1193 Before the Rain (Pred dozhdot) (1994) +1194 Once Were Warriors (1994) +1195 Strawberry and Chocolate (Fresa y chocolate) (1993) +1196 Savage Nights (Nuits fauves, Les) (1992) +1197 Family Thing, A (1996) +1198 Purple Noon (1960) +1199 Cemetery Man (Dellamorte Dellamore) (1994) +1200 Kim (1950) +1201 Marlene Dietrich: Shadow and Light (1996) +1202 Maybe, Maybe Not (Bewegte Mann, Der) (1994) +1203 Top Hat (1935) +1204 To Be or Not to Be (1942) +1205 Secret Agent, The (1996) +1206 Amos & Andrew (1993) +1207 Jade (1995) +1208 Kiss of Death (1995) +1209 Mixed Nuts (1994) +1210 Virtuosity (1995) +1211 Blue Sky (1994) +1212 Flesh and Bone (1993) +1213 Guilty as Sin (1993) +1214 In the Realm of the Senses (Ai no corrida) (1976) +1215 Barb Wire (1996) +1216 Kissed (1996) +1217 Assassins (1995) +1218 Friday (1995) +1219 Goofy Movie, A (1995) +1220 Higher Learning (1995) +1221 When a Man Loves a Woman (1994) +1222 Judgment Night (1993) +1223 King of the Hill (1993) +1224 Scout, The (1994) +1225 Angus (1995) +1226 Night Falls on Manhattan (1997) +1227 Awfully Big Adventure, An (1995) +1228 Under Siege 2: Dark Territory (1995) +1229 Poison Ivy II (1995) +1230 Ready to Wear (Pret-A-Porter) (1994) +1231 Marked for Death (1990) +1232 Madonna: Truth or Dare (1991) +1233 Nénette et Boni (1996) +1234 Chairman of the Board (1998) +1235 Big Bang Theory, The (1994) +1236 Other Voices, Other Rooms (1997) +1237 Twisted (1996) +1238 Full Speed (1996) +1239 Cutthroat Island (1995) +1240 Ghost in the Shell (Kokaku kidotai) (1995) +1241 Van, The (1996) +1242 Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991) +1243 Night Flier (1997) +1244 Metro (1997) +1245 Gridlock'd (1997) +1246 Bushwhacked (1995) +1247 Bad Girls (1994) +1248 Blink (1994) +1249 For Love or Money (1993) +1250 Best of the Best 3: No Turning Back (1995) +1251 A Chef in Love (1996) +1252 Contempt (Mépris, Le) (1963) +1253 Tie That Binds, The (1995) +1254 Gone Fishin' (1997) +1255 Broken English (1996) +1256 Designated Mourner, The (1997) +1257 Designated Mourner, The (1997) +1258 Trial and Error (1997) +1259 Pie in the Sky (1995) +1260 Total Eclipse (1995) +1261 Run of the Country, The (1995) +1262 Walking and Talking (1996) +1263 Foxfire (1996) +1264 Nothing to Lose (1994) +1265 Star Maps (1997) +1266 Bread and Chocolate (Pane e cioccolata) (1973) +1267 Clockers (1995) +1268 Bitter Moon (1992) +1269 Love in the Afternoon (1957) +1270 Life with Mikey (1993) +1271 North (1994) +1272 Talking About Sex (1994) +1273 Color of Night (1994) +1274 Robocop 3 (1993) +1275 Killer (Bulletproof Heart) (1994) +1276 Sunset Park (1996) +1277 Set It Off (1996) +1278 Selena (1997) +1279 Wild America (1997) +1280 Gang Related (1997) +1281 Manny & Lo (1996) +1282 Grass Harp, The (1995) +1283 Out to Sea (1997) +1284 Before and After (1996) +1285 Princess Caraboo (1994) +1286 Shall We Dance? (1937) +1287 Ed (1996) +1288 Denise Calls Up (1995) +1289 Jack and Sarah (1995) +1290 Country Life (1994) +1291 Celtic Pride (1996) +1292 Simple Wish, A (1997) +1293 Star Kid (1997) +1294 Ayn Rand: A Sense of Life (1997) +1295 Kicked in the Head (1997) +1296 Indian Summer (1996) +1297 Love Affair (1994) +1298 Band Wagon, The (1953) +1299 Penny Serenade (1941) +1300 'Til There Was You (1997) +1301 Stripes (1981) +1302 Late Bloomers (1996) +1303 Getaway, The (1994) +1304 New York Cop (1996) +1305 National Lampoon's Senior Trip (1995) +1306 Delta of Venus (1994) +1307 Carmen Miranda: Bananas Is My Business (1994) +1308 Babyfever (1994) +1309 Very Natural Thing, A (1974) +1310 Walk in the Sun, A (1945) +1311 Waiting to Exhale (1995) +1312 Pompatus of Love, The (1996) +1313 Palmetto (1998) +1314 Surviving the Game (1994) +1315 Inventing the Abbotts (1997) +1316 Horse Whisperer, The (1998) +1317 Journey of August King, The (1995) +1318 Catwalk (1995) +1319 Neon Bible, The (1995) +1320 Homage (1995) +1321 Open Season (1996) +1322 Metisse (Café au Lait) (1993) +1323 Wooden Man's Bride, The (Wu Kui) (1994) +1324 Loaded (1994) +1325 August (1996) +1326 Boys (1996) +1327 Captives (1994) +1328 Of Love and Shadows (1994) +1329 Low Life, The (1994) +1330 An Unforgettable Summer (1994) +1331 Last Klezmer: Leopold Kozlowski, His Life and Music, The (1995) +1332 My Life and Times With Antonin Artaud (En compagnie d'Antonin Artaud) (1993) +1333 Midnight Dancers (Sibak) (1994) +1334 Somebody to Love (1994) +1335 American Buffalo (1996) +1336 Kazaam (1996) +1337 Larger Than Life (1996) +1338 Two Deaths (1995) +1339 Stefano Quantestorie (1993) +1340 Crude Oasis, The (1995) +1341 Hedd Wyn (1992) +1342 Convent, The (Convento, O) (1995) +1343 Lotto Land (1995) +1344 Story of Xinghua, The (1993) +1345 Day the Sun Turned Cold, The (Tianguo niezi) (1994) +1346 Dingo (1992) +1347 Ballad of Narayama, The (Narayama Bushiko) (1958) +1348 Every Other Weekend (1990) +1349 Mille bolle blu (1993) +1350 Crows and Sparrows (1949) +1351 Lover's Knot (1996) +1352 Shadow of Angels (Schatten der Engel) (1976) +1353 1-900 (1994) +1354 Venice/Venice (1992) +1355 Infinity (1996) +1356 Ed's Next Move (1996) +1357 For the Moment (1994) +1358 The Deadly Cure (1996) +1359 Boys in Venice (1996) +1360 Sexual Life of the Belgians, The (1994) +1361 Search for One-eye Jimmy, The (1996) +1362 American Strays (1996) +1363 Leopard Son, The (1996) +1364 Bird of Prey (1996) +1365 Johnny 100 Pesos (1993) +1366 JLG/JLG - autoportrait de décembre (1994) +1367 Faust (1994) +1368 Mina Tannenbaum (1994) +1369 Forbidden Christ, The (Cristo proibito, Il) (1950) +1370 I Can't Sleep (J'ai pas sommeil) (1994) +1371 Machine, The (1994) +1372 Stranger, The (1994) +1373 Good Morning (1971) +1374 Falling in Love Again (1980) +1375 Cement Garden, The (1993) +1376 Meet Wally Sparks (1997) +1377 Hotel de Love (1996) +1378 Rhyme & Reason (1997) +1379 Love and Other Catastrophes (1996) +1380 Hollow Reed (1996) +1381 Losing Chase (1996) +1382 Bonheur, Le (1965) +1383 Second Jungle Book: Mowgli & Baloo, The (1997) +1384 Squeeze (1996) +1385 Roseanna's Grave (For Roseanna) (1997) +1386 Tetsuo II: Body Hammer (1992) +1387 Fall (1997) +1388 Gabbeh (1996) +1389 Mondo (1996) +1390 Innocent Sleep, The (1995) +1391 For Ever Mozart (1996) +1392 Locusts, The (1997) +1393 Stag (1997) +1394 Swept from the Sea (1997) +1395 Hurricane Streets (1998) +1396 Stonewall (1995) +1397 Of Human Bondage (1934) +1398 Anna (1996) +1399 Stranger in the House (1997) +1400 Picture Bride (1995) +1401 M. Butterfly (1993) +1402 Ciao, Professore! (1993) +1403 Caro Diario (Dear Diary) (1994) +1404 Withnail and I (1987) +1405 Boy's Life 2 (1997) +1406 When Night Is Falling (1995) +1407 Specialist, The (1994) +1408 Gordy (1995) +1409 Swan Princess, The (1994) +1410 Harlem (1993) +1411 Barbarella (1968) +1412 Land Before Time III: The Time of the Great Giving (1995) (V) +1413 Street Fighter (1994) +1414 Coldblooded (1995) +1415 Next Karate Kid, The (1994) +1416 No Escape (1994) +1417 Turning, The (1992) +1418 Joy Luck Club, The (1993) +1419 Highlander III: The Sorcerer (1994) +1420 Gilligan's Island: The Movie (1998) +1421 My Crazy Life (Mi vida loca) (1993) +1422 Suture (1993) +1423 Walking Dead, The (1995) +1424 I Like It Like That (1994) +1425 I'll Do Anything (1994) +1426 Grace of My Heart (1996) +1427 Drunks (1995) +1428 SubUrbia (1997) +1429 Sliding Doors (1998) +1430 Ill Gotten Gains (1997) +1431 Legal Deceit (1997) +1432 Mighty, The (1998) +1433 Men of Means (1998) +1434 Shooting Fish (1997) +1435 Steal Big, Steal Little (1995) +1436 Mr. Jones (1993) +1437 House Party 3 (1994) +1438 Panther (1995) +1439 Jason's Lyric (1994) +1440 Above the Rim (1994) +1441 Moonlight and Valentino (1995) +1442 Scarlet Letter, The (1995) +1443 8 Seconds (1994) +1444 That Darn Cat! (1965) +1445 Ladybird Ladybird (1994) +1446 Bye Bye, Love (1995) +1447 Century (1993) +1448 My Favorite Season (1993) +1449 Pather Panchali (1955) +1450 Golden Earrings (1947) +1451 Foreign Correspondent (1940) +1452 Lady of Burlesque (1943) +1453 Angel on My Shoulder (1946) +1454 Angel and the Badman (1947) +1455 Outlaw, The (1943) +1456 Beat the Devil (1954) +1457 Love Is All There Is (1996) +1458 Damsel in Distress, A (1937) +1459 Madame Butterfly (1995) +1460 Sleepover (1995) +1461 Here Comes Cookie (1935) +1462 Thieves (Voleurs, Les) (1996) +1463 Boys, Les (1997) +1464 Stars Fell on Henrietta, The (1995) +1465 Last Summer in the Hamptons (1995) +1466 Margaret's Museum (1995) +1467 Saint of Fort Washington, The (1993) +1468 Cure, The (1995) +1469 Tom and Huck (1995) +1470 Gumby: The Movie (1995) +1471 Hideaway (1995) +1472 Visitors, The (Visiteurs, Les) (1993) +1473 Little Princess, The (1939) +1474 Nina Takes a Lover (1994) +1475 Bhaji on the Beach (1993) +1476 Raw Deal (1948) +1477 Nightwatch (1997) +1478 Dead Presidents (1995) +1479 Reckless (1995) +1480 Herbie Rides Again (1974) +1481 S.F.W. (1994) +1482 Gate of Heavenly Peace, The (1995) +1483 Man in the Iron Mask, The (1998) +1484 Jerky Boys, The (1994) +1485 Colonel Chabert, Le (1994) +1486 Girl in the Cadillac (1995) +1487 Even Cowgirls Get the Blues (1993) +1488 Germinal (1993) +1489 Chasers (1994) +1490 Fausto (1993) +1491 Tough and Deadly (1995) +1492 Window to Paris (1994) +1493 Modern Affair, A (1995) +1494 Mostro, Il (1994) +1495 Flirt (1995) +1496 Carpool (1996) +1497 Line King: Al Hirschfeld, The (1996) +1498 Farmer & Chase (1995) +1499 Grosse Fatigue (1994) +1500 Santa with Muscles (1996) +1501 Prisoner of the Mountains (Kavkazsky Plennik) (1996) +1502 Naked in New York (1994) +1503 Gold Diggers: The Secret of Bear Mountain (1995) +1504 Bewegte Mann, Der (1994) +1505 Killer: A Journal of Murder (1995) +1506 Nelly & Monsieur Arnaud (1995) +1507 Three Lives and Only One Death (1996) +1508 Babysitter, The (1995) +1509 Getting Even with Dad (1994) +1510 Mad Dog Time (1996) +1511 Children of the Revolution (1996) +1512 World of Apu, The (Apur Sansar) (1959) +1513 Sprung (1997) +1514 Dream With the Fishes (1997) +1515 Wings of Courage (1995) +1516 Wedding Gift, The (1994) +1517 Race the Sun (1996) +1518 Losing Isaiah (1995) +1519 New Jersey Drive (1995) +1520 Fear, The (1995) +1521 Mr. Wonderful (1993) +1522 Trial by Jury (1994) +1523 Good Man in Africa, A (1994) +1524 Kaspar Hauser (1993) +1525 Object of My Affection, The (1998) +1526 Witness (1985) +1527 Senseless (1998) +1528 Nowhere (1997) +1529 Underground (1995) +1530 Jefferson in Paris (1995) +1531 Far From Home: The Adventures of Yellow Dog (1995) +1532 Foreign Student (1994) +1533 I Don't Want to Talk About It (De eso no se habla) (1993) +1534 Twin Town (1997) +1535 Enfer, L' (1994) +1536 Aiqing wansui (1994) +1537 Cosi (1996) +1538 All Over Me (1997) +1539 Being Human (1993) +1540 Amazing Panda Adventure, The (1995) +1541 Beans of Egypt, Maine, The (1994) +1542 Scarlet Letter, The (1926) +1543 Johns (1996) +1544 It Takes Two (1995) +1545 Frankie Starlight (1995) +1546 Shadows (Cienie) (1988) +1547 Show, The (1995) +1548 The Courtyard (1995) +1549 Dream Man (1995) +1550 Destiny Turns on the Radio (1995) +1551 Glass Shield, The (1994) +1552 Hunted, The (1995) +1553 Underneath, The (1995) +1554 Safe Passage (1994) +1555 Secret Adventures of Tom Thumb, The (1993) +1556 Condition Red (1995) +1557 Yankee Zulu (1994) +1558 Aparajito (1956) +1559 Hostile Intentions (1994) +1560 Clean Slate (Coup de Torchon) (1981) +1561 Tigrero: A Film That Was Never Made (1994) +1562 Eye of Vichy, The (Oeil de Vichy, L') (1993) +1563 Promise, The (Versprechen, Das) (1994) +1564 To Cross the Rubicon (1991) +1565 Daens (1992) +1566 Man from Down Under, The (1943) +1567 Careful (1992) +1568 Vermont Is For Lovers (1992) +1569 Vie est belle, La (Life is Rosey) (1987) +1570 Quartier Mozart (1992) +1571 Touki Bouki (Journey of the Hyena) (1973) +1572 Wend Kuuni (God's Gift) (1982) +1573 Spirits of the Dead (Tre passi nel delirio) (1968) +1574 Pharaoh's Army (1995) +1575 I, Worst of All (Yo, la peor de todas) (1990) +1576 Hungarian Fairy Tale, A (1987) +1577 Death in the Garden (Mort en ce jardin, La) (1956) +1578 Collectionneuse, La (1967) +1579 Baton Rouge (1988) +1580 Liebelei (1933) +1581 Woman in Question, The (1950) +1582 T-Men (1947) +1583 Invitation, The (Zaproszenie) (1986) +1584 Symphonie pastorale, La (1946) +1585 American Dream (1990) +1586 Lashou shentan (1992) +1587 Terror in a Texas Town (1958) +1588 Salut cousin! (1996) +1589 Schizopolis (1996) +1590 To Have, or Not (1995) +1591 Duoluo tianshi (1995) +1592 Magic Hour, The (1998) +1593 Death in Brunswick (1991) +1594 Everest (1998) +1595 Shopping (1994) +1596 Nemesis 2: Nebula (1995) +1597 Romper Stomper (1992) +1598 City of Industry (1997) +1599 Someone Else's America (1995) +1600 Guantanamera (1994) +1601 Office Killer (1997) +1602 Price Above Rubies, A (1998) +1603 Angela (1995) +1604 He Walked by Night (1948) +1605 Love Serenade (1996) +1606 Deceiver (1997) +1607 Hurricane Streets (1998) +1608 Buddy (1997) +1609 B*A*P*S (1997) +1610 Truth or Consequences, N.M. (1997) +1611 Intimate Relations (1996) +1612 Leading Man, The (1996) +1613 Tokyo Fist (1995) +1614 Reluctant Debutante, The (1958) +1615 Warriors of Virtue (1997) +1616 Desert Winds (1995) +1617 Hugo Pool (1997) +1618 King of New York (1990) +1619 All Things Fair (1996) +1620 Sixth Man, The (1997) +1621 Butterfly Kiss (1995) +1622 Paris, France (1993) +1623 Cérémonie, La (1995) +1624 Hush (1998) +1625 Nightwatch (1997) +1626 Nobody Loves Me (Keiner liebt mich) (1994) +1627 Wife, The (1995) +1628 Lamerica (1994) +1629 Nico Icon (1995) +1630 Silence of the Palace, The (Saimt el Qusur) (1994) +1631 Slingshot, The (1993) +1632 Land and Freedom (Tierra y libertad) (1995) +1633 Á köldum klaka (Cold Fever) (1994) +1634 Etz Hadomim Tafus (Under the Domin Tree) (1994) +1635 Two Friends (1986) +1636 Brothers in Trouble (1995) +1637 Girls Town (1996) +1638 Normal Life (1996) +1639 Bitter Sugar (Azucar Amargo) (1996) +1640 Eighth Day, The (1996) +1641 Dadetown (1995) +1642 Some Mother's Son (1996) +1643 Angel Baby (1995) +1644 Sudden Manhattan (1996) +1645 Butcher Boy, The (1998) +1646 Men With Guns (1997) +1647 Hana-bi (1997) +1648 Niagara, Niagara (1997) +1649 Big One, The (1997) +1650 Butcher Boy, The (1998) +1651 Spanish Prisoner, The (1997) +1652 Temptress Moon (Feng Yue) (1996) +1653 Entertaining Angels: The Dorothy Day Story (1996) +1654 Chairman of the Board (1998) +1655 Favor, The (1994) +1656 Little City (1998) +1657 Target (1995) +1658 Substance of Fire, The (1996) +1659 Getting Away With Murder (1996) +1660 Small Faces (1995) +1661 New Age, The (1994) +1662 Rough Magic (1995) +1663 Nothing Personal (1995) +1664 8 Heads in a Duffel Bag (1997) +1665 Brother's Kiss, A (1997) +1666 Ripe (1996) +1667 Next Step, The (1995) +1668 Wedding Bell Blues (1996) +1669 MURDER and murder (1996) +1670 Tainted (1998) +1671 Further Gesture, A (1996) +1672 Kika (1993) +1673 Mirage (1995) +1674 Mamma Roma (1962) +1675 Sunchaser, The (1996) +1676 War at Home, The (1996) +1677 Sweet Nothing (1995) +1678 Mat' i syn (1997) +1679 B. Monkey (1998) +1680 Sliding Doors (1998) +1681 You So Crazy (1994) +1682 Scream of Stone (Schrei aus Stein) (1991) diff --git a/ex8/multivariateGaussian.py b/ex8/multivariateGaussian.py new file mode 100644 index 0000000..d2c1944 --- /dev/null +++ b/ex8/multivariateGaussian.py @@ -0,0 +1,21 @@ +import numpy as np + + +def multivariateGaussian(X, mu, Sigma2): + """Computes the probability + density function of the examples X under the multivariate gaussian + distribution with parameters mu and Sigma2. If Sigma2 is a matrix, it is + treated as the covariance matrix. If Sigma2 is a vector, it is treated + as the \sigma^2 values of the variances in each dimension (a diagonal + covariance matrix) + """ + k = len(mu) + + if Sigma2.ndim == 1: + Sigma2 = np.diag(Sigma2) + + X = X - mu + p = (2 * np.pi) ** (- k / 2) * np.linalg.det(Sigma2) ** (-0.5) * \ + np.exp(-0.5 * np.sum(X.dot(np.linalg.pinv(Sigma2))*X, axis=1)) + + return p \ No newline at end of file diff --git a/ex8/normalizeRatings.py b/ex8/normalizeRatings.py new file mode 100644 index 0000000..2bab007 --- /dev/null +++ b/ex8/normalizeRatings.py @@ -0,0 +1,22 @@ +import numpy as np + + +def normalizeRatings(Y, R): + """normalized Y so that each movie has a rating of 0 on average, + and returns the mean rating in Ymean. + """ + + m, n = Y.shape + Ymean = np.zeros(m) + Ynorm = np.zeros(Y.shape) + + for i in range(n): + idx = (R[i,:]==1).nonzero()[0] + if len(idx): + Ymean[i] = np.mean(Y[i, idx]) + Ynorm[i, idx] = Y[i, idx] - Ymean[i] + else: + Ymean[i] = 0.0 + Ynorm[i,idx] = 0.0 + + return Ynorm, Ymean diff --git a/ex8/selectThreshold.py b/ex8/selectThreshold.py new file mode 100644 index 0000000..1ab72ab --- /dev/null +++ b/ex8/selectThreshold.py @@ -0,0 +1,40 @@ +import numpy as np +import math + +def selectThreshold(yval, pval): + """ + finds the best + threshold to use for selecting outliers based on the results from a + validation set (pval) and the ground truth (yval). + """ + + bestEpsilon = 0 + bestF1 = 0 + + stepsize = (np.max(pval) - np.min(pval)) / 1000.0 + for epsilon in np.arange(np.min(pval),np.max(pval), stepsize): + + # ====================== YOUR CODE HERE ====================== + # Instructions: Compute the F1 score of choosing epsilon as the + # threshold and place the value in F1. The code at the + # end of the loop will compare the F1 score for this + # choice of epsilon and set it to be the best epsilon if + # it is better than the current choice of epsilon. + # + # Note: You can use predictions = (pval < epsilon) to get a binary vector + # of 0's and 1's of the outlier predictions + + + # ============================================================= + + if F1 > bestF1: + bestF1 = F1 + bestEpsilon = epsilon + + return bestEpsilon, bestF1 + + + + + + diff --git a/ex8/submit.py b/ex8/submit.py new file mode 100644 index 0000000..dd67f92 --- /dev/null +++ b/ex8/submit.py @@ -0,0 +1,64 @@ +import numpy as np + +from Submission import Submission +from Submission import sprintf + +homework = 'anomaly-detection-and-recommender-systems' + +part_names = [ + 'Estimate Gaussian Parameters', + 'Select Threshold', + 'Collaborative Filtering Cost', + 'Collaborative Filtering Gradient', + 'Regularized Cost', + 'Regularized Gradient', + ] + +srcs = [ + 'estimateGaussian.py', + 'selectThreshold.py', + 'cofiCostFunc.py', + 'cofiCostFunc.py', + 'cofiCostFunc.py', + 'cofiCostFunc.py', + ] + + +def output(part_id): + # Random Test Cases + n_u = 3 + n_m = 4 + n = 5 + X = np.sin(np.arange(1,n_m*n+1)).reshape((n, n_m)).T + Theta = np.cos(np.arange(1,n_u*n+1)).reshape((n, n_u)).T + Y = np.sin(np.arange(1,2.0*n_m*n_u,2.0)).reshape((n_u, n_m)).T + R = Y > 0.5 + pval = np.hstack((np.abs(Y.T.flatten()), 0.001, 1.0)) + yval = np.hstack((R.T.flatten(), 1.0, 0.0)).astype('bool') + params = np.hstack((X.T.flatten(), Theta.T.flatten())) + + fname = srcs[part_id-1].rsplit('.',1)[0] + mod = __import__(fname, fromlist=[fname], level=1) + func = getattr(mod, fname) + + if part_id == 1: + mu, sigma2 = func(X) + return sprintf('%0.5f ', np.hstack((mu.T.flatten(), sigma2.T.flatten()))) + elif part_id == 2: + bestEpsilon, bestF1 = func(yval, pval) + return sprintf('%0.5f ', np.hstack((bestEpsilon, bestF1))) + elif part_id == 3: + J, grad = func(params, Y, R, n_u, n_m, n, 0.0) + return sprintf('%0.5f ', J) + elif part_id == 4: + J, grad = func(params, Y, R, n_u, n_m, n, 0.0) + return sprintf('%0.5f ', grad.T.flatten()) + elif part_id == 5: + J, grad = func(params, Y, R, n_u, n_m, n, 1.5) + return sprintf('%0.5f ', J) + elif part_id == 6: + J, grad = func(params, Y, R, n_u, n_m, n, 1.5) + return sprintf('%0.5f ', grad.T.flatten()) + +s = Submission(homework, part_names, srcs, output) +s.submit() diff --git a/ex8/visualizeFit.py b/ex8/visualizeFit.py new file mode 100644 index 0000000..fd4e290 --- /dev/null +++ b/ex8/visualizeFit.py @@ -0,0 +1,24 @@ +import matplotlib.pyplot as plt +import numpy as np +from math import isinf +from multivariateGaussian import multivariateGaussian + +from show import show + + +def visualizeFit(X, mu, sigma2): + """ + This visualization shows you the + probability density function of the Gaussian distribution. Each example + has a location (x1, x2) that depends on its feature values. + """ + n = np.linspace(0,35,71) + X1 = np.meshgrid(n,n) + Z = multivariateGaussian(np.column_stack((X1[0].T.flatten(), X1[1].T.flatten())),mu,sigma2) + Z = Z.reshape(X1[0].shape) + + plt.plot(X[:, 0], X[:, 1],'bx') + # Do not plot if there are infinities + if not isinf(np.sum(Z)): + plt.contour(X1[0], X1[1], Z, 10.0**np.arange(-20, 0, 3).T) + show() diff --git a/show.py b/show.py new file mode 100644 index 0000000..0a801e8 --- /dev/null +++ b/show.py @@ -0,0 +1,9 @@ +from matplotlib import use +use('TkAgg') +import matplotlib.pyplot as plt + +def show(): + wm = plt.get_current_fig_manager() + wm.window.wm_geometry("+0+0") + plt.show(block=False) + wm.window.attributes('-topmost', 1)