Skip to content

Commit

Permalink
Update ex6
Browse files Browse the repository at this point in the history
  • Loading branch information
jtlowery committed Apr 16, 2017
1 parent ad702fd commit df95336
Show file tree
Hide file tree
Showing 9 changed files with 96 additions and 103 deletions.
25 changes: 13 additions & 12 deletions ex6/dataset3Params.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,17 @@ def dataset3Params(X, y, Xval, yval):
C = 1
sigma = 0.3

# ====================== YOUR CODE HERE ======================
# Instructions: Fill in this function to return the optimal C and sigma
# learning parameters found using the cross validation set.
# You can use svmPredict to predict the labels on the cross
# validation set. For example,
# predictions = svmPredict(model, Xval)
# will return the predictions on the cross validation set.
#
# Note: You can compute the prediction error using
# mean(double(predictions ~= yval))
#
# =========================================================================
# ====================== YOUR CODE HERE ======================
# Instructions: Fill in this function to return the optimal C and sigma
# learning parameters found using the cross validation set.
# You can use svmPredict to predict the labels on the cross
# validation set. For example,
# predictions = svmPredict(model, Xval)
# will return the predictions on the cross validation set.
#
# Note: You can compute the prediction error using
# mean(double(predictions ~= yval))
#
# =========================================================================

return C, sigma
77 changes: 39 additions & 38 deletions ex6/emailFeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,43 +11,44 @@ def emailFeatures(word_indices):

# You need to return the following variables correctly.
x = np.zeros(n)
# ====================== YOUR CODE HERE ======================
# Instructions: Fill in this function to return a feature vector for the
# given email (word_indices). To help make it easier to
# process the emails, we have have already pre-processed each
# email and converted each word in the email into an index in
# a fixed dictionary (of 1899 words). The variable
# word_indices contains the list of indices of the words
# which occur in one email.
#
# Concretely, if an email has the text:
#
# The quick brown fox jumped over the lazy dog.
#
# Then, the word_indices vector for this text might look
# like:
#
# 60 100 33 44 10 53 60 58 5
#
# where, we have mapped each word onto a number, for example:
#
# the -- 60
# quick -- 100
# ...
#
# (note: the above numbers are just an example and are not the
# actual mappings).
#
# Your task is take one such word_indices vector and construct
# a binary feature vector that indicates whether a particular
# word occurs in the email. That is, x(i) = 1 when word i
# is present in the email. Concretely, if the word 'the' (say,
# index 60) appears in the email, then x(60) = 1. The feature
# vector should look like:
#
# x = [ 0 0 0 0 1 0 0 0 ... 0 0 0 0 1 ... 0 0 0 1 0 ..]
#
#
# =========================================================================

# ====================== YOUR CODE HERE ======================
# Instructions: Fill in this function to return a feature vector for the
# given email (word_indices). To help make it easier to
# process the emails, we have have already pre-processed each
# email and converted each word in the email into an index in
# a fixed dictionary (of 1899 words). The variable
# word_indices contains the list of indices of the words
# which occur in one email.
#
# Concretely, if an email has the text:
#
# The quick brown fox jumped over the lazy dog.
#
# Then, the word_indices vector for this text might look
# like:
#
# 60 100 33 44 10 53 60 58 5
#
# where, we have mapped each word onto a number, for example:
#
# the -- 60
# quick -- 100
# ...
#
# (note: the above numbers are just an example and are not the
# actual mappings).
#
# Your task is take one such word_indices vector and construct
# a binary feature vector that indicates whether a particular
# word occurs in the email. That is, x(i) = 1 when word i
# is present in the email. Concretely, if the word 'the' (say,
# index 60) appears in the email, then x(60) = 1. The feature
# vector should look like:
#
# x = [ 0 0 0 0 1 0 0 0 ... 0 0 0 0 1 ... 0 0 0 1 0 ..]
#
#
# =========================================================================

return x
29 changes: 11 additions & 18 deletions ex6/ex6.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## Machine Learning Online Class
# Machine Learning Online Class
# Exercise 6 | Support Vector Machines
#
# Instructions
Expand All @@ -14,7 +14,7 @@
#
# For this exercise, you will not need to change any code in this file,
# or any other files other than those mentioned above.
#

from matplotlib import use, cm
use('TkAgg')
import numpy as np
Expand All @@ -25,11 +25,10 @@
from visualizeBoundary import visualizeBoundary
from visualizeBoundaryLinear import visualizeBoundaryLinear

## =============== Part 1: Loading and Visualizing Data ================
# =============== Part 1: Loading and Visualizing Data ================
# We start the exercise by first loading and visualizing the dataset.
# The following code will load the dataset into your environment and plot
# the data.
#

print('Loading and Visualizing Data ...')

Expand All @@ -44,10 +43,9 @@

input('Program paused. Press Enter to continue...')

## ==================== Part 2: Training Linear SVM ====================
# ==================== Part 2: Training Linear SVM ====================
# The following code will train a linear SVM on the dataset and plot the
# decision boundary learned.
#

# Load from ex6data1:
# You will have X, y in your environment
Expand All @@ -67,10 +65,10 @@

input('Program paused. Press Enter to continue...')

## =============== Part 3: Implementing Gaussian Kernel ===============
# =============== Part 3: Implementing Gaussian Kernel ===============
# You will now implement the Gaussian kernel to use
# with the SVM. You should complete the code in gaussianKernel.m
#

print('Evaluating the Gaussian Kernel ...')

x1 = np.array([1, 2, 1])
Expand All @@ -83,10 +81,9 @@

input('Program paused. Press Enter to continue...')

## =============== Part 4: Visualizing Dataset 2 ================
# =============== Part 4: Visualizing Dataset 2 ================
# The following code will load the next dataset into your environment and
# plot the data.
#

print('Loading and Visualizing Data ...')

Expand All @@ -101,7 +98,7 @@

input('Program paused. Press Enter to continue...')

## ========== Part 5: Training SVM with RBF Kernel (Dataset 2) ==========
# ========== Part 5: Training SVM with RBF Kernel (Dataset 2) ==========
# After you have implemented the kernel, we can now use it to train the
# SVM classifier.
#
Expand All @@ -128,10 +125,9 @@

input('Program paused. Press Enter to continue...')

## =============== Part 6: Visualizing Dataset 3 ================
# =============== Part 6: Visualizing Dataset 3 ================
# The following code will load the next dataset into your environment and
# plot the data.
#

print('Loading and Visualizing Data ...')

Expand All @@ -146,11 +142,9 @@

input('Program paused. Press Enter to continue...')

## ========== Part 7: Training SVM with RBF Kernel (Dataset 3) ==========

# ========== Part 7: Training SVM with RBF Kernel (Dataset 3) ==========
# This is a different dataset that you can use to experiment with. Try
# different values of C and sigma here.
#

# Load from ex6data3:
# You will have X, y in your environment
Expand All @@ -161,11 +155,10 @@
# Try different SVM Parameters here
C, sigma = dataset3Params(X, y, Xval, yval)
gamma = 1.0 / (2.0 * sigma ** 2)
# Train the SVM

# Train the SVM
clf = svm.SVC(C=C, kernel='rbf', tol=1e-3, max_iter=200, gamma=gamma)
model = clf.fit(X, y)
visualizeBoundary(X, y, model)

input('Program paused. Press Enter to continue...')

24 changes: 11 additions & 13 deletions ex6/ex6_spam.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## Machine Learning Online Class
# Machine Learning Online Class
# Exercise 6 | Spam Classification with SVMs
#
# Instructions
Expand All @@ -14,7 +14,7 @@
#
# For this exercise, you will not need to change any code in this file,
# or any other files other than those mentioned above.
#

import numpy as np
import scipy.io
from sklearn import svm
Expand All @@ -24,7 +24,7 @@
from emailFeatures import emailFeatures
from getVocabList import getVocabList

## ==================== Part 1: Email Preprocessing ====================
# ==================== Part 1: Email Preprocessing ====================
# To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
# to convert each email into a vector of features. In this part, you will
# implement the preprocessing steps for each email. You should
Expand All @@ -36,15 +36,15 @@
# Extract Features
file = open('emailSample1.txt', 'r')
file_contents = file.readlines()
word_indices = processEmail(''.join(file_contents))
word_indices = processEmail(''.join(file_contents))

# Print Stats
print('Word Indices: ')
print(word_indices)

input('Program paused. Press Enter to continue...')

## ==================== Part 2: Feature Extraction ====================
# ==================== Part 2: Feature Extraction ====================
# Now, you will convert each email into a vector of features in R^n.
# You should complete the code in emailFeatures.m to produce a feature
# vector for a given email.
Expand All @@ -58,12 +58,12 @@
features = emailFeatures(word_indices)

# Print Stats
print('Length of feature vector: %d'% features.size)
print('Number of non-zero entries: %d'% sum(features > 0))
print('Length of feature vector: %d' % features.size)
print('Number of non-zero entries: %d' % sum(features > 0))

input('Program paused. Press Enter to continue...')

## =========== Part 3: Train Linear SVM for Spam Classification ========
# =========== Part 3: Train Linear SVM for Spam Classification ========
# In this section, you will train a linear classifier to determine if an
# email is Spam or Not-Spam.

Expand All @@ -84,7 +84,7 @@

print('Training Accuracy: %f', np.mean(np.double(p == y)) * 100)

## =================== Part 4: Test Spam Classification ================
# =================== Part 4: Test Spam Classification ================
# After training the classifier, we can evaluate it on a test set. We have
# included a test set in spamTest.mat

Expand All @@ -100,14 +100,12 @@

print('Test Accuracy: %f', np. mean(np.double(p == ytest)) * 100)


## ================= Part 5: Top Predictors of Spam ====================
# ================= Part 5: Top Predictors of Spam ====================
# Since the model we are training is a linear SVM, we can inspect the
# weights learned by the model to understand better how it is determining
# whether an email is spam or not. The following code finds the words with
# the highest weights in the classifier. Informally, the classifier
# 'thinks' that these words are the most likely indicators of spam.
#

# Sort the weights and obtain the vocabulary list

Expand All @@ -123,7 +121,7 @@

print('Program paused. Press enter to continue.')

## =================== Part 6: Try Your Own Emails =====================
# =================== Part 6: Try Your Own Emails =====================
# Now that you've trained the spam classifier, you can use it on your own
# emails! In the starter code, we have included spamSample1.txt,
# spamSample2.txt, emailSample1.txt and emailSample2.txt as examples.
Expand Down
13 changes: 6 additions & 7 deletions ex6/gaussianKernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@ def gaussianKernel(x1, x2, sigma):
# You need to return the following variables correctly.
sim = 0

# ====================== YOUR CODE HERE ======================
# Instructions: Fill in this function to return the similarity between x1
# and x2 computed using a Gaussian kernel with bandwidth
# sigma
#
#
# =============================================================
# ====================== YOUR CODE HERE ======================
# Instructions: Fill in this function to return the similarity between x1
# and x2 computed using a Gaussian kernel with bandwidth
# sigma
#
# =============================================================
return sim
8 changes: 4 additions & 4 deletions ex6/getVocabList.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ def getVocabList():
and returns a cell array of the words in vocabList.
"""

## Read the fixed vocabulary list
# Read the fixed vocabulary list
with open('vocab.txt') as f:

# Store all dictionary words in cell array vocab{}
# Store all dictionary words in cell array vocab{}

# For ease of implementation, we use a struct to map the strings => integers
# In practice, you'll want to use some form of hashmap
# For ease of implementation, we use a struct to map the strings => integers
# In practice, you'll want to use some form of hashmap
vocabList = []
for line in f:
idx, w = line.split()
Expand Down
4 changes: 2 additions & 2 deletions ex6/plotData.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def plotData(X, y):
neg = np.where(y == 0, True, False).flatten()

# Plot Examples
plt.plot(X[pos,0], X[pos, 1], 'k+', linewidth=1, markersize=7)
plt.plot(X[neg,0], X[neg, 1], 'ko', color='y', markersize=7)
plt.plot(X[pos, 0], X[pos, 1], 'k+', linewidth=1, markersize=7)
plt.plot(X[neg, 0], X[neg, 1], 'ko', color='y', markersize=7)
show()

Loading

0 comments on commit df95336

Please sign in to comment.