Update ex6

johnmcdowell · Apr 16, 2017 · df95336 · df95336
1 parent ad702fd
commit df95336
Show file tree

Hide file tree

Showing 9 changed files with 96 additions and 103 deletions.
diff --git a/ex6/dataset3Params.py b/ex6/dataset3Params.py
@@ -12,16 +12,17 @@ def dataset3Params(X, y, Xval, yval):
     C = 1
     sigma = 0.3
 
-# ====================== YOUR CODE HERE ======================
-# Instructions: Fill in this function to return the optimal C and sigma
-#               learning parameters found using the cross validation set.
-#               You can use svmPredict to predict the labels on the cross
-#               validation set. For example, 
-#                   predictions = svmPredict(model, Xval)
-#               will return the predictions on the cross validation set.
-#
-#  Note: You can compute the prediction error using 
-#        mean(double(predictions ~= yval))
-#
-# =========================================================================
+    # ====================== YOUR CODE HERE ======================
+    # Instructions: Fill in this function to return the optimal C and sigma
+    #               learning parameters found using the cross validation set.
+    #               You can use svmPredict to predict the labels on the cross
+    #               validation set. For example,
+    #                   predictions = svmPredict(model, Xval)
+    #               will return the predictions on the cross validation set.
+    #
+    #  Note: You can compute the prediction error using
+    #        mean(double(predictions ~= yval))
+    #
+    # =========================================================================
+
     return C, sigma
diff --git a/ex6/emailFeatures.py b/ex6/emailFeatures.py
@@ -11,43 +11,44 @@ def emailFeatures(word_indices):
 
     # You need to return the following variables correctly.
     x = np.zeros(n)
-# ====================== YOUR CODE HERE ======================
-# Instructions: Fill in this function to return a feature vector for the
-#               given email (word_indices). To help make it easier to 
-#               process the emails, we have have already pre-processed each
-#               email and converted each word in the email into an index in
-#               a fixed dictionary (of 1899 words). The variable
-#               word_indices contains the list of indices of the words
-#               which occur in one email.
-# 
-#               Concretely, if an email has the text:
-#
-#                  The quick brown fox jumped over the lazy dog.
-#
-#               Then, the word_indices vector for this text might look 
-#               like:
-#               
-#                   60  100   33   44   10     53  60  58   5
-#
-#               where, we have mapped each word onto a number, for example:
-#
-#                   the   -- 60
-#                   quick -- 100
-#                   ...
-#
-#              (note: the above numbers are just an example and are not the
-#               actual mappings).
-#
-#              Your task is take one such word_indices vector and construct
-#              a binary feature vector that indicates whether a particular
-#              word occurs in the email. That is, x(i) = 1 when word i
-#              is present in the email. Concretely, if the word 'the' (say,
-#              index 60) appears in the email, then x(60) = 1. The feature
-#              vector should look like:
-#
-#              x = [ 0 0 0 0 1 0 0 0 ... 0 0 0 0 1 ... 0 0 0 1 0 ..]
-#
-#
-# =========================================================================
+
+    # ====================== YOUR CODE HERE ======================
+    # Instructions: Fill in this function to return a feature vector for the
+    #               given email (word_indices). To help make it easier to
+    #               process the emails, we have have already pre-processed each
+    #               email and converted each word in the email into an index in
+    #               a fixed dictionary (of 1899 words). The variable
+    #               word_indices contains the list of indices of the words
+    #               which occur in one email.
+    #
+    #               Concretely, if an email has the text:
+    #
+    #                  The quick brown fox jumped over the lazy dog.
+    #
+    #               Then, the word_indices vector for this text might look
+    #               like:
+    #
+    #                   60  100   33   44   10     53  60  58   5
+    #
+    #               where, we have mapped each word onto a number, for example:
+    #
+    #                   the   -- 60
+    #                   quick -- 100
+    #                   ...
+    #
+    #              (note: the above numbers are just an example and are not the
+    #               actual mappings).
+    #
+    #              Your task is take one such word_indices vector and construct
+    #              a binary feature vector that indicates whether a particular
+    #              word occurs in the email. That is, x(i) = 1 when word i
+    #              is present in the email. Concretely, if the word 'the' (say,
+    #              index 60) appears in the email, then x(60) = 1. The feature
+    #              vector should look like:
+    #
+    #              x = [ 0 0 0 0 1 0 0 0 ... 0 0 0 0 1 ... 0 0 0 1 0 ..]
+    #
+    #
+    # =========================================================================
 
     return x
diff --git a/ex6/ex6.py b/ex6/ex6.py
@@ -1,4 +1,4 @@
-## Machine Learning Online Class
+#  Machine Learning Online Class
 #  Exercise 6 | Support Vector Machines
 #
 #  Instructions
@@ -14,7 +14,7 @@
 #
 #  For this exercise, you will not need to change any code in this file,
 #  or any other files other than those mentioned above.
-#
+
 from matplotlib import use, cm
 use('TkAgg')
 import numpy as np
@@ -25,11 +25,10 @@
 from visualizeBoundary import visualizeBoundary
 from visualizeBoundaryLinear import visualizeBoundaryLinear
 
-## =============== Part 1: Loading and Visualizing Data ================
+#  =============== Part 1: Loading and Visualizing Data ================
 #  We start the exercise by first loading and visualizing the dataset. 
 #  The following code will load the dataset into your environment and plot
 #  the data.
-#
 
 print('Loading and Visualizing Data ...')
 
@@ -44,10 +43,9 @@
 
 input('Program paused. Press Enter to continue...')
 
-## ==================== Part 2: Training Linear SVM ====================
+#  ==================== Part 2: Training Linear SVM ====================
 #  The following code will train a linear SVM on the dataset and plot the
 #  decision boundary learned.
-#
 
 # Load from ex6data1:
 # You will have X, y in your environment
@@ -67,10 +65,10 @@
 
 input('Program paused. Press Enter to continue...')
 
-## =============== Part 3: Implementing Gaussian Kernel ===============
+#  =============== Part 3: Implementing Gaussian Kernel ===============
 #  You will now implement the Gaussian kernel to use
 #  with the SVM. You should complete the code in gaussianKernel.m
-#
+
 print('Evaluating the Gaussian Kernel ...')
 
 x1 = np.array([1, 2, 1])
@@ -83,10 +81,9 @@
 
 input('Program paused. Press Enter to continue...')
 
-## =============== Part 4: Visualizing Dataset 2 ================
+#  =============== Part 4: Visualizing Dataset 2 ================
 #  The following code will load the next dataset into your environment and
 #  plot the data.
-#
 
 print('Loading and Visualizing Data ...')
 
@@ -101,7 +98,7 @@
 
 input('Program paused. Press Enter to continue...')
 
-## ========== Part 5: Training SVM with RBF Kernel (Dataset 2) ==========
+#  ========== Part 5: Training SVM with RBF Kernel (Dataset 2) ==========
 #  After you have implemented the kernel, we can now use it to train the
 #  SVM classifier.
 #
@@ -128,10 +125,9 @@
 
 input('Program paused. Press Enter to continue...')
 
-## =============== Part 6: Visualizing Dataset 3 ================
+#  =============== Part 6: Visualizing Dataset 3 ================
 #  The following code will load the next dataset into your environment and
 #  plot the data.
-#
 
 print('Loading and Visualizing Data ...')
 
@@ -146,11 +142,9 @@
 
 input('Program paused. Press Enter to continue...')
 
-## ========== Part 7: Training SVM with RBF Kernel (Dataset 3) ==========
-
+# ========== Part 7: Training SVM with RBF Kernel (Dataset 3) ==========
 #  This is a different dataset that you can use to experiment with. Try
 #  different values of C and sigma here.
-#
 
 # Load from ex6data3:
 # You will have X, y in your environment
@@ -161,11 +155,10 @@
 # Try different SVM Parameters here
 C, sigma = dataset3Params(X, y, Xval, yval)
 gamma = 1.0 / (2.0 * sigma ** 2)
-# Train the SVM
 
+# Train the SVM
 clf = svm.SVC(C=C, kernel='rbf', tol=1e-3, max_iter=200, gamma=gamma)
 model = clf.fit(X, y)
 visualizeBoundary(X, y, model)
 
 input('Program paused. Press Enter to continue...')
-
diff --git a/ex6/ex6_spam.py b/ex6/ex6_spam.py
@@ -1,4 +1,4 @@
-## Machine Learning Online Class
+#  Machine Learning Online Class
 #  Exercise 6 | Spam Classification with SVMs
 #
 #  Instructions
@@ -14,7 +14,7 @@
 #
 #  For this exercise, you will not need to change any code in this file,
 #  or any other files other than those mentioned above.
-#
+
 import numpy as np
 import scipy.io
 from sklearn import svm
@@ -24,7 +24,7 @@
 from emailFeatures import emailFeatures
 from getVocabList import getVocabList
 
-## ==================== Part 1: Email Preprocessing ====================
+#  ==================== Part 1: Email Preprocessing ====================
 #  To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
 #  to convert each email into a vector of features. In this part, you will
 #  implement the preprocessing steps for each email. You should
@@ -36,15 +36,15 @@
 # Extract Features
 file = open('emailSample1.txt', 'r')
 file_contents = file.readlines()
-word_indices  = processEmail(''.join(file_contents))
+word_indices = processEmail(''.join(file_contents))
 
 # Print Stats
 print('Word Indices: ')
 print(word_indices)
 
 input('Program paused. Press Enter to continue...')
 
-## ==================== Part 2: Feature Extraction ====================
+#  ==================== Part 2: Feature Extraction ====================
 #  Now, you will convert each email into a vector of features in R^n.
 #  You should complete the code in emailFeatures.m to produce a feature
 #  vector for a given email.
@@ -58,12 +58,12 @@
 features = emailFeatures(word_indices)
 
 # Print Stats
-print('Length of feature vector: %d'% features.size)
-print('Number of non-zero entries: %d'% sum(features > 0))
+print('Length of feature vector: %d' % features.size)
+print('Number of non-zero entries: %d' % sum(features > 0))
 
 input('Program paused. Press Enter to continue...')
 
-## =========== Part 3: Train Linear SVM for Spam Classification ========
+#  =========== Part 3: Train Linear SVM for Spam Classification ========
 #  In this section, you will train a linear classifier to determine if an
 #  email is Spam or Not-Spam.
 
@@ -84,7 +84,7 @@
 
 print('Training Accuracy: %f', np.mean(np.double(p == y)) * 100)
 
-## =================== Part 4: Test Spam Classification ================
+#  =================== Part 4: Test Spam Classification ================
 #  After training the classifier, we can evaluate it on a test set. We have
 #  included a test set in spamTest.mat
 
@@ -100,14 +100,12 @@
 
 print('Test Accuracy: %f', np. mean(np.double(p == ytest)) * 100)
 
-
-## ================= Part 5: Top Predictors of Spam ====================
+#  ================= Part 5: Top Predictors of Spam ====================
 #  Since the model we are training is a linear SVM, we can inspect the
 #  weights learned by the model to understand better how it is determining
 #  whether an email is spam or not. The following code finds the words with
 #  the highest weights in the classifier. Informally, the classifier
 #  'thinks' that these words are the most likely indicators of spam.
-#
 
 # Sort the weights and obtain the vocabulary list
 
@@ -123,7 +121,7 @@
 
 print('Program paused. Press enter to continue.')
 
-## =================== Part 6: Try Your Own Emails =====================
+#  =================== Part 6: Try Your Own Emails =====================
 #  Now that you've trained the spam classifier, you can use it on your own
 #  emails! In the starter code, we have included spamSample1.txt,
 #  spamSample2.txt, emailSample1.txt and emailSample2.txt as examples.

diff --git a/ex6/gaussianKernel.py b/ex6/gaussianKernel.py
@@ -13,11 +13,10 @@ def gaussianKernel(x1, x2, sigma):
     # You need to return the following variables correctly.
     sim = 0
 
-# ====================== YOUR CODE HERE ======================
-# Instructions: Fill in this function to return the similarity between x1
-#               and x2 computed using a Gaussian kernel with bandwidth
-#               sigma
-#
-#
-# =============================================================
+    # ====================== YOUR CODE HERE ======================
+    # Instructions: Fill in this function to return the similarity between x1
+    #               and x2 computed using a Gaussian kernel with bandwidth
+    #               sigma
+    #
+    # =============================================================
     return sim
diff --git a/ex6/getVocabList.py b/ex6/getVocabList.py
@@ -7,13 +7,13 @@ def getVocabList():
     and returns a cell array of the words in vocabList.
     """
 
-    ## Read the fixed vocabulary list
+    # Read the fixed vocabulary list
     with open('vocab.txt') as f:
 
-    # Store all dictionary words in cell array vocab{}
+        # Store all dictionary words in cell array vocab{}
 
-    # For ease of implementation, we use a struct to map the strings => integers
-    # In practice, you'll want to use some form of hashmap
+        # For ease of implementation, we use a struct to map the strings => integers
+        # In practice, you'll want to use some form of hashmap
         vocabList = []
         for line in f:
             idx, w = line.split()

diff --git a/ex6/plotData.py b/ex6/plotData.py
@@ -16,7 +16,7 @@ def plotData(X, y):
     neg = np.where(y == 0, True, False).flatten()
 
     # Plot Examples
-    plt.plot(X[pos,0], X[pos, 1], 'k+', linewidth=1, markersize=7)
-    plt.plot(X[neg,0], X[neg, 1], 'ko', color='y', markersize=7)
+    plt.plot(X[pos, 0], X[pos, 1], 'k+', linewidth=1, markersize=7)
+    plt.plot(X[neg, 0], X[neg, 1], 'ko', color='y', markersize=7)
     show()