From 6d015f9937a506121149ad08803c689257bcb306 Mon Sep 17 00:00:00 2001
From: Naman Kalra <43729631+namankalra@users.noreply.github.com>
Date: Fri, 9 Nov 2018 17:35:48 +0530
Subject: [PATCH 1/7] Update minor_project.py

---
 minor_project.py | 49 ++++++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 22 deletions(-)
diff --git a/minor_project.py b/minor_project.py
index 7d6f5ca..55fcfeb 100644
--- a/minor_project.py
+++ b/minor_project.py
@@ -11,13 +11,13 @@
 from collections import Counter
 from bs4 import BeautifulSoup
 #reading CSV Data
-QuestionData =shortpanda.read_csv('Questions.csv',encoding='latin-1')
-TagsData=shortpanda.read_csv('Tags.csv')
+QuestionData =shortpanda.read_csv('Questions.csv (1).zip',encoding='latin-1')
+TagsData=shortpanda.read_csv('Tags.csv.zip')
 #creating a list of Tags from TagsData
 Taglist=TagsData.Tag.tolist()
 #creating a list of Tags from TagsData
 SortedTaglist=Counter(Taglist)
-print(SortedTaglist) #analyzing whether the Data is sorted or not
+print(SortedTaglist)  #analyzing whether the Data is sorted or not
 list(QuestionData)
 CreationDateList=QuestionData.CreationDate.tolist()
 print(CreationDateList)
@@ -33,7 +33,7 @@
 print(Titlelist)
 
 
-with open('TitleData.txt') as titlefile:
+with open('TempData.txt') as titlefile:
     TitleRead=titlefile.read()
     Words = re.findall(r'\w+', TitleRead)
    
@@ -47,7 +47,7 @@
     
 #for cleaning HTML headers
 with open('SampleBodyData.txt') as Datafile:
-         text=Datafile.read()
+         text=Datafile.read()  
         
    
 def HTML_ClEAN(text):
@@ -55,33 +55,30 @@ def HTML_ClEAN(text):
     soup=BeautifulSoup(text,'html.parser')
     return soup.get_text
         
+
    # for removing  unnecessary code snippets, ,links, URL...
 def remove_CodeSnippet(text):
       
-    return re.sub('<pre><code>.*?</code></pre>', '', text)
-   
-    
+    return re.sub('<pre><code>.*?</code></pre>', '', str(text))
+
     
     #replacing paragraph and next line headers with a blank string
 def remove_Para(text):
       
-    text= re.sub('</p>', '', text)
-    text=  re.sub('\\n', '', text)
-    text=  re.sub('<p>', '', text)
+    text= re.sub('</p>', '', str(text))
+    text=  re.sub('\\n', '', str(text))
+    text=  re.sub('<p>', '', str(text))
     return text
-    
       
   
 #implementing the De-noise Functions to clean the SampleData
 def De_noise(text):
-    text=  HTML_ClEAN(text)
+    text= HTML_ClEAN(text)
     text= remove_CodeSnippet(text)
     text= remove_Para(text)
     return text
+         
 
-        
-     
-        
 #Non-Ascii Words are ignored for better accuracy purpose        
 def is_Non_Ascii(ProcessedSampleBodyData):
     NewProcessedSampleBodyData = []
@@ -90,6 +87,7 @@ def is_Non_Ascii(ProcessedSampleBodyData):
        NewProcessedSampleBodyData.append(temp)
     return NewProcessedSampleBodyData
 
+
 #converting everyword to lowercase to remove redundancy for ex-is & IS
 def Case_lower(ProcessedSampleBodyData):
     NewProcessedSampleBodyData = []
@@ -98,15 +96,17 @@ def Case_lower(ProcessedSampleBodyData):
         NewProcessedSampleBodyData.append(Temp)
     return NewProcessedSampleBodyData
 
+
 #removing Punctuation like,0-;] for better data quality
 def TextClean(ProcessedSampleBodyData):
     NewProcessedSampleBodyData = []
     for word in ProcessedSampleBodyData:
-        temp = re.sub(r'[^\w\s]', '', word)
+        temp = re.sub(r'[^\w\s]', '', str(word))
         if  NewProcessedSampleBodyData != '':
              NewProcessedSampleBodyData .append(temp)
     return  NewProcessedSampleBodyData 
 
+
 #removing Numbers for better tag prediction
 def Number_Removal(ProcessedSampleBodyData):
     use = inflect.engine()
@@ -119,6 +119,8 @@ def Number_Removal(ProcessedSampleBodyData):
             NewProcessedSampleBodyData.append(word)
     return NewProcessedSampleBodyData
 
+nltk.download('stopwords')
+
 #filtering out StopWords to before processing natural data
 def StopWord_Removal(ProcessedSampleBodyData):
     
@@ -129,7 +131,6 @@ def StopWord_Removal(ProcessedSampleBodyData):
      return  NewProcessedSampleBodyData
 
 
-
 def WordProcessing(Body_word):
 
     Body_word=is_Non_Ascii(Body_word)
@@ -139,9 +140,14 @@ def WordProcessing(Body_word):
     Body_word=StopWord_Removal(Body_word)
 
     return Body_word
-    
   
-    DataText= De_noise(text)
+    detext= De_noise(text)   
+    print(detext)
+    newtext= WordProcessing(detext) 
+    print(newtext)
+    
+    nltk.download('punkt')
+    
     #Tokenising the sampledata
     #Tokenising is converting text to words
     ProcessedSampleBodyData = nltk.word_tokenize(text)
@@ -152,5 +158,4 @@ def WordProcessing(Body_word):
     ProcessedBodyWord= WordProcessing(ProcessedSampleBodyData)
 
     count=Counter( ProcessedBodyWord)
-    
-
+    print(count)

From 647045db7fa783054bcbc1d0425879e031c1b7bb Mon Sep 17 00:00:00 2001
From: Naman Kalra <43729631+namankalra@users.noreply.github.com>
Date: Sat, 10 Nov 2018 23:01:08 +0530
Subject: [PATCH 2/7] Update minor_project.py

---
 minor_project.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/minor_project.py b/minor_project.py
index 55fcfeb..65ed211 100644
--- a/minor_project.py
+++ b/minor_project.py
@@ -11,8 +11,8 @@
 from collections import Counter
 from bs4 import BeautifulSoup
 #reading CSV Data
-QuestionData =shortpanda.read_csv('Questions.csv (1).zip',encoding='latin-1')
-TagsData=shortpanda.read_csv('Tags.csv.zip')
+QuestionData =shortpanda.read_csv('Questions.csv',encoding='latin-1')
+TagsData=shortpanda.read_csv('Tags.csv')
 #creating a list of Tags from TagsData
 Taglist=TagsData.Tag.tolist()
 #creating a list of Tags from TagsData

From e23bcd20e805e2e0d42dd51ff93d7e3d69377a49 Mon Sep 17 00:00:00 2001
From: Naman Kalra <43729631+namankalra@users.noreply.github.com>
Date: Sat, 10 Nov 2018 23:05:01 +0530
Subject: [PATCH 3/7] Update minor_project.py

---
 minor_project.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/minor_project.py b/minor_project.py
index 65ed211..0d59081 100644
--- a/minor_project.py
+++ b/minor_project.py
@@ -33,7 +33,7 @@
 print(Titlelist)
 
 
-with open('TempData.txt') as titlefile:
+with open('TitleData.txt') as titlefile:
     TitleRead=titlefile.read()
     Words = re.findall(r'\w+', TitleRead)
    

From 2b0d23586c36b4e87927d601a4e34509b494f179 Mon Sep 17 00:00:00 2001
From: Naman Kalra <43729631+namankalra@users.noreply.github.com>
Date: Sat, 17 Nov 2018 03:47:24 +0530
Subject: [PATCH 4/7] Update minor_project.py

---
 minor_project.py | 372 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 266 insertions(+), 106 deletions(-)

diff --git a/minor_project.py b/minor_project.py
index 0d59081..36bcf39 100644
--- a/minor_project.py
+++ b/minor_project.py
@@ -1,134 +1,93 @@
-#importing the libraries
-import pandas as shortpanda
-import numpy as shortnum
+import pandas as pd
+from collections import Counter
+import string
 import re
+import numpy as np
 import nltk
 from nltk import word_tokenize, sent_tokenize
 from nltk.corpus import stopwords
-import datetime
 import unicodedata
 import inflect
-from collections import Counter
 from bs4 import BeautifulSoup
-#reading CSV Data
-QuestionData =shortpanda.read_csv('Questions.csv',encoding='latin-1')
-TagsData=shortpanda.read_csv('Tags.csv')
-#creating a list of Tags from TagsData
-Taglist=TagsData.Tag.tolist()
-#creating a list of Tags from TagsData
-SortedTaglist=Counter(Taglist)
-print(SortedTaglist)  #analyzing whether the Data is sorted or not
-list(QuestionData)
-CreationDateList=QuestionData.CreationDate.tolist()
-print(CreationDateList)
-
-#finding the week number
-Week= datetime.date(2014,4,17).isocalendar()[1]
-Week= datetime.date(2012,6,12).isocalendar()[1]
-
-
-#counting the number of most frequent word in Title Data
-Titlelist=QuestionData.Title.tolist()
+import matplotlib.pyplot as plt
+Questions=pd.read_csv('Questions.csv',encoding='latin-1')
+TagData=pd.read_csv('Tags.csv',encoding='latin-1')
 
-print(Titlelist)
-
-
-with open('TitleData.txt') as titlefile:
-    TitleRead=titlefile.read()
-    Words = re.findall(r'\w+', TitleRead)
-   
-    WordCount = Counter(Words)
-    
 
-#Cleaning Data by removing HTML Tags ,Links and Code Snippets
-   
-    Bodylist=QuestionData.Body.tolist() # list of unproceesed question data
-#Using BeautifulSoup for Noise Removal
-    
-#for cleaning HTML headers
-with open('SampleBodyData.txt') as Datafile:
-         text=Datafile.read()  
-        
+def text_clean(text):
+    global Blank
+    Blank=''
+    if not isinstance(text,str):
+        return text
+    text=re.sub('<pre><code>.*?</pre></code>',Blank,str(text))
+    def clean_link(match):
+        return Blank if re.match('[a-z]+://',match.group(1)) else match.group(1)
    
+    text = re.sub('<a[^>]+>(.*)</a>',clean_link,str(text))
+    return re.sub('<[^>]+>',Blank,str(text))
 def HTML_ClEAN(text):
         
     soup=BeautifulSoup(text,'html.parser')
     return soup.get_text
         
-
    # for removing  unnecessary code snippets, ,links, URL...
 def remove_CodeSnippet(text):
       
     return re.sub('<pre><code>.*?</code></pre>', '', str(text))
-
-    
+   
     #replacing paragraph and next line headers with a blank string
-def remove_Para(text):
-      
-    text= re.sub('</p>', '', str(text))
-    text=  re.sub('\\n', '', str(text))
-    text=  re.sub('<p>', '', str(text))
-    return text
-      
-  
 #implementing the De-noise Functions to clean the SampleData
 def De_noise(text):
-    text= HTML_ClEAN(text)
+    text=  HTML_ClEAN(text)
     text= remove_CodeSnippet(text)
-    text= remove_Para(text)
     return text
-         
 
-#Non-Ascii Words are ignored for better accuracy purpose        
-def is_Non_Ascii(ProcessedSampleBodyData):
-    NewProcessedSampleBodyData = []
-    for word in ProcessedSampleBodyData:
+        #Non-Ascii Words are ignored for better accuracy purpose        
+def is_Non_Ascii(ProcessedBodyData):
+    NewProcessedBodyData = []
+    for word in ProcessedBodyData:
        temp = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
-       NewProcessedSampleBodyData.append(temp)
-    return NewProcessedSampleBodyData
-
+       NewProcessedBodyData.append(temp)
+    return NewProcessedBodyData
 
 #converting everyword to lowercase to remove redundancy for ex-is & IS
-def Case_lower(ProcessedSampleBodyData):
-    NewProcessedSampleBodyData = []
-    for word in ProcessedSampleBodyData:
+def Case_lower(ProcessedBodyData):
+    NewProcessedBodyData = []
+    for word in ProcessedBodyData:
         Temp = word.lower()
-        NewProcessedSampleBodyData.append(Temp)
-    return NewProcessedSampleBodyData
-
+        NewProcessedBodyData.append(Temp)
+    return NewProcessedBodyData
 
 #removing Punctuation like,0-;] for better data quality
-def TextClean(ProcessedSampleBodyData):
-    NewProcessedSampleBodyData = []
-    for word in ProcessedSampleBodyData:
+def TextClean(ProcessedBodyData):
+    NewProcessedBodyData = []
+    for word in ProcessedBodyData:
         temp = re.sub(r'[^\w\s]', '', str(word))
-        if  NewProcessedSampleBodyData != '':
-             NewProcessedSampleBodyData .append(temp)
-    return  NewProcessedSampleBodyData 
-
+        if  NewProcessedBodyData != '':
+             NewProcessedBodyData .append(temp)
+    return  NewProcessedBodyData 
 
 #removing Numbers for better tag prediction
-def Number_Removal(ProcessedSampleBodyData):
+def Number_Removal(ProcessedBodyData):
     use = inflect.engine()
-    NewProcessedSampleBodyData = []
-    for word in ProcessedSampleBodyData:
+    NewProcessedBodyData = []
+    for word in ProcessedBodyData:
         if word.isdigit():
           temp  = use.number_to_words(word)
-          NewProcessedSampleBodyData.append(temp)
+          NewProcessedBodyData.append(temp)
         else:
-            NewProcessedSampleBodyData.append(word)
-    return NewProcessedSampleBodyData
-
-nltk.download('stopwords')
+            NewProcessedBodyData.append(word)
+    return NewProcessedBodyData
 
 #filtering out StopWords to before processing natural data
-def StopWord_Removal(ProcessedSampleBodyData):
+def StopWord_Removal(ProcessedBodyData):
     
-     NewProcessedSampleBodyData = []
-     for word in ProcessedSampleBodyData:
+     NewProcessedBodyData = []
+     for word in ProcessedBodyData:
         if word not in stopwords.words('english'):
-           NewProcessedSampleBodyData.append(word)
-     return  NewProcessedSampleBodyData
+           NewProcessedBodyData.append(word)
+     return  NewProcessedBodyData
+
 
 
 def WordProcessing(Body_word):
@@ -140,22 +99,223 @@ def WordProcessing(Body_word):
     Body_word=StopWord_Removal(Body_word)
 
     return Body_word
-  
-    detext= De_noise(text)   
-    print(detext)
-    newtext= WordProcessing(detext) 
-    print(newtext)
-    
-    nltk.download('punkt')
-    
-    #Tokenising the sampledata
-    #Tokenising is converting text to words
-    ProcessedSampleBodyData = nltk.word_tokenize(text)
-    print( ProcessedSampleBodyData)
+
+Questions['Text']=Questions['Body'].apply(text_clean).str.lower()
+Questions['Text']=Questions['Text'].apply(De_noise)
+Questions['Text']=Questions['Text'].apply(WordProcessing)
+Questions.Text=Questions.Text.apply(lambda x:x.replace('"','').replace("\n","").replace("\t",""))
+Questions.Text[0]
+TagData.Tag.nunique()
+MostCommonTagCount=Counter(list(TagData.Tag)).most_common(40)
+print(MostCommonTagCount)
+
+TagData = TagData[(TagData.Tag == 'javascript') | (TagData.Tag == 'java') | (TagData.Tag == 'c#') | (TagData.Tag =='php') | (TagData.Tag =='android') | (TagData.Tag == 'jquery') | (TagData.Tag == 'python') | (TagData.Tag == 'html') | (TagData.Tag == 'c++') | (TagData.Tag == 'windows')|  (TagData.Tag == 'ios')]
+
+TagData.head()
+TextandTags=TagData.merge(Questions,on='Id')
+TextandTags.Tag
+TextandTags.Text
+
+#TextandTags.to_csv("output.csv", index=False)
+UnnecessaryColumns=['Id','OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'Title', 'Body']
+TextandTags=TextandTags.drop( UnnecessaryColumns,axis=1,inplace=False)
+TextandTags.Tag
+Categories = TextandTags['Tag'].unique()
+
+fig = plt.figure(figsize=(11,6))
+BalTextandTags.groupby('Tag').Text.count().plot.bar(ylim=0)
+plt.show()
+
+TextandTags = pd.DataFrame(TextandTags)
+BalTextandTags = TextandTags.groupby('Tag')
+BalTextandTags = pd.DataFrame(BalTextandTags.apply(lambda x: x.sample(BalTextandTags.size().min()).reset_index(drop=True)))
+BalTextandTags.head()
+BalTextandTags.Tag
+
+
+MostCommonTagCount=Counter(list(BalTextandTags.Tag)).most_common(11)
+print(MostCommonTagCount)
+
+
+
+from sklearn.model_selection import train_test_split
+X_train,X_test,Y_train,Y_test=train_test_split(BalTextandTags['Text'],BalTextandTags['Tag'],random_state=42,
+                                               test_size=0.2,shuffle=True)
+
+def Convert_to_MB(Dataset):
+    Result=sum(len(s.encode('utf-8'))for s in Dataset)/ 1e6
+    return Result
+Train_MB_size=Convert_to_MB(X_train)
+Test_MB_size=Convert_to_MB(X_test)
+print("%d documents - %0.3fMB (training set)" % (
+    len(X_train), Train_MB_size))
+print("%d documents - %0.3fMB (test set)" % (
+    len(X_test),Test_MB_size))
+print("%d Categories" % len(Categories))
+print()
+
+from optparse import OptionParser
+options = OptionParser()
+options.add_option("--use_hashing",
+              action="store_true",
+              help="Use a hashing vectorizer.")
+options.add_option("--n_features",
+              action="store", type=int, default=2 ** 16,
+              help="n_features when using the hashing vectorizer.")
+
+import sys
+def is_interactive():
+    return not hasattr(sys.modules['__main__'], '__file__')
+
+# work-around for Jupyter notebook and IPython console
+argv = [] if is_interactive() else sys.argv[1:]
+(opts, args) = options.parse_args(argv)
+if len(args) > 0:
+    options.error("this script takes no arguments.")
+    sys.exit(1)
+
+print(__doc__)
+options.print_help()
+print()
+
+
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.feature_extraction.text import HashingVectorizer
+from time import time
+#feature extraction using sparse vectorizer
+tnought = time()
+if opts.use_hashing:
+    Vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,
+                                   n_features=opts.n_features)
+    X_train_new = Vectorizer.transform(X_train)
+else:
+    Vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
+                                 stop_words='english')
+    X_train_new = Vectorizer.fit_transform(X_train)
+TimeTakenTrain = time()-tnought
+print("done in %fs at %0.3fMB/s" % (TimeTakenTrain, Train_MB_size / TimeTakenTrain))
+print("n_samples: %d, n_features: %d" % X_train_new.shape)
+print()
+#using the vectoriser for the test data now
+tnought=time()
+X_test_new=Vectorizer.transform(X_test)
+TimeTakenTest= time()-tnought
+print("done in %fs at %0.3fMB/s" % (TimeTakenTest, Test_MB_size / TimeTakenTest))
+print("n_samples: %d, n_features: %d" % X_test_new.shape)
+print()
+#chi square test for conversion of Integer feature name to 
+#original String Token Name
+from sklearn.feature_selection import SelectKBest, chi2
+if options.use_hashing:
+    feature_names=None
+else:
+    feature_names=Vectorizer.get_feature_names()
  
-    #BodyWordCount=Counter(ProcessedSampleBodyData)
-       
-    ProcessedBodyWord= WordProcessing(ProcessedSampleBodyData)
+if opts.select_chi2:
+    print("Extracting %d bestfeatures from chi-squared test"
+          % opts.select_chi2)
+    
+    options.add_option("--chi2_select",
+              action="store", type="int", dest="select_chi2",
+              help="Select some number of features using a chi-squared test")
+    tnought=time()
+    ch2=SelectKBest(chi2,k=opts.select_chi2)
+    X_train_new=ch2.fit_transform(X_train_new,Y_train)
+    X_test_new=ch2.transform(X_test_new)
+    if feature_names:
+        feature_names= [feature_names[i] for i
+                        in ch2.get_support(indices=True)]
+          
+    print("done in %fs" % (time() - tnought))
+    print()
+if feature_names:
+    feature_names=np.asarray(feature_names)
+
+TargetClasses=Categories  
+
+print(X_train_new)
+
+
+
+from sklearn.svm import LinearSVC
+classifier=LinearSVC(multi_class='ovr',random_state=0)
+classifier.fit(X_train_new,Y_train)
+
+Y_pred=classifier.predict(X_test_new)
+
+from sklearn.metrics import confusion_matrix
+cm=confusion_matrix(Y_test,Y_pred)
+
+from sklearn import metrics
+
+score =metrics.accuracy_score(Y_test, Y_pred)
+print("accuracy:   %0.3f" % score)
+
+from sklearn.metrics import classification_report
+
+print(classification_report(Y_test,Y_pred,target_names=Categories))
+
+
+
+
+# Visualising the Training set results
+from matplotlib.colors import ListedColormap
+X_set, y_set = X_train_new, Y_train
+X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
+                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
+plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
+             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
+plt.xlim(X1.min(), X1.max())
+plt.ylim(X2.min(), X2.max())
+for i, j in enumerate(np.unique(y_set)):
+    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
+                c = ListedColormap(('red', 'green'))(i), label = j)
+plt.title('SVM (Training set)')
+plt.xlabel('Questions')
+plt.ylabel('Tag')
+plt.legend()
+plt.show()
+
+indices = np.arange(len())
+
+ = [[x[i] for x in results] for i in range(4)]
+
+clf_names, score, training_time, test_time = 
+training_time = np.array(training_time) / np.max(training_time)
+test_time = np.array(test_time) / np.max(test_time)
+
+plt.figure(figsize=(12, 8))
+plt.title("Score")
+plt.barh(indices, score, .2, label="score", color='navy')
+plt.barh(indices + .3, training_time, .2, label="training time",
+         color='c')
+plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange')
+plt.yticks(())
+plt.legend(loc='best')
+plt.subplots_adjust(left=.25)
+plt.subplots_adjust(top=.95)
+plt.subplots_adjust(bottom=.05)
+
+for i, c in zip(indices, clf_names):
+    plt.text(-.3, i, c)
+
+plt.show()
 
-    count=Counter( ProcessedBodyWord)
-    print(count)
+# Visualising the Test set results
+from matplotlib.colors import ListedColormap
+X_set, y_set = X_test_new, Y_test
+X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
+                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
+plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
+             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
+plt.xlim(X1.min(), X1.max())
+plt.ylim(X2.min(), X2.max())
+for i, j in enumerate(np.unique(y_set)):
+    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
+                c = ListedColormap(('red', 'green'))(i), label = j)
+plt.title('SVM (Test set)')
+plt.xlabel('Questions')
+plt.ylabel('Tag')
+plt.legend()
+plt.show()

From c419e120c4c0ab1e1f60ad7969d28df898457b9b Mon Sep 17 00:00:00 2001
From: Naman Kalra <43729631+namankalra@users.noreply.github.com>
Date: Wed, 21 Nov 2018 00:02:05 +0530
Subject: [PATCH 5/7] Update minor_project.py

---
 minor_project.py | 174 ++++++++++++++++++++++++++---------------------
 1 file changed, 97 insertions(+), 77 deletions(-)

diff --git a/minor_project.py b/minor_project.py
index 36bcf39..f8a991f 100644
--- a/minor_project.py
+++ b/minor_project.py
@@ -1,3 +1,10 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Nov 17 14:57:53 2018
+
+@author: Naman Kalra
+"""
+
 import pandas as pd
 from collections import Counter
 import string
@@ -101,10 +108,7 @@ def WordProcessing(Body_word):
     return Body_word
 
 Questions['Text']=Questions['Body'].apply(text_clean).str.lower()
-Questions['Text']=Questions['Text'].apply(De_noise)
-Questions['Text']=Questions['Text'].apply(WordProcessing)
-Questions.Text=Questions.Text.apply(lambda x:x.replace('"','').replace("\n","").replace("\t",""))
-Questions.Text[0]
+
 TagData.Tag.nunique()
 MostCommonTagCount=Counter(list(TagData.Tag)).most_common(40)
 print(MostCommonTagCount)
@@ -116,14 +120,13 @@ def WordProcessing(Body_word):
 TextandTags.Tag
 TextandTags.Text
 
-#TextandTags.to_csv("output.csv", index=False)
 UnnecessaryColumns=['Id','OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'Title', 'Body']
 TextandTags=TextandTags.drop( UnnecessaryColumns,axis=1,inplace=False)
-TextandTags.Tag
 Categories = TextandTags['Tag'].unique()
+print(Categories)
 
-fig = plt.figure(figsize=(11,6))
-BalTextandTags.groupby('Tag').Text.count().plot.bar(ylim=0)
+graph = plt.figure(figsize=(11,6))
+TextandTags.groupby('Tag').Text.count().plot.bar(ylim=0)
 plt.show()
 
 TextandTags = pd.DataFrame(TextandTags)
@@ -132,16 +135,22 @@ def WordProcessing(Body_word):
 BalTextandTags.head()
 BalTextandTags.Tag
 
+BalTextandTags.to_csv("output.csv", index=False)
+
+#BalTextandTags['Text']=BalTextandTags['Text'].apply(De_noise)
+#BalTextandTags['Text']=BalTextandTags['Text'].apply(WordProcessing)
+BalTextandTags.Text=BalTextandTags.Text.apply(lambda x:x.replace('"','').replace("\n","").replace("\t",""))
+BalTextandTags.Text[0]
 
 MostCommonTagCount=Counter(list(BalTextandTags.Tag)).most_common(11)
 print(MostCommonTagCount)
 
-
-
 from sklearn.model_selection import train_test_split
 X_train,X_test,Y_train,Y_test=train_test_split(BalTextandTags['Text'],BalTextandTags['Tag'],random_state=42,
                                                test_size=0.2,shuffle=True)
 
+
+
 def Convert_to_MB(Dataset):
     Result=sum(len(s.encode('utf-8'))for s in Dataset)/ 1e6
     return Result
@@ -152,7 +161,6 @@ def Convert_to_MB(Dataset):
 print("%d documents - %0.3fMB (test set)" % (
     len(X_test),Test_MB_size))
 print("%d Categories" % len(Categories))
-print()
 
 from optparse import OptionParser
 options = OptionParser()
@@ -196,68 +204,105 @@ def is_interactive():
 TimeTakenTrain = time()-tnought
 print("done in %fs at %0.3fMB/s" % (TimeTakenTrain, Train_MB_size / TimeTakenTrain))
 print("n_samples: %d, n_features: %d" % X_train_new.shape)
-print()
+
 #using the vectoriser for the test data now
 tnought=time()
 X_test_new=Vectorizer.transform(X_test)
 TimeTakenTest= time()-tnought
 print("done in %fs at %0.3fMB/s" % (TimeTakenTest, Test_MB_size / TimeTakenTest))
 print("n_samples: %d, n_features: %d" % X_test_new.shape)
-print()
-#chi square test for conversion of Integer feature name to 
-#original String Token Name
-from sklearn.feature_selection import SelectKBest, chi2
-if options.use_hashing:
-    feature_names=None
-else:
-    feature_names=Vectorizer.get_feature_names()
- 
-if opts.select_chi2:
-    print("Extracting %d bestfeatures from chi-squared test"
-          % opts.select_chi2)
-    
-    options.add_option("--chi2_select",
-              action="store", type="int", dest="select_chi2",
-              help="Select some number of features using a chi-squared test")
-    tnought=time()
-    ch2=SelectKBest(chi2,k=opts.select_chi2)
-    X_train_new=ch2.fit_transform(X_train_new,Y_train)
-    X_test_new=ch2.transform(X_test_new)
-    if feature_names:
-        feature_names= [feature_names[i] for i
-                        in ch2.get_support(indices=True)]
-          
-    print("done in %fs" % (time() - tnought))
-    print()
-if feature_names:
-    feature_names=np.asarray(feature_names)
 
-TargetClasses=Categories  
 
-print(X_train_new)
+#SVM
+from sklearn.svm import LinearSVC
+classifiersvm=LinearSVC(multi_class='ovr',random_state=0)
+classifiersvm.fit(X_train_new,Y_train)
+
+Y_predsvm=classifiersvm.predict(X_test_new)
 
+from sklearn.metrics import confusion_matrix
+cmsvm=confusion_matrix(Y_test,Y_predsvm)
 
+from sklearn import metrics
+scoresvm=metrics.accuracy_score(Y_test, Y_predsvm)
+print("accuracy:   %0.3f" % scoresvm)
 
-from sklearn.svm import LinearSVC
-classifier=LinearSVC(multi_class='ovr',random_state=0)
-classifier.fit(X_train_new,Y_train)
+from sklearn.metrics import classification_report
+print(classification_report(Y_test,Y_predsvm,target_names=Categories))
 
-Y_pred=classifier.predict(X_test_new)
+#RandomForest
+from sklearn.ensemble import RandomForestClassifier
+classifierrf=RandomForestClassifier(n_estimators=10 , criterion='entropy' ,random_state=0)
+classifierrf.fit(X_train_new,Y_train)
+
+Y_predrf=classifierrf.predict(X_test_new)
 
 from sklearn.metrics import confusion_matrix
-cm=confusion_matrix(Y_test,Y_pred)
+cmrf=confusion_matrix(Y_test,Y_predrf)
 
 from sklearn import metrics
-
-score =metrics.accuracy_score(Y_test, Y_pred)
-print("accuracy:   %0.3f" % score)
+scorerf=metrics.accuracy_score(Y_test, Y_predrf)
+print("accuracy:   %0.3f" % scorerf)
 
 from sklearn.metrics import classification_report
+print(classification_report(Y_test,Y_predrf,target_names=Categories))
+
+def make_meshgrid(x, y, h=.02):
+    """Create a mesh of points to plot in
+
+    Parameters
+    ----------
+    x: data to base x-axis meshgrid on
+    y: data to base y-axis meshgrid on
+    h: stepsize for meshgrid, optional
+
+    Returns
+    -------
+    xx, yy : ndarray
+    """
+    x_min, x_max = x.min() - 1, x.max() + 1
+    y_min, y_max = y.min() - 1, y.max() + 1
+    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
+                         np.arange(y_min, y_max, h))
+    return xx, yy
+
+
+def plot_contours(ax, clf, xx, yy, **params):
+    """Plot the decision boundaries for a classifier.
+
+    Parameters
+    ----------
+    ax: matplotlib axes object
+    clf: a classifier
+    xx: meshgrid ndarray
+    yy: meshgrid ndarray
+    params: dictionary of params to pass to contourf, optional
+    """
+    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
+    Z = Z.reshape(xx.shape)
+    out = ax.contourf(xx, yy, Z, **params)
+    return out
+
+titles = ('SVC with linear kernel')
 
-print(classification_report(Y_test,Y_pred,target_names=Categories))
-
+fig = plt.figure(figsize=(11,6))
 
+X0, X1 = X_train_new[:, 0], X_train_new[:, 1]
+xx, yy = make_meshgrid(X0, X1)
+
+for clf, title, ax in zip(classifier, titles, sub.flatten()):
+    plot_contours(ax, clf, xx, yy,
+                  cmap=plt.cm.coolwarm, alpha=0.8)
+    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
+    ax.set_xlim(xx.min(), xx.max())
+    ax.set_ylim(yy.min(), yy.max())
+    ax.set_xlabel('Sepal length')
+    ax.set_ylabel('Sepal width')
+    ax.set_xticks(())
+    ax.set_yticks(())
+    ax.set_title(title)
 
+plt.show()
 
 # Visualising the Training set results
 from matplotlib.colors import ListedColormap
@@ -277,31 +322,6 @@ def is_interactive():
 plt.legend()
 plt.show()
 
-indices = np.arange(len())
-
- = [[x[i] for x in results] for i in range(4)]
-
-clf_names, score, training_time, test_time = 
-training_time = np.array(training_time) / np.max(training_time)
-test_time = np.array(test_time) / np.max(test_time)
-
-plt.figure(figsize=(12, 8))
-plt.title("Score")
-plt.barh(indices, score, .2, label="score", color='navy')
-plt.barh(indices + .3, training_time, .2, label="training time",
-         color='c')
-plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange')
-plt.yticks(())
-plt.legend(loc='best')
-plt.subplots_adjust(left=.25)
-plt.subplots_adjust(top=.95)
-plt.subplots_adjust(bottom=.05)
-
-for i, c in zip(indices, clf_names):
-    plt.text(-.3, i, c)
-
-plt.show()
-
 # Visualising the Test set results
 from matplotlib.colors import ListedColormap
 X_set, y_set = X_test_new, Y_test

From ed7b373dac09a77a337f4b29c4fa7321b33a019b Mon Sep 17 00:00:00 2001
From: Naman Kalra <43729631+namankalra@users.noreply.github.com>
Date: Wed, 21 Nov 2018 13:04:49 +0530
Subject: [PATCH 6/7] Update minor_project.py

---
 minor_project.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/minor_project.py b/minor_project.py
index f8a991f..1693127 100644
--- a/minor_project.py
+++ b/minor_project.py
@@ -247,6 +247,24 @@ def is_interactive():
 from sklearn.metrics import classification_report
 print(classification_report(Y_test,Y_predrf,target_names=Categories))
 
+#NaiveBayes
+
+from sklearn.naive_bayes import GaussianNB as nb
+classifiernb=nb()
+classifiernb.fit(X_train_new,Y_train)
+Y_predrf=classifierrf.predict(X_test_new)
+
+from sklearn.metrics import confusion_matrix
+cmnb=confusion_matrix(Y_test,Y_prednb)
+
+from sklearn import metrics
+scorenb=metrics.accuracy_score(Y_test, Y_prednb)
+print("accuracy:   %0.3f" % scorenb)
+
+from sklearn.metrics import classification_report
+print(classification_report(Y_test,Y_prednb,target_names=Categories))
+
+
 def make_meshgrid(x, y, h=.02):
     """Create a mesh of points to plot in
 

From ff122528f7217a0451fad97847bd2035518399a0 Mon Sep 17 00:00:00 2001
From: Naman Kalra <43729631+namankalra@users.noreply.github.com>
Date: Wed, 21 Nov 2018 14:53:03 +0530
Subject: [PATCH 7/7] Update minor_project.py

---
 minor_project.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/minor_project.py b/minor_project.py
index 1693127..a486cd2 100644
--- a/minor_project.py
+++ b/minor_project.py
@@ -20,6 +20,25 @@
 Questions=pd.read_csv('Questions.csv',encoding='latin-1')
 TagData=pd.read_csv('Tags.csv',encoding='latin-1')
 
+def plot_tags(tagCount):
+    
+    x,y = zip(*tagCount)
+
+    colormap = plt.cm.gist_ncar #nipy_spectral, Set1,Paired  
+    colors = [colormap(i) for i in np.linspace(0, 0.8,50)]   
+
+    area = [i/4000 for i in list(y)]   # 0 to 15 point radiuses
+    plt.figure(figsize=(10,6))
+    plt.ylabel("Number of question associations")
+    for i in range(len(y)):
+        plt.plot(i,y[i], marker='o', linestyle='',ms=area[i],label=x[i])
+
+    plt.legend(numpoints=1)
+    plt.show()
+import collections
+tagCount =  collections.Counter(list(TagData['Tag'])).most_common(10)
+print(tagCount)
+plot_tags(tagCount)
 
 def text_clean(text):
     global Blank