diff --git a/minor_project.py b/minor_project.py
index 7d6f5ca..a486cd2 100644
--- a/minor_project.py
+++ b/minor_project.py
@@ -1,55 +1,56 @@
-#importing the libraries
-import pandas as shortpanda
-import numpy as shortnum
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Nov 17 14:57:53 2018
+
+@author: Naman Kalra
+"""
+
+import pandas as pd
+from collections import Counter
+import string
 import re
+import numpy as np
 import nltk
 from nltk import word_tokenize, sent_tokenize
 from nltk.corpus import stopwords
-import datetime
 import unicodedata
 import inflect
-from collections import Counter
 from bs4 import BeautifulSoup
-#reading CSV Data
-QuestionData =shortpanda.read_csv('Questions.csv',encoding='latin-1')
-TagsData=shortpanda.read_csv('Tags.csv')
-#creating a list of Tags from TagsData
-Taglist=TagsData.Tag.tolist()
-#creating a list of Tags from TagsData
-SortedTaglist=Counter(Taglist)
-print(SortedTaglist) #analyzing whether the Data is sorted or not
-list(QuestionData)
-CreationDateList=QuestionData.CreationDate.tolist()
-print(CreationDateList)
-
-#finding the week number
-Week= datetime.date(2014,4,17).isocalendar()[1]
-Week= datetime.date(2012,6,12).isocalendar()[1]
+import matplotlib.pyplot as plt
+Questions=pd.read_csv('Questions.csv',encoding='latin-1')
+TagData=pd.read_csv('Tags.csv',encoding='latin-1')
 
+def plot_tags(tagCount):
+    
+    x,y = zip(*tagCount)
 
-#counting the number of most frequent word in Title Data
-Titlelist=QuestionData.Title.tolist()
+    colormap = plt.cm.gist_ncar #nipy_spectral, Set1,Paired  
+    colors = [colormap(i) for i in np.linspace(0, 0.8,50)]   
 
-print(Titlelist)
+    area = [i/4000 for i in list(y)]   # 0 to 15 point radiuses
+    plt.figure(figsize=(10,6))
+    plt.ylabel("Number of question associations")
+    for i in range(len(y)):
+        plt.plot(i,y[i], marker='o', linestyle='',ms=area[i],label=x[i])
 
+    plt.legend(numpoints=1)
+    plt.show()
+import collections
+tagCount =  collections.Counter(list(TagData['Tag'])).most_common(10)
+print(tagCount)
+plot_tags(tagCount)
 
-with open('TitleData.txt') as titlefile:
-    TitleRead=titlefile.read()
-    Words = re.findall(r'\w+', TitleRead)
-   
-    WordCount = Counter(Words)
-    
-
-#Cleaning Data by removing HTML Tags ,Links and Code Snippets
-   
-    Bodylist=QuestionData.Body.tolist() # list of unproceesed question data
-#Using BeautifulSoup for Noise Removal
-    
-#for cleaning HTML headers
-with open('SampleBodyData.txt') as Datafile:
-         text=Datafile.read()
-        
+def text_clean(text):
+    global Blank
+    Blank=''
+    if not isinstance(text,str):
+        return text
+    text=re.sub('<pre><code>.*?</pre></code>',Blank,str(text))
+    def clean_link(match):
+        return Blank if re.match('[a-z]+://',match.group(1)) else match.group(1)
    
+    text = re.sub('<a[^>]+>(.*)</a>',clean_link,str(text))
+    return re.sub('<[^>]+>',Blank,str(text))
 def HTML_ClEAN(text):
         
     soup=BeautifulSoup(text,'html.parser')
@@ -58,75 +59,60 @@ def HTML_ClEAN(text):
    # for removing  unnecessary code snippets, ,links, URL...
 def remove_CodeSnippet(text):
       
-    return re.sub('<pre><code>.*?</code></pre>', '', text)
+    return re.sub('<pre><code>.*?</code></pre>', '', str(text))
    
-    
-    
     #replacing paragraph and next line headers with a blank string
-def remove_Para(text):
-      
-    text= re.sub('</p>', '', text)
-    text=  re.sub('\\n', '', text)
-    text=  re.sub('<p>', '', text)
-    return text
-    
-      
-  
 #implementing the De-noise Functions to clean the SampleData
 def De_noise(text):
     text=  HTML_ClEAN(text)
     text= remove_CodeSnippet(text)
-    text= remove_Para(text)
     return text
 
-        
-     
-        
-#Non-Ascii Words are ignored for better accuracy purpose        
-def is_Non_Ascii(ProcessedSampleBodyData):
-    NewProcessedSampleBodyData = []
-    for word in ProcessedSampleBodyData:
+        #Non-Ascii Words are ignored for better accuracy purpose        
+def is_Non_Ascii(ProcessedBodyData):
+    NewProcessedBodyData = []
+    for word in ProcessedBodyData:
        temp = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
-       NewProcessedSampleBodyData.append(temp)
-    return NewProcessedSampleBodyData
+       NewProcessedBodyData.append(temp)
+    return NewProcessedBodyData
 
 #converting everyword to lowercase to remove redundancy for ex-is & IS
-def Case_lower(ProcessedSampleBodyData):
-    NewProcessedSampleBodyData = []
-    for word in ProcessedSampleBodyData:
+def Case_lower(ProcessedBodyData):
+    NewProcessedBodyData = []
+    for word in ProcessedBodyData:
         Temp = word.lower()
-        NewProcessedSampleBodyData.append(Temp)
-    return NewProcessedSampleBodyData
+        NewProcessedBodyData.append(Temp)
+    return NewProcessedBodyData
 
 #removing Punctuation like,0-;] for better data quality
-def TextClean(ProcessedSampleBodyData):
-    NewProcessedSampleBodyData = []
-    for word in ProcessedSampleBodyData:
-        temp = re.sub(r'[^\w\s]', '', word)
-        if  NewProcessedSampleBodyData != '':
-             NewProcessedSampleBodyData .append(temp)
-    return  NewProcessedSampleBodyData 
+def TextClean(ProcessedBodyData):
+    NewProcessedBodyData = []
+    for word in ProcessedBodyData:
+        temp = re.sub(r'[^\w\s]', '', str(word))
+        if  NewProcessedBodyData != '':
+             NewProcessedBodyData .append(temp)
+    return  NewProcessedBodyData 
 
 #removing Numbers for better tag prediction
-def Number_Removal(ProcessedSampleBodyData):
+def Number_Removal(ProcessedBodyData):
     use = inflect.engine()
-    NewProcessedSampleBodyData = []
-    for word in ProcessedSampleBodyData:
+    NewProcessedBodyData = []
+    for word in ProcessedBodyData:
         if word.isdigit():
           temp  = use.number_to_words(word)
-          NewProcessedSampleBodyData.append(temp)
+          NewProcessedBodyData.append(temp)
         else:
-            NewProcessedSampleBodyData.append(word)
-    return NewProcessedSampleBodyData
+            NewProcessedBodyData.append(word)
+    return NewProcessedBodyData
 
 #filtering out StopWords to before processing natural data
-def StopWord_Removal(ProcessedSampleBodyData):
+def StopWord_Removal(ProcessedBodyData):
     
-     NewProcessedSampleBodyData = []
-     for word in ProcessedSampleBodyData:
+     NewProcessedBodyData = []
+     for word in ProcessedBodyData:
         if word not in stopwords.words('english'):
-           NewProcessedSampleBodyData.append(word)
-     return  NewProcessedSampleBodyData
+           NewProcessedBodyData.append(word)
+     return  NewProcessedBodyData
 
 
 
@@ -139,18 +125,254 @@ def WordProcessing(Body_word):
     Body_word=StopWord_Removal(Body_word)
 
     return Body_word
-    
-  
-    DataText= De_noise(text)
-    #Tokenising the sampledata
-    #Tokenising is converting text to words
-    ProcessedSampleBodyData = nltk.word_tokenize(text)
-    print( ProcessedSampleBodyData)
- 
-    #BodyWordCount=Counter(ProcessedSampleBodyData)
-       
-    ProcessedBodyWord= WordProcessing(ProcessedSampleBodyData)
-
-    count=Counter( ProcessedBodyWord)
-    
 
+Questions['Text']=Questions['Body'].apply(text_clean).str.lower()
+
+TagData.Tag.nunique()
+MostCommonTagCount=Counter(list(TagData.Tag)).most_common(40)
+print(MostCommonTagCount)
+
+TagData = TagData[(TagData.Tag == 'javascript') | (TagData.Tag == 'java') | (TagData.Tag == 'c#') | (TagData.Tag =='php') | (TagData.Tag =='android') | (TagData.Tag == 'jquery') | (TagData.Tag == 'python') | (TagData.Tag == 'html') | (TagData.Tag == 'c++') | (TagData.Tag == 'windows')|  (TagData.Tag == 'ios')]
+
+TagData.head()
+TextandTags=TagData.merge(Questions,on='Id')
+TextandTags.Tag
+TextandTags.Text
+
+UnnecessaryColumns=['Id','OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'Title', 'Body']
+TextandTags=TextandTags.drop( UnnecessaryColumns,axis=1,inplace=False)
+Categories = TextandTags['Tag'].unique()
+print(Categories)
+
+graph = plt.figure(figsize=(11,6))
+TextandTags.groupby('Tag').Text.count().plot.bar(ylim=0)
+plt.show()
+
+TextandTags = pd.DataFrame(TextandTags)
+BalTextandTags = TextandTags.groupby('Tag')
+BalTextandTags = pd.DataFrame(BalTextandTags.apply(lambda x: x.sample(BalTextandTags.size().min()).reset_index(drop=True)))
+BalTextandTags.head()
+BalTextandTags.Tag
+
+BalTextandTags.to_csv("output.csv", index=False)
+
+#BalTextandTags['Text']=BalTextandTags['Text'].apply(De_noise)
+#BalTextandTags['Text']=BalTextandTags['Text'].apply(WordProcessing)
+BalTextandTags.Text=BalTextandTags.Text.apply(lambda x:x.replace('"','').replace("\n","").replace("\t",""))
+BalTextandTags.Text[0]
+
+MostCommonTagCount=Counter(list(BalTextandTags.Tag)).most_common(11)
+print(MostCommonTagCount)
+
+from sklearn.model_selection import train_test_split
+X_train,X_test,Y_train,Y_test=train_test_split(BalTextandTags['Text'],BalTextandTags['Tag'],random_state=42,
+                                               test_size=0.2,shuffle=True)
+
+
+
+def Convert_to_MB(Dataset):
+    Result=sum(len(s.encode('utf-8'))for s in Dataset)/ 1e6
+    return Result
+Train_MB_size=Convert_to_MB(X_train)
+Test_MB_size=Convert_to_MB(X_test)
+print("%d documents - %0.3fMB (training set)" % (
+    len(X_train), Train_MB_size))
+print("%d documents - %0.3fMB (test set)" % (
+    len(X_test),Test_MB_size))
+print("%d Categories" % len(Categories))
+
+from optparse import OptionParser
+options = OptionParser()
+options.add_option("--use_hashing",
+              action="store_true",
+              help="Use a hashing vectorizer.")
+options.add_option("--n_features",
+              action="store", type=int, default=2 ** 16,
+              help="n_features when using the hashing vectorizer.")
+
+import sys
+def is_interactive():
+    return not hasattr(sys.modules['__main__'], '__file__')
+
+# work-around for Jupyter notebook and IPython console
+argv = [] if is_interactive() else sys.argv[1:]
+(opts, args) = options.parse_args(argv)
+if len(args) > 0:
+    options.error("this script takes no arguments.")
+    sys.exit(1)
+
+print(__doc__)
+options.print_help()
+print()
+
+
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.feature_extraction.text import HashingVectorizer
+from time import time
+#feature extraction using sparse vectorizer
+tnought = time()
+if opts.use_hashing:
+    Vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,
+                                   n_features=opts.n_features)
+    X_train_new = Vectorizer.transform(X_train)
+else:
+    Vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
+                                 stop_words='english')
+    X_train_new = Vectorizer.fit_transform(X_train)
+TimeTakenTrain = time()-tnought
+print("done in %fs at %0.3fMB/s" % (TimeTakenTrain, Train_MB_size / TimeTakenTrain))
+print("n_samples: %d, n_features: %d" % X_train_new.shape)
+
+#using the vectoriser for the test data now
+tnought=time()
+X_test_new=Vectorizer.transform(X_test)
+TimeTakenTest= time()-tnought
+print("done in %fs at %0.3fMB/s" % (TimeTakenTest, Test_MB_size / TimeTakenTest))
+print("n_samples: %d, n_features: %d" % X_test_new.shape)
+
+
+#SVM
+from sklearn.svm import LinearSVC
+classifiersvm=LinearSVC(multi_class='ovr',random_state=0)
+classifiersvm.fit(X_train_new,Y_train)
+
+Y_predsvm=classifiersvm.predict(X_test_new)
+
+from sklearn.metrics import confusion_matrix
+cmsvm=confusion_matrix(Y_test,Y_predsvm)
+
+from sklearn import metrics
+scoresvm=metrics.accuracy_score(Y_test, Y_predsvm)
+print("accuracy:   %0.3f" % scoresvm)
+
+from sklearn.metrics import classification_report
+print(classification_report(Y_test,Y_predsvm,target_names=Categories))
+
+#RandomForest
+from sklearn.ensemble import RandomForestClassifier
+classifierrf=RandomForestClassifier(n_estimators=10 , criterion='entropy' ,random_state=0)
+classifierrf.fit(X_train_new,Y_train)
+
+Y_predrf=classifierrf.predict(X_test_new)
+
+from sklearn.metrics import confusion_matrix
+cmrf=confusion_matrix(Y_test,Y_predrf)
+
+from sklearn import metrics
+scorerf=metrics.accuracy_score(Y_test, Y_predrf)
+print("accuracy:   %0.3f" % scorerf)
+
+from sklearn.metrics import classification_report
+print(classification_report(Y_test,Y_predrf,target_names=Categories))
+
+#NaiveBayes
+
+from sklearn.naive_bayes import GaussianNB as nb
+classifiernb=nb()
+classifiernb.fit(X_train_new,Y_train)
+Y_predrf=classifierrf.predict(X_test_new)
+
+from sklearn.metrics import confusion_matrix
+cmnb=confusion_matrix(Y_test,Y_prednb)
+
+from sklearn import metrics
+scorenb=metrics.accuracy_score(Y_test, Y_prednb)
+print("accuracy:   %0.3f" % scorenb)
+
+from sklearn.metrics import classification_report
+print(classification_report(Y_test,Y_prednb,target_names=Categories))
+
+
+def make_meshgrid(x, y, h=.02):
+    """Create a mesh of points to plot in
+
+    Parameters
+    ----------
+    x: data to base x-axis meshgrid on
+    y: data to base y-axis meshgrid on
+    h: stepsize for meshgrid, optional
+
+    Returns
+    -------
+    xx, yy : ndarray
+    """
+    x_min, x_max = x.min() - 1, x.max() + 1
+    y_min, y_max = y.min() - 1, y.max() + 1
+    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
+                         np.arange(y_min, y_max, h))
+    return xx, yy
+
+
+def plot_contours(ax, clf, xx, yy, **params):
+    """Plot the decision boundaries for a classifier.
+
+    Parameters
+    ----------
+    ax: matplotlib axes object
+    clf: a classifier
+    xx: meshgrid ndarray
+    yy: meshgrid ndarray
+    params: dictionary of params to pass to contourf, optional
+    """
+    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
+    Z = Z.reshape(xx.shape)
+    out = ax.contourf(xx, yy, Z, **params)
+    return out
+
+titles = ('SVC with linear kernel')
+
+fig = plt.figure(figsize=(11,6))
+
+X0, X1 = X_train_new[:, 0], X_train_new[:, 1]
+xx, yy = make_meshgrid(X0, X1)
+
+for clf, title, ax in zip(classifier, titles, sub.flatten()):
+    plot_contours(ax, clf, xx, yy,
+                  cmap=plt.cm.coolwarm, alpha=0.8)
+    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
+    ax.set_xlim(xx.min(), xx.max())
+    ax.set_ylim(yy.min(), yy.max())
+    ax.set_xlabel('Sepal length')
+    ax.set_ylabel('Sepal width')
+    ax.set_xticks(())
+    ax.set_yticks(())
+    ax.set_title(title)
+
+plt.show()
+
+# Visualising the Training set results
+from matplotlib.colors import ListedColormap
+X_set, y_set = X_train_new, Y_train
+X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
+                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
+plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
+             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
+plt.xlim(X1.min(), X1.max())
+plt.ylim(X2.min(), X2.max())
+for i, j in enumerate(np.unique(y_set)):
+    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
+                c = ListedColormap(('red', 'green'))(i), label = j)
+plt.title('SVM (Training set)')
+plt.xlabel('Questions')
+plt.ylabel('Tag')
+plt.legend()
+plt.show()
+
+# Visualising the Test set results
+from matplotlib.colors import ListedColormap
+X_set, y_set = X_test_new, Y_test
+X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
+                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
+plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
+             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
+plt.xlim(X1.min(), X1.max())
+plt.ylim(X2.min(), X2.max())
+for i, j in enumerate(np.unique(y_set)):
+    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
+                c = ListedColormap(('red', 'green'))(i), label = j)
+plt.title('SVM (Test set)')
+plt.xlabel('Questions')
+plt.ylabel('Tag')
+plt.legend()
+plt.show()