From 6d015f9937a506121149ad08803c689257bcb306 Mon Sep 17 00:00:00 2001 From: Naman Kalra <43729631+namankalra@users.noreply.github.com> Date: Fri, 9 Nov 2018 17:35:48 +0530 Subject: [PATCH 1/7] Update minor_project.py --- minor_project.py | 49 ++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/minor_project.py b/minor_project.py index 7d6f5ca..55fcfeb 100644 --- a/minor_project.py +++ b/minor_project.py @@ -11,13 +11,13 @@ from collections import Counter from bs4 import BeautifulSoup #reading CSV Data -QuestionData =shortpanda.read_csv('Questions.csv',encoding='latin-1') -TagsData=shortpanda.read_csv('Tags.csv') +QuestionData =shortpanda.read_csv('Questions.csv (1).zip',encoding='latin-1') +TagsData=shortpanda.read_csv('Tags.csv.zip') #creating a list of Tags from TagsData Taglist=TagsData.Tag.tolist() #creating a list of Tags from TagsData SortedTaglist=Counter(Taglist) -print(SortedTaglist) #analyzing whether the Data is sorted or not +print(SortedTaglist) #analyzing whether the Data is sorted or not list(QuestionData) CreationDateList=QuestionData.CreationDate.tolist() print(CreationDateList) @@ -33,7 +33,7 @@ print(Titlelist) -with open('TitleData.txt') as titlefile: +with open('TempData.txt') as titlefile: TitleRead=titlefile.read() Words = re.findall(r'\w+', TitleRead) @@ -47,7 +47,7 @@ #for cleaning HTML headers with open('SampleBodyData.txt') as Datafile: - text=Datafile.read() + text=Datafile.read() def HTML_ClEAN(text): @@ -55,33 +55,30 @@ def HTML_ClEAN(text): soup=BeautifulSoup(text,'html.parser') return soup.get_text + # for removing unnecessary code snippets, ,links, URL... def remove_CodeSnippet(text): - return re.sub('
.*?', '', text)
-
-
+ return re.sub('.*?', '', str(text))
+
#replacing paragraph and next line headers with a blank string
def remove_Para(text):
- text= re.sub('', '', text)
- text= re.sub('\\n', '', text)
- text= re.sub('', '', text) + text= re.sub('
', '', str(text)) + text= re.sub('\\n', '', str(text)) + text= re.sub('', '', str(text)) return text - #implementing the De-noise Functions to clean the SampleData def De_noise(text): - text= HTML_ClEAN(text) + text= HTML_ClEAN(text) text= remove_CodeSnippet(text) text= remove_Para(text) return text + - - - #Non-Ascii Words are ignored for better accuracy purpose def is_Non_Ascii(ProcessedSampleBodyData): NewProcessedSampleBodyData = [] @@ -90,6 +87,7 @@ def is_Non_Ascii(ProcessedSampleBodyData): NewProcessedSampleBodyData.append(temp) return NewProcessedSampleBodyData + #converting everyword to lowercase to remove redundancy for ex-is & IS def Case_lower(ProcessedSampleBodyData): NewProcessedSampleBodyData = [] @@ -98,15 +96,17 @@ def Case_lower(ProcessedSampleBodyData): NewProcessedSampleBodyData.append(Temp) return NewProcessedSampleBodyData + #removing Punctuation like,0-;] for better data quality def TextClean(ProcessedSampleBodyData): NewProcessedSampleBodyData = [] for word in ProcessedSampleBodyData: - temp = re.sub(r'[^\w\s]', '', word) + temp = re.sub(r'[^\w\s]', '', str(word)) if NewProcessedSampleBodyData != '': NewProcessedSampleBodyData .append(temp) return NewProcessedSampleBodyData + #removing Numbers for better tag prediction def Number_Removal(ProcessedSampleBodyData): use = inflect.engine() @@ -119,6 +119,8 @@ def Number_Removal(ProcessedSampleBodyData): NewProcessedSampleBodyData.append(word) return NewProcessedSampleBodyData +nltk.download('stopwords') + #filtering out StopWords to before processing natural data def StopWord_Removal(ProcessedSampleBodyData): @@ -129,7 +131,6 @@ def StopWord_Removal(ProcessedSampleBodyData): return NewProcessedSampleBodyData - def WordProcessing(Body_word): Body_word=is_Non_Ascii(Body_word) @@ -139,9 +140,14 @@ def WordProcessing(Body_word): Body_word=StopWord_Removal(Body_word) return Body_word - - DataText= De_noise(text) + detext= De_noise(text) + print(detext) + newtext= WordProcessing(detext) + print(newtext) + + nltk.download('punkt') + #Tokenising the sampledata #Tokenising is converting text to words ProcessedSampleBodyData = nltk.word_tokenize(text) @@ -152,5 +158,4 @@ def WordProcessing(Body_word): ProcessedBodyWord= WordProcessing(ProcessedSampleBodyData) count=Counter( ProcessedBodyWord) - - + print(count) From 647045db7fa783054bcbc1d0425879e031c1b7bb Mon Sep 17 00:00:00 2001 From: Naman Kalra <43729631+namankalra@users.noreply.github.com> Date: Sat, 10 Nov 2018 23:01:08 +0530 Subject: [PATCH 2/7] Update minor_project.py --- minor_project.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/minor_project.py b/minor_project.py index 55fcfeb..65ed211 100644 --- a/minor_project.py +++ b/minor_project.py @@ -11,8 +11,8 @@ from collections import Counter from bs4 import BeautifulSoup #reading CSV Data -QuestionData =shortpanda.read_csv('Questions.csv (1).zip',encoding='latin-1') -TagsData=shortpanda.read_csv('Tags.csv.zip') +QuestionData =shortpanda.read_csv('Questions.csv',encoding='latin-1') +TagsData=shortpanda.read_csv('Tags.csv') #creating a list of Tags from TagsData Taglist=TagsData.Tag.tolist() #creating a list of Tags from TagsData From e23bcd20e805e2e0d42dd51ff93d7e3d69377a49 Mon Sep 17 00:00:00 2001 From: Naman Kalra <43729631+namankalra@users.noreply.github.com> Date: Sat, 10 Nov 2018 23:05:01 +0530 Subject: [PATCH 3/7] Update minor_project.py --- minor_project.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/minor_project.py b/minor_project.py index 65ed211..0d59081 100644 --- a/minor_project.py +++ b/minor_project.py @@ -33,7 +33,7 @@ print(Titlelist) -with open('TempData.txt') as titlefile: +with open('TitleData.txt') as titlefile: TitleRead=titlefile.read() Words = re.findall(r'\w+', TitleRead) From 2b0d23586c36b4e87927d601a4e34509b494f179 Mon Sep 17 00:00:00 2001 From: Naman Kalra <43729631+namankalra@users.noreply.github.com> Date: Sat, 17 Nov 2018 03:47:24 +0530 Subject: [PATCH 4/7] Update minor_project.py --- minor_project.py | 372 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 266 insertions(+), 106 deletions(-) diff --git a/minor_project.py b/minor_project.py index 0d59081..36bcf39 100644 --- a/minor_project.py +++ b/minor_project.py @@ -1,134 +1,93 @@ -#importing the libraries -import pandas as shortpanda -import numpy as shortnum +import pandas as pd +from collections import Counter +import string import re +import numpy as np import nltk from nltk import word_tokenize, sent_tokenize from nltk.corpus import stopwords -import datetime import unicodedata import inflect -from collections import Counter from bs4 import BeautifulSoup -#reading CSV Data -QuestionData =shortpanda.read_csv('Questions.csv',encoding='latin-1') -TagsData=shortpanda.read_csv('Tags.csv') -#creating a list of Tags from TagsData -Taglist=TagsData.Tag.tolist() -#creating a list of Tags from TagsData -SortedTaglist=Counter(Taglist) -print(SortedTaglist) #analyzing whether the Data is sorted or not -list(QuestionData) -CreationDateList=QuestionData.CreationDate.tolist() -print(CreationDateList) - -#finding the week number -Week= datetime.date(2014,4,17).isocalendar()[1] -Week= datetime.date(2012,6,12).isocalendar()[1] - - -#counting the number of most frequent word in Title Data -Titlelist=QuestionData.Title.tolist() +import matplotlib.pyplot as plt +Questions=pd.read_csv('Questions.csv',encoding='latin-1') +TagData=pd.read_csv('Tags.csv',encoding='latin-1') -print(Titlelist) - - -with open('TitleData.txt') as titlefile: - TitleRead=titlefile.read() - Words = re.findall(r'\w+', TitleRead) - - WordCount = Counter(Words) - -#Cleaning Data by removing HTML Tags ,Links and Code Snippets - - Bodylist=QuestionData.Body.tolist() # list of unproceesed question data -#Using BeautifulSoup for Noise Removal - -#for cleaning HTML headers -with open('SampleBodyData.txt') as Datafile: - text=Datafile.read() - +def text_clean(text): + global Blank + Blank='' + if not isinstance(text,str): + return text + text=re.sub('
.*?',Blank,str(text))
+ def clean_link(match):
+ return Blank if re.match('[a-z]+://',match.group(1)) else match.group(1)
+ text = re.sub(']+>(.*)',clean_link,str(text))
+ return re.sub('<[^>]+>',Blank,str(text))
def HTML_ClEAN(text):
soup=BeautifulSoup(text,'html.parser')
return soup.get_text
-
# for removing unnecessary code snippets, ,links, URL...
def remove_CodeSnippet(text):
return re.sub('.*?', '', str(text))
-
-
+
#replacing paragraph and next line headers with a blank string
-def remove_Para(text):
-
- text= re.sub('', '', str(text))
- text= re.sub('\\n', '', str(text))
- text= re.sub('', '', str(text)) - return text - - #implementing the De-noise Functions to clean the SampleData def De_noise(text): - text= HTML_ClEAN(text) + text= HTML_ClEAN(text) text= remove_CodeSnippet(text) - text= remove_Para(text) return text - -#Non-Ascii Words are ignored for better accuracy purpose -def is_Non_Ascii(ProcessedSampleBodyData): - NewProcessedSampleBodyData = [] - for word in ProcessedSampleBodyData: + #Non-Ascii Words are ignored for better accuracy purpose +def is_Non_Ascii(ProcessedBodyData): + NewProcessedBodyData = [] + for word in ProcessedBodyData: temp = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore') - NewProcessedSampleBodyData.append(temp) - return NewProcessedSampleBodyData - + NewProcessedBodyData.append(temp) + return NewProcessedBodyData #converting everyword to lowercase to remove redundancy for ex-is & IS -def Case_lower(ProcessedSampleBodyData): - NewProcessedSampleBodyData = [] - for word in ProcessedSampleBodyData: +def Case_lower(ProcessedBodyData): + NewProcessedBodyData = [] + for word in ProcessedBodyData: Temp = word.lower() - NewProcessedSampleBodyData.append(Temp) - return NewProcessedSampleBodyData - + NewProcessedBodyData.append(Temp) + return NewProcessedBodyData #removing Punctuation like,0-;] for better data quality -def TextClean(ProcessedSampleBodyData): - NewProcessedSampleBodyData = [] - for word in ProcessedSampleBodyData: +def TextClean(ProcessedBodyData): + NewProcessedBodyData = [] + for word in ProcessedBodyData: temp = re.sub(r'[^\w\s]', '', str(word)) - if NewProcessedSampleBodyData != '': - NewProcessedSampleBodyData .append(temp) - return NewProcessedSampleBodyData - + if NewProcessedBodyData != '': + NewProcessedBodyData .append(temp) + return NewProcessedBodyData #removing Numbers for better tag prediction -def Number_Removal(ProcessedSampleBodyData): +def Number_Removal(ProcessedBodyData): use = inflect.engine() - NewProcessedSampleBodyData = [] - for word in ProcessedSampleBodyData: + NewProcessedBodyData = [] + for word in ProcessedBodyData: if word.isdigit(): temp = use.number_to_words(word) - NewProcessedSampleBodyData.append(temp) + NewProcessedBodyData.append(temp) else: - NewProcessedSampleBodyData.append(word) - return NewProcessedSampleBodyData - -nltk.download('stopwords') + NewProcessedBodyData.append(word) + return NewProcessedBodyData #filtering out StopWords to before processing natural data -def StopWord_Removal(ProcessedSampleBodyData): +def StopWord_Removal(ProcessedBodyData): - NewProcessedSampleBodyData = [] - for word in ProcessedSampleBodyData: + NewProcessedBodyData = [] + for word in ProcessedBodyData: if word not in stopwords.words('english'): - NewProcessedSampleBodyData.append(word) - return NewProcessedSampleBodyData + NewProcessedBodyData.append(word) + return NewProcessedBodyData + def WordProcessing(Body_word): @@ -140,22 +99,223 @@ def WordProcessing(Body_word): Body_word=StopWord_Removal(Body_word) return Body_word - - detext= De_noise(text) - print(detext) - newtext= WordProcessing(detext) - print(newtext) - - nltk.download('punkt') - - #Tokenising the sampledata - #Tokenising is converting text to words - ProcessedSampleBodyData = nltk.word_tokenize(text) - print( ProcessedSampleBodyData) + +Questions['Text']=Questions['Body'].apply(text_clean).str.lower() +Questions['Text']=Questions['Text'].apply(De_noise) +Questions['Text']=Questions['Text'].apply(WordProcessing) +Questions.Text=Questions.Text.apply(lambda x:x.replace('"','').replace("\n","").replace("\t","")) +Questions.Text[0] +TagData.Tag.nunique() +MostCommonTagCount=Counter(list(TagData.Tag)).most_common(40) +print(MostCommonTagCount) + +TagData = TagData[(TagData.Tag == 'javascript') | (TagData.Tag == 'java') | (TagData.Tag == 'c#') | (TagData.Tag =='php') | (TagData.Tag =='android') | (TagData.Tag == 'jquery') | (TagData.Tag == 'python') | (TagData.Tag == 'html') | (TagData.Tag == 'c++') | (TagData.Tag == 'windows')| (TagData.Tag == 'ios')] + +TagData.head() +TextandTags=TagData.merge(Questions,on='Id') +TextandTags.Tag +TextandTags.Text + +#TextandTags.to_csv("output.csv", index=False) +UnnecessaryColumns=['Id','OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'Title', 'Body'] +TextandTags=TextandTags.drop( UnnecessaryColumns,axis=1,inplace=False) +TextandTags.Tag +Categories = TextandTags['Tag'].unique() + +fig = plt.figure(figsize=(11,6)) +BalTextandTags.groupby('Tag').Text.count().plot.bar(ylim=0) +plt.show() + +TextandTags = pd.DataFrame(TextandTags) +BalTextandTags = TextandTags.groupby('Tag') +BalTextandTags = pd.DataFrame(BalTextandTags.apply(lambda x: x.sample(BalTextandTags.size().min()).reset_index(drop=True))) +BalTextandTags.head() +BalTextandTags.Tag + + +MostCommonTagCount=Counter(list(BalTextandTags.Tag)).most_common(11) +print(MostCommonTagCount) + + + +from sklearn.model_selection import train_test_split +X_train,X_test,Y_train,Y_test=train_test_split(BalTextandTags['Text'],BalTextandTags['Tag'],random_state=42, + test_size=0.2,shuffle=True) + +def Convert_to_MB(Dataset): + Result=sum(len(s.encode('utf-8'))for s in Dataset)/ 1e6 + return Result +Train_MB_size=Convert_to_MB(X_train) +Test_MB_size=Convert_to_MB(X_test) +print("%d documents - %0.3fMB (training set)" % ( + len(X_train), Train_MB_size)) +print("%d documents - %0.3fMB (test set)" % ( + len(X_test),Test_MB_size)) +print("%d Categories" % len(Categories)) +print() + +from optparse import OptionParser +options = OptionParser() +options.add_option("--use_hashing", + action="store_true", + help="Use a hashing vectorizer.") +options.add_option("--n_features", + action="store", type=int, default=2 ** 16, + help="n_features when using the hashing vectorizer.") + +import sys +def is_interactive(): + return not hasattr(sys.modules['__main__'], '__file__') + +# work-around for Jupyter notebook and IPython console +argv = [] if is_interactive() else sys.argv[1:] +(opts, args) = options.parse_args(argv) +if len(args) > 0: + options.error("this script takes no arguments.") + sys.exit(1) + +print(__doc__) +options.print_help() +print() + + + +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.feature_extraction.text import HashingVectorizer +from time import time +#feature extraction using sparse vectorizer +tnought = time() +if opts.use_hashing: + Vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, + n_features=opts.n_features) + X_train_new = Vectorizer.transform(X_train) +else: + Vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, + stop_words='english') + X_train_new = Vectorizer.fit_transform(X_train) +TimeTakenTrain = time()-tnought +print("done in %fs at %0.3fMB/s" % (TimeTakenTrain, Train_MB_size / TimeTakenTrain)) +print("n_samples: %d, n_features: %d" % X_train_new.shape) +print() +#using the vectoriser for the test data now +tnought=time() +X_test_new=Vectorizer.transform(X_test) +TimeTakenTest= time()-tnought +print("done in %fs at %0.3fMB/s" % (TimeTakenTest, Test_MB_size / TimeTakenTest)) +print("n_samples: %d, n_features: %d" % X_test_new.shape) +print() +#chi square test for conversion of Integer feature name to +#original String Token Name +from sklearn.feature_selection import SelectKBest, chi2 +if options.use_hashing: + feature_names=None +else: + feature_names=Vectorizer.get_feature_names() - #BodyWordCount=Counter(ProcessedSampleBodyData) - - ProcessedBodyWord= WordProcessing(ProcessedSampleBodyData) +if opts.select_chi2: + print("Extracting %d bestfeatures from chi-squared test" + % opts.select_chi2) + + options.add_option("--chi2_select", + action="store", type="int", dest="select_chi2", + help="Select some number of features using a chi-squared test") + tnought=time() + ch2=SelectKBest(chi2,k=opts.select_chi2) + X_train_new=ch2.fit_transform(X_train_new,Y_train) + X_test_new=ch2.transform(X_test_new) + if feature_names: + feature_names= [feature_names[i] for i + in ch2.get_support(indices=True)] + + print("done in %fs" % (time() - tnought)) + print() +if feature_names: + feature_names=np.asarray(feature_names) + +TargetClasses=Categories + +print(X_train_new) + + + +from sklearn.svm import LinearSVC +classifier=LinearSVC(multi_class='ovr',random_state=0) +classifier.fit(X_train_new,Y_train) + +Y_pred=classifier.predict(X_test_new) + +from sklearn.metrics import confusion_matrix +cm=confusion_matrix(Y_test,Y_pred) + +from sklearn import metrics + +score =metrics.accuracy_score(Y_test, Y_pred) +print("accuracy: %0.3f" % score) + +from sklearn.metrics import classification_report + +print(classification_report(Y_test,Y_pred,target_names=Categories)) + + + + +# Visualising the Training set results +from matplotlib.colors import ListedColormap +X_set, y_set = X_train_new, Y_train +X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), + np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) +plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), + alpha = 0.75, cmap = ListedColormap(('red', 'green'))) +plt.xlim(X1.min(), X1.max()) +plt.ylim(X2.min(), X2.max()) +for i, j in enumerate(np.unique(y_set)): + plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], + c = ListedColormap(('red', 'green'))(i), label = j) +plt.title('SVM (Training set)') +plt.xlabel('Questions') +plt.ylabel('Tag') +plt.legend() +plt.show() + +indices = np.arange(len()) + + = [[x[i] for x in results] for i in range(4)] + +clf_names, score, training_time, test_time = +training_time = np.array(training_time) / np.max(training_time) +test_time = np.array(test_time) / np.max(test_time) + +plt.figure(figsize=(12, 8)) +plt.title("Score") +plt.barh(indices, score, .2, label="score", color='navy') +plt.barh(indices + .3, training_time, .2, label="training time", + color='c') +plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange') +plt.yticks(()) +plt.legend(loc='best') +plt.subplots_adjust(left=.25) +plt.subplots_adjust(top=.95) +plt.subplots_adjust(bottom=.05) + +for i, c in zip(indices, clf_names): + plt.text(-.3, i, c) + +plt.show() - count=Counter( ProcessedBodyWord) - print(count) +# Visualising the Test set results +from matplotlib.colors import ListedColormap +X_set, y_set = X_test_new, Y_test +X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), + np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) +plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), + alpha = 0.75, cmap = ListedColormap(('red', 'green'))) +plt.xlim(X1.min(), X1.max()) +plt.ylim(X2.min(), X2.max()) +for i, j in enumerate(np.unique(y_set)): + plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], + c = ListedColormap(('red', 'green'))(i), label = j) +plt.title('SVM (Test set)') +plt.xlabel('Questions') +plt.ylabel('Tag') +plt.legend() +plt.show() From c419e120c4c0ab1e1f60ad7969d28df898457b9b Mon Sep 17 00:00:00 2001 From: Naman Kalra <43729631+namankalra@users.noreply.github.com> Date: Wed, 21 Nov 2018 00:02:05 +0530 Subject: [PATCH 5/7] Update minor_project.py --- minor_project.py | 174 ++++++++++++++++++++++++++--------------------- 1 file changed, 97 insertions(+), 77 deletions(-) diff --git a/minor_project.py b/minor_project.py index 36bcf39..f8a991f 100644 --- a/minor_project.py +++ b/minor_project.py @@ -1,3 +1,10 @@ +# -*- coding: utf-8 -*- +""" +Created on Sat Nov 17 14:57:53 2018 + +@author: Naman Kalra +""" + import pandas as pd from collections import Counter import string @@ -101,10 +108,7 @@ def WordProcessing(Body_word): return Body_word Questions['Text']=Questions['Body'].apply(text_clean).str.lower() -Questions['Text']=Questions['Text'].apply(De_noise) -Questions['Text']=Questions['Text'].apply(WordProcessing) -Questions.Text=Questions.Text.apply(lambda x:x.replace('"','').replace("\n","").replace("\t","")) -Questions.Text[0] + TagData.Tag.nunique() MostCommonTagCount=Counter(list(TagData.Tag)).most_common(40) print(MostCommonTagCount) @@ -116,14 +120,13 @@ def WordProcessing(Body_word): TextandTags.Tag TextandTags.Text -#TextandTags.to_csv("output.csv", index=False) UnnecessaryColumns=['Id','OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'Title', 'Body'] TextandTags=TextandTags.drop( UnnecessaryColumns,axis=1,inplace=False) -TextandTags.Tag Categories = TextandTags['Tag'].unique() +print(Categories) -fig = plt.figure(figsize=(11,6)) -BalTextandTags.groupby('Tag').Text.count().plot.bar(ylim=0) +graph = plt.figure(figsize=(11,6)) +TextandTags.groupby('Tag').Text.count().plot.bar(ylim=0) plt.show() TextandTags = pd.DataFrame(TextandTags) @@ -132,16 +135,22 @@ def WordProcessing(Body_word): BalTextandTags.head() BalTextandTags.Tag +BalTextandTags.to_csv("output.csv", index=False) + +#BalTextandTags['Text']=BalTextandTags['Text'].apply(De_noise) +#BalTextandTags['Text']=BalTextandTags['Text'].apply(WordProcessing) +BalTextandTags.Text=BalTextandTags.Text.apply(lambda x:x.replace('"','').replace("\n","").replace("\t","")) +BalTextandTags.Text[0] MostCommonTagCount=Counter(list(BalTextandTags.Tag)).most_common(11) print(MostCommonTagCount) - - from sklearn.model_selection import train_test_split X_train,X_test,Y_train,Y_test=train_test_split(BalTextandTags['Text'],BalTextandTags['Tag'],random_state=42, test_size=0.2,shuffle=True) + + def Convert_to_MB(Dataset): Result=sum(len(s.encode('utf-8'))for s in Dataset)/ 1e6 return Result @@ -152,7 +161,6 @@ def Convert_to_MB(Dataset): print("%d documents - %0.3fMB (test set)" % ( len(X_test),Test_MB_size)) print("%d Categories" % len(Categories)) -print() from optparse import OptionParser options = OptionParser() @@ -196,68 +204,105 @@ def is_interactive(): TimeTakenTrain = time()-tnought print("done in %fs at %0.3fMB/s" % (TimeTakenTrain, Train_MB_size / TimeTakenTrain)) print("n_samples: %d, n_features: %d" % X_train_new.shape) -print() + #using the vectoriser for the test data now tnought=time() X_test_new=Vectorizer.transform(X_test) TimeTakenTest= time()-tnought print("done in %fs at %0.3fMB/s" % (TimeTakenTest, Test_MB_size / TimeTakenTest)) print("n_samples: %d, n_features: %d" % X_test_new.shape) -print() -#chi square test for conversion of Integer feature name to -#original String Token Name -from sklearn.feature_selection import SelectKBest, chi2 -if options.use_hashing: - feature_names=None -else: - feature_names=Vectorizer.get_feature_names() - -if opts.select_chi2: - print("Extracting %d bestfeatures from chi-squared test" - % opts.select_chi2) - - options.add_option("--chi2_select", - action="store", type="int", dest="select_chi2", - help="Select some number of features using a chi-squared test") - tnought=time() - ch2=SelectKBest(chi2,k=opts.select_chi2) - X_train_new=ch2.fit_transform(X_train_new,Y_train) - X_test_new=ch2.transform(X_test_new) - if feature_names: - feature_names= [feature_names[i] for i - in ch2.get_support(indices=True)] - - print("done in %fs" % (time() - tnought)) - print() -if feature_names: - feature_names=np.asarray(feature_names) -TargetClasses=Categories -print(X_train_new) +#SVM +from sklearn.svm import LinearSVC +classifiersvm=LinearSVC(multi_class='ovr',random_state=0) +classifiersvm.fit(X_train_new,Y_train) + +Y_predsvm=classifiersvm.predict(X_test_new) +from sklearn.metrics import confusion_matrix +cmsvm=confusion_matrix(Y_test,Y_predsvm) +from sklearn import metrics +scoresvm=metrics.accuracy_score(Y_test, Y_predsvm) +print("accuracy: %0.3f" % scoresvm) -from sklearn.svm import LinearSVC -classifier=LinearSVC(multi_class='ovr',random_state=0) -classifier.fit(X_train_new,Y_train) +from sklearn.metrics import classification_report +print(classification_report(Y_test,Y_predsvm,target_names=Categories)) -Y_pred=classifier.predict(X_test_new) +#RandomForest +from sklearn.ensemble import RandomForestClassifier +classifierrf=RandomForestClassifier(n_estimators=10 , criterion='entropy' ,random_state=0) +classifierrf.fit(X_train_new,Y_train) + +Y_predrf=classifierrf.predict(X_test_new) from sklearn.metrics import confusion_matrix -cm=confusion_matrix(Y_test,Y_pred) +cmrf=confusion_matrix(Y_test,Y_predrf) from sklearn import metrics - -score =metrics.accuracy_score(Y_test, Y_pred) -print("accuracy: %0.3f" % score) +scorerf=metrics.accuracy_score(Y_test, Y_predrf) +print("accuracy: %0.3f" % scorerf) from sklearn.metrics import classification_report +print(classification_report(Y_test,Y_predrf,target_names=Categories)) + +def make_meshgrid(x, y, h=.02): + """Create a mesh of points to plot in + + Parameters + ---------- + x: data to base x-axis meshgrid on + y: data to base y-axis meshgrid on + h: stepsize for meshgrid, optional + + Returns + ------- + xx, yy : ndarray + """ + x_min, x_max = x.min() - 1, x.max() + 1 + y_min, y_max = y.min() - 1, y.max() + 1 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + return xx, yy + + +def plot_contours(ax, clf, xx, yy, **params): + """Plot the decision boundaries for a classifier. + + Parameters + ---------- + ax: matplotlib axes object + clf: a classifier + xx: meshgrid ndarray + yy: meshgrid ndarray + params: dictionary of params to pass to contourf, optional + """ + Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) + Z = Z.reshape(xx.shape) + out = ax.contourf(xx, yy, Z, **params) + return out + +titles = ('SVC with linear kernel') -print(classification_report(Y_test,Y_pred,target_names=Categories)) - +fig = plt.figure(figsize=(11,6)) +X0, X1 = X_train_new[:, 0], X_train_new[:, 1] +xx, yy = make_meshgrid(X0, X1) + +for clf, title, ax in zip(classifier, titles, sub.flatten()): + plot_contours(ax, clf, xx, yy, + cmap=plt.cm.coolwarm, alpha=0.8) + ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k') + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xlabel('Sepal length') + ax.set_ylabel('Sepal width') + ax.set_xticks(()) + ax.set_yticks(()) + ax.set_title(title) +plt.show() # Visualising the Training set results from matplotlib.colors import ListedColormap @@ -277,31 +322,6 @@ def is_interactive(): plt.legend() plt.show() -indices = np.arange(len()) - - = [[x[i] for x in results] for i in range(4)] - -clf_names, score, training_time, test_time = -training_time = np.array(training_time) / np.max(training_time) -test_time = np.array(test_time) / np.max(test_time) - -plt.figure(figsize=(12, 8)) -plt.title("Score") -plt.barh(indices, score, .2, label="score", color='navy') -plt.barh(indices + .3, training_time, .2, label="training time", - color='c') -plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange') -plt.yticks(()) -plt.legend(loc='best') -plt.subplots_adjust(left=.25) -plt.subplots_adjust(top=.95) -plt.subplots_adjust(bottom=.05) - -for i, c in zip(indices, clf_names): - plt.text(-.3, i, c) - -plt.show() - # Visualising the Test set results from matplotlib.colors import ListedColormap X_set, y_set = X_test_new, Y_test From ed7b373dac09a77a337f4b29c4fa7321b33a019b Mon Sep 17 00:00:00 2001 From: Naman Kalra <43729631+namankalra@users.noreply.github.com> Date: Wed, 21 Nov 2018 13:04:49 +0530 Subject: [PATCH 6/7] Update minor_project.py --- minor_project.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/minor_project.py b/minor_project.py index f8a991f..1693127 100644 --- a/minor_project.py +++ b/minor_project.py @@ -247,6 +247,24 @@ def is_interactive(): from sklearn.metrics import classification_report print(classification_report(Y_test,Y_predrf,target_names=Categories)) +#NaiveBayes + +from sklearn.naive_bayes import GaussianNB as nb +classifiernb=nb() +classifiernb.fit(X_train_new,Y_train) +Y_predrf=classifierrf.predict(X_test_new) + +from sklearn.metrics import confusion_matrix +cmnb=confusion_matrix(Y_test,Y_prednb) + +from sklearn import metrics +scorenb=metrics.accuracy_score(Y_test, Y_prednb) +print("accuracy: %0.3f" % scorenb) + +from sklearn.metrics import classification_report +print(classification_report(Y_test,Y_prednb,target_names=Categories)) + + def make_meshgrid(x, y, h=.02): """Create a mesh of points to plot in From ff122528f7217a0451fad97847bd2035518399a0 Mon Sep 17 00:00:00 2001 From: Naman Kalra <43729631+namankalra@users.noreply.github.com> Date: Wed, 21 Nov 2018 14:53:03 +0530 Subject: [PATCH 7/7] Update minor_project.py --- minor_project.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/minor_project.py b/minor_project.py index 1693127..a486cd2 100644 --- a/minor_project.py +++ b/minor_project.py @@ -20,6 +20,25 @@ Questions=pd.read_csv('Questions.csv',encoding='latin-1') TagData=pd.read_csv('Tags.csv',encoding='latin-1') +def plot_tags(tagCount): + + x,y = zip(*tagCount) + + colormap = plt.cm.gist_ncar #nipy_spectral, Set1,Paired + colors = [colormap(i) for i in np.linspace(0, 0.8,50)] + + area = [i/4000 for i in list(y)] # 0 to 15 point radiuses + plt.figure(figsize=(10,6)) + plt.ylabel("Number of question associations") + for i in range(len(y)): + plt.plot(i,y[i], marker='o', linestyle='',ms=area[i],label=x[i]) + + plt.legend(numpoints=1) + plt.show() +import collections +tagCount = collections.Counter(list(TagData['Tag'])).most_common(10) +print(tagCount) +plot_tags(tagCount) def text_clean(text): global Blank