From b1e896a67cc05ee4fd997d50fc8c90f86321e717 Mon Sep 17 00:00:00 2001 From: smitkalathiya <83771700+smitkalathiya@users.noreply.github.com> Date: Thu, 4 Jan 2024 12:23:36 +0530 Subject: [PATCH] Update TextFeatureSelection.py Added Vocabulary and ngram_range parameter in TFIDF. Updated : _getPopulationAndMatrix , _computeFitness ,_get_parents. --- build/lib/TextFeatureSelection.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/build/lib/TextFeatureSelection.py b/build/lib/TextFeatureSelection.py index 9b2f181..6156834 100644 --- a/build/lib/TextFeatureSelection.py +++ b/build/lib/TextFeatureSelection.py @@ -494,7 +494,7 @@ def _cost_function_value(self,y_test,y_test_pred,cost_function,avrg): return metric - def _computeFitness(self,gene,unique_words,x,y,model,model_metric,avrg,analyzer,min_df,max_df,stop_words,tokenizer,token_pattern,lowercase): + def _computeFitness(self,gene,unique_words,x,y,model,model_metric,avrg,analyzer,min_df,max_df,stop_words,tokenizer,token_pattern,lowercase,vocabulary,ngram_range): ### create tfidf matrix for only terms which are in gene # get terms from gene and vocabulary combnation term_to_use=list(np.array(unique_words)[list(map(bool,gene))]) @@ -511,7 +511,7 @@ def _computeFitness(self,gene,unique_words,x,y,model,model_metric,avrg,analyzer, y_train, y_test = np.array(y)[train_index],np.array(y)[test_index] ##based on vocabulary set, create tfidf matrix for train and test data - tfidf=TfidfVectorizer(vocabulary=term_to_use,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase) + tfidf=TfidfVectorizer(vocabulary=term_to_use,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range) tfidfvec_vectorizer=tfidf.fit(X_train) #get x train and test @@ -564,7 +564,7 @@ def _get_population(self,population,population_matrix,population_array): return population_matrix - def _get_parents(self,population_array,population_matrix,unique_words,x,y,model,model_metric,avrg,analyzer,min_df,max_df,stop_words,tokenizer,token_pattern,lowercase): + def _get_parents(self,population_array,population_matrix,unique_words,x,y,model,model_metric,avrg,analyzer,min_df,max_df,stop_words,tokenizer,token_pattern,lowercase,vocabulary,ngram_range): #keep space for best chromosome parents = np.empty((0,population_array.shape[0])) @@ -587,15 +587,15 @@ def _get_parents(self,population_array,population_matrix,unique_words,x,y,model, ##gene pool 1 gene_1 = population_matrix[index_run[0]] #cost of gene 1 - cost1=self._computeFitness(gene=gene_1,unique_words=unique_words,x=x,y=y,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase) + cost1=self._computeFitness(gene=gene_1,unique_words=unique_words,x=x,y=y,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range) ##gene pool 2 gene_2 = population_matrix[index_run[1]] #cost of gene 2 - cost2=self._computeFitness(gene=gene_2,unique_words=unique_words,x=x,y=y,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase) + cost2=self._computeFitness(gene=gene_2,unique_words=unique_words,x=x,y=y,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range) ##gene pool 3 gene_3 = population_matrix[index_run[2]] #cost of gene 3 - cost3=self._computeFitness(gene=gene_3,unique_words=unique_words,x=x,y=y,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase) + cost3=self._computeFitness(gene=gene_3,unique_words=unique_words,x=x,y=y,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range) #get best chromosome from 3 and assign best chromosome. if cost1==max(cost1,cost2,cost3): @@ -681,7 +681,7 @@ def _mutation(self,child,prob_mutation): t = t+1 return mutated_child - def _getPopulationAndMatrix(self,doc_list,label_list,analyzer,min_df,max_df,stop_words,tokenizer,token_pattern,lowercase): + def _getPopulationAndMatrix(self,doc_list,label_list,analyzer,min_df,max_df,stop_words,tokenizer,token_pattern,lowercase,vocabulary,ngram_range): #get null free df temp_df=pd.DataFrame({'doc_list':doc_list,'label_list':label_list}) temp_df=temp_df[(~temp_df['doc_list'].isna()) & (~temp_df['label_list'].isna())] @@ -692,7 +692,7 @@ def _getPopulationAndMatrix(self,doc_list,label_list,analyzer,min_df,max_df,stop gc.collect() #get unique tokens - tfidfvec = TfidfVectorizer(analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase) + tfidfvec = TfidfVectorizer(analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,vocabulary=vocabulary,ngram_range=ngram_range,lowercase=lowercase) tfidfvec_vectorizer = tfidfvec.fit(doc_list) unique_words=list(tfidfvec_vectorizer.vocabulary_.keys()) @@ -715,7 +715,7 @@ def _getPopulationAndMatrix(self,doc_list,label_list,analyzer,min_df,max_df,stop return doc_list,label_list,unique_words,population_array,population_matrix,best_of_a_generation - def getGeneticFeatures(self,doc_list,label_list,model=LogisticRegression(),model_metric='f1',avrg='binary',analyzer='word',min_df=2,max_df=1.0,stop_words=None,tokenizer=None,token_pattern='(?u)\\b\\w\\w+\\b',lowercase=True): + def getGeneticFeatures(self,doc_list,label_list,model=LogisticRegression(),model_metric='f1',avrg='binary',analyzer='word',min_df=2,max_df=1.0,stop_words=None,tokenizer=None,token_pattern='(?u)\\b\\w\\w+\\b',lowercase=True,vocabulary=None,ngram_range=(3,5)): ''' Data Parameters ---------- @@ -805,7 +805,7 @@ def getGeneticFeatures(self,doc_list,label_list,model=LogisticRegression(),model avrg='binary' #get all parameters needed for GA - doc_list,label_list,unique_words,population_array,population_matrix,best_of_a_generation=self._getPopulationAndMatrix(doc_list,label_list,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase) + doc_list,label_list,unique_words,population_array,population_matrix,best_of_a_generation=self._getPopulationAndMatrix(doc_list,label_list,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range) #Execute GA for genrtn in range(self.generations): @@ -829,7 +829,7 @@ def getGeneticFeatures(self,doc_list,label_list,model=LogisticRegression(),model # Doing it half the population size will mean getting matrix of population size equal to original matrix for family in range(int(self.population/2)): #get parents - parent1,parent2=self._get_parents(population_array=population_array,population_matrix=population_matrix,unique_words=unique_words,x=doc_list,y=label_list,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase) + parent1,parent2=self._get_parents(population_array=population_array,population_matrix=population_matrix,unique_words=unique_words,x=doc_list,y=label_list,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range) #crossover child1,child2=self._crossover(parent1=parent1,parent2=parent2,prob_crossover=self.prob_crossover) @@ -839,8 +839,8 @@ def getGeneticFeatures(self,doc_list,label_list,model=LogisticRegression(),model mutated_child2=self._mutation(child=child2,prob_mutation=self.prob_mutation) #get cost function for 2 mutated child and print for generation, family and child - cost1=self._computeFitness(gene=mutated_child1,unique_words=unique_words,x=doc_list,y=label_list,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase) - cost2=self._computeFitness(gene=mutated_child2,unique_words=unique_words,x=doc_list,y=label_list,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase) + cost1=self._computeFitness(gene=mutated_child1,unique_words=unique_words,x=doc_list,y=label_list,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range) + cost2=self._computeFitness(gene=mutated_child2,unique_words=unique_words,x=doc_list,y=label_list,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range) #create population for next generaion new_population = np.vstack((new_population,mutated_child1,mutated_child2))