-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlemmatized_data_clean.py
68 lines (51 loc) · 2.46 KB
/
lemmatized_data_clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re
from matplotlib import pyplot as plt
df = pd.read_csv('../web_scrape_description_final.csv', error_bad_lines=False)
## tokenize the description to split individual words into tokens
df['Web_Description_Tokenized'] = df['Web_Description'].apply(word_tokenize)
##convert text into lower case
df['Web_Description_Tokenized'] = df['Web_Description_Tokenized'].apply(lambda x: [word.lower().strip() for word in x])
##remove stopwords
stops = stopwords.words("english")
df['Web_Description_Tokenized'] = df['Web_Description_Tokenized'].apply(lambda x: [word for word in x if word not in stops])
##remove punctuations
table = str.maketrans("", "", string.punctuation)
df['Web_Description_NoPunctuation'] = df['Web_Description_Tokenized'].apply(lambda x: [word.translate(table) for word in x])
##.apply(lambda x: [word for word in x if word not in punc])
df['Web_Description_Clean'] = df['Web_Description_NoPunctuation'].apply(lambda x: [word for word in x if word])
##add pos tags (useful for lemmatization)
df['Web_Description_PosTag'] = df['Web_Description_Clean'].apply(nltk.tag.pos_tag)
## define a function to get pos tags that can be used in lemmatization
from nltk.corpus import wordnet
def get_pos_tag(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
##call get_pos_tag function for creating (word, tag) pair to be used in lemmatization
df['Web_Description_WordnetPos'] = df['Web_Description_PosTag'].apply(lambda x: [(word, get_pos_tag(pos_tag)) for (word, pos_tag) in x])
##lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['Web_Description_Lemmatized'] = df['Web_Description_WordnetPos'].apply(lambda x: [lemmatizer.lemmatize(word, tag) for word, tag in x])
##join the lemmatized words into a single text
df['Web_Description_LemmatizedString'] = [' '.join(map(str,l)) for l in df['Web_Description_Lemmatized']]
## save the cleaned df as csv
df.to_csv('./lemmatized_data.csv', index=False)