Skip to content

Latest commit

 

History

History
1371 lines (1106 loc) · 30.8 KB

2021-03-24-kaggle_1.md

File metadata and controls

1371 lines (1106 loc) · 30.8 KB
title categories tags classes typora-copy-images-to
[Kaggle] NLP:Disaster Tweets
Kaggle
Kaggle
NLP
wide
..\images\2021-03-24
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
# 데이터 불러오기
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')
display(train_df.sample(10))
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
id keyword location text target
5223 7463 obliteration NaN @tiggr_ why only Squad Obliteration? 1
3571 5103 famine NaN Robert Conquest Famine Museum Kiev @GuidoFawke... 0
5429 7748 police New York, NY #BREAKING411 4 police officers arrested for ab... 1
1447 2086 casualty NaN I still don't know why independence day and so... 0
3511 5018 eyewitness Jammu and Kashmir Eyewitness accounts of survivors of Hiroshima ... 1
821 1195 blizzard Himalayan Mountains #Tweet4Taiji is a dolphin worship group based ... 1
6758 9683 tornado San Antonio, TX Pizza and beer in a tornado in Austin. Windy a... 1
2593 3722 destroyed USA Black Eye 9: A space battle occurred at Star O... 0
7519 10752 wreckage Mumbai Wreckage 'Conclusively Confirmed' as From MH37... 1
6078 8684 sinkhole Haddonfield, NJ Georgia sinkhole closes road swallows whole po... 1
# 결측치 비율 확인
train_df.isnull().sum()/train_df.shape[0]*100
id           0.000000
keyword      0.801261
location    33.272035
text         0.000000
target       0.000000
dtype: float64
# label데이터 balance 확인 
x = train_df['target'].value_counts()
plt.pie(x, labels=["Not Disaster", "Disaster"], autopct='%1.1f%%',
       shadow=True, explode=(0.05, 0), startangle=60)
([<matplotlib.patches.Wedge at 0x7f51f625be10>,
  <matplotlib.patches.Wedge at 0x7f51f626bb10>],
 [Text(-1.0977433150136204, 0.3427238164220687, 'Not Disaster'),
  Text(1.0500153447956364, -0.327822780925458, 'Disaster')],
 [Text(-0.6204636128337854, 0.1937134614559518, '57.0%'),
  Text(0.5727356426158017, -0.17881242595934074, '43.0%')])

Keyword 분석

sns.barplot(y=train_df['keyword'].value_counts()[:20].index, 
            x=train_df['keyword'].value_counts()[:20], orient='h')
<AxesSubplot:xlabel='keyword'>

grouped_df = train_df.groupby('keyword').agg(['count','sum'])
grouped_df
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
id text target
count sum count sum count sum
keyword
ablaze 36 2534 36 @bbcmtd Wholesale Markets ablaze http://t.co/l... 36 13
accident 35 4263 35 'I can't have kids cuz I got in a bicycle acci... 35 24
aftershock 34 5825 34 @afterShock_DeLo scuf ps live and the game... ... 34 0
airplane%20accident 35 7705 35 Experts in France begin examining airplane deb... 35 30
ambulance 38 10224 38 Early wake up call from my sister begging me t... 38 20
... ... ... ... ... ... ...
wounded 37 392538 37 Gunmen kill four in El Salvador bus attack: Su... 37 26
wounds 33 351859 33 Gunshot wound #9 is in the bicep. The only one... 33 10
wreck 37 396215 37 @Squeaver just hangin out in star buck watchin... 37 7
wreckage 39 419629 39 Wreckage 'Conclusively Confirmed' as From MH37... 39 39
wrecked 39 421617 39 Wrecked an hour on YouTube with @julian_lage @... 39 3

221 rows × 6 columns

grouped_df['proportion'] = 100*grouped_df['target']['sum']/grouped_df['target']['count']
grouped_df
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
id text target proportion
count sum count sum count sum
keyword
ablaze 36 2534 36 @bbcmtd Wholesale Markets ablaze http://t.co/l... 36 13 36.111111
accident 35 4263 35 'I can't have kids cuz I got in a bicycle acci... 35 24 68.571429
aftershock 34 5825 34 @afterShock_DeLo scuf ps live and the game... ... 34 0 0.000000
airplane%20accident 35 7705 35 Experts in France begin examining airplane deb... 35 30 85.714286
ambulance 38 10224 38 Early wake up call from my sister begging me t... 38 20 52.631579
... ... ... ... ... ... ... ...
wounded 37 392538 37 Gunmen kill four in El Salvador bus attack: Su... 37 26 70.270270
wounds 33 351859 33 Gunshot wound #9 is in the bicep. The only one... 33 10 30.303030
wreck 37 396215 37 @Squeaver just hangin out in star buck watchin... 37 7 18.918919
wreckage 39 419629 39 Wreckage 'Conclusively Confirmed' as From MH37... 39 39 100.000000
wrecked 39 421617 39 Wrecked an hour on YouTube with @julian_lage @... 39 3 7.692308

221 rows × 7 columns

grouped_df.loc[grouped_df['proportion']>=50, 'keyword truth'] = 'high'
grouped_df.loc[grouped_df['proportion']<50, 'keyword truth'] = 'low'
grouped_df
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
id text target proportion keyword truth
count sum count sum count sum
keyword
ablaze 36 2534 36 @bbcmtd Wholesale Markets ablaze http://t.co/l... 36 13 36.111111 low
accident 35 4263 35 'I can't have kids cuz I got in a bicycle acci... 35 24 68.571429 high
aftershock 34 5825 34 @afterShock_DeLo scuf ps live and the game... ... 34 0 0.000000 low
airplane%20accident 35 7705 35 Experts in France begin examining airplane deb... 35 30 85.714286 high
ambulance 38 10224 38 Early wake up call from my sister begging me t... 38 20 52.631579 high
... ... ... ... ... ... ... ... ...
wounded 37 392538 37 Gunmen kill four in El Salvador bus attack: Su... 37 26 70.270270 high
wounds 33 351859 33 Gunshot wound #9 is in the bicep. The only one... 33 10 30.303030 low
wreck 37 396215 37 @Squeaver just hangin out in star buck watchin... 37 7 18.918919 low
wreckage 39 419629 39 Wreckage 'Conclusively Confirmed' as From MH37... 39 39 100.000000 high
wrecked 39 421617 39 Wrecked an hour on YouTube with @julian_lage @... 39 3 7.692308 low

221 rows × 8 columns

sns.countplot(grouped_df['keyword truth'])
<AxesSubplot:xlabel='keyword truth', ylabel='count'>

WordCloud(Before)

#disaster tweets
wc = WordCloud(background_color="white", stopwords=STOPWORDS, max_words=500,
              width=600, height=600, random_state=1)
wc.generate(" ".join(train_df[train_df['target']==1]['text'].tolist()))
plt.figure(figsize=(10,15))
plt.imshow(wc)
<matplotlib.image.AxesImage at 0x7f51f5fd7a10>

#Non-disaster tweets
wc = WordCloud(background_color="white", stopwords=STOPWORDS, max_words=500,
              width=600, height=600, random_state=1)
wc.generate(" ".join(train_df[train_df['target']==0]['text'].tolist()))
plt.figure(figsize=(10,15))
plt.imshow(wc)
<matplotlib.image.AxesImage at 0x7f51f57bbc10>

Data cleaning

import re
import spacy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
from collections import Counter

punctuations = string.punctuation #구두점
stopwords = stopwords.words('english') #불용어 - 빈번하게 등장하는 조사, 의미 없는 단어들
nlp = spacy.load('en_core_web_sm')


# 불필요한 텍스트 제거
def cleanup_text1(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


def cleanup_text2(docs):
  texts = []
  counter = 1
  for doc in docs:
    if counter % 1000 == 0:
      print("Processed %d out of %d documents." % (counter, len(docs)))
    counter += 1
    
    doc = nlp(doc)
    tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-'] # 인칭대명사 제거, 단어 기본형 추출
    tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations] # 불용어, 구두점 제거
    tokens = ' '.join(tokens)
    texts.append(tokens)
  return pd.Series(texts)
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
#cleanup_text1 적용
train_df['text'] = train_df['text'].apply(lambda x: cleanup_text1(x))
test_df['text'] = test_df['text'].apply(lambda x: cleanup_text1(x))

# label별 텍스트 분류
disaster_text = [text for text in train_df[train_df['target'] == 1]['text']]
not_disaster_text = [text for text in train_df[train_df['target'] == 0]['text']]

#cleanup_text2 적용
disaster_clean = cleanup_text2(disaster_text)
disaster_clean = ' '.join(disaster_clean).split()
disaster_clean = [word for word in disaster_clean if word != '\'s'] # 's 제거

not_disaster_clean = cleanup_text2(not_disaster_text)
not_disaster_clean = ' '.join(not_disaster_clean).split()
not_disaster_clean = [word for word in not_disaster_clean if word != '\'s']

# {단어:횟수} 형태의 딕셔너리로 저장
disaster_counts = Counter(disaster_clean)
not_disaster_counts = Counter(not_disaster_clean)
Processed 1000 out of 3271 documents.
Processed 2000 out of 3271 documents.
Processed 3000 out of 3271 documents.
Processed 1000 out of 4342 documents.
Processed 2000 out of 4342 documents.
Processed 3000 out of 4342 documents.
Processed 4000 out of 4342 documents.
# 재난시 빈번한 단어 시각화
plt.rcParams["font.size"] = 15
plt.rcParams["figure.figsize"] = (30,15)

disaster_common_words = [word[0] for word in disaster_counts.most_common(25)]
disaster_common_counts = [word[1] for word in disaster_counts.most_common(25)]

sns.barplot(x=disaster_common_words, y=disaster_common_counts)
plt.title('Most Common Words Used in disaster')
plt.show()

# 재난이 아닐 시 빈번한 단어 시각화
not_disaster_common_words = [word[0] for word in not_disaster_counts.most_common(25)]
not_disaster_common_counts = [word[1] for word in not_disaster_counts.most_common(25)]

sns.barplot(x=not_disaster_common_words, y=not_disaster_common_counts)
plt.title('Most Common Words Used in Non_disaster')
plt.show()

WordCloud(After)

#disaster tweets
wc = WordCloud(background_color="white", stopwords=STOPWORDS, max_words=500,
              width=600, height=600, random_state=1)
wc.generate(" ".join(disaster_clean))
plt.figure(figsize=(10,15))
plt.imshow(wc)
<matplotlib.image.AxesImage at 0x7f51f5f0b210>

#non-disaster tweets
wc = WordCloud(background_color="white", stopwords=STOPWORDS, max_words=500,
              width=600, height=600, random_state=1)
wc.generate(" ".join(not_disaster_clean))
plt.figure(figsize=(10,15))
plt.imshow(wc)
<matplotlib.image.AxesImage at 0x7f51f57cf810>

Model

from sklearn.model_selection import train_test_split
import keras
from keras.models import Sequential
from keras.initializers import Constant
from keras.layers import (LSTM, 
                          Embedding, 
                          BatchNormalization,
                          Dense, 
                          TimeDistributed, 
                          Dropout, 
                          Bidirectional,
                          Flatten, 
                          GlobalMaxPool1D)
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam
from sklearn.metrics import (
    precision_score, 
    recall_score, 
    f1_score, 
    classification_report,
    accuracy_score
)
train = train_df['text'].values
test = test_df['text'].values
label = train_df['target'].values
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(train)
vocab_length = len(word_tokenizer.word_index) + 1
def metrics(pred, y_test):
    print("F1-score: ", f1_score(pred, y_test))
    print("Precision: ", precision_score(pred, y_test))
    print("Recall: ", recall_score(pred, y_test))
    print("Accuracy: ", accuracy_score(pred, y_test))
    print("-"*50)
    print(classification_report(pred, y_test))
def embed(corpus):
    return word_tokenizer.texts_to_sequences(corpus) #각 단어를 정수로 변환해 문장에 맞춰 순서대로 반환
def plot(history, arr):
    fig, ax = plt.subplots(1, 2, figsize=(20, 5))
    for idx in range(2):
        ax[idx].plot(history.history[arr[idx][0]])
        ax[idx].plot(history.history[arr[idx][1]])
        ax[idx].legend([arr[idx][0], arr[idx][1]],fontsize=18)
        ax[idx].set_xlabel('A ',fontsize=16)
        ax[idx].set_ylabel('B',fontsize=16)
        ax[idx].set_title(arr[idx][0] + ' X ' + arr[idx][1],fontsize=16)
longest_train = max(train, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))

#각 행 길이를 맞춰줌 (sequences, maxlen, dtype, padding, truncating, value) 
#padding/truncating=['pre','post']-숫자를 채우거나 삭제할 때 앞부터 or 뒤부터
#value-채우는 값, default는 0
padded_sentences = pad_sequences(embed(train), length_long_sentence, padding='post')
test_sentences = pad_sequences(embed(test), length_long_sentence, padding='post')
#단어: array 형태의 dictiionary
embeddings_dictionary = dict()
embedding_dim = 100
glove_file = open('../input/glove-file/glove.6B.100d.txt')
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()
#train데이터에 있는 단어들과 glove파일 내의 array값들 연결
embedding_matrix = np.zeros((vocab_length, embedding_dim))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
X_train, X_test, y_train, y_test = train_test_split(
    padded_sentences, 
    label, 
    test_size=0.3
)
def BLSTM():
    model = Sequential()
    model.add(Embedding(input_dim=embedding_matrix.shape[0], 
                        output_dim=embedding_matrix.shape[1], 
                        weights = [embedding_matrix], 
                        input_length=length_long_sentence))
    model.add(Bidirectional(LSTM(length_long_sentence, return_sequences = True, recurrent_dropout=0.2)))
    #Bidirectional-양방향 학습
    #length_long_sentence-출력의 개수
    #return_sequences=True-LSTM의 중간 스텝의 출력 모두 사용
    model.add(GlobalMaxPool1D())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(length_long_sentence, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(length_long_sentence, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model
model = BLSTM()
checkpoint = ModelCheckpoint(
    'model.h5', 
    monitor = 'val_loss', 
    verbose = 1, 
    save_best_only = True
)
#에폭마다 현재 가중치 저장
#val_loss가 좋아지지 않으면 덮어쓰지 않음

reduce_lr = ReduceLROnPlateau(
    monitor = 'val_loss', 
    factor = 0.2, 
    verbose = 1, 
    patience = 2,                        
    min_lr = 0.001
)
#val_loss가 향상되지 않을 때 학습률을 작게 함
#new_lr = lr*factor
#patience만큼의 에폭 동안 개선되지 않을 경우 호출

history = model.fit(
    X_train, 
    y_train, 
    epochs = 10,
    batch_size = 32,
    validation_data = [X_test, y_test],
    verbose = 1,
    callbacks = [reduce_lr, checkpoint]
)
Epoch 1/10
167/167 [==============================] - 16s 62ms/step - loss: 0.7585 - accuracy: 0.5747 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00

Epoch 00001: val_loss improved from inf to 0.00000, saving model to model.h5
Epoch 2/10
167/167 [==============================] - 10s 58ms/step - loss: 0.5923 - accuracy: 0.7070 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00

Epoch 00002: val_loss did not improve from 0.00000
Epoch 3/10
167/167 [==============================] - 10s 59ms/step - loss: 0.5189 - accuracy: 0.7674 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00

Epoch 00003: ReduceLROnPlateau reducing learning rate to 0.001.

Epoch 00003: val_loss did not improve from 0.00000
Epoch 4/10
167/167 [==============================] - 10s 59ms/step - loss: 0.4662 - accuracy: 0.8142 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00

Epoch 00004: val_loss did not improve from 0.00000
Epoch 5/10
167/167 [==============================] - 10s 57ms/step - loss: 0.4459 - accuracy: 0.8218 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.001.

Epoch 00005: val_loss did not improve from 0.00000
Epoch 6/10
167/167 [==============================] - 10s 59ms/step - loss: 0.4165 - accuracy: 0.8322 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00

Epoch 00006: val_loss did not improve from 0.00000
Epoch 7/10
167/167 [==============================] - 10s 57ms/step - loss: 0.3925 - accuracy: 0.8531 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.001.

Epoch 00007: val_loss did not improve from 0.00000
Epoch 8/10
167/167 [==============================] - 10s 58ms/step - loss: 0.3679 - accuracy: 0.8556 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00

Epoch 00008: val_loss did not improve from 0.00000
Epoch 9/10
167/167 [==============================] - 10s 59ms/step - loss: 0.3484 - accuracy: 0.8652 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.001.

Epoch 00009: val_loss did not improve from 0.00000
Epoch 10/10
167/167 [==============================] - 10s 57ms/step - loss: 0.3380 - accuracy: 0.8720 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00

Epoch 00010: val_loss did not improve from 0.00000
plot(history, [['loss', 'val_loss'],['accuracy', 'val_accuracy']])

loss, accuracy = model.evaluate(X_test, y_test)
print('Loss:', loss)
print('Accuracy:', accuracy)
72/72 [==============================] - 1s 11ms/step - loss: 0.4768 - accuracy: 0.7942
Loss: 0.47684410214424133
Accuracy: 0.7942206859588623
preds = model.predict_classes(X_test)
metrics(preds, y_test)
F1-score:  0.7232037691401648
Precision:  0.6329896907216495
Recall:  0.8434065934065934
Accuracy:  0.7942206654991243
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.91      0.77      0.84      1556
           1       0.63      0.84      0.72       728

    accuracy                           0.79      2284
   macro avg       0.77      0.81      0.78      2284
weighted avg       0.82      0.79      0.80      2284
model.load_weights('model.h5')
preds = model.predict_classes(X_test)
metrics(preds, y_test)
F1-score:  0.7090754877014419
Precision:  0.8618556701030928
Recall:  0.6023054755043228
Accuracy:  0.6996497373029772
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.58      0.85      0.69       896
           1       0.86      0.60      0.71      1388

    accuracy                           0.70      2284
   macro avg       0.72      0.73      0.70      2284
weighted avg       0.75      0.70      0.70      2284
submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
submission.target = model.predict_classes(test_sentences)
submission.to_csv("submission.csv", index=False)
submission.target.value_counts().plot.bar();

submission
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
id target
0 0 1
1 2 1
2 3 1
3 9 1
4 11 1
... ... ...
3258 10861 1
3259 10865 1
3260 10868 1
3261 10874 1
3262 10875 1

3263 rows × 2 columns

References