From 93d89dd751b48b43865a021704390e9ba5a4efb3 Mon Sep 17 00:00:00 2001 From: acs19qc <61751060+acs19qc@users.noreply.github.com> Date: Wed, 15 Apr 2020 13:59:16 +0100 Subject: [PATCH] Add files via upload --- pre_processing.ipynb | 499 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 499 insertions(+) create mode 100644 pre_processing.ipynb diff --git a/pre_processing.ipynb b/pre_processing.ipynb new file mode 100644 index 0000000..1dbbe36 --- /dev/null +++ b/pre_processing.ipynb @@ -0,0 +1,499 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import warnings\n", + "import nltk\n", + "import os\n", + "from nltk import word_tokenize\n", + "import re\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import PorterStemmer \n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] /Users/caoqianyu/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package punkt to /Users/caoqianyu/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + } + ], + "source": [ + "nltk.download('stopwords')\n", + "nltk.download('punkt')\n", + "stopwords = stopwords.words('english')\n", + "\n", + "path = 'data/training/'\n", + "dirs = os.listdir(path)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idspeakercontentlabel
01COOPERAnd welcome back.0
12COOPERThere is history all around us here at downtow...0
23COOPERIt was nearly destroyed by fire in 1966.0
34COOPERIt was lovingly restored in the 80s and has be...0
45COOPERIt is a magnificent theater.0
56COOPERWe’re here in the mid of a history-making Repu...0
67COOPERJoining us now is Ohio’s governor, John Kasich.0
78SYSTEM(APPLAUSE)0
89KASICHHow are you?0
910COOPERGood to see you.0
1011COOPERHave a seat.0
1112COOPERAll right, we got a lot of questions from the ...0
1213COOPERA couple of questions on news of the day.0
1314COOPERDonald Trump’s campaign manager, Corey Lewando...0
1415COOPERWould you fire him?0
1516KASICHWell, I haven’t seen the video, but they tell ...0
1617KASICHOf course I would.0
1718KASICHLook, when you have problems like that, you ha...0
1819KASICHNow, I’ve been of course an executive running ...0
1920KASICHAnd we see things that happen.0
\n", + "
" + ], + "text/plain": [ + " id speaker content label\n", + "0 1 COOPER And welcome back. 0\n", + "1 2 COOPER There is history all around us here at downtow... 0\n", + "2 3 COOPER It was nearly destroyed by fire in 1966. 0\n", + "3 4 COOPER It was lovingly restored in the 80s and has be... 0\n", + "4 5 COOPER It is a magnificent theater. 0\n", + "5 6 COOPER We’re here in the mid of a history-making Repu... 0\n", + "6 7 COOPER Joining us now is Ohio’s governor, John Kasich. 0\n", + "7 8 SYSTEM (APPLAUSE) 0\n", + "8 9 KASICH How are you? 0\n", + "9 10 COOPER Good to see you. 0\n", + "10 11 COOPER Have a seat. 0\n", + "11 12 COOPER All right, we got a lot of questions from the ... 0\n", + "12 13 COOPER A couple of questions on news of the day. 0\n", + "13 14 COOPER Donald Trump’s campaign manager, Corey Lewando... 0\n", + "14 15 COOPER Would you fire him? 0\n", + "15 16 KASICH Well, I haven’t seen the video, but they tell ... 0\n", + "16 17 KASICH Of course I would. 0\n", + "17 18 KASICH Look, when you have problems like that, you ha... 0\n", + "18 19 KASICH Now, I’ve been of course an executive running ... 0\n", + "19 20 KASICH And we see things that happen. 0" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train = pd.read_csv(path+dirs[0],sep='\\t',header = None,names = ['id','speaker','content','label'])\n", + "train.head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "def pre_processing(data,stemming = False, stopwords = stopwords):\n", + " \n", + " for i in range(len(data)):\n", + " #lower case\n", + " text = data.loc[i,'content']\n", + " text = text.lower()\n", + " \n", + " #tokenization\n", + " text_token = word_tokenize(text)\n", + " \n", + " \n", + " \n", + " #stopwords removel\n", + " text_pro = []\n", + " for word in text_token:\n", + " if word not in stopwords:\n", + " text_pro.append(word)\n", + " \n", + " #stemming \n", + " if stemming ==True:\n", + " ps = PorterStemmer()\n", + " text_pro = [ps.stem(vocab) for vocab in text_stop]\n", + " \n", + " #removel punctuation\n", + " characters = [',','’', '\\'','.','DBSCAN', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-','...','^','{','}']\n", + " final_word = [word for word in text_pro if word not in characters]\n", + " \n", + " final_text = str()\n", + " for word in final_word:\n", + " final_text += word\n", + " final_text +=' '\n", + " data.loc[i,'content'] = final_text\n", + " \n", + " \n", + " return data\n", + " \n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "train = pre_processing(train,stopwords)\n", + "for i in range(len(dirs)):\n", + " df = pd.read_csv(path+dirs[0],sep='\\t',header = None,names = ['id','speaker','content','label'])\n", + " df = pre_processing(df,stopwords)\n", + " train = train.append(df,ignore_index = True)\n", + " \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idspeakercontentlabel
01COOPERwelcome back0
12COOPERhistory around us downtown milwaukee riverside...0
23COOPERnearly destroyed fire 19660
34COOPERlovingly restored 80s heart city cultural life...0
45COOPERmagnificent theater0
56COOPERmid history-making republican presidential cam...0
67COOPERjoining us ohio governor john kasich0
89KASICH0
910COOPERgood see0
1011COOPERseat0
\n", + "
" + ], + "text/plain": [ + " id speaker content label\n", + "0 1 COOPER welcome back 0\n", + "1 2 COOPER history around us downtown milwaukee riverside... 0\n", + "2 3 COOPER nearly destroyed fire 1966 0\n", + "3 4 COOPER lovingly restored 80s heart city cultural life... 0\n", + "4 5 COOPER magnificent theater 0\n", + "5 6 COOPER mid history-making republican presidential cam... 0\n", + "6 7 COOPER joining us ohio governor john kasich 0\n", + "8 9 KASICH 0\n", + "9 10 COOPER good see 0\n", + "10 11 COOPER seat 0" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train = train.loc[train['speaker'] != 'SYSTEM']\n", + "train.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(36348, 4)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}