From 93d89dd751b48b43865a021704390e9ba5a4efb3 Mon Sep 17 00:00:00 2001
From: acs19qc <61751060+acs19qc@users.noreply.github.com>
Date: Wed, 15 Apr 2020 13:59:16 +0100
Subject: [PATCH] Add files via upload
---
pre_processing.ipynb | 499 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 499 insertions(+)
create mode 100644 pre_processing.ipynb
diff --git a/pre_processing.ipynb b/pre_processing.ipynb
new file mode 100644
index 0000000..1dbbe36
--- /dev/null
+++ b/pre_processing.ipynb
@@ -0,0 +1,499 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import warnings\n",
+ "import nltk\n",
+ "import os\n",
+ "from nltk import word_tokenize\n",
+ "import re\n",
+ "from nltk.corpus import stopwords\n",
+ "from nltk.stem import PorterStemmer \n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package stopwords to\n",
+ "[nltk_data] /Users/caoqianyu/nltk_data...\n",
+ "[nltk_data] Package stopwords is already up-to-date!\n",
+ "[nltk_data] Downloading package punkt to /Users/caoqianyu/nltk_data...\n",
+ "[nltk_data] Package punkt is already up-to-date!\n"
+ ]
+ }
+ ],
+ "source": [
+ "nltk.download('stopwords')\n",
+ "nltk.download('punkt')\n",
+ "stopwords = stopwords.words('english')\n",
+ "\n",
+ "path = 'data/training/'\n",
+ "dirs = os.listdir(path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " speaker | \n",
+ " content | \n",
+ " label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " COOPER | \n",
+ " And welcome back. | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " COOPER | \n",
+ " There is history all around us here at downtow... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " COOPER | \n",
+ " It was nearly destroyed by fire in 1966. | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " COOPER | \n",
+ " It was lovingly restored in the 80s and has be... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " COOPER | \n",
+ " It is a magnificent theater. | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 6 | \n",
+ " COOPER | \n",
+ " We’re here in the mid of a history-making Repu... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 7 | \n",
+ " COOPER | \n",
+ " Joining us now is Ohio’s governor, John Kasich. | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 8 | \n",
+ " SYSTEM | \n",
+ " (APPLAUSE) | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 9 | \n",
+ " KASICH | \n",
+ " How are you? | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 10 | \n",
+ " COOPER | \n",
+ " Good to see you. | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 11 | \n",
+ " COOPER | \n",
+ " Have a seat. | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 12 | \n",
+ " COOPER | \n",
+ " All right, we got a lot of questions from the ... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 13 | \n",
+ " COOPER | \n",
+ " A couple of questions on news of the day. | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 14 | \n",
+ " COOPER | \n",
+ " Donald Trump’s campaign manager, Corey Lewando... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 15 | \n",
+ " COOPER | \n",
+ " Would you fire him? | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 16 | \n",
+ " KASICH | \n",
+ " Well, I haven’t seen the video, but they tell ... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 17 | \n",
+ " KASICH | \n",
+ " Of course I would. | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 18 | \n",
+ " KASICH | \n",
+ " Look, when you have problems like that, you ha... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 19 | \n",
+ " KASICH | \n",
+ " Now, I’ve been of course an executive running ... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 20 | \n",
+ " KASICH | \n",
+ " And we see things that happen. | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id speaker content label\n",
+ "0 1 COOPER And welcome back. 0\n",
+ "1 2 COOPER There is history all around us here at downtow... 0\n",
+ "2 3 COOPER It was nearly destroyed by fire in 1966. 0\n",
+ "3 4 COOPER It was lovingly restored in the 80s and has be... 0\n",
+ "4 5 COOPER It is a magnificent theater. 0\n",
+ "5 6 COOPER We’re here in the mid of a history-making Repu... 0\n",
+ "6 7 COOPER Joining us now is Ohio’s governor, John Kasich. 0\n",
+ "7 8 SYSTEM (APPLAUSE) 0\n",
+ "8 9 KASICH How are you? 0\n",
+ "9 10 COOPER Good to see you. 0\n",
+ "10 11 COOPER Have a seat. 0\n",
+ "11 12 COOPER All right, we got a lot of questions from the ... 0\n",
+ "12 13 COOPER A couple of questions on news of the day. 0\n",
+ "13 14 COOPER Donald Trump’s campaign manager, Corey Lewando... 0\n",
+ "14 15 COOPER Would you fire him? 0\n",
+ "15 16 KASICH Well, I haven’t seen the video, but they tell ... 0\n",
+ "16 17 KASICH Of course I would. 0\n",
+ "17 18 KASICH Look, when you have problems like that, you ha... 0\n",
+ "18 19 KASICH Now, I’ve been of course an executive running ... 0\n",
+ "19 20 KASICH And we see things that happen. 0"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train = pd.read_csv(path+dirs[0],sep='\\t',header = None,names = ['id','speaker','content','label'])\n",
+ "train.head(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def pre_processing(data,stemming = False, stopwords = stopwords):\n",
+ " \n",
+ " for i in range(len(data)):\n",
+ " #lower case\n",
+ " text = data.loc[i,'content']\n",
+ " text = text.lower()\n",
+ " \n",
+ " #tokenization\n",
+ " text_token = word_tokenize(text)\n",
+ " \n",
+ " \n",
+ " \n",
+ " #stopwords removel\n",
+ " text_pro = []\n",
+ " for word in text_token:\n",
+ " if word not in stopwords:\n",
+ " text_pro.append(word)\n",
+ " \n",
+ " #stemming \n",
+ " if stemming ==True:\n",
+ " ps = PorterStemmer()\n",
+ " text_pro = [ps.stem(vocab) for vocab in text_stop]\n",
+ " \n",
+ " #removel punctuation\n",
+ " characters = [',','’', '\\'','.','DBSCAN', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-','...','^','{','}']\n",
+ " final_word = [word for word in text_pro if word not in characters]\n",
+ " \n",
+ " final_text = str()\n",
+ " for word in final_word:\n",
+ " final_text += word\n",
+ " final_text +=' '\n",
+ " data.loc[i,'content'] = final_text\n",
+ " \n",
+ " \n",
+ " return data\n",
+ " \n",
+ " \n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train = pre_processing(train,stopwords)\n",
+ "for i in range(len(dirs)):\n",
+ " df = pd.read_csv(path+dirs[0],sep='\\t',header = None,names = ['id','speaker','content','label'])\n",
+ " df = pre_processing(df,stopwords)\n",
+ " train = train.append(df,ignore_index = True)\n",
+ " \n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " speaker | \n",
+ " content | \n",
+ " label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " COOPER | \n",
+ " welcome back | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " COOPER | \n",
+ " history around us downtown milwaukee riverside... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " COOPER | \n",
+ " nearly destroyed fire 1966 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " COOPER | \n",
+ " lovingly restored 80s heart city cultural life... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " COOPER | \n",
+ " magnificent theater | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 6 | \n",
+ " COOPER | \n",
+ " mid history-making republican presidential cam... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 7 | \n",
+ " COOPER | \n",
+ " joining us ohio governor john kasich | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 9 | \n",
+ " KASICH | \n",
+ " | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 10 | \n",
+ " COOPER | \n",
+ " good see | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 11 | \n",
+ " COOPER | \n",
+ " seat | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id speaker content label\n",
+ "0 1 COOPER welcome back 0\n",
+ "1 2 COOPER history around us downtown milwaukee riverside... 0\n",
+ "2 3 COOPER nearly destroyed fire 1966 0\n",
+ "3 4 COOPER lovingly restored 80s heart city cultural life... 0\n",
+ "4 5 COOPER magnificent theater 0\n",
+ "5 6 COOPER mid history-making republican presidential cam... 0\n",
+ "6 7 COOPER joining us ohio governor john kasich 0\n",
+ "8 9 KASICH 0\n",
+ "9 10 COOPER good see 0\n",
+ "10 11 COOPER seat 0"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train = train.loc[train['speaker'] != 'SYSTEM']\n",
+ "train.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(36348, 4)"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}