-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathharvard-capstone-amazon-reviews-polarity.tex
3948 lines (3319 loc) · 258 KB
/
harvard-capstone-amazon-reviews-polarity.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
%
\documentclass[
]{article}
\usepackage{amsmath,amssymb}
\usepackage{lmodern}
\usepackage{iftex}
\ifPDFTeX
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
\usepackage{unicode-math}
\defaultfontfeatures{Scale=MatchLowercase}
\defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
\usepackage[]{microtype}
\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
\IfFileExists{parskip.sty}{%
\usepackage{parskip}
}{% else
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
\KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
\hypersetup{
pdftitle={Amazon Reviews Sentiment Analysis/Text Classification Choose Your Own Project A Harvard Capstone Project},
pdfauthor={Manoj Bijoor},
hidelinks,
pdfcreator={LaTeX via pandoc}}
\urlstyle{same} % disable monospaced font for URLs
\usepackage[margin=1in]{geometry}
\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\newenvironment{Shaded}{}{}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{#1}}}
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textbf{\textit{#1}}}}
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.49,0.56,0.16}{#1}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{#1}}
\newcommand{\BuiltInTok}[1]{#1}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{#1}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textit{#1}}}
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textbf{\textit{#1}}}}
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.53,0.00,0.00}{#1}}
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{\textbf{#1}}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.56,0.13,0.00}{#1}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{#1}}
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.73,0.13,0.13}{\textit{#1}}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{#1}}}
\newcommand{\ExtensionTok}[1]{#1}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{#1}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.02,0.16,0.49}{#1}}
\newcommand{\ImportTok}[1]{#1}
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textbf{\textit{#1}}}}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{\textbf{#1}}}
\newcommand{\NormalTok}[1]{#1}
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.40,0.40,0.40}{#1}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{#1}}
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.74,0.48,0.00}{#1}}
\newcommand{\RegionMarkerTok}[1]{#1}
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{#1}}
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.73,0.40,0.53}{#1}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{#1}}
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.10,0.09,0.49}{#1}}
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{#1}}
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textbf{\textit{#1}}}}
\usepackage{longtable,booktabs,array}
\usepackage{calc} % for calculating minipage widths
% Correct order of tables after \paragraph or \subparagraph
\usepackage{etoolbox}
\makeatletter
\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
\makeatother
% Allow footnotes in longtable head/foot
\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
\makesavenoteenv{longtable}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother
% Make links footnotes instead of hotlinks:
\DeclareRobustCommand{\href}[2]{#2\footnote{\url{#1}}}
\setlength{\emergencystretch}{3em} % prevent overfull lines
\providecommand{\tightlist}{%
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\setcounter{secnumdepth}{5}
\usepackage[utf8]{inputenc} \usepackage[english]{babel} \usepackage{bookmark} \usepackage[]{hyperref} \hypersetup{ backref, pdftitle={"Amazon Review Polarity Harvard Capstone"}, bookmarks=true, bookmarksnumbered=true, bookmarksopen=true, bookmarksopenlevel=3, pdfpagemode=FullScreen, pdfstartpage=1, hyperindex=true, pageanchor=true, colorlinks=true, linkcolor=blue, filecolor=magenta, urlcolor=cyan } \usepackage{amsmath} \usepackage{pdflscape} \usepackage[titles]{tocloft} \usepackage{tocloft} \usepackage{titlesec} \usepackage{longtable} \usepackage{xpatch} \usepackage[T1]{fontenc} \usepackage{imakeidx} \makeindex[columns=3, title=Alphabetical Index, intoc]
\usepackage{subfig}
\usepackage{booktabs}
\usepackage{longtable}
\usepackage{array}
\usepackage{multirow}
\usepackage{wrapfig}
\usepackage{float}
\usepackage{colortbl}
\usepackage{pdflscape}
\usepackage{tabu}
\usepackage{threeparttable}
\usepackage{threeparttablex}
\usepackage[normalem]{ulem}
\usepackage{makecell}
\usepackage{xcolor}
\ifLuaTeX
\usepackage{selnolig} % disable illegal ligatures
\fi
\title{Amazon Reviews\\
Sentiment Analysis/Text Classification\\
Choose Your Own Project\\
A Harvard Capstone Project}
\author{Manoj Bijoor}
\date{July 28, 2021}
\begin{document}
\maketitle
\bookmark[dest=TitlePage]{Title Page}
\pagenumbering{roman}
\newpage
\newpage
\begin{center}
\hypertarget{Abstract}{}
\large{Abstract}
\bookmark[dest=Abstract]{Abstract}
\end{center}
\bigskip
Deriving truth and insight from a pile of data is a powerful but
error-prone job.
This project offers an empirical exploration on the use of Neural
networks for text classification using the Amazon Reviews Polarity
dataset.
Text classification algorithms are at the heart of a variety of software
systems that process text data at scale.
One common type of text classification is sentiment analysis, whose goal
is to identify the polarity of text content: the type of opinion it
expresses. This can take the form of a binary like/dislike rating, or a
more granular set of options, such as a star rating from 1 to 5.
Examples of sentiment analysis include analyzing Twitter posts to
determine if people liked the Black Panther movie, or extrapolating the
general public's opinion of a new brand of Nike shoes from Walmart
reviews.
Algorithms such as regularized linear models, support vector machines,
and naive Bayes models are used to predict outcomes from predictors
including text data. These algorithms use a shallow (single) mapping. In
contrast, Deep learning models approach the same tasks and have the same
goals, but the algorithms involved are different. Deep learning models
are ``deep'' in the sense that they use multiple layers to learn how to
map from input features to output outcomes.
Deep learning models can be effective for text prediction problems
because they use these multiple layers to capture complex relationships
in language.
The layers in a deep learning model are connected in a network and these
models are called Neural Networks.
Neural language models (or continuous space language models) use
continuous representations or embeddings of words to make their
predictions. These models make use of Neural networks.
Continuous space embeddings help to alleviate the curse of
dimensionality in language modeling: as language models are trained on
larger and larger texts, the number of unique words (the vocabulary)
increases. The number of possible sequences of words increases
exponentially with the size of the vocabulary, causing a data sparsity
problem because of the exponentially many sequences. Thus, statistics
are needed to properly estimate probabilities. Neural networks avoid
this problem by representing words in a distributed way, as non-linear
combinations of weights in a neural net.
Instead of using neural net language models to produce actual
probabilities, it is common to instead use the distributed
representation encoded in the networks' ``hidden'' layers as
representations of words; each word is then mapped onto an n-dimensional
real vector called the word embedding, where n is the size of the layer
just before the output layer. An alternate description is that a neural
net approximates the language function and models semantic relations
between words as linear combinations, capturing a form of
compositionality.
In this project we will cover four network architectures, namely DNN,
CNN, sepCNN and BERT. We will also first implement a Baseline linear
classifier model which serves the purpose of comparison with the deep
learning techniques.
For metrics we will use the default performance parameters for binary
classification which are Accuracy, Loss and ROC AUC (area under the
receiver operator characteristic curve).
\newpage
\clearpage
\phantomsection
\setcounter{secnumdepth}{5}
\setcounter{tocdepth}{5}
\cleardoublepage \hypertarget{toc}{}
\bookmark[dest=toc,level=chapter]{\contentsname} \tableofcontents
\clearpage
\newpage
\clearpage
\phantomsection
\hypertarget{list-of-tables}{%
\section*{List of tables}\label{list-of-tables}}
\addcontentsline{toc}{section}{List of tables}
\renewcommand{\listtablename}{}
\listoftables
\clearpage
\newpage
\clearpage
\phantomsection
\hypertarget{list-of-figures}{%
\section*{List of figures}\label{list-of-figures}}
\addcontentsline{toc}{section}{List of figures}
\renewcommand{\listfigurename}{}
\listoffigures
\clearpage
\newpage
\clearpage
\phantomsection
\newcommand{\listequationsname}{List of Equations}
\newlistof{equations}{equ}{List of Equations}
\newcommand{\equations}[1]{%
\refstepcounter{equations}
\addcontentsline{equ}{equations}{ \protect\numberline{\theequations}#1}\par}
\xpretocmd{\listofequations}{\addcontentsline{toc}{section}{List of Equations}}{}{}
\renewcommand{\listequationsname}{}
\listofequations
\clearpage
\newpage
\pagenumbering{arabic}
\newpage
\hypertarget{project-overview-amazon-reviews-polarity}{%
\section{Project Overview: Amazon Reviews
Polarity}\label{project-overview-amazon-reviews-polarity}}
\hypertarget{introduction}{%
\subsection{Introduction}\label{introduction}}
Deriving truth and insight from a pile of data is a powerful but
error-prone job.
Text classification algorithms are at the heart of a variety of software
systems that process text data at scale.
One common type of text classification is sentiment analysis, whose goal
is to identify the polarity of text content: the type of opinion it
expresses. This can take the form of a binary like/dislike rating, or a
more granular set of options, such as a star rating from 1 to 5.
Examples of sentiment analysis include analyzing Twitter posts to
determine if people liked the Black Panther movie, or extrapolating the
general public's opinion of a new brand of Nike shoes from Walmart
reviews.
Algorithms such as regularized linear models, support vector machines,
and naive Bayes models are used to predict outcomes from predictors
including text data. These algorithms use a shallow (single) mapping. In
contrast, Deep learning models approach the same tasks and have the same
goals, but the algorithms involved are different. Deep learning models
are ``deep'' in the sense that they use multiple layers to learn how to
map from input features to output outcomes.
Deep learning models can be effective for text prediction problems
because they use these multiple layers to capture complex relationships
in language.
The layers in a deep learning model are connected in a network and these
models are called neural networks.
\hypertarget{neural-networks}{%
\subsubsection{Neural networks}\label{neural-networks}}
Neural language models (or continuous space language models) use
continuous representations or
\href{https://en.wikipedia.org/wiki/Word_embedding}{embeddings of words}
to make their
predictions.\href{https://karpathy.github.io/2015/05/21/rnn-effectiveness/}{Karpathy,
Andrej. ``The Unreasonable Effectiveness of Recurrent Neural Networks''}
These models make use of
\href{https://en.wikipedia.org/wiki/Artificial_neural_network}{Neural
networks}.
Continuous space embeddings help to alleviate the
\href{https://en.wikipedia.org/wiki/Curse_of_dimensionality}{curse of
dimensionality} in language modeling: as language models are trained on
larger and larger texts, the number of unique words (the vocabulary)
increases.\href{https://en.wikipedia.org/wiki/Heaps\%27_law}{Heaps'
law}. The number of possible sequences of words increases exponentially
with the size of the vocabulary, causing a data sparsity problem because
of the exponentially many sequences. Thus, statistics are needed to
properly estimate probabilities. Neural networks avoid this problem by
representing words in a distributed way, as non-linear combinations of
weights in a neural
net.\href{https://ui.adsabs.harvard.edu/abs/2008SchpJ...3.3881B/abstract}{Bengio,
Yoshua (2008). ``Neural net language models''. Scholarpedia. 3. p.~3881.
Bibcode:2008SchpJ\ldots3.3881B. doi:10.4249/scholarpedia.3881} An
alternate description is that a neural net approximates the language
function.
Instead of using neural net language models to produce actual
probabilities, it is common to instead use the distributed
representation encoded in the networks' ``hidden'' layers as
representations of words;\\
A hidden layer is a synthetic layer in a neural network between the
input layer (that is, the features) and the output layer (the
prediction). Hidden layers typically contain an activation function such
as
\href{https://developers.google.com/machine-learning/glossary?utm_source=DevSite\&utm_campaign=Text-Class-Guide\&utm_medium=referral\&utm_content=glossary\&utm_term=sepCNN\#rectified-linear-unit-relu}{ReLU}
for training. A deep neural network contains more than one hidden layer.
Each word is then mapped onto an n-dimensional real vector called the
word embedding, where n is the size of the layer just before the output
layer. The representations in skip-gram models for example have the
distinct characteristic that they model semantic relations between words
as \href{https://en.wikipedia.org/wiki/Linear_combination}{linear
combinations}, capturing a form of
\href{https://en.wikipedia.org/wiki/Principle_of_compositionality}{compositionality}.
In this project we will cover four network architectures, namely:
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
DNN - Dense Neural Network - a bridge between the ``shallow'' learning
approaches and the other 3 - CNN, sepCNN, BERT.
\item
CNN - Convolutional Neural Network - advanced architecture appropriate
for text data because they can capture specific local patterns.
\item
sepCNN - Depthwise Separable Convolutional Neural Network.
\item
BERT - Bidirectional Encoder Representations from Transformers.
\end{enumerate}
We will also first implement a Baseline linear classifier model which
serves the purpose of comparison with the deep learning techniques we
will implement later on, and also as a succinct summary of a basic
supervised machine learning analysis for text.
This linear baseline is a regularized linear model trained on the same
data set, using tf-idf weights and 5000 tokens.
For metrics we will use the default performance parameters for binary
classification which are Accuracy, Loss and ROC AUC (area under the
receiver operator characteristic curve).
We will also use the confusion matrix to get an overview of our model
performance, as it includes rich information.
We will use tidymodels packages along with Tensorflow, the R interface
to Keras. See \href{https://CRAN.R-project.org/package=keras}{Allaire,
JJ, and François Chollet. 2021. keras: R Interface to 'Keras'} for
preprocessing, modeling, and evaluation, and
\href{https://www.tidytextmining.com/}{Silge, Julia, and David Robinson.
2017. Text Mining with R: A Tidy Approach. 1st ed.~O'Reilly Media,
Inc.}, \href{https://smltar.com/}{Supervised Machine Learning for Text
Analysis in R, by Emil Hvitfeldt and Julia Silge.} and
\href{https://www.tmwr.org/}{Tidy Modeling with R, Max Kuhn and Julia
Silge, Version 0.0.1.9010, 2021-07-19} and how can we forget
\href{https://rafalab.github.io/dsbook/}{Introduction to Data Science,
Data Analysis and Prediction Algorithms with R - Rafael A. Irizarry,
2021-07-03}.
The keras R package provides an interface for R users to Keras, a
high-level API for building neural networks.
This project will use some key machine learning best practices for
solving text classification problems.\\
Here's what you'll learn:
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
The high-level, end-to-end workflow for solving text classification
problems using machine learning
\item
How to choose the right model for your text classification problem
\item
How to implement your model of choice using TensorFlow with Keras
acting as an interface for the TensorFlow library
\end{enumerate}
I have used/mentioned several references throughout the project.
This project depends on python and R software for Tensorflow and Keras
that needs to be installed both inside and outside of R. As each
individual's environment may be different, I cannot automate this part
in my code.
R side:\\
\url{https://cran.r-project.org/}\strut \\
\url{https://tensorflow.rstudio.com/installation/}\strut \\
\url{https://tensorflow.rstudio.com/installation/gpu/local_gpu/}
Python side:\\
\url{https://www.tensorflow.org/install}\strut \\
\url{https://www.anaconda.com/products/individual}\strut \\
\url{https://keras.io/}
Instead of cluttering code with comments, I ask you to please use these
references and the rstudio help (?cmd/??cmd) if you are not very
familiar with any specific command. Most commands are pretty self
explanatory if you are even a little familiar with R.
Here are some more references:
\hypertarget{references}{%
\subsection{References}\label{references}}
\href{https://www.tensorflow.org/}{Tensorflow} is an end-to-end open
source platform for machine learning. It has a comprehensive, flexible
ecosystem of tools, libraries and community resources that lets
researchers push the state-of-the-art in ML and developers easily build
and deploy ML powered applications.
The \href{https://tfhub.dev/}{TensorFlow Hub} lets you search and
discover hundreds of trained, ready-to-deploy machine learning models in
one place.
\href{https://tensorflow.rstudio.com/}{Tensorflow for R} provides an R
interface for Tensorflow.
\href{https://www.tmwr.org/}{Tidy Modeling with R}
\href{https://yihui.org/tinytex/}{Tinytex}\\
I have used tinytex in code chunks.
\href{https://www.overleaf.com/learn/latex}{Latex}\\
I have used Latex beyond the very basic provided by default templates in
RStudio. Too numerous to explain. Though that much is not needed, I have
used it to learn and make better pdf docs.
\href{https://bookdown.org/yihui/rmarkdown}{Rmarkdown}
\newpage
\hypertarget{text-classification-workflow}{%
\subsection{Text Classification
Workflow}\label{text-classification-workflow}}
Here's a high-level overview of the workflow used to solve machine
learning problems:
Step 1: Gather Data\\
Step 2: Explore Your Data\\
Step 2.5: Choose a Model*\\
Step 3: Prepare Your Data\\
Step 4: Build, Train, and Evaluate Your Model\\
Step 5: Tune Hyperparameters\\
Step 6: Deploy Your Model
The following sections explain each step in detail, and how to implement
them for text data.
\hypertarget{gather-data}{%
\subsubsection{Gather Data}\label{gather-data}}
Gathering data is the most important step in solving any supervised
machine learning problem. Your text classifier can only be as good as
the dataset it is built from.
Here are some important things to remember when collecting data:
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
If you are using a public API, understand the limitations of the API
before using them. For example, some APIs set a limit on the rate at
which you can make queries.
\item
The more training examples/samples you have, the better. This will
help your model generalize better.
\item
Make sure the number of samples for every class or topic is not overly
imbalanced. That is, you should have comparable number of samples in
each class.
\item
Make sure that your samples adequately cover the space of possible
inputs, not only the common cases.
\end{enumerate}
This dataset contains amazon reviews posted by people on the Amazon
website, and is a classic example of a sentiment analysis problem.
Amazon Review Polarity Dataset - Version 3, Updated 09/09/2015
ORIGIN
The Amazon reviews dataset consists of reviews from amazon. The data
span a period of 18 years, including \textasciitilde35 million reviews
up to March 2013. Reviews include product and user information, ratings,
and a plaintext review. For more information, please refer to the
following paper:
\href{https://cs.stanford.edu/people/jure/pubs/reviews-recsys13.pdf}{J.
McAuley and J. Leskovec. Hidden factors and hidden topics: Understanding
rating dimensions with review text. In Proceedings of the 7th ACM
Conference on Recommender Systems, RecSys '13, pages 165--172, New York,
NY, USA, 2013. ACM}.
The Amazon reviews polarity dataset was constructed by Xiang Zhang
(\href{mailto:[email protected]}{\nolinkurl{[email protected]}})
from the above dataset. It is used as a text classification benchmark in
the following paper: Xiang Zhang, Junbo Zhao, Yann LeCun.
\href{https://arxiv.org/abs/1509.01626}{Character-level Convolutional
Networks for Text Classification}. Advances in Neural Information
Processing Systems 28 (NIPS 2015).
Here is an Abstract of that paper:
This article offers an empirical exploration on the use of
character-level convolutional networks (ConvNets) for text
classification. We constructed several large-scale datasets to show that
character-level convolutional networks could achieve state-of-the-art or
competitive results. Comparisons are offered against traditional models
such as bag of words, n-grams and their TFIDF variants, and deep
learning models such as word-based ConvNets and recurrent neural
networks.
Coming back to our project: As Google has changed it's API, I had to
download the dataset manually from the following URL:
Please select file named ``amazon\_review\_polarity\_csv.tar.gz'' and
download it to the project directory.
Download Location URL :
\href{https://drive.google.com/drive/folders/0Bz8a_Dbh9Qhbfll6bVpmNUtUcFdjYmF2SEpmZUZUcVNiMUw1TWN6RDV3a0JHT3kxLVhVR2M?resourcekey=0-TLwzfR2O-D2aPitmn5o9VQ}{Xiang
Zhang Google Drive}
DESCRIPTION
The Amazon reviews polarity dataset is constructed by taking review
score 1 and 2 as negative, and 4 and 5 as positive. Samples of score 3
is ignored. In the dataset, class 1 is the negative and class 2 is the
positive. Each class has 1,800,000 training samples and 200,000 testing
samples.
The files train.csv and test.csv contain all the training samples as
comma-separated values. There are 3 columns in them, corresponding to
label/class index (1 or 2), review title and review text. The review
title and text are escaped using double quotes (``), and any internal
double quote is escaped by 2 double quotes (''``). New lines are escaped
by a backslash followed with an''n'' character, that is
``\textbackslash n''.
\newpage
\hypertarget{explore-your-data}{%
\subsubsection{Explore Your Data}\label{explore-your-data}}
Building and training a model is only one part of the workflow.
Understanding the characteristics of your data beforehand will enable
you to build a better model. This could simply mean obtaining a higher
accuracy. It could also mean requiring less data for training, or fewer
computational resources.
\hypertarget{load-the-dataset}{%
\paragraph{Load the Dataset}\label{load-the-dataset}}
First up, let's load the dataset into R.
In the dataset, class 1 is the negative and class 2 is the positive
review. We will change these to 0 and 1.
columns = (0, 1, 2) \# 0 - label/class index, 1 - title/subject, 2 -
text body/review.
In this project we will NOT be using the ``title'' data. We will use
only ``label'' and ``text''. Also note that I have more comments in the
code file/s than in the pdf document.
\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{untar}\NormalTok{(}\StringTok{"amazon\_review\_polarity\_csv.tar.gz"}\NormalTok{, }\AttributeTok{list =} \ConstantTok{TRUE}\NormalTok{) }\DocumentationTok{\#\# check contents}
\NormalTok{[}\DecValTok{1}\NormalTok{] }\StringTok{"amazon\_review\_polarity\_csv/"}
\NormalTok{[}\DecValTok{2}\NormalTok{] }\StringTok{"amazon\_review\_polarity\_csv/test.csv"}
\NormalTok{[}\DecValTok{3}\NormalTok{] }\StringTok{"amazon\_review\_polarity\_csv/train.csv"}
\NormalTok{[}\DecValTok{4}\NormalTok{] }\StringTok{"amazon\_review\_polarity\_csv/readme.txt"}
\FunctionTok{untar}\NormalTok{(}\StringTok{"amazon\_review\_polarity\_csv.tar.gz"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}
\newpage
\hypertarget{check-the-data}{%
\paragraph{Check the Data}\label{check-the-data}}
After loading the data, it's good practice to run some checks on it:
pick a few samples and manually check if they are consistent with your
expectations. For example see Table \ref{tbl:amazon_train}
\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{glimpse}\NormalTok{(amazon\_train)}
\NormalTok{Rows}\SpecialCharTok{:} \DecValTok{2}\NormalTok{,}\DecValTok{879}\NormalTok{,}\DecValTok{960}
\NormalTok{Columns}\SpecialCharTok{:} \DecValTok{3}
\SpecialCharTok{$}\NormalTok{ label }\SpecialCharTok{\textless{}}\NormalTok{dbl}\SpecialCharTok{\textgreater{}} \DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{0}\SpecialCharTok{\textasciitilde{}}
\ErrorTok{$}\NormalTok{ title }\SpecialCharTok{\textless{}}\NormalTok{chr}\SpecialCharTok{\textgreater{}} \StringTok{"For Older Mac Operating Systems Only"}\NormalTok{, }\StringTok{"greener today than yest\textasciitilde{}}
\StringTok{$ text \textless{}chr\textgreater{} "}\NormalTok{Does not work on Mac OSX Per Scholastic tech support If you hav}\SpecialCharTok{\textasciitilde{}}
\end{Highlighting}
\end{Shaded}
\begin{table}[H]
\caption{\label{tab:chk_data_2}Amazon Train data\label{tbl:amazon_train}}
\centering
\fontsize{6}{8}\selectfont
\begin{tabular}[t]{rll}
\toprule
label & title & text\\
\midrule
0 & For Older Mac Operating Systems Only & Does not work on Mac OSX Per Scholastic tech support If you have a newer Mac machine with one of the later OS versions ( or later) the game will not work as it was meant to run on an older system\\
1 & greener today than yesterday & I was a skeptic however I nowhave three newspapers the bible six books and two magazines on my kindle I ve wowed my church reading group with skipping back and forth between our book we are reading and the bible I ve eliminated several paper subscriptions and actually am reading more because i can easily carry it all with me The page is easy on the eyes I really like it\\
0 & Overrated very overrated & This film is a disappointment Based on the reviews I read here at Amazon and in other media I expected an exciting and visually engrossing experience This is a nineteen nineties touchy feely melodrama and furthermore it s poorly done Can I get my money back\\
1 & well really & dmst are my favourite band and have been for a while now just mind blowing i wanted to make one correction to the previous review though the do makes are from toronto not quebec\\
1 & dynomax super turbo exhaust & it fit good took only about mins to put on little quiter than i wanted ( but for the price can t beat it ) it is getting a littler louder everyday i drive it starting to burn some of the glass packing out\\
1 & East LA Marine the Guy Gabaldon Story & This movie puts history in perspective for those who know the story of Guy Gabaldon I think that he is an unsung hero that should have been awarded for the lives that he saved on both sides I think that you will be glad that you watched this movie Not the Hollywood version but through this you get to meet the actual hero not Hollywood s version of who they thought he should be\\
0 & World of Bad Support & Before getting this game be sure to check out the Support forums WoW (World of warcraft) is suffering from things like Peoples accounts expiring with no way to get a CC or game card updated High graphics glitches make game unplayableHigh rate of computer lock ups freezing and Blue screening Blizzards support staff ask the users to update drivers There latest patch has caused a large amount of players to be unable to play at all So make sure that your computer won t have these issues Even though systems with gig of ram and the best video cards have issues maybe yours won t I recommended waiting for Blizzard to finish the stress test they call GOLD Instead get any other MMORPG none are having the issues this one has If you do buy it and can t play please note that for the last days Blizzards support line has been ringing fast busy hehe\\
0 & disapointing & Only two songs are great Desire All I want is you There are some good live performaces but Helter Skelter and All along the watch tower covers were very bad moves If you re a die hard fan buy this for the two songs I mentioned because they re classics but otherwise this is hardly essential\\
1 & SAITEK X FLIGHT CONTROL SYSTEM BLOWS YOU AWAY & When I purchased my Flight Simulator Deluxe Edition I chose to purchase these controls as well I wanted as real a feel as I could get with my gaming system Well at the time they were the best on the shelf that I could find Nothing else came close to these The first few reviewers have explained the controls already They are right on the money You will want to purchase these along with your game\\
1 & the secret of science & The best kept secret of science is how strongly it points towards a creator and dovetails with Christianity In this marvelously lucid book the eminent physical chemist Henry Schaefer unfolds the secret\\
\bottomrule
\end{tabular}
\end{table}
Labels : Negative reviews = 0, Positive reviews = 1
\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{unique}\NormalTok{(amazon\_train}\SpecialCharTok{$}\NormalTok{label)}
\NormalTok{[}\DecValTok{1}\NormalTok{] }\DecValTok{0} \DecValTok{1}
\end{Highlighting}
\end{Shaded}
\newpage
\hypertarget{collect-key-metrics}{%
\paragraph{Collect Key Metrics}\label{collect-key-metrics}}
Once you've verified the data, collect the following important metrics
that can help characterize your text classification problem:
1.Number of samples: Total number of examples you have in the data.
2.Number of classes: Total number of topics or categories in the data.
3.Number of samples per class: Number of samples per class
(topic/category). In a balanced dataset, all classes will have a similar
number of samples; in an imbalanced dataset, the number of samples in
each class will vary widely.
4.Number of words per sample: Median number of words in one sample.
5.Frequency distribution of words: Distribution showing the frequency
(number of occurrences) of each word in the dataset.
6.Distribution of sample length: Distribution showing the number of
words per sample in the dataset.
Number of samples
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{(num\_samples }\OtherTok{\textless{}{-}} \FunctionTok{nrow}\NormalTok{(amazon\_train))}
\NormalTok{[}\DecValTok{1}\NormalTok{] }\DecValTok{2879960}
\end{Highlighting}
\end{Shaded}
Number of classes
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{(num\_classes }\OtherTok{\textless{}{-}} \FunctionTok{length}\NormalTok{(}\FunctionTok{unique}\NormalTok{(amazon\_train}\SpecialCharTok{$}\NormalTok{label)))}
\NormalTok{[}\DecValTok{1}\NormalTok{] }\DecValTok{2}
\end{Highlighting}
\end{Shaded}
Number of samples per class
\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Pretty Balanced classes}
\NormalTok{(num\_samples\_per\_class }\OtherTok{\textless{}{-}}\NormalTok{ amazon\_train }\SpecialCharTok{\%\textgreater{}\%}
\FunctionTok{count}\NormalTok{(label))}
\end{Highlighting}
\end{Shaded}
\begin{tabular}{r|r}
\hline
label & n\\
\hline
0 & 1439405\\
\hline
1 & 1440555\\
\hline
\end{tabular}
Number of words per sample
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{amazon\_train\_text\_wordCount }\OtherTok{\textless{}{-}} \FunctionTok{sapply}\NormalTok{(temp, length)}
\NormalTok{(mean\_num\_words\_per\_sample }\OtherTok{\textless{}{-}} \FunctionTok{mean}\NormalTok{(amazon\_train\_text\_wordCount))}
\NormalTok{[}\DecValTok{1}\NormalTok{] }\FloatTok{75.89602}
\NormalTok{(median\_num\_words\_per\_sample }\OtherTok{\textless{}{-}} \FunctionTok{median}\NormalTok{(amazon\_train\_text\_wordCount))}
\NormalTok{[}\DecValTok{1}\NormalTok{] }\DecValTok{67}
\end{Highlighting}
\end{Shaded}
\newpage
\hypertarget{tokenization}{%
\paragraph{Tokenization}\label{tokenization}}
To build features for supervised machine learning from natural language,
we need some way of representing raw text as numbers so we can perform
computation on them. Typically, one of the first steps in this
transformation from natural language to feature, or any of kind of text
analysis, is tokenization. Knowing what tokenization and tokens are,
along with the related concept of an n-gram, is important for almost any
natural language processing task.
Tokenization in NLP/Text Classification is essentially splitting a
phrase, sentence, paragraph, or an entire text document into smaller
units, such as individual words or terms. Each of these smaller units
are called tokens.
For Frequency distribution of words(nrams) and for Top 25 words see
Table \ref{tbl:train_words} and Figure \ref{fig:model_1}
\begin{table}
\caption{\label{tab:freq_dist_ngrams}Frequency distribution of words\label{tbl:train_words}}
\centering
\begin{tabular}[t]{lrrrr}
\toprule
word & n & total & rank & term frequency\\
\midrule
the & 11106073 & 218378699 & 1 & 0.0508569\\
i & 6544247 & 218378699 & 2 & 0.0299674\\
and & 6018678 & 218378699 & 3 & 0.0275607\\
a & 5452771 & 218378699 & 4 & 0.0249693\\
to & 5398243 & 218378699 & 5 & 0.0247196\\
it & 5028997 & 218378699 & 6 & 0.0230288\\
of & 4325341 & 218378699 & 7 & 0.0198066\\
this & 4083354 & 218378699 & 8 & 0.0186985\\
is & 3850538 & 218378699 & 9 & 0.0176324\\
in & 2594242 & 218378699 & 10 & 0.0118796\\
\bottomrule
\end{tabular}
\end{table}
\begin{verbatim}
Warning: Ignoring unknown parameters: binwidth
\end{verbatim}
\begin{figure}
\centering
\includegraphics{figures/plot_freq_dist_ngrams-1.pdf}
\caption{Frequency distribution of words(nrams) for Top 25
words\label{fig:model_1}}
\end{figure}
\newpage
\hypertarget{stopwords}{%
\paragraph{Stopwords}\label{stopwords}}
Once we have split text into tokens, it often becomes clear that not all
words carry the same amount of information, if any information at all,
for a predictive modeling task. Common words that carry little (or
perhaps no) meaningful information are called stop words. It is common
advice and practice to remove stop words for various NLP tasks.
The concept of stop words has a long history with Hans Peter Luhn
credited with coining the term in 1960.
\href{https://doi.org/10.1002/asi.5090110403}{Luhn, H. P. 1960. ``Key
Word-in-Context Index for Technical Literature (kwic Index).'' American
Documentation 11 (4): 288--295. doi:10.1002/asi.5090110403}. Examples of
these words in English are ``a,'' ``the,'' ``of,'' and ``didn't.'' These
words are very common and typically don't add much to the meaning of a
text but instead ensure the structure of a sentence is sound.
Historically, one of the main reasons for removing stop words was to
decrease the computational time for text mining; it can be regarded as a
dimensionality reduction of text data and was commonly used in search
engines to give better results
\href{https://doi.org/10.1145/1835449.1835499}{Huston, Samuel, and W.
Bruce Croft. 2010. ``Evaluating Verbose Query Processing Techniques.''
In Proceedings of the 33rd International ACM SIGIR Conference on
Research and Development in Information Retrieval, 291--298. SIGIR '10.
New York, NY, USA: ACM. doi:10.1145/1835449.1835499}.
For Frequency distribution of words(ngrams) and for Top 25 words
excluding stopwords see Table \ref{tbl:train_words_sw} and Figure
\ref{fig:model_2}
Using Pre-made stopwords
\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{length}\NormalTok{(}\FunctionTok{stopwords}\NormalTok{(}\AttributeTok{source =} \StringTok{"smart"}\NormalTok{))}
\NormalTok{[}\DecValTok{1}\NormalTok{] }\DecValTok{571}
\FunctionTok{length}\NormalTok{(}\FunctionTok{stopwords}\NormalTok{(}\AttributeTok{source =} \StringTok{"snowball"}\NormalTok{))}
\NormalTok{[}\DecValTok{1}\NormalTok{] }\DecValTok{175}
\FunctionTok{length}\NormalTok{(}\FunctionTok{stopwords}\NormalTok{(}\AttributeTok{source =} \StringTok{"stopwords{-}iso"}\NormalTok{))}
\NormalTok{[}\DecValTok{1}\NormalTok{] }\DecValTok{1298}
\end{Highlighting}
\end{Shaded}
Frequency distribution of words with stopwords removed
We will use the ``stopwords-iso'' Pre-made stopwords along with a few
unique to our case
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mystopwords }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"s"}\NormalTok{, }\StringTok{"t"}\NormalTok{, }\StringTok{"m"}\NormalTok{, }\StringTok{"ve"}\NormalTok{, }\StringTok{"re"}\NormalTok{, }\StringTok{"d"}\NormalTok{, }\StringTok{"ll"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}
\begin{table}
\caption{\label{tab:freq_dist_ngrams_stopwords}Frequency distribution of words excluding stopwords\label{tbl:train_words_sw}}
\centering
\begin{tabular}[t]{lrrrr}
\toprule
word & n & total & rank & term frequency\\
\midrule
book & 1441251 & 71253939 & 1 & 0.0202270\\
read & 513081 & 71253939 & 2 & 0.0072007\\
time & 506943 & 71253939 & 3 & 0.0071146\\
movie & 431317 & 71253939 & 4 & 0.0060532\\
love & 332012 & 71253939 & 5 & 0.0046596\\
product & 306813 & 71253939 & 6 & 0.0043059\\
bought & 292201 & 71253939 & 7 & 0.0041008\\
album & 265835 & 71253939 & 8 & 0.0037308\\
story & 264836 & 71253939 & 9 & 0.0037168\\
music & 235009 & 71253939 & 10 & 0.0032982\\
\bottomrule
\end{tabular}
\end{table}
\begin{figure}
\centering
\includegraphics{figures/plot_freq_dist_ngrams_stopwords-1.pdf}
\caption{Frequency distribution of words(nrams) for Top 25 words
excluding stopwords\label{fig:model_2}}
\end{figure}
\newpage
Here are Google's recommendations after decades of research:
Algorithm for Data Preparation and Model Building
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
Calculate the number of samples/number of words per sample ratio.
\item
If this ratio is less than 1500, tokenize the text as n-grams and use
a simple multi-layer perceptron (MLP) model to classify them (left
branch in the flowchart below):
\end{enumerate}
\begin{enumerate}
\def\labelenumi{\alph{enumi}.}
\tightlist
\item
Split the samples into word n-grams; convert the n-grams into vectors.
\item
Score the importance of the vectors and then select the top 20K using
the scores.
\item
Build an MLP model.
\end{enumerate}
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
If the ratio is greater than 1500, tokenize the text as sequences and
use a
\href{https://developers.google.com/machine-learning/glossary?utm_source=DevSite\&utm_campaign=Text-Class-Guide\&utm_medium=referral\&utm_content=glossary\&utm_term=sepCNN\#depthwise-separable-convolutional-neural-network-sepcnn}{sepCNN}
model to classify them (right branch in the flowchart below):
\end{enumerate}
\begin{enumerate}
\def\labelenumi{\alph{enumi}.}
\tightlist
\item
Split the samples into words; select the top 20K words based on their
frequency.
\item
Convert the samples into word sequence vectors.
\item
If the original number of samples/number of words per sample ratio is
less than 15K, using a fine-tuned pre-trained embedding with the
sepCNN model will likely provide the best results.
\end{enumerate}
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{3}
\tightlist
\item
Measure the model performance with different hyperparameter values to
find the best model configuration for the dataset.
\end{enumerate}
\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# 3. If the ratio is greater than 1500, tokenize the text as sequences and use}
\CommentTok{\# a sepCNN model see above}
\NormalTok{(S\_W\_ratio }\OtherTok{\textless{}{-}}\NormalTok{ num\_samples}\SpecialCharTok{/}\NormalTok{median\_num\_words\_per\_sample)}
\NormalTok{[}\DecValTok{1}\NormalTok{] }\FloatTok{42984.48}
\end{Highlighting}
\end{Shaded}
\newpage
\hypertarget{preprocessing-for-deep-learning-continued-with-more-exploration}{%
\subsection{Preprocessing for deep learning continued with more
exploration}\label{preprocessing-for-deep-learning-continued-with-more-exploration}}
For ``Number of words per review text'' see Figure \ref{fig:model_3}
For ``Number of words per review title'' see Figure \ref{fig:model_4}
For ``Number of words per review text by label'' see Figure
\ref{fig:model_5}
For ``Number of words per review title by label'' see Figure
\ref{fig:model_6}
For ``Sample/Subset of our training dataset'' see Table
\ref{tbl:amazon_subset_train}
\begin{figure}
\centering
\includegraphics{figures/preproc_1-1.pdf}
\caption{Number of words per review text\label{fig:model_3}}
\end{figure}