-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathharvard-capstone-amazon-reviews-polarity.R
2021 lines (1816 loc) · 91.8 KB
/
harvard-capstone-amazon-reviews-polarity.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#' ---
#' title: "Amazon Reviews \n Sentiment Analysis/Text Classification \n Choose Your Own Project \n A Harvard Capstone Project"
#' author: "Manoj Bijoor"
#' email: [email protected]
#' date: "`r format(Sys.time(), '%B %d, %Y')`"
#' output:
#' pdf_document:
#' latex_engine: xelatex
#' number_sections: yes
#' keep_tex: yes
#' keep_md: yes
#' df_print: kable
#' highlight: pygments
#' extra_dependencies: "subfig"
#' md_document:
#' variant: markdown_github
#' # check https://bookdown.org/yihui/rmarkdown/markdown-document.html#markdown-variants
#' github_document:
#' toc: true
#' toc_depth: 5
#' pandoc_args: --webtex
#' # pandoc_args: ['--lua-filter', 'math-github.lua']
#' html_document:
#' keep_md: true
#' code_folding: hide
#' # urlcolor: blue
#' # linkcolor: blue
#' #citecolor: blue
#' #geometry: margin=1in
#' always_allow_html: true
#' links-as-notes: true
#' header-includes:
#' \usepackage[utf8]{inputenc}
#' \usepackage[english]{babel}
#' \usepackage{bookmark}
#' \usepackage[]{hyperref}
#' \hypersetup{
#' backref,
#' pdftitle={"Amazon Review Polarity Harvard Capstone"},
#' bookmarks=true,
#' bookmarksnumbered=true,
#' bookmarksopen=true,
#' bookmarksopenlevel=3,
#' pdfpagemode=FullScreen,
#' pdfstartpage=1,
#' hyperindex=true,
#' pageanchor=true,
#' colorlinks=true,
#' linkcolor=blue,
#' filecolor=magenta,
#' urlcolor=cyan
#' }
#' \usepackage{amsmath}
#' \usepackage{pdflscape}
#' \usepackage[titles]{tocloft}
#' \usepackage{tocloft}
#' \usepackage{titlesec}
#' \usepackage{longtable}
#' \usepackage{xpatch}
#' \usepackage[T1]{fontenc}
#' \usepackage{imakeidx}
#' \makeindex[columns=3, title=Alphabetical Index, intoc]
#'
#' # \usepackage{amssymb}
#' # \usepackage{mathtools}
#' # \usepackage{unicode-math}
#' # \usepackage{fontspec}
#' # \usepackage{letltxmacro}%
#' # \usepackage{float}
#' # \usepackage{flafter}
#' # \usepackage[titles]{tocloft}
#' ---
#'
#'
## ----setup, include=FALSE----------------------------------------------
knitr::knit_hooks$set(time_it = local({
now <- NULL
function(before, options) {
if (before) {
# record the current time before each chunk
now <<- Sys.time()
} else {
# calculate the time difference after a chunk
res <- difftime(Sys.time(), now)
# return a character string to show the time
# paste("Time for this code chunk to run:", res)
paste("Time for the chunk", options$label, "to run:", res)
}
}
}))
# knit_hooks$get("inline")
# knitr::opts_chunk$set(fig.pos = "!H", out.extra = "")
knitr::opts_chunk$set(echo = TRUE,
fig.path = "figures/")
# Beware, using the "time_it" hook messes up fig.cap, \label, \ref
# knitr::opts_chunk$set(time_it = TRUE)
#knitr::opts_chunk$set(eval = FALSE)
#'
#'
## ---- include=FALSE, eval=FALSE----------------------------------------
## options(tinytex.verbose = TRUE)
##
## # set pandoc stack size
## stack_size <- getOption("pandoc.stack.size", default = "100000000")
## args <- c(c("+RTS", paste0("-K", stack_size), "-RTS"), args)
#'
#'
## ---- include=FALSE, echo=FALSE----------------------------------------
# library(dplyr)
# library(tidyr)
# library(purrr)
# library(readr)
library(tidyverse)
library(textrecipes)
library(tidymodels)
library(tidytext)
library(ngram)
library(keras)
library(stopwords)
# Used in Baseline model
library(hardhat)
# BERT setup in its own section
# library(keras)
# library(tfdatasets)
# library(reticulate)
# library(tidyverse)
# library(lubridate)
# library(tfhub)
# import("tensorflow_text")
# o_nlp <- import("official.nlp")
#
# Sys.setenv(TFHUB_CACHE_DIR="C:/Users/bijoor/.cache/tfhub_modules")
# Sys.getenv("TFHUB_CACHE_DIR")
set.seed(234)
# Start the clock!
# ptm <- proc.time()
Sys.time()
#'
#'
## ---- include=FALSE, echo=FALSE----------------------------------------
library(ggplot2)
library(kableExtra)
#'
#' <!-- ------------------------------ -->
#'
#' \bookmark[dest=TitlePage]{Title Page}
#'
#' \pagenumbering{roman} <!-- first page with Roman numbering -->
#'
#' \newpage <!-- new page -->
#'
#' <!-- ------------------------------ -->
#'
#' \newpage
#'
#' \begin{center}
#'
#' \hypertarget{Abstract}{}
#' \large{Abstract}
#' \bookmark[dest=Abstract]{Abstract}
#'
#' \end{center}
#'
#' \bigskip
#'
#' Deriving truth and insight from a pile of data is a powerful but error-prone job.
#'
#' This project offers an empirical exploration on the use of Neural networks for text classification using the Amazon Reviews Polarity dataset.
#'
#' Text classification algorithms are at the heart of a variety of software systems that process text data at scale.
#'
#' One common type of text classification is sentiment analysis, whose goal is to identify the polarity of text content: the type of opinion it expresses. This can take the form of a binary like/dislike rating, or a more granular set of options, such as a star rating from 1 to 5. Examples of sentiment analysis include analyzing Twitter posts to determine if people liked the Black Panther movie, or extrapolating the general public’s opinion of a new brand of Nike shoes from Walmart reviews.
#'
#' Algorithms such as regularized linear models, support vector machines, and naive Bayes models are used to predict outcomes from predictors including text data. These algorithms use a shallow (single) mapping. In contrast, Deep learning models approach the same tasks and have the same goals, but the algorithms involved are different. Deep learning models are "deep" in the sense that they use multiple layers to learn how to map from input features to output outcomes.
#'
#' Deep learning models can be effective for text prediction problems because they use these multiple layers to capture complex relationships in language.
#'
#' The layers in a deep learning model are connected in a network and these models are called Neural Networks.
#'
#' Neural language models (or continuous space language models) use continuous representations or embeddings of words to make their predictions. These models make use of Neural networks.
#'
#' Continuous space embeddings help to alleviate the curse of dimensionality in language modeling: as language models are trained on larger and larger texts, the number of unique words (the vocabulary) increases. The number of possible sequences of words increases exponentially with the size of the vocabulary, causing a data sparsity problem because of the exponentially many sequences. Thus, statistics are needed to properly estimate probabilities. Neural networks avoid this problem by representing words in a distributed way, as non-linear combinations of weights in a neural net.
#'
#' Instead of using neural net language models to produce actual probabilities, it is common to instead use the distributed representation encoded in the networks' "hidden" layers as representations of words; each word is then mapped onto an n-dimensional real vector called the word embedding, where n is the size of the layer just before the output layer.
#' An alternate description is that a neural net approximates the language function and models semantic relations between words as linear combinations, capturing a form of compositionality.
#'
#' In this project we will cover four network architectures, namely DNN, CNN, sepCNN and BERT. We will also first implement a Baseline linear classifier model which serves the purpose of comparison with the deep learning techniques.
#'
#' For metrics we will use the default performance parameters for binary classification which are Accuracy, Loss and ROC AUC (area under the receiver operator characteristic curve).
#'
#' <!-- ------------------------------ -->
#'
#' \newpage
#' \clearpage
#' \phantomsection
#' \setcounter{secnumdepth}{5}
#' \setcounter{tocdepth}{5}
#'
#' \cleardoublepage <!-- ensure that the hypertarget is on the same page as the TOC heading -->
#' \hypertarget{toc}{} <!-- set the hypertarget -->
#' \bookmark[dest=toc,level=chapter]{\contentsname}
#' \tableofcontents
#'
#' \clearpage
#'
#' <!-- ------------------------------ -->
#' <!-- \renewcommand{\theHsection}{\thepart.section.\thesection} -->
#'
#' \newpage
#' \clearpage
#' \phantomsection
#' # List of tables{-}
#' \renewcommand{\listtablename}{} <!-- removes default section name -->
#'
#' \listoftables
#' \clearpage
#'
#' \newpage
#' \clearpage
#' \phantomsection
#' # List of figures{-}
#' \renewcommand{\listfigurename}{}
#'
#' \listoffigures
#' \clearpage
#'
#' \newpage
#' \clearpage
#' \phantomsection
#' \newcommand{\listequationsname}{List of Equations}
#' \newlistof{equations}{equ}{\listequationsname}
#' \newcommand{\equations}[1]{%
#' \refstepcounter{equations}
#' \addcontentsline{equ}{equations}{ \protect\numberline{\theequations}#1}\par}
#' \xpretocmd{\listofequations}{\addcontentsline{toc}{section}{\listequationsname}}{}{}
#'
#' \renewcommand{\listequationsname}{}
#'
#' \listofequations
#' \clearpage
#'
#' <!-- ------------------------------ -->
#'
#' \newpage
#'
#' \pagenumbering{arabic}
#'
#' <!-- ------------------------------ -->
#'
#' \newpage
#' # Project Overview: Amazon Reviews Polarity
#'
#' ## Introduction
#'
#' Deriving truth and insight from a pile of data is a powerful but error-prone job.
#'
#' Text classification algorithms are at the heart of a variety of software systems that process text data at scale.
#'
#' One common type of text classification is sentiment analysis, whose goal is to identify the polarity of text content: the type of opinion it expresses. This can take the form of a binary like/dislike rating, or a more granular set of options, such as a star rating from 1 to 5. Examples of sentiment analysis include analyzing Twitter posts to determine if people liked the Black Panther movie, or extrapolating the general public’s opinion of a new brand of Nike shoes from Walmart reviews.
#'
#' Algorithms such as regularized linear models, support vector machines, and naive Bayes models are used to predict outcomes from predictors including text data. These algorithms use a shallow (single) mapping. In contrast, Deep learning models approach the same tasks and have the same goals, but the algorithms involved are different. Deep learning models are "deep" in the sense that they use multiple layers to learn how to map from input features to output outcomes.
#'
#' Deep learning models can be effective for text prediction problems because they use these multiple layers to capture complex relationships in language.
#'
#' The layers in a deep learning model are connected in a network and these models are called neural networks.
#'
#' ### Neural networks
#'
#' Neural language models (or continuous space language models) use continuous representations or [embeddings of words](https://en.wikipedia.org/wiki/Word_embedding) to make their predictions.[ Karpathy, Andrej. "The Unreasonable Effectiveness of Recurrent Neural Networks"](https://karpathy.github.io/2015/05/21/rnn-effectiveness/) These models make use of [Neural networks](https://en.wikipedia.org/wiki/Artificial_neural_network).
#'
#' Continuous space embeddings help to alleviate the [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) in language modeling: as language models are trained on larger and larger texts, the number of unique words (the vocabulary) increases.[Heaps' law](https://en.wikipedia.org/wiki/Heaps%27_law). The number of possible sequences of words increases exponentially with the size of the vocabulary, causing a data sparsity problem because of the exponentially many sequences. Thus, statistics are needed to properly estimate probabilities. Neural networks avoid this problem by representing words in a distributed way, as non-linear combinations of weights in a neural net.[Bengio, Yoshua (2008). "Neural net language models". Scholarpedia. 3. p. 3881. Bibcode:2008SchpJ...3.3881B. doi:10.4249/scholarpedia.3881](https://ui.adsabs.harvard.edu/abs/2008SchpJ...3.3881B/abstract) An alternate description is that a neural net approximates the language function.
#'
#' Instead of using neural net language models to produce actual probabilities, it is common to instead use the distributed representation encoded in the networks' "hidden" layers as representations of words;
#' A hidden layer is a synthetic layer in a neural network between the input layer (that is, the features) and the output layer (the prediction). Hidden layers typically contain an activation function such as [ReLU](https://developers.google.com/machine-learning/glossary?utm_source=DevSite&utm_campaign=Text-Class-Guide&utm_medium=referral&utm_content=glossary&utm_term=sepCNN#rectified-linear-unit-relu) for training. A deep neural network contains more than one hidden layer. Each word is then mapped onto an n-dimensional real vector called the word embedding, where n is the size of the layer just before the output layer. The representations in skip-gram models for example have the distinct characteristic that they model semantic relations between words as [linear combinations](https://en.wikipedia.org/wiki/Linear_combination), capturing a form of [compositionality](https://en.wikipedia.org/wiki/Principle_of_compositionality).
#'
#'
#' In this project we will cover four network architectures, namely:
#'
#' 1. DNN - Dense Neural Network - a bridge between the "shallow" learning approaches and the other 3 - CNN, sepCNN, BERT.
#'
#' 2. CNN - Convolutional Neural Network - advanced architecture appropriate for text data because they can capture specific local patterns.
#'
#' 3. sepCNN - Depthwise Separable Convolutional Neural Network.
#'
#' 4. BERT - Bidirectional Encoder Representations from Transformers.
#'
#' We will also first implement a Baseline linear classifier model which serves the purpose of comparison with the deep learning techniques we will implement later on, and also as a succinct summary of a basic supervised machine learning analysis for text.
#'
#' This linear baseline is a regularized linear model trained on the same data set, using tf-idf weights and 5000 tokens.
#'
#' For metrics we will use the default performance parameters for binary classification which are Accuracy, Loss and ROC AUC (area under the receiver operator characteristic curve).
#'
#' We will also use the confusion matrix to get an overview of our model performance, as it includes rich information.
#'
#' We will use tidymodels packages along with Tensorflow, the R interface to Keras. See [Allaire, JJ, and François Chollet. 2021. keras: R Interface to ’Keras’](https://CRAN.R-project.org/package=keras) for preprocessing, modeling, and evaluation, and [Silge, Julia, and David Robinson. 2017. Text Mining with R: A Tidy Approach. 1st ed. O’Reilly Media, Inc.](https://www.tidytextmining.com/), [Supervised Machine Learning for Text Analysis in R, by Emil Hvitfeldt and Julia Silge.](https://smltar.com/) and [Tidy Modeling with R, Max Kuhn and Julia Silge, Version 0.0.1.9010, 2021-07-19](https://www.tmwr.org/) and how can we forget [Introduction to Data Science, Data Analysis and Prediction Algorithms with R - Rafael A. Irizarry, 2021-07-03](https://rafalab.github.io/dsbook/).
#'
#' The keras R package provides an interface for R users to Keras, a high-level API for building neural networks.
#'
#' This project will use some key machine learning best practices for solving text classification problems.
#' Here’s what you’ll learn:
#'
#' 1. The high-level, end-to-end workflow for solving text classification problems using machine learning
#' 2. How to choose the right model for your text classification problem
#' 3. How to implement your model of choice using TensorFlow with Keras acting as an interface for the TensorFlow library
#'
#' I have used/mentioned several references throughout the project.
#'
#' This project depends on python and R software for Tensorflow and Keras that needs to be installed both inside and outside of R. As each individual's environment may be different, I cannot automate this part in my code.
#'
#' R side:
#' https://cran.r-project.org/
#' https://tensorflow.rstudio.com/installation/
#' https://tensorflow.rstudio.com/installation/gpu/local_gpu/
#'
#' Python side:
#' https://www.tensorflow.org/install
#' https://www.anaconda.com/products/individual
#' https://keras.io/
#'
#' Instead of cluttering code with comments, I ask you to please use these references and the rstudio help (?cmd/??cmd) if you are not very familiar with any specific command. Most commands are pretty self explanatory if you are even a little familiar with R.
#'
#' Here are some more references:
#'
#' ## References
#'
#' [Tensorflow](https://www.tensorflow.org/) is an end-to-end open source platform for machine learning. It has a comprehensive, flexible ecosystem of tools, libraries and community resources that lets researchers push the state-of-the-art in ML and developers easily build and deploy ML powered applications.
#'
#' The [TensorFlow Hub](https://tfhub.dev/) lets you search and discover hundreds of trained, ready-to-deploy machine learning models in one place.
#'
#' [Tensorflow for R](https://tensorflow.rstudio.com/) provides an R interface for Tensorflow.
#'
#' [Tidy Modeling with R](https://www.tmwr.org/)
#'
#' [Tinytex](https://yihui.org/tinytex/)
#' I have used tinytex in code chunks.
#'
#' [Latex](https://www.overleaf.com/learn/latex)
#' I have used Latex beyond the very basic provided by default templates in RStudio. Too numerous to explain. Though that much is not needed, I have used it to learn and make better pdf docs.
#'
#' [Rmarkdown](https://bookdown.org/yihui/rmarkdown)
#'
#'
#' \newpage
#' ## Text Classification Workflow
#'
#' Here’s a high-level overview of the workflow used to solve machine learning problems:
#'
#' Step 1: Gather Data
#' Step 2: Explore Your Data
#' Step 2.5: Choose a Model*
#' Step 3: Prepare Your Data
#' Step 4: Build, Train, and Evaluate Your Model
#' Step 5: Tune Hyperparameters
#' Step 6: Deploy Your Model
#'
#' The following sections explain each step in detail, and how to implement them for text data.
#'
#' ### Gather Data
#' Gathering data is the most important step in solving any supervised machine learning problem. Your text classifier can only be as good as the dataset it is built from.
#'
#' Here are some important things to remember when collecting data:
#'
#' 1. If you are using a public API, understand the limitations of the API before using them. For example, some APIs set a limit on the rate at which you can make queries.
#'
#' 2. The more training examples/samples you have, the better. This will help your model generalize better.
#'
#' 3. Make sure the number of samples for every class or topic is not overly imbalanced. That is, you should have comparable number of samples in each class.
#'
#' 4. Make sure that your samples adequately cover the space of possible inputs, not only the common cases.
#'
#' This dataset contains amazon reviews posted by people on the Amazon website, and is a classic example of a sentiment analysis problem.
#'
#' Amazon Review Polarity Dataset - Version 3, Updated 09/09/2015
#'
#'
#' ORIGIN
#'
#' The Amazon reviews dataset consists of reviews from amazon. The data span a period of 18 years, including ~35 million reviews up to March 2013. Reviews include product and user information, ratings, and a plaintext review. For more information, please refer to the following paper: [J. McAuley and J. Leskovec. Hidden factors and hidden topics: Understanding rating dimensions with review text. In Proceedings of the 7th ACM Conference on Recommender Systems, RecSys ’13, pages 165–172, New York, NY, USA, 2013. ACM](https://cs.stanford.edu/people/jure/pubs/reviews-recsys13.pdf).
#'
#' The Amazon reviews polarity dataset was constructed by Xiang Zhang ([email protected]) from the above dataset. It is used as a text classification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann LeCun. [Character-level Convolutional Networks for Text Classification](https://arxiv.org/abs/1509.01626). Advances in Neural Information Processing Systems 28 (NIPS 2015).
#'
#' Here is an Abstract of that paper:
#'
#' This article offers an empirical exploration on the use of character-level convolutional networks (ConvNets) for text classification. We constructed several large-scale datasets to show that character-level convolutional networks could achieve state-of-the-art or competitive results. Comparisons are offered against traditional models such as bag of words, n-grams and their TFIDF variants, and deep learning models such as word-based ConvNets and recurrent neural networks.
#'
#' Coming back to our project: As Google has changed it's API, I had to download the dataset manually from the following URL:
#'
#' Please select file named "amazon_review_polarity_csv.tar.gz" and download it to the project directory.
#'
#' Download Location URL : [Xiang Zhang Google Drive](https://drive.google.com/drive/folders/0Bz8a_Dbh9Qhbfll6bVpmNUtUcFdjYmF2SEpmZUZUcVNiMUw1TWN6RDV3a0JHT3kxLVhVR2M?resourcekey=0-TLwzfR2O-D2aPitmn5o9VQ)
#'
#'
#' DESCRIPTION
#'
#' The Amazon reviews polarity dataset is constructed by taking review score 1 and 2 as negative, and 4 and 5 as positive. Samples of score 3 is ignored. In the dataset, class 1 is the negative and class 2 is the positive. Each class has 1,800,000 training samples and 200,000 testing samples.
#'
#' The files train.csv and test.csv contain all the training samples as comma-separated values. There are 3 columns in them, corresponding to label/class index (1 or 2), review title and review text. The review title and text are escaped using double quotes ("), and any internal double quote is escaped by 2 double quotes (""). New lines are escaped by a backslash followed with an "n" character, that is "\textbackslash n".
#'
#'
#' \newpage
#' ### Explore Your Data
#' Building and training a model is only one part of the workflow. Understanding the characteristics of your data beforehand will enable you to build a better model. This could simply mean obtaining a higher accuracy. It could also mean requiring less data for training, or fewer computational resources.
#'
#' #### Load the Dataset
#' First up, let’s load the dataset into R.
#'
#' In the dataset, class 1 is the negative and class 2 is the positive review. We will change these to 0 and 1.
#'
#' columns = (0, 1, 2) \# 0 - label/class index, 1 - title/subject, 2 -
#' text body/review.
#'
#' In this project we will NOT be using the "title" data. We will use only "label" and "text".
#' Also note that I have more comments in the code file/s than in the pdf document.
#'
## ----untar_dataset, include=TRUE, echo=TRUE, eval=TRUE, collapse = TRUE, comment="", highlight=TRUE, background='#F7F7F7', tidy=TRUE----
untar("amazon_review_polarity_csv.tar.gz",list=TRUE) ## check contents
untar("amazon_review_polarity_csv.tar.gz")
#'
#'
#'
## ----load_csv, include=FALSE, echo=FALSE, eval=TRUE, collapse = TRUE, comment="", highlight=TRUE, background='#F7F7F7', tidy=TRUE----
train_file_path <- file.path("amazon_review_polarity_csv/train.csv")
test_file_path <- file.path("amazon_review_polarity_csv/test.csv")
# read data, ensure "utf-8" encoding, add column names, exclude rows with missing values(NA)
amazon_orig_train <- readr::read_csv(
train_file_path,
# skip = 0,
col_names = c("label", "title", "text"),
locale = locale(encoding = "UTF-8")) %>% na.omit()
# change labels from (1,2) to (0,1) - easier for binary classification
amazon_orig_train$label[amazon_orig_train$label==1] <- 0
amazon_orig_train$label[amazon_orig_train$label==2] <- 1
# removed numbers as they were too many and did not contribute any info
# amazon_orig_train$text <- str_replace_all(amazon_orig_train$text,"[^([[:alnum:]_])]"," ") %>% trimws() %>% str_squish()
#
# amazon_orig_train$title <- str_replace_all(amazon_orig_train$title,"[^([[:alnum:]_])]"," ") %>% trimws() %>% str_squish()
# remove leading/trailing whitespace (trimws)
# trim whitespace from a string (str_squish)
# replace non alphabet chars with space
amazon_orig_train$text <- str_replace_all(amazon_orig_train$text,"[^([[:alpha:]_])]"," ") %>% trimws() %>% str_squish()
amazon_orig_train$title <- str_replace_all(amazon_orig_train$title,"[^([[:alpha:]_])]"," ") %>% trimws() %>% str_squish()
# create a validation set for training purposes
ids_train <- sample.int(nrow(amazon_orig_train), size = 0.8*nrow(amazon_orig_train))
amazon_train <- amazon_orig_train[ids_train,]
amazon_val <- amazon_orig_train[-ids_train,]
head(amazon_train)
# save cleaned up data for later use
write_csv(amazon_train,"amazon_review_polarity_csv/amazon_train.csv", col_names = TRUE)
write_csv(amazon_val,"amazon_review_polarity_csv/amazon_val.csv", col_names = TRUE)
# -----------------------------------------------
# read data, ensure "utf-8" encoding, add column names, exclude rows with missing values(NA)
amazon_orig_test <- readr::read_csv(
test_file_path,
# skip = 0,
col_names = c("label", "title", "text"),
locale = locale(encoding = "UTF-8")) %>% na.omit()
# change labels from (1,2) to (0,1) - easier for binary classification
amazon_orig_test$label[amazon_orig_test$label==1] <- 0
amazon_orig_test$label[amazon_orig_test$label==2] <- 1
# remove leading/trailing whitespace (trimws)
# trim whitespace from a string (str_squish)
# replace non alphabet chars with space
amazon_orig_test$text <- str_replace_all(amazon_orig_test$text,"[^([[:alpha:]_])]"," ") %>% trimws() %>% str_squish()
amazon_orig_test$title <- str_replace_all(amazon_orig_test$title,"[^([[:alpha:]_])]"," ") %>% trimws() %>% str_squish()
# amazon_orig_test$text <- str_replace_all(amazon_orig_test$text,"[^([[:alnum:]_])]"," ") %>% trimws() %>% str_squish()
#
# amazon_orig_test$title <- str_replace_all(amazon_orig_test$title,"[^([[:alnum:]_])]"," ") %>% trimws() %>% str_squish()
head(amazon_orig_test)
# save cleaned up data for later use
write_csv(amazon_orig_test,"amazon_review_polarity_csv/amazon_test.csv", col_names = TRUE)
rm(amazon_orig_train, amazon_orig_test)
rm(ids_train, test_file_path, train_file_path)
# free unused R memory
gc()
#'
#'
## ----temp, echo=FALSE, eval=TRUE, message=FALSE, include=FALSE---------
#### To be deleted later
amazon_train <- readr::read_csv("amazon_review_polarity_csv/amazon_train.csv")
#'
#' \newpage
#' #### Check the Data
#' After loading the data, it’s good practice to run some checks on it: pick a few samples and manually check if they are consistent with your expectations. For example see Table \ref{tbl:amazon_train}
#'
## ----chk_data_1, eval=TRUE, collapse = TRUE, comment="", highlight=TRUE, background='#F7F7F7', tidy=TRUE----
glimpse(amazon_train)
#'
## ----chk_data_2, echo=FALSE, eval=TRUE, collapse = TRUE, comment="", highlight=TRUE, background='#F7F7F7', tidy=TRUE, tidy.opts=list(blank = FALSE, width.cutoff = 90)----
# head(amazon_train)
kable(amazon_train[1:10,], "latex", escape=FALSE, booktabs=TRUE, linesep="", caption="Amazon Train data\\label{tbl:amazon_train}") %>%
kable_styling(latex_options=c("HOLD_position"), font_size=6)
# kable_styling(full_width = F)
#'
#' Labels : Negative reviews = 0, Positive reviews = 1
## ----chk_data_3, eval=TRUE, collapse = TRUE, comment="", highlight=TRUE, background='#F7F7F7', tidy=TRUE----
unique(amazon_train$label)
#'
#' \newpage
#' #### Collect Key Metrics
#'
#' Once you've verified the data, collect the following
#' important metrics that can help characterize your text classification
#' problem:
#'
#' 1.Number of samples: Total number of examples you have in the data.
#'
#' 2.Number of classes: Total number of topics or categories in the data.
#'
#' 3.Number of samples per class: Number of samples per class
#' (topic/category). In a balanced dataset, all classes will have a similar
#' number of samples; in an imbalanced dataset, the number of samples in
#' each class will vary widely.
#'
#' 4.Number of words per sample: Median number of words in one sample.
#'
#' 5.Frequency distribution of words: Distribution showing the frequency
#' (number of occurrences) of each word in the dataset.
#'
#' 6.Distribution of sample length: Distribution showing the number of words
#' per sample in the dataset.
#'
#'
#' Number of samples
## ----num_samples, include=TRUE, echo=TRUE, eval=TRUE, collapse = TRUE, comment="", highlight=TRUE, background='#F7F7F7', tidy=TRUE, message=FALSE----
(num_samples <- nrow(amazon_train))
#'
#'
#' Number of classes
## ----num_classes, include=TRUE, echo=TRUE, eval=TRUE, collapse = TRUE, comment="", highlight=TRUE, background='#F7F7F7', tidy=TRUE, message=FALSE----
(num_classes <- length(unique(amazon_train$label)))
#'
#'
#' Number of samples per class
## ----balanced_classes, include=TRUE, echo=TRUE, eval=TRUE, collapse = TRUE, comment="", highlight=TRUE, background='#F7F7F7', tidy=TRUE, message=FALSE----
# Pretty Balanced classes
(num_samples_per_class <- amazon_train %>% count(label))
#'
#'
#' Number of words per sample
## ----mean_median_num_words_per_sample, echo=6:14, include=TRUE, eval=TRUE, collapse = TRUE, comment="", highlight=TRUE, background='#F7F7F7', tidy=TRUE, message=FALSE----
# break up the strings in each row by " "
temp <- strsplit(amazon_train$text, split=" ")
# sapply(temp[c(1:3)], length)
# count the number of words as the length of the vectors
amazon_train_text_wordCount <- sapply(temp, length)
(mean_num_words_per_sample <- mean(amazon_train_text_wordCount))
(median_num_words_per_sample <- median(amazon_train_text_wordCount))
#'
#'
#' \newpage
#' #### Tokenization
#'
#' To build features for supervised machine learning from natural language, we need some way of representing raw text as numbers so we can perform computation on them. Typically, one of the first steps in this transformation from natural language to feature, or any of kind of text analysis, is tokenization. Knowing what tokenization and tokens are, along with the related concept of an n-gram, is important for almost any natural language processing task.
#'
#' Tokenization in NLP/Text Classification is essentially splitting a phrase, sentence, paragraph, or an entire text document into smaller units, such as individual words or terms. Each of these smaller units are called tokens.
#'
#' For Frequency distribution of words(nrams) and for Top 25 words see Table \ref{tbl:train_words} and Figure \ref{fig:model_1}
#'
## ----freq_dist_ngrams, echo=FALSE, eval=TRUE, collapse = TRUE, comment="", warning=FALSE, message=FALSE, highlight=TRUE, background='#F7F7F7', tidy=TRUE, tidy.opts=list(blank = FALSE, width.cutoff = 90)----
# Frequency distribution of words(ngrams)
train_words <- amazon_train %>% unnest_tokens(word, text) %>% count(word,sort = TRUE)
total_words <- train_words %>%
summarize(total = sum(n))
# Zipf’s law states that the frequency that a word appears is inversely proportional to its rank.
train_words <- train_words %>%
mutate(total_words) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
# head(train_words)
kable(train_words[1:10,], "latex", escape=FALSE, booktabs=TRUE, linesep="", caption="Frequency distribution of words\\label{tbl:train_words}") #%>%
# kable_styling(latex_options=c("HOLD_position"), font_size=6)
#'
#'
## ----plot_freq_dist_ngrams, echo=FALSE, eval=TRUE, comment="", message=FALSE, fig.pos="h!", fig.cap="Frequency distribution of words(nrams) for Top 25 words\\label{fig:model_1}"----
train_words %>%
top_n(25, n) %>%
ggplot(aes(reorder(word,n),n)) +
geom_col(binwidth = 1, alpha = 0.8) +
coord_flip() +
labs(y="n - Frequency distribution of words(ngrams)",
x="Top 25 words")
#'
#'
#' \newpage
#' #### Stopwords
#'
#' Once we have split text into tokens, it often becomes clear that not all words carry the same amount of information, if any information at all, for a predictive modeling task. Common words that carry little (or perhaps no) meaningful information are called stop words. It is common advice and practice to remove stop words for various NLP tasks.
#'
#' The concept of stop words has a long history with Hans Peter Luhn credited with coining the term in 1960. [Luhn, H. P. 1960. “Key Word-in-Context Index for Technical Literature (kwic Index).” American Documentation 11 (4): 288–295. doi:10.1002/asi.5090110403](https://doi.org/10.1002/asi.5090110403). Examples of these words in English are “a,” “the,” “of,” and “didn’t.” These words are very common and typically don’t add much to the meaning of a text but instead ensure the structure of a sentence is sound.
#'
#' Historically, one of the main reasons for removing stop words was to decrease the computational time for text mining; it can be regarded as a dimensionality reduction of text data and was commonly used in search engines to give better results [Huston, Samuel, and W. Bruce Croft. 2010. “Evaluating Verbose Query Processing Techniques.” In Proceedings of the 33rd International ACM SIGIR Conference on Research and Development in Information Retrieval, 291–298. SIGIR ’10. New York, NY, USA: ACM. doi:10.1145/1835449.1835499](https://doi.org/10.1145/1835449.1835499).
#'
#' For Frequency distribution of words(ngrams) and for Top 25 words excluding stopwords see Table \ref{tbl:train_words_sw} and Figure \ref{fig:model_2}
#'
#'
#' Using Pre-made stopwords
## ----stopwords_choices, include=TRUE, echo=TRUE, eval=TRUE, collapse = TRUE, comment="", highlight=TRUE, background='#F7F7F7', tidy=TRUE, message=FALSE----
length(stopwords(source = "smart"))
length(stopwords(source = "snowball"))
length(stopwords(source = "stopwords-iso"))
#'
#'
#' Frequency distribution of words with stopwords removed
#'
#' We will use the "stopwords-iso" Pre-made stopwords along with a few unique to our case
## ----freq_dist_ngrams_stopwords, echo=1:1, eval=TRUE, collapse = TRUE, comment="", warning=FALSE, message=FALSE, highlight=TRUE, background='#F7F7F7', tidy=TRUE, tidy.opts=list(blank = FALSE, width.cutoff = 90)----
mystopwords <- c("s", "t", "m", "ve", "re", "d", "ll")
# Frequency distribution of words(ngrams)
train_words_sw <- amazon_train %>% unnest_tokens(word, text) %>%
anti_join(get_stopwords(source = "stopwords-iso"))%>%
filter(!(word %in% mystopwords)) %>%
count(word,sort = TRUE)
total_words_sw <- train_words_sw %>%
summarize(total = sum(n))
# Zipf’s law states that the frequency that a word appears is inversely proportional to its rank.
train_words_sw <- train_words_sw %>%
mutate(total_words_sw) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
# head(train_words_sw)
kable(train_words_sw[1:10,], "latex", escape=FALSE, booktabs=TRUE, linesep="", caption="Frequency distribution of words excluding stopwords\\label{tbl:train_words_sw}") #%>%
# kable_styling(latex_options=c("HOLD_position"), font_size=6)
#'
#'
#'
## ----plot_freq_dist_ngrams_stopwords, echo=FALSE, eval=TRUE, comment="", warning=FALSE, message=FALSE, fig.pos="h!", fig.cap="Frequency distribution of words(nrams) for Top 25 words excluding stopwords\\label{fig:model_2}"----
train_words_sw %>%
top_n(25, n) %>%
ggplot(aes(reorder(word,n),n)) +
geom_col(binwidth = 1, alpha = 0.8) +
coord_flip() +
labs(y="n - Frequency distribution of words(ngrams) excluding stopwords",
x="Top 25 words")
#'
#'
#' \newpage
#' Here are Google's recommendations after decades of research:
#'
#' Algorithm for Data Preparation and Model Building
#'
#' 1. Calculate the number of samples/number of words per sample ratio.
#' 2. If this ratio is less than 1500, tokenize the text as n-grams and use a
#' simple multi-layer perceptron (MLP) model to classify them (left branch in the
#' flowchart below):
#' a. Split the samples into word n-grams; convert the n-grams into vectors.
#' b. Score the importance of the vectors and then select the top 20K using the scores.
#' c. Build an MLP model.
#' 3. If the ratio is greater than 1500, tokenize the text as sequences and use a
#' [sepCNN](https://developers.google.com/machine-learning/glossary?utm_source=DevSite&utm_campaign=Text-Class-Guide&utm_medium=referral&utm_content=glossary&utm_term=sepCNN#depthwise-separable-convolutional-neural-network-sepcnn) model to classify them (right branch in the flowchart below):
#' a. Split the samples into words; select the top 20K words based on their frequency.
#' b. Convert the samples into word sequence vectors.
#' c. If the original number of samples/number of words per sample ratio is less
#' than 15K, using a fine-tuned pre-trained embedding with the sepCNN
#' model will likely provide the best results.
#' 4. Measure the model performance with different hyperparameter values to find
#' the best model configuration for the dataset.
#'
## ----S_W_ratio, include=TRUE, echo=TRUE, eval=TRUE, collapse = TRUE, comment="", highlight=TRUE, background='#F7F7F7', tidy=TRUE, warning=FALSE, message=FALSE----
# 3. If the ratio is greater than 1500, tokenize the text as
# sequences and use a sepCNN model
# see above
(S_W_ratio <- num_samples / median_num_words_per_sample)
#'
#'
#' \newpage
#' ## Preprocessing for deep learning continued with more exploration
#'
#' For "Number of words per review text" see Figure \ref{fig:model_3}
#'
#' For "Number of words per review title" see Figure \ref{fig:model_4}
#'
#' For "Number of words per review text by label" see Figure \ref{fig:model_5}
#'
#' For "Number of words per review title by label" see Figure \ref{fig:model_6}
#'
#' For "Sample/Subset of our training dataset" see Table \ref{tbl:amazon_subset_train}
#'
#'
## ----preproc_1, echo=FALSE, eval=TRUE, comment="", warning=FALSE, message=FALSE, fig.pos="h!", fig.cap="Number of words per review text\\label{fig:model_3}"----
amazon_train %>%
mutate(n_words = tokenizers::count_words(text)) %>%
ggplot(aes(n_words)) +
geom_bar() +
labs(x = "Number of words per review text",
y = "Number of review texts")
#'
## ----preproc_2, echo=FALSE, eval=TRUE, comment="", warning=FALSE, message=FALSE, fig.pos="h!", fig.cap="Number of words per review title\\label{fig:model_4}"----
amazon_train %>%
mutate(n_words = tokenizers::count_words(title)) %>%
ggplot(aes(n_words)) +
geom_bar() +
labs(x = "Number of words per review title",
y = "Number of review titles")
#'
## ----preproc_3, echo=FALSE, eval=TRUE, comment="", warning=FALSE, message=FALSE, fig.pos="h!", fig.cap="Number of words per review text by label\\label{fig:model_5}"----
amazon_train %>%
group_by(label) %>%
mutate(n_words = tokenizers::count_words(text)) %>%
ggplot(aes(n_words)) +
# ggplot(aes(nchar(text))) +
geom_histogram(binwidth = 1, alpha = 0.8) +
facet_wrap(~ label, nrow = 1) +
labs(x = "Number of words per review text by label",
y = "Number of reviews")
#'
## ----preproc_4, echo=FALSE, eval=TRUE, comment="", warning=FALSE, message=FALSE, fig.pos="h!", fig.cap="Number of words per review title by label\\label{fig:model_6}"----
amazon_train %>%
group_by(label) %>%
mutate(n_words = tokenizers::count_words(title)) %>%
ggplot(aes(n_words)) +
# ggplot(aes(nchar(title))) +
geom_histogram(binwidth = 1, alpha = 0.8) +
facet_wrap(~ label, nrow = 1) +
labs(x = "Number of words per review title by label",
y = "Number of reviews")
#'
#'
#' Let's trim down our training dataset due to computing resource limitations.
## ----subset_train, echo=1:3, eval=TRUE, collapse = TRUE, comment="", warning=FALSE, message=FALSE, highlight=TRUE, background='#F7F7F7', tidy=TRUE, tidy.opts=list(blank = FALSE, width.cutoff = 90)----
amazon_subset_train <- amazon_train %>% select(-title) %>%
mutate(n_words = tokenizers::count_words(text)) %>%
filter((n_words < 35) & (n_words > 5)) %>% select(-n_words)
dim(amazon_subset_train)
# head(amazon_subset_train)
kable(amazon_subset_train[1:10,], "latex", escape=FALSE, booktabs=TRUE, linesep="", caption="Sample/Subset of our training dataset\\label{tbl:amazon_subset_train}") #%>%
# kable_styling(latex_options=c("HOLD_position"), font_size=6)
#'
#'
#'
#' \newpage
#' # Model Baseline linear classifier
#'
#' This model serves the purpose of comparison with the deep learning techniques we will implement later on, and also as a succinct summary of a basic supervised machine learning analysis for text.
#'
#' This linear baseline is a regularized linear model trained on the same data set, using tf-idf weights and 5000 tokens.
#'
#' ## Modify label column to factor
#'
## ----label_to_factor, echo=TRUE, eval=TRUE, collapse = TRUE, comment="", warning=FALSE, message=FALSE, highlight=TRUE, background='#F7F7F7', tidy=TRUE----
# Free computer resources
rm(amazon_train, amazon_val, amazon_train_text_wordCount,num_samples_per_class, temp, total_words, train_words)
rm(mean_num_words_per_sample, median_num_words_per_sample, num_classes, num_samples, S_W_ratio)
gc()
# save(amazon_subset_train)
write_csv(amazon_subset_train,"amazon_review_polarity_csv/amazon_subset_train.csv", col_names = TRUE)
amazon_train <- amazon_subset_train
amazon_train <- amazon_train %>%
mutate(label = as.factor(label))
# amazon_val <- amazon_train %>%
# mutate(label = as.factor(label))
#'
#'
#' ## Split into test/train and create resampling folds
#'
## ----create_folds, echo=TRUE, eval=TRUE, collapse = TRUE, comment="", highlight=TRUE, background='#F7F7F7', tidy=TRUE, tidy.opts=list(blank = FALSE, width.cutoff = 60)----
set.seed(1234)
amazon_split <- amazon_train %>% initial_split()
amazon_train <- training(amazon_split)
amazon_test <- testing(amazon_split)
set.seed(123)
amazon_folds <- vfold_cv(amazon_train)
# amazon_folds
#'
#'
#' ## Recipe for data preprocessing
#'
#' "step_tfidf" creates a specification of a recipe step that will convert a tokenlist into multiple variables containing the [term frequency-inverse document frequency](https://www.tidytextmining.com/tfidf.html) of tokens.(check it out in the console by typing ?textrecipes::step_tfidf)
#'
## ----rec_blm, echo=TRUE, eval=TRUE, collapse = TRUE, comment="", warning=FALSE, message=FALSE, highlight=TRUE, background='#F7F7F7', tidy=TRUE----
# library(textrecipes)
amazon_rec <- recipe(label ~ text, data = amazon_train) %>%
step_tokenize(text) %>%
step_tokenfilter(text, max_tokens = 5e3) %>%
step_tfidf(text)
amazon_rec
#'
#' ## Lasso regularized classification model and tuning
#'
#' Linear models are not considered cutting edge in NLP research, but are a workhorse in real-world practice. Here we will use a lasso regularized model [Tibshirani, Robert. 1996. "Regression Shrinkage and Selection via the Lasso." Journal of the Royal Statistical Society. Series B (Methodological) 58 (1). Royal Statistical Society, Wiley: 267–288.]( http://www.jstor.org/stable/2346178).
#'
#' Let’s create a specification of lasso regularized model.
#'
#' "penalty" is a model hyperparameter and we cannot learn its best value during model training, but we can estimate the best value by training many models on resampled data sets and exploring how well all these models perform. Let’s build a new model specification for model tuning.
#'
## ----lasso_spec, eval=TRUE, collapse = TRUE, comment="", warning=FALSE, message=FALSE, highlight=TRUE, background='#F7F7F7', tidy=TRUE----
lasso_spec <- logistic_reg(penalty = tune(), mixture = 1) %>%
set_mode("classification") %>%
set_engine("glmnet")
lasso_spec
#'
#' ## A model workflow
#'
#' We need a few more components before we can tune our workflow. Let's use
#' a sparse data encoding.
#'
#' We can change how our text data is represented to take advantage of its sparsity, especially for models like lasso regularized models. The regularized regression model we trained above used set_engine("glmnet"); this computational engine can be more efficient when text data is transformed to a sparse matrix, rather than a dense data frame or tibble representation.
#'
#' To keep our text data sparse throughout modeling and use the sparse capabilities of set_engine("glmnet"), we need to explicitly set a non-default preprocessing blueprint, using the package hardhat [Vaughan, Davis, and Max Kuhn. 2020. hardhat: Construct Modeling Packages.](https://CRAN.R-project.org/package=hardhat).
#'
## ----rec_blueprint, eval=TRUE, collapse = TRUE, comment="", warning=FALSE, message=FALSE, highlight=TRUE, background='#F7F7F7', tidy=TRUE----
library(hardhat)
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
#'
#' Let's create a grid of possible regularization penalties to try, using a convenience function for penalty() called grid_regular() from the dials package.
#'
## ----penalty_grid, eval=TRUE, collapse = TRUE, comment="", warning=FALSE, message=FALSE, highlight=TRUE, background='#F7F7F7', tidy=TRUE----
lambda_grid <- grid_regular(penalty(range = c(-5, 0)), levels = 20)
lambda_grid
#'
#' Now these can be combined in a tuneable workflow()
#'
## ----amazon_wf_1, eval=TRUE, collapse = TRUE, comment="", warning=FALSE, message=FALSE, highlight=TRUE, background='#F7F7F7', tidy=TRUE----
amazon_wf <- workflow() %>%
add_recipe(amazon_rec, blueprint = sparse_bp) %>%
add_model(lasso_spec)
amazon_wf
#'
#'
#' \newpage
#' ## Tune the workflow
#'
#' Let’s use tune_grid() to fit a model at each of the values for the regularization penalty in our regular grid and every resample in amazon_folds.
#'
## ----tune_grid_1, eval=TRUE, collapse = TRUE, comment="", warning=FALSE, message=FALSE, highlight=TRUE, background='#F7F7F7', tidy=TRUE----
set.seed(2020)
lasso_rs <- tune_grid(
amazon_wf,
amazon_folds,
grid = lambda_grid,
control = control_resamples(save_pred = TRUE)
)
# lasso_rs
#'
#'
#' We now have a set of metrics for each value of the regularization penalty.
#'
#' We can extract the relevant information using collect_metrics() and collect_predictions()
#'
#' See Table \ref{tbl:lasso_metrics} for Lasso Metrics
#'
## ----lasso_metrics, eval=TRUE, collapse = TRUE, comment="", warning=FALSE, message=FALSE, highlight=TRUE, background='#F7F7F7', tidy=TRUE----
m_lm <- collect_metrics(lasso_rs)
kable(m_lm, format = "simple", caption="Lasso Metrics\\label{tbl:lasso_metrics}")
#'
#'
#' What are the best models?
#'
#' See Table \ref{tbl:best_lasso_roc} for Best Lasso ROC.
#'
## ----best_lasso_roc, eval=TRUE, collapse = TRUE, comment="", warning=FALSE, message=FALSE, highlight=TRUE, background='#F7F7F7', tidy=TRUE----
m_blr <- show_best(lasso_rs, "roc_auc")
kable(m_blr, format = "simple", caption="Best Lasso ROC\\label{tbl:best_lasso_roc}")
#'
#'
#' See Table \ref{tbl:best_lasso_acc} for Best Lasso Accuracy.
#'
## ----best_lasso_acc, eval=TRUE, collapse = TRUE, comment="", warning=FALSE, message=FALSE, highlight=TRUE, background='#F7F7F7', tidy=TRUE----
m_bla <- show_best(lasso_rs, "accuracy")
kable(m_bla, format = "simple", caption="Best Lasso Accuracy\\label{tbl:best_lasso_acc}")
#'
#'
#' Let’s visualize these metrics; accuracy and ROC AUC, in Figure \ref{fig:model_7} to see what the best model is.
#'
## ----plot_lasso, echo=FALSE, eval=TRUE, comment="", warning=FALSE, message=FALSE, fig.pos="h!", fig.cap="Lasso model performance across regularization penalties\\label{fig:model_7}"----
autoplot(lasso_rs) +
labs(
title = "Lasso model performance across regularization penalties",
subtitle = "Performance metrics can be used to identify the best penalty"
)
#'
#'
#' See Table \ref{tbl:lasso_predictions} for Lasso Predictions
#'
## ----lasso_predictions, eval=TRUE, collapse = TRUE, comment="", warning=FALSE, message=FALSE, highlight=TRUE, background='#F7F7F7', tidy=TRUE----
m_lp <- collect_predictions(lasso_rs)
kable(head(m_lp), format = "simple", caption="Lasso Predictions\\label{tbl:lasso_predictions}")
#'
#'
#' Figure \ref{fig:model_8} shows the ROC curve, a visualization of how well a classification model can distinguish between classes
#'
## ----m_lp_roc_0, echo=FALSE, eval=TRUE, comment="", warning=FALSE, message=FALSE, fig.pos="h!", fig.cap="Lasso model ROC Label 0\\label{fig:model_8}"----
m_lp %>%
# mutate(.pred_class=as.numeric(levels(.pred_class)[.pred_class])) %>%
group_by(id) %>%
roc_curve(truth = label, .pred_0) %>%
autoplot() +
labs(
color = NULL,
title = "ROC curve for Lasso model Label 0",
subtitle = "Each resample fold is shown in a different color"
)
#'
#' Figure \ref{fig:model_9} shows the ROC curve, a visualization of how well a classification model can distinguish between classes
#'
## ----m_lp_roc_1, echo=FALSE, eval=TRUE, comment="", warning=FALSE, message=FALSE, fig.pos="h!", fig.cap="Lasso model ROC Label 1\\label{fig:model_9}"----
m_lp %>%
group_by(id) %>%
roc_curve(truth = label, .pred_1) %>%
autoplot() +
labs(
color = NULL,
title = "ROC curve for Lasso model Label 1",
subtitle = "Each resample fold is shown in a different color"
)
#'
## ----blm_best, echo=FALSE----------------------------------------------
# Best ROC_AUC
blm_best_roc <- max(m_blr$mean)
# Best Accuracy