From 3c9216d34da32520c03ceec7d3ccc17ecad4e53b Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Wed, 5 Dec 2018 13:38:36 +0530 Subject: [PATCH 01/29] Update DESCRIPTION --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 912bd1e..6c11b1b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,7 +10,7 @@ Depends: R (>= 3.4.0) Suggests: testthat Imports: data.table, ggplot2, graphics, grid, lexicon (>= 1.2.1), methods, stats, stringi, syuzhet, textclean (>= 0.6.1), textshape (>= 1.3.0), - utils + utils, DescTools License: MIT + file LICENSE LazyData: TRUE Roxygen: list(wrap = FALSE) From 62ad324fd08ac929e114e2d65617415d64ab6213 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Wed, 5 Dec 2018 13:41:17 +0530 Subject: [PATCH 02/29] Added comma removal before valence shifters --- R/sentiment.R | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/R/sentiment.R b/R/sentiment.R index cebd7d4..cb7477d 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -88,7 +88,8 @@ #' useful in literary works, where like is often used in non-verb form, than #' product comments. Use of this parameter will add compute time, this must be #' weighed against the need for accuracy and the likeliness that more accurate -#' results will come from setting this argument to \code{TRUE}. +#' results will come from setting this argument to \code{TRUE}. + removing commas before +#' valence shifters. #' @param missing_value A value to replace \code{NA}/\code{NaN} with. Use #' \code{NULL} to retain missing values. #' @param \ldots Ignored. @@ -316,6 +317,31 @@ sentiment <- function(text.var, polarity_dt = lexicon::hash_sentiment_jockers_ri } +run_preprocess <- function(sentence) { +sentence <- as.character(sentence) +u <- unlist(strsplit(sentence,split = ' ')) +w <- u[u %like any% c(",%","%,")] +index <- which(u %like any% c(",%","%,")) +if(length(index)==0) +{ + t1 <- u[which(sapply(strsplit(u,","),length)>1)] + if(length(t1)==0) { + u <- paste(u,collapse = ' ') + return(u) + } + t_final <- gsub(',',' ',t1) + index_2 <- which(sapply(strsplit(u,","),length)>1) + u[index_2] <- t_final + u <- paste(u,collapse = ' ') + return(u) + } + v <- gsub(',',' ',w) +v <- gsub(' ','',v) +suppressWarnings(u[index[(u[index+1] %in% hash_valence_shifters$x) & !(u[index] %in% hash_valence_shifters$x)]] <- v[(u[index+1] %in% hash_valence_shifters$x) & !(u[index] %in% hash_valence_shifters$x)]) +u <- paste(u,collapse = ' ') +return(u) + +} #' @export @@ -333,6 +359,8 @@ sentiment.get_sentences_character <- function(text.var, polarity_dt = lexicon::h if(any(valence_shifters_dt[[1]] %in% polarity_dt[[1]])) { stop('`polarity_dt` & `valence_shifters_dt` not mutually exclusive') } + + if(neutral.nonverb.like) text.var <- run_preprocess(text.var) ## Add "~~" holder for any words `polarity_frame` & `valence_shifters_dt` ## that have spaces From 1cbabe18d1a56ee10c3b1bc7ca7e498dd1225965 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Wed, 5 Dec 2018 13:51:42 +0530 Subject: [PATCH 03/29] Update sentiment.R --- R/sentiment.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/sentiment.R b/R/sentiment.R index cb7477d..e0cb38a 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -88,8 +88,7 @@ #' useful in literary works, where like is often used in non-verb form, than #' product comments. Use of this parameter will add compute time, this must be #' weighed against the need for accuracy and the likeliness that more accurate -#' results will come from setting this argument to \code{TRUE}. + removing commas before -#' valence shifters. +#' results will come from setting this argument to \code{TRUE}. + removing commas before valence shifters. #' @param missing_value A value to replace \code{NA}/\code{NaN} with. Use #' \code{NULL} to retain missing values. #' @param \ldots Ignored. From 54a51cbf2032dbb06eb7b34a4b718479d7fa409b Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Wed, 5 Dec 2018 13:56:02 +0530 Subject: [PATCH 04/29] Update sentiment.Rd --- man/sentiment.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/sentiment.Rd b/man/sentiment.Rd index f2dec93..54e23f4 100644 --- a/man/sentiment.Rd +++ b/man/sentiment.Rd @@ -101,7 +101,7 @@ of where the text data comes from. For example, it is likely to be more useful in literary works, where like is often used in non-verb form, than product comments. Use of this parameter will add compute time, this must be weighed against the need for accuracy and the likeliness that more accurate -results will come from setting this argument to \code{TRUE}.} +results will come from setting this argument to \code{TRUE}. Currently also removing commas before valence shifters.} \item{missing_value}{A value to replace \code{NA}/\code{NaN} with. Use \code{NULL} to retain missing values.} From 331c57782d5202b7ec1999c7788670854994b4de Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Wed, 5 Dec 2018 20:32:27 +0530 Subject: [PATCH 05/29] Update sentiment.R --- R/sentiment.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/sentiment.R b/R/sentiment.R index e0cb38a..f4ab20d 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -358,8 +358,6 @@ sentiment.get_sentences_character <- function(text.var, polarity_dt = lexicon::h if(any(valence_shifters_dt[[1]] %in% polarity_dt[[1]])) { stop('`polarity_dt` & `valence_shifters_dt` not mutually exclusive') } - - if(neutral.nonverb.like) text.var <- run_preprocess(text.var) ## Add "~~" holder for any words `polarity_frame` & `valence_shifters_dt` ## that have spaces @@ -371,6 +369,9 @@ sentiment.get_sentences_character <- function(text.var, polarity_dt = lexicon::h # break rows into count words sent_dat <- make_sentence_df2(sents) + + if(neutral.nonverb.like) sent_dat$sentences <- unlist(lapply(sent_dat$sentences, run_preprocess)) + # buts <- valence_shifters_dt[valence_shifters_dt[[2]] == 4,][['x']] # # if (length(buts) > 0){ From d84c5df1e056f23755e72ce2ff245e305de8b488 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Wed, 5 Dec 2018 23:24:25 +0530 Subject: [PATCH 06/29] Update sentiment.R --- R/sentiment.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/sentiment.R b/R/sentiment.R index f4ab20d..4983a9e 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -334,9 +334,13 @@ if(length(index)==0) u <- paste(u,collapse = ' ') return(u) } - v <- gsub(',',' ',w) -v <- gsub(' ','',v) + +v <- gsub(',','',w) + +suppressWarnings(if(v %in% hash_valence_shifters$x) u[index[which(v %in% hash_valence_shifters$x)]] <- v[which(v %in% hash_valence_shifters$x)]) + suppressWarnings(u[index[(u[index+1] %in% hash_valence_shifters$x) & !(u[index] %in% hash_valence_shifters$x)]] <- v[(u[index+1] %in% hash_valence_shifters$x) & !(u[index] %in% hash_valence_shifters$x)]) + u <- paste(u,collapse = ' ') return(u) From cb49912d2e69fcbb6d80c0982ce2779425c92183 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Tue, 11 Dec 2018 12:56:01 +0530 Subject: [PATCH 07/29] Update NAMESPACE --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index 02b2e36..8cc9d08 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -105,3 +105,4 @@ importFrom(textclean,replace_grade) importFrom(textclean,replace_internet_slang) importFrom(textclean,replace_rating) importFrom(textclean,replace_word_elongation) +importFrom(DescTools,"%like any%") From 95f540c203552e5c76b4da8bea50b58be65766bc Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Tue, 11 Dec 2018 15:06:28 +0530 Subject: [PATCH 08/29] Update sentiment.R --- R/sentiment.R | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/R/sentiment.R b/R/sentiment.R index 4983a9e..356eb55 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -11,6 +11,7 @@ #' a raw character vector though \code{get_sentences} is preferred as it avoids #' the repeated cost of doing sentence boundary disambiguation every time #' \code{sentiment} is run. +#' @param comma #' @param polarity_dt A \pkg{data.table} of positive/negative words and #' weights with x and y as column names. The \pkg{lexicon} package has several #' dictionaries that can be used, including: @@ -307,7 +308,7 @@ #' sentiment(Tweet, polarity_dt = combined_emoji) #' #' } -sentiment <- function(text.var, polarity_dt = lexicon::hash_sentiment_jockers_rinker, +sentiment <- function(text.var, comma = TRUE, polarity_dt = lexicon::hash_sentiment_jockers_rinker, valence_shifters_dt = lexicon::hash_valence_shifters, hyphen = "", amplifier.weight = .8, n.before = 5, n.after = 2, question.weight = 1, adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, ...){ @@ -349,7 +350,7 @@ return(u) #' @export #' @method sentiment get_sentences_character -sentiment.get_sentences_character <- function(text.var, polarity_dt = lexicon::hash_sentiment_jockers_rinker, +sentiment.get_sentences_character <- function(text.var, comma = FALSE ,polarity_dt = lexicon::hash_sentiment_jockers_rinker, valence_shifters_dt = lexicon::hash_valence_shifters, hyphen = "", amplifier.weight = .8, n.before = 5, n.after = 2, question.weight = 1, adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, ...){ @@ -358,7 +359,7 @@ sentiment.get_sentences_character <- function(text.var, polarity_dt = lexicon::h cluster_tag <- w_neg <- neg <- A <- a <- D <- d <- wc <- id <- T_sum <- N <- . <- b <- before <- NULL - ## check to ake sure valence_shifters_dt polarity_dt are mutually exclusive + ## check to make sure valence_shifters_dt polarity_dt are mutually exclusive if(any(valence_shifters_dt[[1]] %in% polarity_dt[[1]])) { stop('`polarity_dt` & `valence_shifters_dt` not mutually exclusive') } @@ -374,7 +375,7 @@ sentiment.get_sentences_character <- function(text.var, polarity_dt = lexicon::h # break rows into count words sent_dat <- make_sentence_df2(sents) - if(neutral.nonverb.like) sent_dat$sentences <- unlist(lapply(sent_dat$sentences, run_preprocess)) + if(comma) sent_dat$sentences <- unlist(lapply(sent_dat$sentences, run_preprocess)) # buts <- valence_shifters_dt[valence_shifters_dt[[2]] == 4,][['x']] # @@ -544,7 +545,7 @@ like_preverbs_regex <- paste0('\\b(', paste(like_preverbs, collapse = '|'), ')(\ #' @export #' @method sentiment character -sentiment.character <- function(text.var, polarity_dt = lexicon::hash_sentiment_jockers_rinker, +sentiment.character <- function(text.var, comma = FALSE, polarity_dt = lexicon::hash_sentiment_jockers_rinker, valence_shifters_dt = lexicon::hash_valence_shifters, hyphen = "", amplifier.weight = .8, n.before = 5, n.after = 2, question.weight = 1, adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, ...){ @@ -563,7 +564,7 @@ sentiment.character <- function(text.var, polarity_dt = lexicon::hash_sentiment_ #' @export #' @method sentiment get_sentences_data_frame -sentiment.get_sentences_data_frame <- function(text.var, polarity_dt = lexicon::hash_sentiment_jockers_rinker, +sentiment.get_sentences_data_frame <- function(text.var, comma = FALSE, polarity_dt = lexicon::hash_sentiment_jockers_rinker, valence_shifters_dt = lexicon::hash_valence_shifters, hyphen = "", amplifier.weight = .8, n.before = 5, n.after = 2, question.weight = 1, adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, ...){ From 8a2f7c9d16f29bf33b6b3af2a2b85a2934954280 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Tue, 11 Dec 2018 15:32:53 +0530 Subject: [PATCH 09/29] Update sentiment_by.R --- R/sentiment_by.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/sentiment_by.R b/R/sentiment_by.R index 37c165c..2759883 100644 --- a/R/sentiment_by.R +++ b/R/sentiment_by.R @@ -8,6 +8,7 @@ #' #' @param text.var The text variable. Also takes a \code{sentimentr} or #' \code{sentiment_by} object. +#' @param comma #' @param by The grouping variable(s). Default \code{NULL} uses the original #' row/element indices; if you used a column of 12 rows for \code{text.var} #' these 12 rows will be used as the grouping variable. Also takes a single From 53065fef0987423b803462d65532d611dae7b01a Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Tue, 11 Dec 2018 15:48:07 +0530 Subject: [PATCH 10/29] Update sentiment_by.R --- R/sentiment_by.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/sentiment_by.R b/R/sentiment_by.R index 2759883..37c165c 100644 --- a/R/sentiment_by.R +++ b/R/sentiment_by.R @@ -8,7 +8,6 @@ #' #' @param text.var The text variable. Also takes a \code{sentimentr} or #' \code{sentiment_by} object. -#' @param comma #' @param by The grouping variable(s). Default \code{NULL} uses the original #' row/element indices; if you used a column of 12 rows for \code{text.var} #' these 12 rows will be used as the grouping variable. Also takes a single From 36fc4c66811e9c7026c5b2a281acf5733ada3103 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Tue, 11 Dec 2018 15:48:29 +0530 Subject: [PATCH 11/29] Update sentiment.R --- R/sentiment.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/sentiment.R b/R/sentiment.R index 356eb55..a28802d 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -11,7 +11,6 @@ #' a raw character vector though \code{get_sentences} is preferred as it avoids #' the repeated cost of doing sentence boundary disambiguation every time #' \code{sentiment} is run. -#' @param comma #' @param polarity_dt A \pkg{data.table} of positive/negative words and #' weights with x and y as column names. The \pkg{lexicon} package has several #' dictionaries that can be used, including: From 3a807acee4c0b50f601f099165bd82f15586eec1 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Tue, 11 Dec 2018 15:56:03 +0530 Subject: [PATCH 12/29] Update sentiment.R --- R/sentiment.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/sentiment.R b/R/sentiment.R index a28802d..0a52837 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -307,7 +307,7 @@ #' sentiment(Tweet, polarity_dt = combined_emoji) #' #' } -sentiment <- function(text.var, comma = TRUE, polarity_dt = lexicon::hash_sentiment_jockers_rinker, +sentiment <- function(text.var, polarity_dt = lexicon::hash_sentiment_jockers_rinker, valence_shifters_dt = lexicon::hash_valence_shifters, hyphen = "", amplifier.weight = .8, n.before = 5, n.after = 2, question.weight = 1, adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, ...){ @@ -349,7 +349,7 @@ return(u) #' @export #' @method sentiment get_sentences_character -sentiment.get_sentences_character <- function(text.var, comma = FALSE ,polarity_dt = lexicon::hash_sentiment_jockers_rinker, +sentiment.get_sentences_character <- function(text.var, polarity_dt = lexicon::hash_sentiment_jockers_rinker, valence_shifters_dt = lexicon::hash_valence_shifters, hyphen = "", amplifier.weight = .8, n.before = 5, n.after = 2, question.weight = 1, adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, ...){ @@ -544,7 +544,7 @@ like_preverbs_regex <- paste0('\\b(', paste(like_preverbs, collapse = '|'), ')(\ #' @export #' @method sentiment character -sentiment.character <- function(text.var, comma = FALSE, polarity_dt = lexicon::hash_sentiment_jockers_rinker, +sentiment.character <- function(text.var, polarity_dt = lexicon::hash_sentiment_jockers_rinker, valence_shifters_dt = lexicon::hash_valence_shifters, hyphen = "", amplifier.weight = .8, n.before = 5, n.after = 2, question.weight = 1, adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, ...){ @@ -563,7 +563,7 @@ sentiment.character <- function(text.var, comma = FALSE, polarity_dt = lexicon:: #' @export #' @method sentiment get_sentences_data_frame -sentiment.get_sentences_data_frame <- function(text.var, comma = FALSE, polarity_dt = lexicon::hash_sentiment_jockers_rinker, +sentiment.get_sentences_data_frame <- function(text.var, polarity_dt = lexicon::hash_sentiment_jockers_rinker, valence_shifters_dt = lexicon::hash_valence_shifters, hyphen = "", amplifier.weight = .8, n.before = 5, n.after = 2, question.weight = 1, adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, ...){ From a7e75ba00f4f9466c0ef6aa45293d8d46c659aa8 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Tue, 11 Dec 2018 16:34:31 +0530 Subject: [PATCH 13/29] Update sentiment.R --- R/sentiment.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/sentiment.R b/R/sentiment.R index 0a52837..e53f5f7 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -374,7 +374,7 @@ sentiment.get_sentences_character <- function(text.var, polarity_dt = lexicon::h # break rows into count words sent_dat <- make_sentence_df2(sents) - if(comma) sent_dat$sentences <- unlist(lapply(sent_dat$sentences, run_preprocess)) + if(neutral.nonverb.like) sent_dat$sentences <- unlist(lapply(sent_dat$sentences, run_preprocess)) # buts <- valence_shifters_dt[valence_shifters_dt[[2]] == 4,][['x']] # From a192b245cee26934fb7dcffb30ed6e0a2053c29e Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Tue, 11 Dec 2018 16:39:15 +0530 Subject: [PATCH 14/29] Update NAMESPACE --- NAMESPACE | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 8cc9d08..b0c9386 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -106,3 +106,5 @@ importFrom(textclean,replace_internet_slang) importFrom(textclean,replace_rating) importFrom(textclean,replace_word_elongation) importFrom(DescTools,"%like any%") +importFrom(lexicon,hash_valence_shifters) + From d56f5c86a2078b65496142608ce8e379c3ed783c Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Tue, 11 Dec 2018 16:40:41 +0530 Subject: [PATCH 15/29] Update NAMESPACE --- NAMESPACE | 1 - 1 file changed, 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index b0c9386..47454fe 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -106,5 +106,4 @@ importFrom(textclean,replace_internet_slang) importFrom(textclean,replace_rating) importFrom(textclean,replace_word_elongation) importFrom(DescTools,"%like any%") -importFrom(lexicon,hash_valence_shifters) From 613689282a27502731015d80540d179ba3e2d60a Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Tue, 11 Dec 2018 16:41:40 +0530 Subject: [PATCH 16/29] Update sentiment.R --- R/sentiment.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/sentiment.R b/R/sentiment.R index e53f5f7..4d34058 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -337,9 +337,9 @@ if(length(index)==0) v <- gsub(',','',w) -suppressWarnings(if(v %in% hash_valence_shifters$x) u[index[which(v %in% hash_valence_shifters$x)]] <- v[which(v %in% hash_valence_shifters$x)]) +suppressWarnings(if(v %in% lexicon::hash_valence_shifters$x) u[index[which(v %in% lexicon::hash_valence_shifters$x)]] <- v[which(v %in% lexicon::hash_valence_shifters$x)]) -suppressWarnings(u[index[(u[index+1] %in% hash_valence_shifters$x) & !(u[index] %in% hash_valence_shifters$x)]] <- v[(u[index+1] %in% hash_valence_shifters$x) & !(u[index] %in% hash_valence_shifters$x)]) +suppressWarnings(u[index[(u[index+1] %in% lexicon::hash_valence_shifters$x) & !(u[index] %in% lexicon::hash_valence_shifters$x)]] <- v[(u[index+1] %in% lexicon::hash_valence_shifters$x) & !(u[index] %in% lexicon::hash_valence_shifters$x)]) u <- paste(u,collapse = ' ') return(u) From 261cf0dd9d50f62992a7b6e538c1c8249bad9b02 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Tue, 11 Dec 2018 16:58:07 +0530 Subject: [PATCH 17/29] Update sentiment.R Changed variable names and added comments --- R/sentiment.R | 57 +++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/R/sentiment.R b/R/sentiment.R index 4d34058..f82f1ed 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -317,36 +317,39 @@ sentiment <- function(text.var, polarity_dt = lexicon::hash_sentiment_jockers_ri } run_preprocess <- function(sentence) { -sentence <- as.character(sentence) -u <- unlist(strsplit(sentence,split = ' ')) -w <- u[u %like any% c(",%","%,")] -index <- which(u %like any% c(",%","%,")) -if(length(index)==0) -{ - t1 <- u[which(sapply(strsplit(u,","),length)>1)] - if(length(t1)==0) { - u <- paste(u,collapse = ' ') - return(u) + sentence <- as.character(sentence) # Parsing to character + split_spaces <- unlist(strsplit(sentence,split = ' ')) # Splitting the sentence on spaces to get all words in the sentence. + comma_words <- split_spaces[split_spaces %like any% c(",%","%,")] # Store all words that either end with a comma or begin with a comma. + index <- which(split_spaces %like any% c(",%","%,")) # Find indices of all those words from above. + # If there is no word with commas, there is an edge case that the comma could be between the two words without spaces. Checking for that in this condition. + if(length(index)==0) + { + t1 <- split_spaces[which(sapply(strsplit(split_spaces,","),length)>1)] + # If splitting by commas breaks any one word into two, the condition was true and we replace that comma with a space. If not, we return the sentence as is. + if(length(t1)==0) { + split_spaces <- paste(split_spaces,collapse = ' ') + return(split_spaces) } - t_final <- gsub(',',' ',t1) - index_2 <- which(sapply(strsplit(u,","),length)>1) - u[index_2] <- t_final - u <- paste(u,collapse = ' ') - return(u) + t_final <- gsub(',',' ',t1) #Replace comma between two words by space. + index_2 <- which(sapply(strsplit(split_spaces,","),length)>1) + split_spaces[index_2] <- t_final + split_spaces <- paste(split_spaces,collapse = ' ') + return(split_spaces) } - -v <- gsub(',','',w) - -suppressWarnings(if(v %in% lexicon::hash_valence_shifters$x) u[index[which(v %in% lexicon::hash_valence_shifters$x)]] <- v[which(v %in% lexicon::hash_valence_shifters$x)]) - -suppressWarnings(u[index[(u[index+1] %in% lexicon::hash_valence_shifters$x) & !(u[index] %in% lexicon::hash_valence_shifters$x)]] <- v[(u[index+1] %in% lexicon::hash_valence_shifters$x) & !(u[index] %in% lexicon::hash_valence_shifters$x)]) - -u <- paste(u,collapse = ' ') -return(u) - + + replaced_words <- gsub(',','',comma_words) + + # If the word is a valence shifter, put it back into the original sentence. + suppressWarnings(if(replaced_words %in% hash_valence_shifters$x) split_spaces[index[which(replaced_words %in% hash_valence_shifters$x)]] <- replaced_words[which(replaced_words %in% hash_valence_shifters$x)]) + + # If the word after is a valence shifter, replace into original sentence and return. + suppressWarnings(split_spaces[index[(split_spaces[index+1] %in% hash_valence_shifters$x) & !(split_spaces[index] %in% hash_valence_shifters$x)]] <- replaced_words[(split_spaces[index+1] %in% hash_valence_shifters$x) & !(split_spaces[index] %in% hash_valence_shifters$x)]) + + split_spaces <- paste(split_spaces,collapse = ' ') + return(split_spaces) + } - - + #' @export #' @method sentiment get_sentences_character sentiment.get_sentences_character <- function(text.var, polarity_dt = lexicon::hash_sentiment_jockers_rinker, From 1623621297e41d81927826d188b686d695ff25c9 Mon Sep 17 00:00:00 2001 From: aainasingh <45811218+aainasingh@users.noreply.github.com> Date: Wed, 12 Dec 2018 12:10:10 +0530 Subject: [PATCH 18/29] Update sentiment.R --- R/sentiment.R | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/R/sentiment.R b/R/sentiment.R index f82f1ed..1c59670 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -88,9 +88,10 @@ #' useful in literary works, where like is often used in non-verb form, than #' product comments. Use of this parameter will add compute time, this must be #' weighed against the need for accuracy and the likeliness that more accurate -#' results will come from setting this argument to \code{TRUE}. + removing commas before valence shifters. +#' results will come from setting this argument to \code{TRUE}. #' @param missing_value A value to replace \code{NA}/\code{NaN} with. Use #' \code{NULL} to retain missing values. +#' @param comma_handler logical. If \code{TRUE}, removes commas before valence shifters. #' @param \ldots Ignored. #' @return Returns a \pkg{data.table} of: #' \itemize{ @@ -310,7 +311,7 @@ sentiment <- function(text.var, polarity_dt = lexicon::hash_sentiment_jockers_rinker, valence_shifters_dt = lexicon::hash_valence_shifters, hyphen = "", amplifier.weight = .8, n.before = 5, n.after = 2, question.weight = 1, - adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, ...){ + adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, comma_handler = FALSE, ...){ UseMethod('sentiment') @@ -355,7 +356,7 @@ run_preprocess <- function(sentence) { sentiment.get_sentences_character <- function(text.var, polarity_dt = lexicon::hash_sentiment_jockers_rinker, valence_shifters_dt = lexicon::hash_valence_shifters, hyphen = "", amplifier.weight = .8, n.before = 5, n.after = 2, question.weight = 1, - adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, ...){ + adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, comma_handler = FALSE, ...){ sentences <- id2 <- pol_loc <- comma_loc <- P <- non_pol <- lens <- cluster_tag <- w_neg <- neg <- A <- a <- D <- d <- wc <- id <- @@ -550,7 +551,7 @@ like_preverbs_regex <- paste0('\\b(', paste(like_preverbs, collapse = '|'), ')(\ sentiment.character <- function(text.var, polarity_dt = lexicon::hash_sentiment_jockers_rinker, valence_shifters_dt = lexicon::hash_valence_shifters, hyphen = "", amplifier.weight = .8, n.before = 5, n.after = 2, question.weight = 1, - adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, ...){ + adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, comma_handler = FALSE, ...){ split_warn(text.var, 'sentiment', ...) @@ -560,7 +561,7 @@ sentiment.character <- function(text.var, polarity_dt = lexicon::hash_sentiment_ amplifier.weight = amplifier.weight, n.before = n.before, n.after = n.after, question.weight = question.weight, adversative.weight = adversative.weight, missing_value = missing_value, - neutral.nonverb.like = neutral.nonverb.like, c(';', ':', ','), ...) + neutral.nonverb.like = neutral.nonverb.like, c(';', ':', ','), comma_handler = comma_handler, ...) } @@ -569,7 +570,7 @@ sentiment.character <- function(text.var, polarity_dt = lexicon::hash_sentiment_ sentiment.get_sentences_data_frame <- function(text.var, polarity_dt = lexicon::hash_sentiment_jockers_rinker, valence_shifters_dt = lexicon::hash_valence_shifters, hyphen = "", amplifier.weight = .8, n.before = 5, n.after = 2, question.weight = 1, - adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, ...){ + adversative.weight = .25, neutral.nonverb.like = FALSE, missing_value = 0, comma_handler = FALSE, ...){ x <- make_class(text.var[[attributes(text.var)[['text.var']]]], "get_sentences", "get_sentences_character") From 0873e8f38375cf44216e0b24d7e2786834e0b567 Mon Sep 17 00:00:00 2001 From: aainasingh <45811218+aainasingh@users.noreply.github.com> Date: Wed, 12 Dec 2018 12:11:20 +0530 Subject: [PATCH 19/29] Update sentiment.Rd --- man/sentiment.Rd | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/man/sentiment.Rd b/man/sentiment.Rd index 54e23f4..394d865 100644 --- a/man/sentiment.Rd +++ b/man/sentiment.Rd @@ -8,7 +8,7 @@ sentiment(text.var, polarity_dt = lexicon::hash_sentiment_jockers_rinker, valence_shifters_dt = lexicon::hash_valence_shifters, hyphen = "", amplifier.weight = 0.8, n.before = 5, n.after = 2, question.weight = 1, adversative.weight = 0.25, - neutral.nonverb.like = FALSE, missing_value = 0, ...) + neutral.nonverb.like = FALSE, missing_value = 0, comma_handler = FALSE, ...) } \arguments{ \item{text.var}{The text variable. Can be a \code{get_sentences} object or @@ -106,6 +106,9 @@ results will come from setting this argument to \code{TRUE}. Currently also remo \item{missing_value}{A value to replace \code{NA}/\code{NaN} with. Use \code{NULL} to retain missing values.} +\item{comma_handler}{If \code{TRUE}, removes commas before valence shifters.} + + \item{\ldots}{Ignored.} } \value{ From 6541279fc933dffafae3797e4430acf894eee90f Mon Sep 17 00:00:00 2001 From: aainasingh <45811218+aainasingh@users.noreply.github.com> Date: Wed, 12 Dec 2018 12:15:46 +0530 Subject: [PATCH 20/29] Update sentiment.R --- R/sentiment.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/sentiment.R b/R/sentiment.R index 1c59670..e5d038c 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -378,8 +378,7 @@ sentiment.get_sentences_character <- function(text.var, polarity_dt = lexicon::h # break rows into count words sent_dat <- make_sentence_df2(sents) - if(neutral.nonverb.like) sent_dat$sentences <- unlist(lapply(sent_dat$sentences, run_preprocess)) - +print(comma_handler) # buts <- valence_shifters_dt[valence_shifters_dt[[2]] == 4,][['x']] # # if (length(buts) > 0){ From 8c30b40349baa057b7c98acbff8bd3869659b8c9 Mon Sep 17 00:00:00 2001 From: aainasingh <45811218+aainasingh@users.noreply.github.com> Date: Wed, 12 Dec 2018 12:50:41 +0530 Subject: [PATCH 21/29] Update sentiment.R --- R/sentiment.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/sentiment.R b/R/sentiment.R index e5d038c..3e1b868 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -378,7 +378,7 @@ sentiment.get_sentences_character <- function(text.var, polarity_dt = lexicon::h # break rows into count words sent_dat <- make_sentence_df2(sents) -print(comma_handler) + if(comma_handler) sent_dat$sentences <- unlist(lapply(sent_dat$sentences, run_preprocess)) # buts <- valence_shifters_dt[valence_shifters_dt[[2]] == 4,][['x']] # # if (length(buts) > 0){ From 5995a98894d9e0b1f3b49fea54783111c36253a8 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Wed, 12 Dec 2018 15:11:35 +0530 Subject: [PATCH 22/29] Update sentiment.Rd --- man/sentiment.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/sentiment.Rd b/man/sentiment.Rd index 394d865..791ec18 100644 --- a/man/sentiment.Rd +++ b/man/sentiment.Rd @@ -101,7 +101,7 @@ of where the text data comes from. For example, it is likely to be more useful in literary works, where like is often used in non-verb form, than product comments. Use of this parameter will add compute time, this must be weighed against the need for accuracy and the likeliness that more accurate -results will come from setting this argument to \code{TRUE}. Currently also removing commas before valence shifters.} +results will come from setting this argument to \code{TRUE}.} \item{missing_value}{A value to replace \code{NA}/\code{NaN} with. Use \code{NULL} to retain missing values.} From ed83ba35b7c309273e58117dc0db081b024c6927 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Wed, 12 Dec 2018 15:24:10 +0530 Subject: [PATCH 23/29] Global - valence_shifters_dt --- R/sentiment.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/sentiment.R b/R/sentiment.R index 3e1b868..b1dd9ed 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -341,10 +341,10 @@ run_preprocess <- function(sentence) { replaced_words <- gsub(',','',comma_words) # If the word is a valence shifter, put it back into the original sentence. - suppressWarnings(if(replaced_words %in% hash_valence_shifters$x) split_spaces[index[which(replaced_words %in% hash_valence_shifters$x)]] <- replaced_words[which(replaced_words %in% hash_valence_shifters$x)]) + suppressWarnings(if(replaced_words %in% lexicon::hash_valence_shifters$x) split_spaces[index[which(replaced_words %in% lexicon::hash_valence_shifters$x)]] <- replaced_words[which(replaced_words %in% lexicon::hash_valence_shifters$x)]) # If the word after is a valence shifter, replace into original sentence and return. - suppressWarnings(split_spaces[index[(split_spaces[index+1] %in% hash_valence_shifters$x) & !(split_spaces[index] %in% hash_valence_shifters$x)]] <- replaced_words[(split_spaces[index+1] %in% hash_valence_shifters$x) & !(split_spaces[index] %in% hash_valence_shifters$x)]) + suppressWarnings(split_spaces[index[(split_spaces[index+1] %in% lexicon::hash_valence_shifters$x) & !(split_spaces[index] %in% lexicon::hash_valence_shifters$x)]] <- replaced_words[(split_spaces[index+1] %in% lexicon::hash_valence_shifters$x) & !(split_spaces[index] %in% lexicon::hash_valence_shifters$x)]) split_spaces <- paste(split_spaces,collapse = ' ') return(split_spaces) From 38cc674f4bc917d94570e8b0dcff103487ab02b1 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Sat, 12 Jan 2019 22:16:58 +0530 Subject: [PATCH 24/29] Updated Conditions to Preprocess Added domain specific [dialog oriented domains] conditions to skip preprocessing of commas and use original algorithm. --- R/sentiment.R | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/R/sentiment.R b/R/sentiment.R index b1dd9ed..5b82143 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -350,6 +350,35 @@ run_preprocess <- function(sentence) { return(split_spaces) } +# Check whether sentence is question tag. +is_question_tag <- function(text){ + splitted <- str_split((stringi::stri_extract_first(text, regex="[A-Za-z'a-zA-Z,]* [A-Za-z'a-zA-Z,]* [A-Za-z'a-zA-Z,]* ?\\?\\s*")),' ') + unlisted <- lapply(splitted, function (x) gsub("[',?]",'',x)) + log <- lapply(unlisted,function (x) any(x %in% lexicon::hash_valence_shifters$x[lexicon::hash_valence_shifters$y==1])) + return(unlist(log)) +} +# Check whether negators are used to emphasise "lack of something" rather than negating. +is_negator_adv_condition <- function(cond_2) { + cond_2 <- tolower(cond_2) + unlisted <- unlist(stringr::str_split(cond_2,pattern=' ')) + index_t <- which(unlisted %like any% c("%,","%;")) + unlisted <- gsub("[?;.!,]",'',unlisted) + negators <- unlist(lexicon::hash_valence_shifters[lexicon::hash_valence_shifters$y==1,1]) + adv_conj <- unlist(lexicon::hash_valence_shifters[lexicon::hash_valence_shifters$y==4,1]) + if(is.null(index) || length(index) == 0 || is.na(index)) return(F) + for(index in index_t){ + before_l <- index-2 + if(before_l < 0) before_l <- 0 + before <- unlisted[before_l:index] + after_l <- index+1 + after_r <- index+2 + after <- unlisted[after_l:after_r] + after <- after[which(!is.na(after))] + if(any(before %in% negators) && (any(after %in% negators) || any(after %in% adv_conj))) return(T) + } + return(F) +} + #' @export #' @method sentiment get_sentences_character @@ -378,7 +407,10 @@ sentiment.get_sentences_character <- function(text.var, polarity_dt = lexicon::h # break rows into count words sent_dat <- make_sentence_df2(sents) - if(comma_handler) sent_dat$sentences <- unlist(lapply(sent_dat$sentences, run_preprocess)) + if(comma_handler) { + indices_to_skip <- c(which(unlist(lapply(sent_dat$sentences,is_negator_adv_condition))),which(unlist(lapply(sent_dat$sentences,is_question_tag)))) + sent_dat$sentences[-(indices_to_skip)] <- unlist(lapply(sent_dat$sentences[-(indices_to_skip)], run_preprocess)) + } # buts <- valence_shifters_dt[valence_shifters_dt[[2]] == 4,][['x']] # # if (length(buts) > 0){ From 29acb768ff580b662b32d257398f61bc881962b7 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Sat, 12 Jan 2019 22:45:15 +0530 Subject: [PATCH 25/29] Referenced globally - stringr::str_split --- R/sentiment.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/sentiment.R b/R/sentiment.R index 5b82143..8a63d1d 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -352,7 +352,7 @@ run_preprocess <- function(sentence) { } # Check whether sentence is question tag. is_question_tag <- function(text){ - splitted <- str_split((stringi::stri_extract_first(text, regex="[A-Za-z'a-zA-Z,]* [A-Za-z'a-zA-Z,]* [A-Za-z'a-zA-Z,]* ?\\?\\s*")),' ') + splitted <- stringr::str_split((stringi::stri_extract_first(text, regex="[A-Za-z'a-zA-Z,]* [A-Za-z'a-zA-Z,]* [A-Za-z'a-zA-Z,]* ?\\?\\s*")),' ') unlisted <- lapply(splitted, function (x) gsub("[',?]",'',x)) log <- lapply(unlisted,function (x) any(x %in% lexicon::hash_valence_shifters$x[lexicon::hash_valence_shifters$y==1])) return(unlist(log)) From c632a475540af0fbf6bfb4c36bf7aec6a26c2641 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Sat, 12 Jan 2019 22:50:42 +0530 Subject: [PATCH 26/29] Update NAMESPACE --- NAMESPACE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 47454fe..9fd7a15 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -106,4 +106,4 @@ importFrom(textclean,replace_internet_slang) importFrom(textclean,replace_rating) importFrom(textclean,replace_word_elongation) importFrom(DescTools,"%like any%") - +importFrom(stringr,str_split) From 70347da4395da3fb2d98b45949b2a3c0caf13857 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Sat, 12 Jan 2019 23:38:35 +0530 Subject: [PATCH 27/29] Update DESCRIPTION --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6c11b1b..78f153d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,7 +9,7 @@ Description: Calculate text polarity sentiment at the sentence level and Depends: R (>= 3.4.0) Suggests: testthat Imports: data.table, ggplot2, graphics, grid, lexicon (>= 1.2.1), methods, - stats, stringi, syuzhet, textclean (>= 0.6.1), textshape (>= 1.3.0), + stats, stringi, stringr, syuzhet, textclean (>= 0.6.1), textshape (>= 1.3.0), utils, DescTools License: MIT + file LICENSE LazyData: TRUE From 27201480c79f710c7d05de673e0963d90f718a7c Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Sun, 13 Jan 2019 00:11:52 +0530 Subject: [PATCH 28/29] Updating variable names for readability --- R/sentiment.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/sentiment.R b/R/sentiment.R index 8a63d1d..a474ef0 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -358,9 +358,9 @@ is_question_tag <- function(text){ return(unlist(log)) } # Check whether negators are used to emphasise "lack of something" rather than negating. -is_negator_adv_condition <- function(cond_2) { - cond_2 <- tolower(cond_2) - unlisted <- unlist(stringr::str_split(cond_2,pattern=' ')) +is_negator_adv_condition <- function(text) { + text <- tolower(text) + unlisted <- unlist(stringr::str_split(text,pattern=' ')) index_t <- which(unlisted %like any% c("%,","%;")) unlisted <- gsub("[?;.!,]",'',unlisted) negators <- unlist(lexicon::hash_valence_shifters[lexicon::hash_valence_shifters$y==1,1]) From e50b0d19c30bb8d0c6b0ecf5b224574088400c02 Mon Sep 17 00:00:00 2001 From: Shantanu Kumar Date: Mon, 14 Jan 2019 00:05:45 +0530 Subject: [PATCH 29/29] Updated condition for length=0 --- R/sentiment.R | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/R/sentiment.R b/R/sentiment.R index a474ef0..f209acb 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -408,9 +408,14 @@ sentiment.get_sentences_character <- function(text.var, polarity_dt = lexicon::h sent_dat <- make_sentence_df2(sents) if(comma_handler) { - indices_to_skip <- c(which(unlist(lapply(sent_dat$sentences,is_negator_adv_condition))),which(unlist(lapply(sent_dat$sentences,is_question_tag)))) - sent_dat$sentences[-(indices_to_skip)] <- unlist(lapply(sent_dat$sentences[-(indices_to_skip)], run_preprocess)) - } + + indices_to_skip <- c(which(unlist(lapply(sent_dat$sentences,is_negator_adv_condition))),which(unlist(lapply(sent_dat$sentences,is_question_tag)))) + if(length(indices_to_skip)!=0) { + sent_dat$sentences[-(indices_to_skip)] <- unlist(lapply(sent_dat$sentences[-(indices_to_skip)], run_preprocess)) + } + else sent_dat$sentences <- unlist(lapply(sent_dat$sentences, run_preprocess)) + + } # buts <- valence_shifters_dt[valence_shifters_dt[[2]] == 4,][['x']] # # if (length(buts) > 0){