Comments (5)
I attempted via regex extraction...slow. Use tokens approach instead like the first part of the sentiment
function.
Slow version:
#' Extract Sentiment Words
#'
#' Extract the sentiment words from a text.
#'
#' @param text.var The text variable.
#' @param polarity_dt A \pkg{data.table} of positive/negative words and
#' weights with x and y as column names.
#' @param split logical. If \code{TRUE} sentences will be split apart,
#' resulting in more rows than the original data set.
#' @param \ldots ignored.
#' @return Returns a \pkg{data.table} with columns of positive and
#' negative terms.
#' @export
#' @examples
extract_sentiment_terms <- function(text.var, polarity_dt = sentimentr::polarity_table, split = TRUE, ...){
ord <- c("element_id", "sentence_id", "positive_terms", "negative_terms", "text")
if (isTRUE(split)){
sents <- get_sents(gsub("(\\s*)([;:,]+)", " \\2", text.var))
dat <- make_sentence_df2(sents)[, 'wc' := NULL][, 'sentence_id' := 1:.N, by = 'id'][]
data.table::setcolorder(dat, c('id','sentence_id', 'sentences'))
data.table::setnames(dat, c('id', "sentences"), c('element_id', "text"))
vars <- c('element_id', 'sentence_id', 'text', 'polarity')
} else {
dat <- data.table::data.table(text=text.var)[, 'element_id' := 1:.N][]
data.table::setcolorder(dat, c('element_id', 'text'))
ord <- ord[-2]
vars <- c('element_id', 'text', 'polarity')
}
pos <- polarity_dt[y > 0]
neg <- polarity_dt[y < 0]
pos_regex <- bor(pos[, 'x', with = FALSE][[1]])
neg_regex <- bor(neg[, 'x', with = FALSE][[1]])
out <- dat[,
'positive_terms' := exlow(text, pos_regex)][,
'negative_terms' := exlow(text, neg_regex)][
]
data.table::setcolorder(out, ord)
cnts <- out[, c('positive_terms', 'negative_terms'), with=FALSE]
posw <- na.omit(unlist(cnts[[1]]))
negw <- na.omit(unlist(cnts[[2]]))
class(out) <- c('extract_sentiment_terms', class(out))
# attributes(out)[['counts']] <- rbind(
# data.table::data.table(word = posw)[, list('n' = length(word)), by = 'word'][, 'polarity' := 'positive'][],
# data.table::data.table(word = negw)[, list('n' = length(word)), by = 'word'][, 'polarity' := 'negative'][]
# )
attributes(out)[['words']] <- na.omit(melt(out, measure.vars = c('positive_terms', 'negative_terms'),
variable.name = "polarity", value.name = "word")[,
list(word = unlist(word)), by = vars])
attributes(out)[['counts']] <- dcast.data.table(attributes(out)[['words']][, 'word', with=FALSE], word ~ ., length, value.var = 'word')[order(., decreasing = TRUE),]
setnames(attributes(out)[['counts']], ".", "n")
out
}
exlow <- function(x, y) stringi::stri_extract_all_regex(tolower(x), y)
bor <- function(x) paste0("(?i)\\b(", paste(x, collapse = "|"), ")\\b")
set.seed(10)
x <- sample(cannon_reviews[[3]], 597)
sentiment(x)
extract_sentiment_terms(x)
attributes(extract_sentiment_terms(x))$counts
attributes(extract_sentiment_terms(x))$words
get_sents <- sentimentr:::get_sents
make_sentence_df2 <- sentimentr:::make_sentence_df2
from sentimentr.
Second round faster. Needs some summary and plotting techniques:
valence_shifters_dt <- sentimentr:::valence_shifters_dt
make_words <- sentimentr:::make_words
#' Extract Sentiment Words
#'
#' Extract the sentiment words from a text.
#'
#' @param text.var The text variable.
#' @param polarity_dt A \pkg{data.table} of positive/negative words and
#' weights with x and y as column names.
#' @param hyphen The character string to replace hyphens with. Default replaces
#' with nothing so 'sugar-free' becomes 'sugarfree'. Setting \code{hyphen = " "}
#' would result in a space between words (e.g., 'sugar free').
#' @param \ldots ignored.
#' @return Returns a \pkg{data.table} with columns of positive and
#' negative terms.
#' @export
#' @examples
extract_sentiment_terms <- function(text.var, polarity_dt = sentimentr::polarity_table,
hyphen = "", ...){
sentences <- sentence_id <- P <- words <- N <- . <- NULL
## Add "~~" holder for any words `polarity_frame` & `valence_shifters_dt`
## that have spaces
posneg <- polarity_dt[[1]]
words <- posneg
space_words <- words[grep("\\s", words)]
# break rows into sentences, count words
# space fill (~~), break into words
sents <- get_sents(gsub("(\\s*)([;:,]+)", " \\2", text.var))
sent_dat <- make_sentence_df2(sents)
sent_dat[, 'words' := list(make_words(space_fill(sentences, space_words), hyphen = hyphen))]
# make sentence id for each row id
sent_dat[, sentence_id:=seq_len(.N), by='id']
## Make the data frame long by stretching out words in sentences
word_dat <- sent_dat[, .(words = unlist(words)), by = c('id', 'sentence_id')]
## 1. add polarity word potential locations (seq along) and the
## value for polarized word
## 2. add comma locations
word_dat[, "P"] <- polarity_dt[word_dat[["words"]]][[2]]
word_dat[, P := ifelse(is.na(P), 0, P)][]
out <- word_dat[!words %in% c(',', ''), ]
data.table::setnames(out, 'id', 'element_id')
class(out) <- unique(c("sentiment", class(out)))
attributes(out)[["counts"]] <- out[, list(n = .N), by = c("P", "words")][order(-P, -n), ]
attributes(out)[["words"]] <- data.table::dcast(out, element_id + sentence_id ~ P,
list, value.var = 'words')[, sentence := unlist(sents)][]
out
}
p_load(sentimentr, data.table)
set.seed(10)
x <- sample(cannon_reviews[[3]], 3000, T)
sentiment(x)
extract_sentiment_terms(x)$words
attributes(extract_sentiment_terms(x))$counts
attributes(extract_sentiment_terms(x))$words
from sentimentr.
valence_shifters_dt <- sentimentr:::valence_shifters_dt
make_words <- sentimentr:::make_words
#' Extract Sentiment Words
#'
#' Extract the sentiment words from a text.
#'
#' @param text.var The text variable.
#' @param polarity_dt A \pkg{data.table} of positive/negative words and
#' weights with x and y as column names.
#' @param hyphen The character string to replace hyphens with. Default replaces
#' with nothing so 'sugar-free' becomes 'sugarfree'. Setting \code{hyphen = " "}
#' would result in a space between words (e.g., 'sugar free').
#' @param \ldots ignored.
#' @return Returns a \pkg{data.table} with columns of positive and
#' negative terms.
#' @export
#' @examples
extract_sentiment_terms <- function(text.var, polarity_dt = sentimentr::polarity_table,
hyphen = "", ...){
sentences <- sentence_id <- P <- polarity <- n <- words <- N <- . <- NULL
## Add "~~" holder for any words `polarity_frame` & `valence_shifters_dt`
## that have spaces
posneg <- polarity_dt[[1]]
words <- posneg
space_words <- words[grep("\\s", words)]
# break rows into sentences, count words
# space fill (~~), break into words
sents <- get_sents(gsub("(\\s*)([;:,]+)", " \\2", text.var))
sent_dat <- make_sentence_df2(sents)
sent_dat[, 'words' := list(make_words(space_fill(sentences, space_words), hyphen = hyphen))]
# make sentence id for each row id
sent_dat[, sentence_id:=seq_len(.N), by='id']
## Make the data frame long by stretching out words in sentences
word_dat <- sent_dat[, .(words = unlist(words)), by = c('id', 'sentence_id')]
## 1. add polarity word potential locations (seq along) and the
## value for polarized word
## 2. add comma locations
word_dat[, "P"] <- polarity_dt[word_dat[["words"]]][[2]]
word_dat[, P := ifelse(is.na(P), 0, P)][]
out <- word_dat[!words %in% c(',', ''), ]
data.table::setnames(out, c('id', 'P'), c('element_id', 'polarity'))
class(out) <- unique(c("extract_sentiment_terms", class(out)))
attributes(out)[["counts"]] <- out[, list(n = .N), by = c("polarity", "words")][order(-polarity, -n), ]
attributes(out)[["words"]] <- data.table::dcast(
out[, list(senti = ifelse(polarity == 0, 'neutral',
ifelse(polarity < 0, 'negative', 'positive'))), by = c('element_id', 'sentence_id', 'words')],
element_id + sentence_id ~ senti,
list, value.var = 'words'
)[, sentence := unlist(sents)][]
data.table::setcolorder(attributes(out)[["counts"]], c('words', 'polarity', 'n'))
out
}
p_load(sentimentr, data.table)
set.seed(10)
x <- sample(cannon_reviews[[3]], 3000, T)
sentiment(x)
extract_sentiment_terms(x)
attributes(extract_sentiment_terms(x))$counts
attributes(extract_sentiment_terms(x))$words
plot.extract_sentiment_terms <- function(x, ...){
pdat <- attributes(extract_sentiment_terms(x))$counts[polarity != 0,][, id := 1:.N, by = 'polarity'][]
ggplot2::ggplot(pdat, ggplot2::aes_string(label = 'words', x = 'polarity',
y = 'id', size='n', color = 'polarity')) +
ggplot2::geom_text() +
ggplot2::scale_color_gradient2()
}
plot.extract_sentiment_terms <- function(x, negative = 'blue', positive = 'red',
neutral = 'grey70', min.freq = 1, random.order = FALSE, include.neutral = TRUE, ...){
pdat <- attributes(extract_sentiment_terms(x))$counts[, list(senti = ifelse(polarity == 0, 'neutral',
ifelse(polarity < 0, 'negative', 'positive')), color = ifelse(polarity == 0, neutral,
ifelse(polarity < 0, negative, positive))), by = c('words', 'n')]
if (!include.neutral) pdat <- pdat[senti != 'neutral', ]
wordcloud::wordcloud(pdat[['words']], pdat[['n']], min.freq = min.freq,
colors = pdat[['color']], ordered.colors = TRUE, random.color = FALSE,
random.order = random.order, ...
)
}
plot(extract_sentiment_terms(x))
plot(extract_sentiment_terms(x), include.neutral = FALSE)
from sentimentr.
valence_shifters_dt <- sentimentr:::valence_shifters_dt
make_words <- sentimentr:::make_words
get_sents <- sentimentr:::get_sents
make_sentence_df2 <- sentimentr:::make_sentence_df2
#plot requires adding wordcloud to dependencies. Not sure if I'm willing to got this far.
## helper to add to utils
rm_class <- function (x, remove, ...) {
class(x) <- class(x)[!class(x) %in% remove]
x
}
#' Extract Sentiment Words
#'
#' Extract the sentiment words from a text.
#'
#' @param text.var The text variable.
#' @param polarity_dt A \pkg{data.table} of positive/negative words and
#' weights with x and y as column names.
#' @param hyphen The character string to replace hyphens with. Default replaces
#' with nothing so 'sugar-free' becomes 'sugarfree'. Setting \code{hyphen = " "}
#' would result in a space between words (e.g., 'sugar free').
#' @param \ldots ignored.
#' @return Returns a \pkg{data.table} with columns of positive and
#' negative terms.
#' @export
#' @importFrom data.table .N :=
#' @examples
#' p_load(sentimentr, data.table)
#' set.seed(10)
#' x <- sample(cannon_reviews[[3]], 3000, T)
#' sentiment(x)
#'
#' pol_words <- extract_sentiment_terms(x)
#' pol_words
#' pol_words$sentence
#' pol_words$neutral
#' data.table::as.data.table(pol_words)
#'
#'
#' attributes(extract_sentiment_terms(x))$counts
#' attributes(extract_sentiment_terms(x))$elements
#'
#' plot(extract_sentiment_terms(x))
#' plot(extract_sentiment_terms(x), include.neutral = TRUE)
#' plot(extract_sentiment_terms(x), separate = TRUE)
#' plot(extract_sentiment_terms(x), include.neutral = TRUE, separate = TRUE)
extract_sentiment_terms <- function(text.var, polarity_dt = sentimentr::polarity_table,
hyphen = "", ...){
sentences <- sentence_id <- P <- polarity <- n <- words <- N <- . <- NULL
## Add "~~" holder for any words `polarity_frame` & `valence_shifters_dt`
## that have spaces
posneg <- polarity_dt[[1]]
words <- posneg
space_words <- words[grep("\\s", words)]
# break rows into sentences, count words
# space fill (~~), break into words
sents <- get_sents(gsub("(\\s*)([;:,]+)", " \\2", text.var))
sent_dat <- make_sentence_df2(sents)
sent_dat[, 'words' := list(make_words(space_fill(sentences, space_words), hyphen = hyphen))]
# make sentence id for each row id
sent_dat[, sentence_id:=seq_len(.N), by='id']
## Make the data frame long by stretching out words in sentences
word_dat <- sent_dat[, .(words = unlist(words)), by = c('id', 'sentence_id')]
## add polarity word potential locations (seq along) and the
## value for polarized word
word_dat[, "P"] <- polarity_dt[word_dat[["words"]]][[2]]
word_dat[, P := ifelse(is.na(P), 0, P)][]
out_prime <- word_dat[!words %in% c(',', ''), ]
data.table::setnames(out_prime, c('id', 'P'), c('element_id', 'polarity'))
out <- data.table::dcast(
out_prime[, list(senti = ifelse(polarity == 0, 'neutral',
ifelse(polarity < 0, 'negative', 'positive'))), by = c('element_id', 'sentence_id', 'words')],
element_id + sentence_id ~ senti,
list, value.var = 'words'
)[, sentence := unlist(sents)][]
class(out) <- unique(c("extract_sentiment_terms", class(out)))
attributes(out)[["counts"]] <- out_prime[, list(n = .N), by = c("polarity", "words")][order(-polarity, -n), ]
data.table::setcolorder(attributes(out)[["counts"]], c('words', 'polarity', 'n'))
attributes(out)[["elements"]] <- out_prime
out
}
#' Prints an extract_sentiment_terms Object
#'
#' Prints an extract_sentiment_terms object
#'
#' @param x
#' @param \ldots ignored
#' @method print extract_sentiment_terms
#' @export
print.extract_sentiment_terms <- function(x, ...){
print(rm_class(x, 'extract_sentiment_terms')[, !c('neutral', "sentence"), with=FALSE] )
}
#' Plots an extract_sentiment_terms Object
#'
#' Plots an extract_sentiment_terms object
#'
#' @param x An extract_sentiment_terms object.
#' @param negative The color to use for negative terms.
#' @param positive The color to use for poisitive terms.
#' @param neutral The color to use for neutral terms.
#' @param min.freq Words with frequency below min.freq will not be plotted.
#' @param include.neutral Should neutral terms be plotted?
#' @param separate logical. If \code{TRUE} positive, negative, and neutral each get a wordcloud.
#' @param \ldots Other arguments passed to \code{\link[wordcloud]{wordcloud}}.
#' @method plot plot.extract_sentiment_terms
#' @export
plot.extract_sentiment_terms <- function(x, negative = 'blue', positive = 'red',
neutral = 'grey70', min.freq = 1, include.neutral = FALSE, separate = FALSE, ...){
pdat <- attributes(extract_sentiment_terms(x))$counts[, list(senti = ifelse(polarity == 0, 'neutral',
ifelse(polarity < 0, 'negative', 'positive')), color = ifelse(polarity == 0, neutral,
ifelse(polarity < 0, negative, positive))), by = c('words', 'n')]
if (!include.neutral) pdat <- pdat[senti != 'neutral', ]
if (isTRUE(separate)){
par(mfrow = c(2, 2))
pdat2 <- split(pdat, pdat[['senti']])
for(i in seq_along(pdat2)){
wordcloud::wordcloud(pdat2[[i]][['words']], pdat2[[i]][['n']], min.freq = min.freq,
colors = pdat2[[i]][['color']], ordered.colors = TRUE, random.color = FALSE,
random.order = FALSE, ...
)
}
} else {
wordcloud::wordcloud(pdat[['words']], pdat[['n']], min.freq = min.freq,
colors = pdat[['color']], ordered.colors = TRUE, random.color = FALSE,
random.order = FALSE, ...
)
}
}
p_load(sentimentr, data.table)
set.seed(10)
x <- sample(cannon_reviews[[3]], 3000, T)
sentiment(x)
pol_words <- extract_sentiment_terms(x)
pol_words
pol_words$sentence
pol_words$neutral
data.table::as.data.table(pol_words)
attributes(extract_sentiment_terms(x))$counts
attributes(extract_sentiment_terms(x))$elements
plot(extract_sentiment_terms(x))
plot(extract_sentiment_terms(x), include.neutral = TRUE)
plot(extract_sentiment_terms(x), separate = TRUE)
plot(extract_sentiment_terms(x), include.neutral = TRUE, separate = TRUE)
## Not used
## plot.extract_sentiment_terms <- function(x, ...){
## pdat <- attributes(extract_sentiment_terms(x))$counts[polarity != 0,][, id := 1:.N, by = 'polarity'][]
##
## ggplot2::ggplot(pdat, ggplot2::aes_string(label = 'words', x = 'polarity',
## y = 'id', size='n', color = 'polarity')) +
## ggplot2::geom_text() +
## ggplot2::scale_color_gradient2()
##
## }
from sentimentr.
Added everything except plotting
from sentimentr.
Related Issues (20)
- Sentiment Score Seems Incorrect HOT 1
- Some words in Hu&Liu dict have off polarity values HOT 1
- Any hint why the two emoji approaches are different and in what circumstance which one is better?! HOT 1
- Amplifiers
- polarity_dt requires words with spaces to work HOT 1
- Sentimentr split words when there is an accented character HOT 5
- Totally different results in Qdap polarity and sentiment r HOT 4
- Changing polarity of words in the dictionary HOT 2
- Question about 'highlight' HOT 1
- how to calculate sum of sentiment? HOT 1
- Question about how the dictionary was built HOT 1
- .mygsub is slow HOT 8
- Transparency for lexicon::hash_sentiment_jockers_rinker HOT 1
- What is the theoretical range for the "sentiment" variable? HOT 1
- I wonder why this sentence is recognized as positive HOT 2
- Polarity categorization? HOT 1
- relabel "black" and "white" as neutral sentiments (black is currently negative; white is labeled positive)
- Overall positive and negative scores given a piece of text
- Positive and Negative Sentiment based on German Language text HOT 1
- how to get pos, neg, neu other than ave_sentiment ?
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from sentimentr.