A function that takes a dictionary and text and extracts the positive, negative terms

<div class="snippet-clipboard-content notranslate position-relative overflow-auto" data-snippet-clip

add extract_sentiment_terms about sentimentr HOT 5 CLOSED

trinker commented on August 18, 2024

add extract_sentiment_terms

from sentimentr.

Comments (5)

trinker commented on August 18, 2024

I attempted via regex extraction...slow. Use tokens approach instead like the first part of the sentiment function.

Slow version:

#' Extract Sentiment Words
#' 
#' Extract the sentiment words from a text.
#' 
#' @param text.var The text variable.
#' @param polarity_dt A \pkg{data.table} of positive/negative words and
#' weights with x and y as column names.
#' @param split logical.  If \code{TRUE} sentences will be split apart, 
#' resulting in more rows than the original data set.
#' @param \ldots ignored.
#' @return Returns a \pkg{data.table} with columns of positive and 
#' negative terms.
#' @export
#' @examples 
extract_sentiment_terms <- function(text.var, polarity_dt = sentimentr::polarity_table, split = TRUE, ...){

    ord <- c("element_id", "sentence_id", "positive_terms", "negative_terms", "text")

    if (isTRUE(split)){

        sents <- get_sents(gsub("(\\s*)([;:,]+)", " \\2", text.var))
        dat <- make_sentence_df2(sents)[, 'wc' := NULL][, 'sentence_id' := 1:.N, by = 'id'][]
        data.table::setcolorder(dat, c('id','sentence_id', 'sentences'))
        data.table::setnames(dat, c('id', "sentences"), c('element_id', "text"))
        vars <- c('element_id', 'sentence_id', 'text', 'polarity')

    } else {

        dat <- data.table::data.table(text=text.var)[, 'element_id' := 1:.N][]
        data.table::setcolorder(dat, c('element_id', 'text'))
        ord <- ord[-2]
        vars <- c('element_id', 'text', 'polarity')
    }


    pos <- polarity_dt[y > 0]    
    neg <- polarity_dt[y < 0]

    pos_regex <- bor(pos[, 'x', with = FALSE][[1]])
    neg_regex <- bor(neg[, 'x', with = FALSE][[1]])

    out <- dat[, 
        'positive_terms' := exlow(text, pos_regex)][, 
        'negative_terms' := exlow(text, neg_regex)][
    ]


    data.table::setcolorder(out, ord)


    cnts <- out[, c('positive_terms', 'negative_terms'), with=FALSE]
    posw <- na.omit(unlist(cnts[[1]]))
    negw <- na.omit(unlist(cnts[[2]]))

    class(out) <- c('extract_sentiment_terms', class(out))  

#    attributes(out)[['counts']] <- rbind(
#        data.table::data.table(word = posw)[, list('n' = length(word)), by = 'word'][, 'polarity' := 'positive'][],
#        data.table::data.table(word = negw)[, list('n' = length(word)), by = 'word'][, 'polarity' := 'negative'][]
#    )

    attributes(out)[['words']] <- na.omit(melt(out, measure.vars = c('positive_terms', 'negative_terms'),
        variable.name = "polarity", value.name = "word")[, 
        list(word = unlist(word)), by = vars])


    attributes(out)[['counts']] <- dcast.data.table(attributes(out)[['words']][, 'word', with=FALSE], word ~ ., length, value.var = 'word')[order(., decreasing = TRUE),]
    setnames(attributes(out)[['counts']], ".", "n")

    out
}

exlow <- function(x, y) stringi::stri_extract_all_regex(tolower(x), y)
bor <- function(x) paste0("(?i)\\b(", paste(x, collapse = "|"), ")\\b")


set.seed(10)
x <- sample(cannon_reviews[[3]], 597)
sentiment(x)

extract_sentiment_terms(x)
attributes(extract_sentiment_terms(x))$counts
attributes(extract_sentiment_terms(x))$words




get_sents <- sentimentr:::get_sents
make_sentence_df2 <- sentimentr:::make_sentence_df2

from sentimentr.

trinker commented on August 18, 2024

Second round faster. Needs some summary and plotting techniques:

valence_shifters_dt <- sentimentr:::valence_shifters_dt
make_words <- sentimentr:::make_words



#' Extract Sentiment Words
#' 
#' Extract the sentiment words from a text.
#' 
#' @param text.var The text variable.
#' @param polarity_dt A \pkg{data.table} of positive/negative words and
#' weights with x and y as column names.
#' @param hyphen The character string to replace hyphens with.  Default replaces
#' with nothing so 'sugar-free' becomes 'sugarfree'.  Setting \code{hyphen = " "}
#' would result in a space between words (e.g., 'sugar free').
#' @param \ldots ignored.
#' @return Returns a \pkg{data.table} with columns of positive and 
#' negative terms.
#' @export
#' @examples 
extract_sentiment_terms  <- function(text.var, polarity_dt = sentimentr::polarity_table,
    hyphen = "", ...){

    sentences <- sentence_id <- P <- words <- N <- . <- NULL

    ## Add "~~" holder for any words `polarity_frame` & `valence_shifters_dt`
    ## that have spaces
    posneg <- polarity_dt[[1]]
    words <- posneg    
    space_words <-  words[grep("\\s", words)]

    # break rows into sentences, count words
    # space fill (~~), break into words
    sents <- get_sents(gsub("(\\s*)([;:,]+)", " \\2", text.var))
    sent_dat <- make_sentence_df2(sents)
    sent_dat[, 'words' := list(make_words(space_fill(sentences, space_words), hyphen = hyphen))]

    # make sentence id for each row id
    sent_dat[, sentence_id:=seq_len(.N), by='id']

    ## Make the data frame long by stretching out words in sentences
    word_dat <- sent_dat[, .(words = unlist(words)), by = c('id', 'sentence_id')]

    ## 1. add polarity word potential locations (seq along) and the
    ##    value for polarized word
    ## 2. add comma locations

    word_dat[, "P"] <- polarity_dt[word_dat[["words"]]][[2]]
    word_dat[, P := ifelse(is.na(P), 0, P)][]
    out <- word_dat[!words %in% c(',', ''), ]

    data.table::setnames(out, 'id', 'element_id')

    class(out) <- unique(c("sentiment", class(out)))
    attributes(out)[["counts"]] <- out[, list(n = .N), by = c("P", "words")][order(-P, -n), ]
    attributes(out)[["words"]] <- data.table::dcast(out, element_id + sentence_id ~ P, 
        list, value.var = 'words')[, sentence := unlist(sents)][]

    out
}



p_load(sentimentr, data.table)
set.seed(10)
x <- sample(cannon_reviews[[3]], 3000, T)
sentiment(x)

extract_sentiment_terms(x)$words

attributes(extract_sentiment_terms(x))$counts
attributes(extract_sentiment_terms(x))$words

from sentimentr.

trinker commented on August 18, 2024

valence_shifters_dt <- sentimentr:::valence_shifters_dt
make_words <- sentimentr:::make_words



#' Extract Sentiment Words
#' 
#' Extract the sentiment words from a text.
#' 
#' @param text.var The text variable.
#' @param polarity_dt A \pkg{data.table} of positive/negative words and
#' weights with x and y as column names.
#' @param hyphen The character string to replace hyphens with.  Default replaces
#' with nothing so 'sugar-free' becomes 'sugarfree'.  Setting \code{hyphen = " "}
#' would result in a space between words (e.g., 'sugar free').
#' @param \ldots ignored.
#' @return Returns a \pkg{data.table} with columns of positive and 
#' negative terms.
#' @export
#' @examples 
extract_sentiment_terms  <- function(text.var, polarity_dt = sentimentr::polarity_table,
    hyphen = "", ...){

    sentences <- sentence_id <- P <- polarity <- n <- words <- N <- . <- NULL

    ## Add "~~" holder for any words `polarity_frame` & `valence_shifters_dt`
    ## that have spaces
    posneg <- polarity_dt[[1]]
    words <- posneg    
    space_words <-  words[grep("\\s", words)]

    # break rows into sentences, count words
    # space fill (~~), break into words
    sents <- get_sents(gsub("(\\s*)([;:,]+)", " \\2", text.var))
    sent_dat <- make_sentence_df2(sents)
    sent_dat[, 'words' := list(make_words(space_fill(sentences, space_words), hyphen = hyphen))]

    # make sentence id for each row id
    sent_dat[, sentence_id:=seq_len(.N), by='id']

    ## Make the data frame long by stretching out words in sentences
    word_dat <- sent_dat[, .(words = unlist(words)), by = c('id', 'sentence_id')]

    ## 1. add polarity word potential locations (seq along) and the
    ##    value for polarized word
    ## 2. add comma locations

    word_dat[, "P"] <- polarity_dt[word_dat[["words"]]][[2]]
    word_dat[, P := ifelse(is.na(P), 0, P)][]
    out <- word_dat[!words %in% c(',', ''), ]

    data.table::setnames(out, c('id', 'P'), c('element_id', 'polarity'))

    class(out) <- unique(c("extract_sentiment_terms", class(out)))
    attributes(out)[["counts"]] <- out[, list(n = .N), by = c("polarity", "words")][order(-polarity, -n), ]
    attributes(out)[["words"]] <- data.table::dcast(
        out[, list(senti = ifelse(polarity == 0, 'neutral', 
            ifelse(polarity < 0, 'negative', 'positive'))), by = c('element_id', 'sentence_id', 'words')], 
        element_id + sentence_id ~ senti, 
        list, value.var = 'words'
    )[, sentence := unlist(sents)][]

    data.table::setcolorder(attributes(out)[["counts"]], c('words', 'polarity', 'n'))

    out
}



p_load(sentimentr, data.table)
set.seed(10)
x <- sample(cannon_reviews[[3]], 3000, T)
sentiment(x)

extract_sentiment_terms(x)

attributes(extract_sentiment_terms(x))$counts
attributes(extract_sentiment_terms(x))$words




plot.extract_sentiment_terms <- function(x, ...){
    pdat <- attributes(extract_sentiment_terms(x))$counts[polarity != 0,][, id := 1:.N, by = 'polarity'][]
        
    ggplot2::ggplot(pdat, ggplot2::aes_string(label = 'words', x = 'polarity', 
            y = 'id', size='n', color = 'polarity')) +
        ggplot2::geom_text() +
        ggplot2::scale_color_gradient2()

}

plot.extract_sentiment_terms <- function(x, negative = 'blue', positive = 'red', 
    neutral = 'grey70', min.freq = 1, random.order = FALSE, include.neutral = TRUE, ...){

    pdat <- attributes(extract_sentiment_terms(x))$counts[, list(senti = ifelse(polarity == 0, 'neutral', 
            ifelse(polarity < 0, 'negative', 'positive')), color = ifelse(polarity == 0, neutral, 
            ifelse(polarity < 0, negative, positive))), by = c('words', 'n')]
    
    if (!include.neutral) pdat <- pdat[senti != 'neutral', ]


    wordcloud::wordcloud(pdat[['words']], pdat[['n']], min.freq = min.freq, 
        colors = pdat[['color']], ordered.colors =  TRUE, random.color = FALSE,
        random.order = random.order, ...
    )

}


plot(extract_sentiment_terms(x))
plot(extract_sentiment_terms(x), include.neutral = FALSE)

from sentimentr.

trinker commented on August 18, 2024

valence_shifters_dt <- sentimentr:::valence_shifters_dt
make_words <- sentimentr:::make_words
get_sents <- sentimentr:::get_sents
make_sentence_df2 <- sentimentr:::make_sentence_df2

#plot requires adding wordcloud to dependencies.  Not sure if I'm willing to got this far.


## helper to add to utils
rm_class <- function (x, remove, ...) {
    class(x) <- class(x)[!class(x) %in% remove]
    x
}

#' Extract Sentiment Words
#' 
#' Extract the sentiment words from a text.
#' 
#' @param text.var The text variable.
#' @param polarity_dt A \pkg{data.table} of positive/negative words and
#' weights with x and y as column names.
#' @param hyphen The character string to replace hyphens with.  Default replaces
#' with nothing so 'sugar-free' becomes 'sugarfree'.  Setting \code{hyphen = " "}
#' would result in a space between words (e.g., 'sugar free').
#' @param \ldots ignored.
#' @return Returns a \pkg{data.table} with columns of positive and 
#' negative terms.
#' @export
#' @importFrom data.table .N :=
#' @examples
#' p_load(sentimentr, data.table)
#' set.seed(10)
#' x <- sample(cannon_reviews[[3]], 3000, T)
#' sentiment(x)
#' 
#' pol_words <- extract_sentiment_terms(x)
#' pol_words
#' pol_words$sentence
#' pol_words$neutral
#' data.table::as.data.table(pol_words)
#' 
#' 
#' attributes(extract_sentiment_terms(x))$counts
#' attributes(extract_sentiment_terms(x))$elements
#' 
#' plot(extract_sentiment_terms(x))
#' plot(extract_sentiment_terms(x), include.neutral = TRUE)
#' plot(extract_sentiment_terms(x), separate = TRUE)
#' plot(extract_sentiment_terms(x), include.neutral = TRUE, separate = TRUE) 
extract_sentiment_terms  <- function(text.var, polarity_dt = sentimentr::polarity_table,
    hyphen = "", ...){

    sentences <- sentence_id <- P <- polarity <- n <- words <- N <- . <- NULL

    ## Add "~~" holder for any words `polarity_frame` & `valence_shifters_dt`
    ## that have spaces
    posneg <- polarity_dt[[1]]
    words <- posneg    
    space_words <-  words[grep("\\s", words)]

    # break rows into sentences, count words
    # space fill (~~), break into words
    sents <- get_sents(gsub("(\\s*)([;:,]+)", " \\2", text.var))
    sent_dat <- make_sentence_df2(sents)
    sent_dat[, 'words' := list(make_words(space_fill(sentences, space_words), hyphen = hyphen))]

    # make sentence id for each row id
    sent_dat[, sentence_id:=seq_len(.N), by='id']

    ## Make the data frame long by stretching out words in sentences
    word_dat <- sent_dat[, .(words = unlist(words)), by = c('id', 'sentence_id')]

    ## add polarity word potential locations (seq along) and the
    ## value for polarized word
    word_dat[, "P"] <- polarity_dt[word_dat[["words"]]][[2]]
    word_dat[, P := ifelse(is.na(P), 0, P)][]
    out_prime <- word_dat[!words %in% c(',', ''), ]

    data.table::setnames(out_prime, c('id', 'P'), c('element_id', 'polarity'))


    out <- data.table::dcast(
        out_prime[, list(senti = ifelse(polarity == 0, 'neutral', 
            ifelse(polarity < 0, 'negative', 'positive'))), by = c('element_id', 'sentence_id', 'words')], 
        element_id + sentence_id ~ senti, 
        list, value.var = 'words'
    )[, sentence := unlist(sents)][]
    class(out) <- unique(c("extract_sentiment_terms", class(out)))

    attributes(out)[["counts"]] <- out_prime[, list(n = .N), by = c("polarity", "words")][order(-polarity, -n), ]
    data.table::setcolorder(attributes(out)[["counts"]], c('words', 'polarity', 'n'))

    attributes(out)[["elements"]] <- out_prime

    out
}


#' Prints an extract_sentiment_terms Object
#' 
#' Prints an extract_sentiment_terms object
#' 
#' @param x
#' @param \ldots ignored
#' @method print extract_sentiment_terms
#' @export 
print.extract_sentiment_terms <- function(x, ...){

    print(rm_class(x, 'extract_sentiment_terms')[, !c('neutral', "sentence"), with=FALSE]  )
    
}

#' Plots an extract_sentiment_terms Object
#' 
#' Plots an extract_sentiment_terms object
#' 
#' @param x An extract_sentiment_terms object.
#' @param negative The color to use for negative terms.
#' @param positive The color to use for poisitive terms.
#' @param neutral The color to use for neutral terms.
#' @param min.freq Words with frequency below min.freq will not be plotted.
#' @param include.neutral Should neutral terms be plotted?
#' @param separate logical.  If \code{TRUE} positive, negative, and neutral each get a wordcloud.
#' @param \ldots Other arguments passed to \code{\link[wordcloud]{wordcloud}}.
#' @method plot plot.extract_sentiment_terms
#' @export 
plot.extract_sentiment_terms <- function(x, negative = 'blue', positive = 'red', 
    neutral = 'grey70', min.freq = 1, include.neutral = FALSE, separate = FALSE, ...){

    pdat <- attributes(extract_sentiment_terms(x))$counts[, list(senti = ifelse(polarity == 0, 'neutral', 
            ifelse(polarity < 0, 'negative', 'positive')), color = ifelse(polarity == 0, neutral, 
            ifelse(polarity < 0, negative, positive))), by = c('words', 'n')]
    
    if (!include.neutral) pdat <- pdat[senti != 'neutral', ]

    if (isTRUE(separate)){

        par(mfrow = c(2, 2))
        pdat2 <- split(pdat, pdat[['senti']])

        for(i in seq_along(pdat2)){
            wordcloud::wordcloud(pdat2[[i]][['words']], pdat2[[i]][['n']], min.freq = min.freq, 
                colors = pdat2[[i]][['color']], ordered.colors =  TRUE, random.color = FALSE,
                random.order = FALSE, ...
            ) 
        }
    } else {
        wordcloud::wordcloud(pdat[['words']], pdat[['n']], min.freq = min.freq, 
            colors = pdat[['color']], ordered.colors =  TRUE, random.color = FALSE,
            random.order = FALSE, ...
        )
    }
}



p_load(sentimentr, data.table)
set.seed(10)
x <- sample(cannon_reviews[[3]], 3000, T)
sentiment(x)

pol_words <- extract_sentiment_terms(x)
pol_words
pol_words$sentence
pol_words$neutral
data.table::as.data.table(pol_words)


attributes(extract_sentiment_terms(x))$counts
attributes(extract_sentiment_terms(x))$elements

plot(extract_sentiment_terms(x))
plot(extract_sentiment_terms(x), include.neutral = TRUE)
plot(extract_sentiment_terms(x), separate = TRUE)
plot(extract_sentiment_terms(x), include.neutral = TRUE, separate = TRUE)


## Not used
## plot.extract_sentiment_terms <- function(x, ...){
##     pdat <- attributes(extract_sentiment_terms(x))$counts[polarity != 0,][, id := 1:.N, by = 'polarity'][]
##         
##     ggplot2::ggplot(pdat, ggplot2::aes_string(label = 'words', x = 'polarity', 
##             y = 'id', size='n', color = 'polarity')) +
##         ggplot2::geom_text() +
##         ggplot2::scale_color_gradient2()
##
## }

from sentimentr.

trinker commented on August 18, 2024

Added everything except plotting

from sentimentr.

add extract_sentiment_terms about sentimentr HOT 5 CLOSED

Comments (5)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent