Giter Club home page Giter Club logo

Comments (5)

trinker avatar trinker commented on August 18, 2024

I attempted via regex extraction...slow. Use tokens approach instead like the first part of the sentiment function.

Slow version:

#' Extract Sentiment Words
#' 
#' Extract the sentiment words from a text.
#' 
#' @param text.var The text variable.
#' @param polarity_dt A \pkg{data.table} of positive/negative words and
#' weights with x and y as column names.
#' @param split logical.  If \code{TRUE} sentences will be split apart, 
#' resulting in more rows than the original data set.
#' @param \ldots ignored.
#' @return Returns a \pkg{data.table} with columns of positive and 
#' negative terms.
#' @export
#' @examples 
extract_sentiment_terms <- function(text.var, polarity_dt = sentimentr::polarity_table, split = TRUE, ...){

    ord <- c("element_id", "sentence_id", "positive_terms", "negative_terms", "text")

    if (isTRUE(split)){

        sents <- get_sents(gsub("(\\s*)([;:,]+)", " \\2", text.var))
        dat <- make_sentence_df2(sents)[, 'wc' := NULL][, 'sentence_id' := 1:.N, by = 'id'][]
        data.table::setcolorder(dat, c('id','sentence_id', 'sentences'))
        data.table::setnames(dat, c('id', "sentences"), c('element_id', "text"))
        vars <- c('element_id', 'sentence_id', 'text', 'polarity')

    } else {

        dat <- data.table::data.table(text=text.var)[, 'element_id' := 1:.N][]
        data.table::setcolorder(dat, c('element_id', 'text'))
        ord <- ord[-2]
        vars <- c('element_id', 'text', 'polarity')
    }


    pos <- polarity_dt[y > 0]    
    neg <- polarity_dt[y < 0]

    pos_regex <- bor(pos[, 'x', with = FALSE][[1]])
    neg_regex <- bor(neg[, 'x', with = FALSE][[1]])

    out <- dat[, 
        'positive_terms' := exlow(text, pos_regex)][, 
        'negative_terms' := exlow(text, neg_regex)][
    ]


    data.table::setcolorder(out, ord)


    cnts <- out[, c('positive_terms', 'negative_terms'), with=FALSE]
    posw <- na.omit(unlist(cnts[[1]]))
    negw <- na.omit(unlist(cnts[[2]]))

    class(out) <- c('extract_sentiment_terms', class(out))  

#    attributes(out)[['counts']] <- rbind(
#        data.table::data.table(word = posw)[, list('n' = length(word)), by = 'word'][, 'polarity' := 'positive'][],
#        data.table::data.table(word = negw)[, list('n' = length(word)), by = 'word'][, 'polarity' := 'negative'][]
#    )

    attributes(out)[['words']] <- na.omit(melt(out, measure.vars = c('positive_terms', 'negative_terms'),
        variable.name = "polarity", value.name = "word")[, 
        list(word = unlist(word)), by = vars])


    attributes(out)[['counts']] <- dcast.data.table(attributes(out)[['words']][, 'word', with=FALSE], word ~ ., length, value.var = 'word')[order(., decreasing = TRUE),]
    setnames(attributes(out)[['counts']], ".", "n")

    out
}

exlow <- function(x, y) stringi::stri_extract_all_regex(tolower(x), y)
bor <- function(x) paste0("(?i)\\b(", paste(x, collapse = "|"), ")\\b")


set.seed(10)
x <- sample(cannon_reviews[[3]], 597)
sentiment(x)

extract_sentiment_terms(x)
attributes(extract_sentiment_terms(x))$counts
attributes(extract_sentiment_terms(x))$words




get_sents <- sentimentr:::get_sents
make_sentence_df2 <- sentimentr:::make_sentence_df2

from sentimentr.

trinker avatar trinker commented on August 18, 2024

Second round faster. Needs some summary and plotting techniques:

valence_shifters_dt <- sentimentr:::valence_shifters_dt
make_words <- sentimentr:::make_words



#' Extract Sentiment Words
#' 
#' Extract the sentiment words from a text.
#' 
#' @param text.var The text variable.
#' @param polarity_dt A \pkg{data.table} of positive/negative words and
#' weights with x and y as column names.
#' @param hyphen The character string to replace hyphens with.  Default replaces
#' with nothing so 'sugar-free' becomes 'sugarfree'.  Setting \code{hyphen = " "}
#' would result in a space between words (e.g., 'sugar free').
#' @param \ldots ignored.
#' @return Returns a \pkg{data.table} with columns of positive and 
#' negative terms.
#' @export
#' @examples 
extract_sentiment_terms  <- function(text.var, polarity_dt = sentimentr::polarity_table,
    hyphen = "", ...){

    sentences <- sentence_id <- P <- words <- N <- . <- NULL

    ## Add "~~" holder for any words `polarity_frame` & `valence_shifters_dt`
    ## that have spaces
    posneg <- polarity_dt[[1]]
    words <- posneg    
    space_words <-  words[grep("\\s", words)]

    # break rows into sentences, count words
    # space fill (~~), break into words
    sents <- get_sents(gsub("(\\s*)([;:,]+)", " \\2", text.var))
    sent_dat <- make_sentence_df2(sents)
    sent_dat[, 'words' := list(make_words(space_fill(sentences, space_words), hyphen = hyphen))]

    # make sentence id for each row id
    sent_dat[, sentence_id:=seq_len(.N), by='id']

    ## Make the data frame long by stretching out words in sentences
    word_dat <- sent_dat[, .(words = unlist(words)), by = c('id', 'sentence_id')]

    ## 1. add polarity word potential locations (seq along) and the
    ##    value for polarized word
    ## 2. add comma locations

    word_dat[, "P"] <- polarity_dt[word_dat[["words"]]][[2]]
    word_dat[, P := ifelse(is.na(P), 0, P)][]
    out <- word_dat[!words %in% c(',', ''), ]

    data.table::setnames(out, 'id', 'element_id')

    class(out) <- unique(c("sentiment", class(out)))
    attributes(out)[["counts"]] <- out[, list(n = .N), by = c("P", "words")][order(-P, -n), ]
    attributes(out)[["words"]] <- data.table::dcast(out, element_id + sentence_id ~ P, 
        list, value.var = 'words')[, sentence := unlist(sents)][]

    out
}



p_load(sentimentr, data.table)
set.seed(10)
x <- sample(cannon_reviews[[3]], 3000, T)
sentiment(x)

extract_sentiment_terms(x)$words

attributes(extract_sentiment_terms(x))$counts
attributes(extract_sentiment_terms(x))$words

from sentimentr.

trinker avatar trinker commented on August 18, 2024
valence_shifters_dt <- sentimentr:::valence_shifters_dt
make_words <- sentimentr:::make_words



#' Extract Sentiment Words
#' 
#' Extract the sentiment words from a text.
#' 
#' @param text.var The text variable.
#' @param polarity_dt A \pkg{data.table} of positive/negative words and
#' weights with x and y as column names.
#' @param hyphen The character string to replace hyphens with.  Default replaces
#' with nothing so 'sugar-free' becomes 'sugarfree'.  Setting \code{hyphen = " "}
#' would result in a space between words (e.g., 'sugar free').
#' @param \ldots ignored.
#' @return Returns a \pkg{data.table} with columns of positive and 
#' negative terms.
#' @export
#' @examples 
extract_sentiment_terms  <- function(text.var, polarity_dt = sentimentr::polarity_table,
    hyphen = "", ...){

    sentences <- sentence_id <- P <- polarity <- n <- words <- N <- . <- NULL

    ## Add "~~" holder for any words `polarity_frame` & `valence_shifters_dt`
    ## that have spaces
    posneg <- polarity_dt[[1]]
    words <- posneg    
    space_words <-  words[grep("\\s", words)]

    # break rows into sentences, count words
    # space fill (~~), break into words
    sents <- get_sents(gsub("(\\s*)([;:,]+)", " \\2", text.var))
    sent_dat <- make_sentence_df2(sents)
    sent_dat[, 'words' := list(make_words(space_fill(sentences, space_words), hyphen = hyphen))]

    # make sentence id for each row id
    sent_dat[, sentence_id:=seq_len(.N), by='id']

    ## Make the data frame long by stretching out words in sentences
    word_dat <- sent_dat[, .(words = unlist(words)), by = c('id', 'sentence_id')]

    ## 1. add polarity word potential locations (seq along) and the
    ##    value for polarized word
    ## 2. add comma locations

    word_dat[, "P"] <- polarity_dt[word_dat[["words"]]][[2]]
    word_dat[, P := ifelse(is.na(P), 0, P)][]
    out <- word_dat[!words %in% c(',', ''), ]

    data.table::setnames(out, c('id', 'P'), c('element_id', 'polarity'))

    class(out) <- unique(c("extract_sentiment_terms", class(out)))
    attributes(out)[["counts"]] <- out[, list(n = .N), by = c("polarity", "words")][order(-polarity, -n), ]
    attributes(out)[["words"]] <- data.table::dcast(
        out[, list(senti = ifelse(polarity == 0, 'neutral', 
            ifelse(polarity < 0, 'negative', 'positive'))), by = c('element_id', 'sentence_id', 'words')], 
        element_id + sentence_id ~ senti, 
        list, value.var = 'words'
    )[, sentence := unlist(sents)][]

    data.table::setcolorder(attributes(out)[["counts"]], c('words', 'polarity', 'n'))

    out
}



p_load(sentimentr, data.table)
set.seed(10)
x <- sample(cannon_reviews[[3]], 3000, T)
sentiment(x)

extract_sentiment_terms(x)

attributes(extract_sentiment_terms(x))$counts
attributes(extract_sentiment_terms(x))$words




plot.extract_sentiment_terms <- function(x, ...){
    pdat <- attributes(extract_sentiment_terms(x))$counts[polarity != 0,][, id := 1:.N, by = 'polarity'][]
        
    ggplot2::ggplot(pdat, ggplot2::aes_string(label = 'words', x = 'polarity', 
            y = 'id', size='n', color = 'polarity')) +
        ggplot2::geom_text() +
        ggplot2::scale_color_gradient2()

}

plot.extract_sentiment_terms <- function(x, negative = 'blue', positive = 'red', 
    neutral = 'grey70', min.freq = 1, random.order = FALSE, include.neutral = TRUE, ...){

    pdat <- attributes(extract_sentiment_terms(x))$counts[, list(senti = ifelse(polarity == 0, 'neutral', 
            ifelse(polarity < 0, 'negative', 'positive')), color = ifelse(polarity == 0, neutral, 
            ifelse(polarity < 0, negative, positive))), by = c('words', 'n')]
    
    if (!include.neutral) pdat <- pdat[senti != 'neutral', ]


    wordcloud::wordcloud(pdat[['words']], pdat[['n']], min.freq = min.freq, 
        colors = pdat[['color']], ordered.colors =  TRUE, random.color = FALSE,
        random.order = random.order, ...
    )

}


plot(extract_sentiment_terms(x))
plot(extract_sentiment_terms(x), include.neutral = FALSE)

from sentimentr.

trinker avatar trinker commented on August 18, 2024
valence_shifters_dt <- sentimentr:::valence_shifters_dt
make_words <- sentimentr:::make_words
get_sents <- sentimentr:::get_sents
make_sentence_df2 <- sentimentr:::make_sentence_df2

#plot requires adding wordcloud to dependencies.  Not sure if I'm willing to got this far.


## helper to add to utils
rm_class <- function (x, remove, ...) {
    class(x) <- class(x)[!class(x) %in% remove]
    x
}

#' Extract Sentiment Words
#' 
#' Extract the sentiment words from a text.
#' 
#' @param text.var The text variable.
#' @param polarity_dt A \pkg{data.table} of positive/negative words and
#' weights with x and y as column names.
#' @param hyphen The character string to replace hyphens with.  Default replaces
#' with nothing so 'sugar-free' becomes 'sugarfree'.  Setting \code{hyphen = " "}
#' would result in a space between words (e.g., 'sugar free').
#' @param \ldots ignored.
#' @return Returns a \pkg{data.table} with columns of positive and 
#' negative terms.
#' @export
#' @importFrom data.table .N :=
#' @examples
#' p_load(sentimentr, data.table)
#' set.seed(10)
#' x <- sample(cannon_reviews[[3]], 3000, T)
#' sentiment(x)
#' 
#' pol_words <- extract_sentiment_terms(x)
#' pol_words
#' pol_words$sentence
#' pol_words$neutral
#' data.table::as.data.table(pol_words)
#' 
#' 
#' attributes(extract_sentiment_terms(x))$counts
#' attributes(extract_sentiment_terms(x))$elements
#' 
#' plot(extract_sentiment_terms(x))
#' plot(extract_sentiment_terms(x), include.neutral = TRUE)
#' plot(extract_sentiment_terms(x), separate = TRUE)
#' plot(extract_sentiment_terms(x), include.neutral = TRUE, separate = TRUE) 
extract_sentiment_terms  <- function(text.var, polarity_dt = sentimentr::polarity_table,
    hyphen = "", ...){

    sentences <- sentence_id <- P <- polarity <- n <- words <- N <- . <- NULL

    ## Add "~~" holder for any words `polarity_frame` & `valence_shifters_dt`
    ## that have spaces
    posneg <- polarity_dt[[1]]
    words <- posneg    
    space_words <-  words[grep("\\s", words)]

    # break rows into sentences, count words
    # space fill (~~), break into words
    sents <- get_sents(gsub("(\\s*)([;:,]+)", " \\2", text.var))
    sent_dat <- make_sentence_df2(sents)
    sent_dat[, 'words' := list(make_words(space_fill(sentences, space_words), hyphen = hyphen))]

    # make sentence id for each row id
    sent_dat[, sentence_id:=seq_len(.N), by='id']

    ## Make the data frame long by stretching out words in sentences
    word_dat <- sent_dat[, .(words = unlist(words)), by = c('id', 'sentence_id')]

    ## add polarity word potential locations (seq along) and the
    ## value for polarized word
    word_dat[, "P"] <- polarity_dt[word_dat[["words"]]][[2]]
    word_dat[, P := ifelse(is.na(P), 0, P)][]
    out_prime <- word_dat[!words %in% c(',', ''), ]

    data.table::setnames(out_prime, c('id', 'P'), c('element_id', 'polarity'))


    out <- data.table::dcast(
        out_prime[, list(senti = ifelse(polarity == 0, 'neutral', 
            ifelse(polarity < 0, 'negative', 'positive'))), by = c('element_id', 'sentence_id', 'words')], 
        element_id + sentence_id ~ senti, 
        list, value.var = 'words'
    )[, sentence := unlist(sents)][]
    class(out) <- unique(c("extract_sentiment_terms", class(out)))

    attributes(out)[["counts"]] <- out_prime[, list(n = .N), by = c("polarity", "words")][order(-polarity, -n), ]
    data.table::setcolorder(attributes(out)[["counts"]], c('words', 'polarity', 'n'))

    attributes(out)[["elements"]] <- out_prime

    out
}


#' Prints an extract_sentiment_terms Object
#' 
#' Prints an extract_sentiment_terms object
#' 
#' @param x
#' @param \ldots ignored
#' @method print extract_sentiment_terms
#' @export 
print.extract_sentiment_terms <- function(x, ...){

    print(rm_class(x, 'extract_sentiment_terms')[, !c('neutral', "sentence"), with=FALSE]  )
    
}

#' Plots an extract_sentiment_terms Object
#' 
#' Plots an extract_sentiment_terms object
#' 
#' @param x An extract_sentiment_terms object.
#' @param negative The color to use for negative terms.
#' @param positive The color to use for poisitive terms.
#' @param neutral The color to use for neutral terms.
#' @param min.freq Words with frequency below min.freq will not be plotted.
#' @param include.neutral Should neutral terms be plotted?
#' @param separate logical.  If \code{TRUE} positive, negative, and neutral each get a wordcloud.
#' @param \ldots Other arguments passed to \code{\link[wordcloud]{wordcloud}}.
#' @method plot plot.extract_sentiment_terms
#' @export 
plot.extract_sentiment_terms <- function(x, negative = 'blue', positive = 'red', 
    neutral = 'grey70', min.freq = 1, include.neutral = FALSE, separate = FALSE, ...){

    pdat <- attributes(extract_sentiment_terms(x))$counts[, list(senti = ifelse(polarity == 0, 'neutral', 
            ifelse(polarity < 0, 'negative', 'positive')), color = ifelse(polarity == 0, neutral, 
            ifelse(polarity < 0, negative, positive))), by = c('words', 'n')]
    
    if (!include.neutral) pdat <- pdat[senti != 'neutral', ]

    if (isTRUE(separate)){

        par(mfrow = c(2, 2))
        pdat2 <- split(pdat, pdat[['senti']])

        for(i in seq_along(pdat2)){
            wordcloud::wordcloud(pdat2[[i]][['words']], pdat2[[i]][['n']], min.freq = min.freq, 
                colors = pdat2[[i]][['color']], ordered.colors =  TRUE, random.color = FALSE,
                random.order = FALSE, ...
            ) 
        }
    } else {
        wordcloud::wordcloud(pdat[['words']], pdat[['n']], min.freq = min.freq, 
            colors = pdat[['color']], ordered.colors =  TRUE, random.color = FALSE,
            random.order = FALSE, ...
        )
    }
}



p_load(sentimentr, data.table)
set.seed(10)
x <- sample(cannon_reviews[[3]], 3000, T)
sentiment(x)

pol_words <- extract_sentiment_terms(x)
pol_words
pol_words$sentence
pol_words$neutral
data.table::as.data.table(pol_words)


attributes(extract_sentiment_terms(x))$counts
attributes(extract_sentiment_terms(x))$elements

plot(extract_sentiment_terms(x))
plot(extract_sentiment_terms(x), include.neutral = TRUE)
plot(extract_sentiment_terms(x), separate = TRUE)
plot(extract_sentiment_terms(x), include.neutral = TRUE, separate = TRUE)


## Not used
## plot.extract_sentiment_terms <- function(x, ...){
##     pdat <- attributes(extract_sentiment_terms(x))$counts[polarity != 0,][, id := 1:.N, by = 'polarity'][]
##         
##     ggplot2::ggplot(pdat, ggplot2::aes_string(label = 'words', x = 'polarity', 
##             y = 'id', size='n', color = 'polarity')) +
##         ggplot2::geom_text() +
##         ggplot2::scale_color_gradient2()
##
## }

from sentimentr.

trinker avatar trinker commented on August 18, 2024

Added everything except plotting

from sentimentr.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.