This is one of the most interesting (future) aspects of Texthero: the ability to repre

Useful code: <div class="snippet-clipboard-content notranslate position-relative o

Useful code for testing the functions: <div class="snippet-clipboard-content notra

Support "Pandas Series Representation" about texthero HOT 3 CLOSED

jbesomi commented on June 7, 2024

Support "Pandas Series Representation"

from texthero.

Comments (3)

mk2510 commented on June 7, 2024 2

Hi Jonny, we will take care about this issue now. 😃

from texthero.

jbesomi commented on June 7, 2024

Useful code:


"""
Helper
"""


def _check_is_valid_representation(s) -> bool:
    """
    Check that the given Pandas Series respect is a Representation Pandas Series.
    """

    if not isinstance(s.index, pd.MultiIndex):
        raise ValueError(
            f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex"
        )

    if s.index.nlevels != 2:
        raise ValueError(
            f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2."
        )
    return True


"""
Vectorization
"""


def term_frequency(
    s: pd.Series, max_features=300, min_df=1, max_df=1.0
) -> pd.Series.sparse:
    """
    Represent a tokenized Pandas Series using term frequency.

    Parameters
    ----------
    s : Pandas Series
    max_features : int, optional, default to 300
        Maximum number of features to keep.
    min_df : int, optional, default to 1
        When building the vocabulary, ignore terms that have a document 
        frequency strictly lower than the given threshold.
    max_df : int or double, optional, default to 1.0
        When building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold. This arguments basically permits to remove corpus-specific stop words. When the argument is a float [0.0, 1.0], the parameter represents a proportion of documents.


    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["I am GROOT", "Flame on"])
    >>> s = hero.tokenize(s)
    >>> hero.term_frequency(s)
    document  word 
    0         GROOT    1
              I        1
              am       1
    1         Flame    1
              on       1
    dtype: Sparse[int64, 0]
    """

    if type(s.iloc[0]) != list:
        raise ValueError(
            "🤔 It seems like the given Pandas Series is not tokenized. Have you tried passing the Series through `hero.tokenize(s)`?"
        )

    # TODO. Can be rewritten without sklearn.

    tf = CountVectorizer(
        max_features=max_features,
        min_df=min_df,
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
    )
    tf_vectors_csr = tf.fit_transform(s)

    tf_vectors_coo = coo_matrix(tf_vectors_csr)
    s_out = pd.Series.sparse.from_coo(tf_vectors_coo)

    features_names = tf.get_feature_names()

    # Map word index to word name
    s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]]))

    s_out.rename_axis(["document", "word"], inplace=True)
    s.name = "term_frequency"

    return s_out


def tfidf(s: pd.Series, max_features=300, min_df=1, max_df=1.0) -> pd.Series.sparse:
    """
    Represent a tokenized Pandas Series with TF-IDF. 
    
    Parameters
    ----------
    s : Pandas Series (tokenized)
    max_features : int, optional, default to 300
        Maximum number of features to keep.
    min_df : int, optional, default to 1.
        When building the vocabulary, ignore terms that have a document 
        frequency strictly lower than the given threshold.
    max_df : int or double, optional, default to 1.0
        When building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold. This arguments basically permits to remove corpus-specific stop words. When the argument is a float [0.0, 1.0], the parameter represents a proportion of documents.

    Returns
    -------
    Sparse Pandas Series
        Return a MultiIndex Sparse Series where the first level is the document, the second level is the word name and the values are the tf-idf coefficients.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["I am GROOT", "Flame on"])
    >>> s = hero.tokenize(s)
    >>> hero.tfidf(s)
    document  word 
    0         GROOT    0.577350
              I        0.577350
              am       0.577350
    1         Flame    0.707107
              on       0.707107
    dtype: Sparse[float64, nan]

    """

    # TODO. In docstring show the formula to compute TF-IDF

    if type(s.iloc[0]) != list:
        raise ValueError(
            "🤔 It seems like the given Pandas Series is not tokenized. Have you tried passing the Series through `hero.tokenize(s)`?"
        )

    tfidf = TfidfVectorizer(
        use_idf=True,
        max_features=max_features,
        min_df=min_df,
        max_df=max_df,
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
    )

    tfidf_vectors_csr = tfidf.fit_transform(s)

    tfidf_vectors_coo = coo_matrix(tfidf_vectors_csr)
    s_out = pd.Series.sparse.from_coo(tfidf_vectors_coo)

    features_names = tfidf.get_feature_names()

    # Map word index to word name
    s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]]))

    s_out.rename_axis(["document", "word"], inplace=True)
    s.name = "tfidf"

    return s_out


"""
Dimensionality reduction
"""


def truncated_svd(s, n_components=2, random_state=42):
    """
    Perform truncated SVD on the given Represented Pandas Series.

    This function is typically used to reduce the number of dimension of a Represented Pandas Series, generally obtained by calling an hero representation function such as :meth:`tfidf`, :meth:`term_frequency` or :meth:`doc2vec`.

    Truncated SVD receive as first argument a MultiIndex Pandas Series where the first level are the documents, the second level are the words and the values are the representation.  

    Parameters
    ----------
    s : Pandas Series
    n_components : int, default is 2.
        Number of components to keep. If n_components is not set or None, all components are kept.
    random_state : int, default to 42.
        Used during randomized svd. Pass an int for reproducible results across multiple function calls.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["I am GROOT", "Flame on"])
    >>> s = hero.tokenize(s)
    >>> s = hero.tfidf(s)
    >>> hero.truncated_svd(s) # doctest: +ELLIPSIS
    0    [0.0, 0.99...]
    1    [1.00..., 0.0]
    dtype: object

    See also
    --------
    :meth:`tfidf` to compute TF-IDF and :meth:`term_frequency` to compute term frequency
 
    """

    _check_is_valid_representation(s)

    svd = TruncatedSVD(n_components=n_components, random_state=random_state)

    s = s.astype("Sparse")
    s_csr_matrix = csr_matrix(s.sparse.to_coo()[0])

    s_out = pd.Series(
        svd.fit_transform(s_csr_matrix).tolist(), index=s.index.unique(level=0),
    )
    s_out = s_out.rename_axis(None)
    return s_out


def pca(s, n_components=2):
    """
    Perform principal component analysis on the given Represented Pandas Series.

    This function is typically used to reduce the number of dimension of a Represented Pandas Series, generally obtained by calling an hero representation function such as :meth:`tfidf`, :meth:`term_frequency` or :meth:`doc2vec`.

    PCA receive as first argument a MultiIndex Pandas Series where the first level are the documents, the second level are the words and the values are their representation.  

    Parameters
    ----------
    s : Pandas Series
    n_components : int, default is 2.
        Number of components to keep. If n_components is not set or None, all components are kept.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["I am GROOT", "Flame on"])
    >>> s = hero.tokenize(s)
    >>> s = hero.tfidf(s) # return a Sparse Pandas Series by default 
    >>> hero.pca(s)
    0    [-0.7071067811865474, 5.5511151231257815e-17]
    1     [0.7071067811865476, 5.5511151231257815e-17]
    dtype: object

    See also
    --------
    :meth:`tfidf` to compute TF-IDF and :meth:`term_frequency` to compute term frequency
 
    """

    _check_is_valid_representation(s)

    pca = PCA(n_components=n_components)

    if pd_types.is_sparse(s):
        s_csr_matrix = csr_matrix(s.sparse.to_coo()[0])
        if s_csr_matrix.shape[1] > 1000:
            warnings.warn(
                "✋ Be careful. You are trying to compute PCA from a Sparse Pandas Series with a very large vocabulary. Principal Component Analysis normalize the data and this act requires to expand the input Sparse Matrix. This operation might take long. Consider using `svd_truncated` instead as it can deals with Sparse Matrix efficiently."
            )
    else:
        # Threat it as a Sparse matrix anyway for efficiency.
        s = s.astype("Sparse")
        s_csr_matrix = csr_matrix(s.sparse.to_coo()[0])

    s_dense_matrix = s_csr_matrix.todense()

    s_out = pd.Series(
        pca.fit_transform(s_dense_matrix).tolist(), index=s.index.unique(level=0),
    )
    s_out = s_out.rename_axis(None)
    return s_out


def nmf(s, n_components=2, random_state=42):
    """
    Perform negative matrix factotization on the given Represented Pandas Series.

    This function is typically used to reduce the number of dimension of a Represented Pandas Series, generally obtained by calling an hero representation function such as :meth:`tfidf`, :meth:`term_frequency` or :meth:`doc2vec`.

    NMF receive as first argument a MultiIndex Pandas Series where the first level are the documents, the second level are the words and the values are the representation.  

    Parameters
    ----------
    s : Pandas Series
    n_components : int, default is 2.
        Number of components to keep. If n_components is not set or None, all components are kept.
    random_state : int, default is 42.
        Used during randomized svd. Pass an int for reproducible results across multiple function calls.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["I am GROOT", "Flame on"])
    >>> s = hero.tokenize(s)
    >>> s = hero.tfidf(s)
    >>> hero.nmf(s) # doctest: +ELLIPSIS
    0                   [0.0, 1.0]
    1    [0.9999999999999998, 0.0]
    dtype: object

    See also
    --------
    :meth:`tfidf` to compute TF-IDF and :meth:`term_frequency` to compute term frequency
 
    """

    _check_is_valid_representation(s)

    nmf = NMF(n_components=n_components, random_state=random_state)

    if pd_types.is_sparse(s):
        s_csr_matrix = csr_matrix(s.sparse.to_coo()[0])
        if s_csr_matrix.shape[1] > 1000:
            warnings.warn(
                "✋ Be careful. You are trying to compute NMF from a Sparse Pandas Series with a very large vocabulary. Principal Component Analysis normalize the data and this act requires to expand the input Sparse Matrix. This operation might take long. Consider using `svd_truncated` instead as it can deals with Sparse Matrix efficiently."
            )
    else:
        # Threat it as a Sparse matrix anyway for efficiency.
        s = s.astype("Sparse")
        s_csr_matrix = csr_matrix(s.sparse.to_coo()[0])

    # s_dense_matrix = s_csr_matrix.todense()

    s_out = pd.Series(
        nmf.fit_transform(s_csr_matrix).tolist(), index=s.index.unique(level=0),
    )
    s_out = s_out.rename_axis(None)
    return s_out

from texthero.

jbesomi commented on June 7, 2024

Useful code for testing the functions:

class TestRepresentation(PandasTestCase):
    """
    Term Frequency.
    """

    def test_term_frequency_single_document(self):
        s = pd.Series([list("abbccc")])

        idx = pd.MultiIndex.from_tuples(
            [(0, "a"), (0, "b"), (0, "c")], names=("document", "word")
        )

        s_true = pd.Series([1, 2, 3], index=idx, dtype="int").astype(
            pd.SparseDtype("int", 0)
        )
        self.assertEqual(representation.term_frequency(s), s_true)

    def test_term_frequency_multiple_documents(self):

        s = pd.Series([["doc_one"], ["doc_two"]])

        idx = pd.MultiIndex.from_tuples(
            [(0, "doc_one"), (1, "doc_two")], names=("document", "word")
        )

        s_true = pd.Series([1, 1], index=idx, dtype="int").astype(
            pd.SparseDtype("int", 0)
        )
        self.assertEqual(representation.term_frequency(s), s_true)

    def test_term_frequency_not_lowercase(self):

        s = pd.Series([["A"], ["a"]])

        idx = pd.MultiIndex.from_tuples(
            [(0, "A"), (1, "a")], names=("document", "word")
        )

        s_true = pd.Series([1, 1], index=idx, dtype="int").astype(
            pd.SparseDtype("int", 0)
        )
        self.assertEqual(representation.term_frequency(s), s_true)

    def test_term_frequency_punctuation_are_kept(self):

        s = pd.Series([["number", "one", "!"]])

        idx = pd.MultiIndex.from_tuples(
            [(0, "!"), (0, "number"), (0, "one")], names=("document", "word")
        )

        s_true = pd.Series([1, 1, 1], index=idx, dtype="int").astype(
            pd.SparseDtype("int", 0)
        )
        self.assertEqual(representation.term_frequency(s), s_true)

    def test_term_frequency_raise_when_not_tokenized(self):
        s = pd.Series("not tokenized")
        with self.assertRaisesRegex(ValueError, r"tokenized"):
            representation.term_frequency(s)

    """
    TF-IDF
    """

    def test_tfidf_simple(self):
        s = pd.Series([["a"]])

        idx = pd.MultiIndex.from_tuples([(0, "a")], names=("document", "word"))
        s_true = pd.Series([1.0], index=idx).astype("Sparse")
        self.assertEqual(representation.tfidf(s), s_true)

    def test_idf_single_not_lowercase(self):
        tfidf_single_smooth = 0.7071067811865475  # TODO

        s = pd.Series([list("Aa")])

        idx = pd.MultiIndex.from_tuples(
            [(0, "A"), (0, "a")], names=("document", "word")
        )

        s_true = pd.Series(
            [tfidf_single_smooth, tfidf_single_smooth], index=idx
        ).astype("Sparse")

        self.assertEqual(representation.tfidf(s), s_true)

    def test_idf_single_different_index(self):
        # compute s_true
        idx = pd.MultiIndex.from_tuples(
            [(10, "a"), (11, "b")], names=("document", "word")
        )
        s_true = pd.Series([1.0, 1.0], index=idx).astype("Sparse")

        s = pd.Series([["a"], ["b"]], index=[10, 11])
        self.assertEqual(representation.tfidf(s), s_true)

    def test_idf_raise_when_not_tokenized(self):
        s = pd.Series("not tokenized")
        with self.assertRaisesRegex(ValueError, r"tokenized"):
            representation.tfidf(s)

    """
    PCA
    """

    def test_pca_tf_simple(self):
        idx = pd.MultiIndex.from_tuples(
            [(0, "a"), (1, "b"), (2, "c")], names=("document", "word")
        )
        s = pd.Series([1, 1, 1], index=idx)
        s = representation.pca(s)

        from sklearn.decomposition import PCA

        pca = PCA(n_components=2)
        s_true = pca.fit_transform([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
        s_true = pd.Series(s_true.tolist())

        self.assertEqual(s, s_true)

    # TODO check raise warning

    """
    NMF
    """

    def test_nmf_tf_simple(self):
        idx = pd.MultiIndex.from_tuples(
            [(0, "a"), (1, "b"), (2, "c")], names=("document", "word")
        )
        s = pd.Series([1, 1, 1], index=idx)
        s = representation.nmf(s, random_state=1)

        from sklearn.decomposition import NMF

        nmf = NMF(n_components=2, random_state=1)
        s_true = nmf.fit_transform([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
        s_true = pd.Series(s_true.tolist())

        self.assertEqual(s, s_true)

    """
    TruncatedSVD
    """

    def test_nmf_tf_simple(self):
        idx = pd.MultiIndex.from_tuples(
            [(0, "a"), (1, "b"), (2, "c")], names=("document", "word")
        )
        s = pd.Series([1, 1, 1], index=idx)
        s = representation.truncated_svd(s, random_state=1)

        from sklearn.decomposition import TruncatedSVD

        svd = TruncatedSVD(n_components=2, random_state=1)
        s_true = svd.fit_transform([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
        s_true = pd.Series(s_true.tolist())

        self.assertEqual(s, s_true)

from texthero.

Support "Pandas Series Representation" about texthero HOT 3 CLOSED

Comments (3)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent