Giter Club home page Giter Club logo

Comments (3)

mk2510 avatar mk2510 commented on June 7, 2024 2

Hi Jonny, we will take care about this issue now. 😃

from texthero.

jbesomi avatar jbesomi commented on June 7, 2024

Useful code:


"""
Helper
"""


def _check_is_valid_representation(s) -> bool:
    """
    Check that the given Pandas Series respect is a Representation Pandas Series.
    """

    if not isinstance(s.index, pd.MultiIndex):
        raise ValueError(
            f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex"
        )

    if s.index.nlevels != 2:
        raise ValueError(
            f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2."
        )
    return True


"""
Vectorization
"""


def term_frequency(
    s: pd.Series, max_features=300, min_df=1, max_df=1.0
) -> pd.Series.sparse:
    """
    Represent a tokenized Pandas Series using term frequency.

    Parameters
    ----------
    s : Pandas Series
    max_features : int, optional, default to 300
        Maximum number of features to keep.
    min_df : int, optional, default to 1
        When building the vocabulary, ignore terms that have a document 
        frequency strictly lower than the given threshold.
    max_df : int or double, optional, default to 1.0
        When building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold. This arguments basically permits to remove corpus-specific stop words. When the argument is a float [0.0, 1.0], the parameter represents a proportion of documents.


    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["I am GROOT", "Flame on"])
    >>> s = hero.tokenize(s)
    >>> hero.term_frequency(s)
    document  word 
    0         GROOT    1
              I        1
              am       1
    1         Flame    1
              on       1
    dtype: Sparse[int64, 0]
    """

    if type(s.iloc[0]) != list:
        raise ValueError(
            "🤔 It seems like the given Pandas Series is not tokenized. Have you tried passing the Series through `hero.tokenize(s)`?"
        )

    # TODO. Can be rewritten without sklearn.

    tf = CountVectorizer(
        max_features=max_features,
        min_df=min_df,
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
    )
    tf_vectors_csr = tf.fit_transform(s)

    tf_vectors_coo = coo_matrix(tf_vectors_csr)
    s_out = pd.Series.sparse.from_coo(tf_vectors_coo)

    features_names = tf.get_feature_names()

    # Map word index to word name
    s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]]))

    s_out.rename_axis(["document", "word"], inplace=True)
    s.name = "term_frequency"

    return s_out


def tfidf(s: pd.Series, max_features=300, min_df=1, max_df=1.0) -> pd.Series.sparse:
    """
    Represent a tokenized Pandas Series with TF-IDF. 
    
    Parameters
    ----------
    s : Pandas Series (tokenized)
    max_features : int, optional, default to 300
        Maximum number of features to keep.
    min_df : int, optional, default to 1.
        When building the vocabulary, ignore terms that have a document 
        frequency strictly lower than the given threshold.
    max_df : int or double, optional, default to 1.0
        When building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold. This arguments basically permits to remove corpus-specific stop words. When the argument is a float [0.0, 1.0], the parameter represents a proportion of documents.

    Returns
    -------
    Sparse Pandas Series
        Return a MultiIndex Sparse Series where the first level is the document, the second level is the word name and the values are the tf-idf coefficients.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["I am GROOT", "Flame on"])
    >>> s = hero.tokenize(s)
    >>> hero.tfidf(s)
    document  word 
    0         GROOT    0.577350
              I        0.577350
              am       0.577350
    1         Flame    0.707107
              on       0.707107
    dtype: Sparse[float64, nan]

    """

    # TODO. In docstring show the formula to compute TF-IDF

    if type(s.iloc[0]) != list:
        raise ValueError(
            "🤔 It seems like the given Pandas Series is not tokenized. Have you tried passing the Series through `hero.tokenize(s)`?"
        )

    tfidf = TfidfVectorizer(
        use_idf=True,
        max_features=max_features,
        min_df=min_df,
        max_df=max_df,
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
    )

    tfidf_vectors_csr = tfidf.fit_transform(s)

    tfidf_vectors_coo = coo_matrix(tfidf_vectors_csr)
    s_out = pd.Series.sparse.from_coo(tfidf_vectors_coo)

    features_names = tfidf.get_feature_names()

    # Map word index to word name
    s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]]))

    s_out.rename_axis(["document", "word"], inplace=True)
    s.name = "tfidf"

    return s_out


"""
Dimensionality reduction
"""


def truncated_svd(s, n_components=2, random_state=42):
    """
    Perform truncated SVD on the given Represented Pandas Series.

    This function is typically used to reduce the number of dimension of a Represented Pandas Series, generally obtained by calling an hero representation function such as :meth:`tfidf`, :meth:`term_frequency` or :meth:`doc2vec`.

    Truncated SVD receive as first argument a MultiIndex Pandas Series where the first level are the documents, the second level are the words and the values are the representation.  

    Parameters
    ----------
    s : Pandas Series
    n_components : int, default is 2.
        Number of components to keep. If n_components is not set or None, all components are kept.
    random_state : int, default to 42.
        Used during randomized svd. Pass an int for reproducible results across multiple function calls.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["I am GROOT", "Flame on"])
    >>> s = hero.tokenize(s)
    >>> s = hero.tfidf(s)
    >>> hero.truncated_svd(s) # doctest: +ELLIPSIS
    0    [0.0, 0.99...]
    1    [1.00..., 0.0]
    dtype: object

    See also
    --------
    :meth:`tfidf` to compute TF-IDF and :meth:`term_frequency` to compute term frequency
 
    """

    _check_is_valid_representation(s)

    svd = TruncatedSVD(n_components=n_components, random_state=random_state)

    s = s.astype("Sparse")
    s_csr_matrix = csr_matrix(s.sparse.to_coo()[0])

    s_out = pd.Series(
        svd.fit_transform(s_csr_matrix).tolist(), index=s.index.unique(level=0),
    )
    s_out = s_out.rename_axis(None)
    return s_out


def pca(s, n_components=2):
    """
    Perform principal component analysis on the given Represented Pandas Series.

    This function is typically used to reduce the number of dimension of a Represented Pandas Series, generally obtained by calling an hero representation function such as :meth:`tfidf`, :meth:`term_frequency` or :meth:`doc2vec`.

    PCA receive as first argument a MultiIndex Pandas Series where the first level are the documents, the second level are the words and the values are their representation.  

    Parameters
    ----------
    s : Pandas Series
    n_components : int, default is 2.
        Number of components to keep. If n_components is not set or None, all components are kept.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["I am GROOT", "Flame on"])
    >>> s = hero.tokenize(s)
    >>> s = hero.tfidf(s) # return a Sparse Pandas Series by default 
    >>> hero.pca(s)
    0    [-0.7071067811865474, 5.5511151231257815e-17]
    1     [0.7071067811865476, 5.5511151231257815e-17]
    dtype: object

    See also
    --------
    :meth:`tfidf` to compute TF-IDF and :meth:`term_frequency` to compute term frequency
 
    """

    _check_is_valid_representation(s)

    pca = PCA(n_components=n_components)

    if pd_types.is_sparse(s):
        s_csr_matrix = csr_matrix(s.sparse.to_coo()[0])
        if s_csr_matrix.shape[1] > 1000:
            warnings.warn(
                "✋ Be careful. You are trying to compute PCA from a Sparse Pandas Series with a very large vocabulary. Principal Component Analysis normalize the data and this act requires to expand the input Sparse Matrix. This operation might take long. Consider using `svd_truncated` instead as it can deals with Sparse Matrix efficiently."
            )
    else:
        # Threat it as a Sparse matrix anyway for efficiency.
        s = s.astype("Sparse")
        s_csr_matrix = csr_matrix(s.sparse.to_coo()[0])

    s_dense_matrix = s_csr_matrix.todense()

    s_out = pd.Series(
        pca.fit_transform(s_dense_matrix).tolist(), index=s.index.unique(level=0),
    )
    s_out = s_out.rename_axis(None)
    return s_out


def nmf(s, n_components=2, random_state=42):
    """
    Perform negative matrix factotization on the given Represented Pandas Series.

    This function is typically used to reduce the number of dimension of a Represented Pandas Series, generally obtained by calling an hero representation function such as :meth:`tfidf`, :meth:`term_frequency` or :meth:`doc2vec`.

    NMF receive as first argument a MultiIndex Pandas Series where the first level are the documents, the second level are the words and the values are the representation.  

    Parameters
    ----------
    s : Pandas Series
    n_components : int, default is 2.
        Number of components to keep. If n_components is not set or None, all components are kept.
    random_state : int, default is 42.
        Used during randomized svd. Pass an int for reproducible results across multiple function calls.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["I am GROOT", "Flame on"])
    >>> s = hero.tokenize(s)
    >>> s = hero.tfidf(s)
    >>> hero.nmf(s) # doctest: +ELLIPSIS
    0                   [0.0, 1.0]
    1    [0.9999999999999998, 0.0]
    dtype: object

    See also
    --------
    :meth:`tfidf` to compute TF-IDF and :meth:`term_frequency` to compute term frequency
 
    """

    _check_is_valid_representation(s)

    nmf = NMF(n_components=n_components, random_state=random_state)

    if pd_types.is_sparse(s):
        s_csr_matrix = csr_matrix(s.sparse.to_coo()[0])
        if s_csr_matrix.shape[1] > 1000:
            warnings.warn(
                "✋ Be careful. You are trying to compute NMF from a Sparse Pandas Series with a very large vocabulary. Principal Component Analysis normalize the data and this act requires to expand the input Sparse Matrix. This operation might take long. Consider using `svd_truncated` instead as it can deals with Sparse Matrix efficiently."
            )
    else:
        # Threat it as a Sparse matrix anyway for efficiency.
        s = s.astype("Sparse")
        s_csr_matrix = csr_matrix(s.sparse.to_coo()[0])

    # s_dense_matrix = s_csr_matrix.todense()

    s_out = pd.Series(
        nmf.fit_transform(s_csr_matrix).tolist(), index=s.index.unique(level=0),
    )
    s_out = s_out.rename_axis(None)
    return s_out

from texthero.

jbesomi avatar jbesomi commented on June 7, 2024

Useful code for testing the functions:

class TestRepresentation(PandasTestCase):
    """
    Term Frequency.
    """

    def test_term_frequency_single_document(self):
        s = pd.Series([list("abbccc")])

        idx = pd.MultiIndex.from_tuples(
            [(0, "a"), (0, "b"), (0, "c")], names=("document", "word")
        )

        s_true = pd.Series([1, 2, 3], index=idx, dtype="int").astype(
            pd.SparseDtype("int", 0)
        )
        self.assertEqual(representation.term_frequency(s), s_true)

    def test_term_frequency_multiple_documents(self):

        s = pd.Series([["doc_one"], ["doc_two"]])

        idx = pd.MultiIndex.from_tuples(
            [(0, "doc_one"), (1, "doc_two")], names=("document", "word")
        )

        s_true = pd.Series([1, 1], index=idx, dtype="int").astype(
            pd.SparseDtype("int", 0)
        )
        self.assertEqual(representation.term_frequency(s), s_true)

    def test_term_frequency_not_lowercase(self):

        s = pd.Series([["A"], ["a"]])

        idx = pd.MultiIndex.from_tuples(
            [(0, "A"), (1, "a")], names=("document", "word")
        )

        s_true = pd.Series([1, 1], index=idx, dtype="int").astype(
            pd.SparseDtype("int", 0)
        )
        self.assertEqual(representation.term_frequency(s), s_true)

    def test_term_frequency_punctuation_are_kept(self):

        s = pd.Series([["number", "one", "!"]])

        idx = pd.MultiIndex.from_tuples(
            [(0, "!"), (0, "number"), (0, "one")], names=("document", "word")
        )

        s_true = pd.Series([1, 1, 1], index=idx, dtype="int").astype(
            pd.SparseDtype("int", 0)
        )
        self.assertEqual(representation.term_frequency(s), s_true)

    def test_term_frequency_raise_when_not_tokenized(self):
        s = pd.Series("not tokenized")
        with self.assertRaisesRegex(ValueError, r"tokenized"):
            representation.term_frequency(s)

    """
    TF-IDF
    """

    def test_tfidf_simple(self):
        s = pd.Series([["a"]])

        idx = pd.MultiIndex.from_tuples([(0, "a")], names=("document", "word"))
        s_true = pd.Series([1.0], index=idx).astype("Sparse")
        self.assertEqual(representation.tfidf(s), s_true)

    def test_idf_single_not_lowercase(self):
        tfidf_single_smooth = 0.7071067811865475  # TODO

        s = pd.Series([list("Aa")])

        idx = pd.MultiIndex.from_tuples(
            [(0, "A"), (0, "a")], names=("document", "word")
        )

        s_true = pd.Series(
            [tfidf_single_smooth, tfidf_single_smooth], index=idx
        ).astype("Sparse")

        self.assertEqual(representation.tfidf(s), s_true)

    def test_idf_single_different_index(self):
        # compute s_true
        idx = pd.MultiIndex.from_tuples(
            [(10, "a"), (11, "b")], names=("document", "word")
        )
        s_true = pd.Series([1.0, 1.0], index=idx).astype("Sparse")

        s = pd.Series([["a"], ["b"]], index=[10, 11])
        self.assertEqual(representation.tfidf(s), s_true)

    def test_idf_raise_when_not_tokenized(self):
        s = pd.Series("not tokenized")
        with self.assertRaisesRegex(ValueError, r"tokenized"):
            representation.tfidf(s)

    """
    PCA
    """

    def test_pca_tf_simple(self):
        idx = pd.MultiIndex.from_tuples(
            [(0, "a"), (1, "b"), (2, "c")], names=("document", "word")
        )
        s = pd.Series([1, 1, 1], index=idx)
        s = representation.pca(s)

        from sklearn.decomposition import PCA

        pca = PCA(n_components=2)
        s_true = pca.fit_transform([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
        s_true = pd.Series(s_true.tolist())

        self.assertEqual(s, s_true)

    # TODO check raise warning

    """
    NMF
    """

    def test_nmf_tf_simple(self):
        idx = pd.MultiIndex.from_tuples(
            [(0, "a"), (1, "b"), (2, "c")], names=("document", "word")
        )
        s = pd.Series([1, 1, 1], index=idx)
        s = representation.nmf(s, random_state=1)

        from sklearn.decomposition import NMF

        nmf = NMF(n_components=2, random_state=1)
        s_true = nmf.fit_transform([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
        s_true = pd.Series(s_true.tolist())

        self.assertEqual(s, s_true)

    """
    TruncatedSVD
    """

    def test_nmf_tf_simple(self):
        idx = pd.MultiIndex.from_tuples(
            [(0, "a"), (1, "b"), (2, "c")], names=("document", "word")
        )
        s = pd.Series([1, 1, 1], index=idx)
        s = representation.truncated_svd(s, random_state=1)

        from sklearn.decomposition import TruncatedSVD

        svd = TruncatedSVD(n_components=2, random_state=1)
        s_true = svd.fit_transform([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
        s_true = pd.Series(s_true.tolist())

        self.assertEqual(s, s_true)

from texthero.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.