Comments (3)
Hi Jonny, we will take care about this issue now. 😃
from texthero.
Useful code:
"""
Helper
"""
def _check_is_valid_representation(s) -> bool:
"""
Check that the given Pandas Series respect is a Representation Pandas Series.
"""
if not isinstance(s.index, pd.MultiIndex):
raise ValueError(
f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex"
)
if s.index.nlevels != 2:
raise ValueError(
f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2."
)
return True
"""
Vectorization
"""
def term_frequency(
s: pd.Series, max_features=300, min_df=1, max_df=1.0
) -> pd.Series.sparse:
"""
Represent a tokenized Pandas Series using term frequency.
Parameters
----------
s : Pandas Series
max_features : int, optional, default to 300
Maximum number of features to keep.
min_df : int, optional, default to 1
When building the vocabulary, ignore terms that have a document
frequency strictly lower than the given threshold.
max_df : int or double, optional, default to 1.0
When building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold. This arguments basically permits to remove corpus-specific stop words. When the argument is a float [0.0, 1.0], the parameter represents a proportion of documents.
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["I am GROOT", "Flame on"])
>>> s = hero.tokenize(s)
>>> hero.term_frequency(s)
document word
0 GROOT 1
I 1
am 1
1 Flame 1
on 1
dtype: Sparse[int64, 0]
"""
if type(s.iloc[0]) != list:
raise ValueError(
"🤔 It seems like the given Pandas Series is not tokenized. Have you tried passing the Series through `hero.tokenize(s)`?"
)
# TODO. Can be rewritten without sklearn.
tf = CountVectorizer(
max_features=max_features,
min_df=min_df,
tokenizer=lambda x: x,
preprocessor=lambda x: x,
)
tf_vectors_csr = tf.fit_transform(s)
tf_vectors_coo = coo_matrix(tf_vectors_csr)
s_out = pd.Series.sparse.from_coo(tf_vectors_coo)
features_names = tf.get_feature_names()
# Map word index to word name
s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]]))
s_out.rename_axis(["document", "word"], inplace=True)
s.name = "term_frequency"
return s_out
def tfidf(s: pd.Series, max_features=300, min_df=1, max_df=1.0) -> pd.Series.sparse:
"""
Represent a tokenized Pandas Series with TF-IDF.
Parameters
----------
s : Pandas Series (tokenized)
max_features : int, optional, default to 300
Maximum number of features to keep.
min_df : int, optional, default to 1.
When building the vocabulary, ignore terms that have a document
frequency strictly lower than the given threshold.
max_df : int or double, optional, default to 1.0
When building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold. This arguments basically permits to remove corpus-specific stop words. When the argument is a float [0.0, 1.0], the parameter represents a proportion of documents.
Returns
-------
Sparse Pandas Series
Return a MultiIndex Sparse Series where the first level is the document, the second level is the word name and the values are the tf-idf coefficients.
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["I am GROOT", "Flame on"])
>>> s = hero.tokenize(s)
>>> hero.tfidf(s)
document word
0 GROOT 0.577350
I 0.577350
am 0.577350
1 Flame 0.707107
on 0.707107
dtype: Sparse[float64, nan]
"""
# TODO. In docstring show the formula to compute TF-IDF
if type(s.iloc[0]) != list:
raise ValueError(
"🤔 It seems like the given Pandas Series is not tokenized. Have you tried passing the Series through `hero.tokenize(s)`?"
)
tfidf = TfidfVectorizer(
use_idf=True,
max_features=max_features,
min_df=min_df,
max_df=max_df,
tokenizer=lambda x: x,
preprocessor=lambda x: x,
)
tfidf_vectors_csr = tfidf.fit_transform(s)
tfidf_vectors_coo = coo_matrix(tfidf_vectors_csr)
s_out = pd.Series.sparse.from_coo(tfidf_vectors_coo)
features_names = tfidf.get_feature_names()
# Map word index to word name
s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]]))
s_out.rename_axis(["document", "word"], inplace=True)
s.name = "tfidf"
return s_out
"""
Dimensionality reduction
"""
def truncated_svd(s, n_components=2, random_state=42):
"""
Perform truncated SVD on the given Represented Pandas Series.
This function is typically used to reduce the number of dimension of a Represented Pandas Series, generally obtained by calling an hero representation function such as :meth:`tfidf`, :meth:`term_frequency` or :meth:`doc2vec`.
Truncated SVD receive as first argument a MultiIndex Pandas Series where the first level are the documents, the second level are the words and the values are the representation.
Parameters
----------
s : Pandas Series
n_components : int, default is 2.
Number of components to keep. If n_components is not set or None, all components are kept.
random_state : int, default to 42.
Used during randomized svd. Pass an int for reproducible results across multiple function calls.
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["I am GROOT", "Flame on"])
>>> s = hero.tokenize(s)
>>> s = hero.tfidf(s)
>>> hero.truncated_svd(s) # doctest: +ELLIPSIS
0 [0.0, 0.99...]
1 [1.00..., 0.0]
dtype: object
See also
--------
:meth:`tfidf` to compute TF-IDF and :meth:`term_frequency` to compute term frequency
"""
_check_is_valid_representation(s)
svd = TruncatedSVD(n_components=n_components, random_state=random_state)
s = s.astype("Sparse")
s_csr_matrix = csr_matrix(s.sparse.to_coo()[0])
s_out = pd.Series(
svd.fit_transform(s_csr_matrix).tolist(), index=s.index.unique(level=0),
)
s_out = s_out.rename_axis(None)
return s_out
def pca(s, n_components=2):
"""
Perform principal component analysis on the given Represented Pandas Series.
This function is typically used to reduce the number of dimension of a Represented Pandas Series, generally obtained by calling an hero representation function such as :meth:`tfidf`, :meth:`term_frequency` or :meth:`doc2vec`.
PCA receive as first argument a MultiIndex Pandas Series where the first level are the documents, the second level are the words and the values are their representation.
Parameters
----------
s : Pandas Series
n_components : int, default is 2.
Number of components to keep. If n_components is not set or None, all components are kept.
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["I am GROOT", "Flame on"])
>>> s = hero.tokenize(s)
>>> s = hero.tfidf(s) # return a Sparse Pandas Series by default
>>> hero.pca(s)
0 [-0.7071067811865474, 5.5511151231257815e-17]
1 [0.7071067811865476, 5.5511151231257815e-17]
dtype: object
See also
--------
:meth:`tfidf` to compute TF-IDF and :meth:`term_frequency` to compute term frequency
"""
_check_is_valid_representation(s)
pca = PCA(n_components=n_components)
if pd_types.is_sparse(s):
s_csr_matrix = csr_matrix(s.sparse.to_coo()[0])
if s_csr_matrix.shape[1] > 1000:
warnings.warn(
"✋ Be careful. You are trying to compute PCA from a Sparse Pandas Series with a very large vocabulary. Principal Component Analysis normalize the data and this act requires to expand the input Sparse Matrix. This operation might take long. Consider using `svd_truncated` instead as it can deals with Sparse Matrix efficiently."
)
else:
# Threat it as a Sparse matrix anyway for efficiency.
s = s.astype("Sparse")
s_csr_matrix = csr_matrix(s.sparse.to_coo()[0])
s_dense_matrix = s_csr_matrix.todense()
s_out = pd.Series(
pca.fit_transform(s_dense_matrix).tolist(), index=s.index.unique(level=0),
)
s_out = s_out.rename_axis(None)
return s_out
def nmf(s, n_components=2, random_state=42):
"""
Perform negative matrix factotization on the given Represented Pandas Series.
This function is typically used to reduce the number of dimension of a Represented Pandas Series, generally obtained by calling an hero representation function such as :meth:`tfidf`, :meth:`term_frequency` or :meth:`doc2vec`.
NMF receive as first argument a MultiIndex Pandas Series where the first level are the documents, the second level are the words and the values are the representation.
Parameters
----------
s : Pandas Series
n_components : int, default is 2.
Number of components to keep. If n_components is not set or None, all components are kept.
random_state : int, default is 42.
Used during randomized svd. Pass an int for reproducible results across multiple function calls.
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["I am GROOT", "Flame on"])
>>> s = hero.tokenize(s)
>>> s = hero.tfidf(s)
>>> hero.nmf(s) # doctest: +ELLIPSIS
0 [0.0, 1.0]
1 [0.9999999999999998, 0.0]
dtype: object
See also
--------
:meth:`tfidf` to compute TF-IDF and :meth:`term_frequency` to compute term frequency
"""
_check_is_valid_representation(s)
nmf = NMF(n_components=n_components, random_state=random_state)
if pd_types.is_sparse(s):
s_csr_matrix = csr_matrix(s.sparse.to_coo()[0])
if s_csr_matrix.shape[1] > 1000:
warnings.warn(
"✋ Be careful. You are trying to compute NMF from a Sparse Pandas Series with a very large vocabulary. Principal Component Analysis normalize the data and this act requires to expand the input Sparse Matrix. This operation might take long. Consider using `svd_truncated` instead as it can deals with Sparse Matrix efficiently."
)
else:
# Threat it as a Sparse matrix anyway for efficiency.
s = s.astype("Sparse")
s_csr_matrix = csr_matrix(s.sparse.to_coo()[0])
# s_dense_matrix = s_csr_matrix.todense()
s_out = pd.Series(
nmf.fit_transform(s_csr_matrix).tolist(), index=s.index.unique(level=0),
)
s_out = s_out.rename_axis(None)
return s_out
from texthero.
Useful code for testing the functions:
class TestRepresentation(PandasTestCase):
"""
Term Frequency.
"""
def test_term_frequency_single_document(self):
s = pd.Series([list("abbccc")])
idx = pd.MultiIndex.from_tuples(
[(0, "a"), (0, "b"), (0, "c")], names=("document", "word")
)
s_true = pd.Series([1, 2, 3], index=idx, dtype="int").astype(
pd.SparseDtype("int", 0)
)
self.assertEqual(representation.term_frequency(s), s_true)
def test_term_frequency_multiple_documents(self):
s = pd.Series([["doc_one"], ["doc_two"]])
idx = pd.MultiIndex.from_tuples(
[(0, "doc_one"), (1, "doc_two")], names=("document", "word")
)
s_true = pd.Series([1, 1], index=idx, dtype="int").astype(
pd.SparseDtype("int", 0)
)
self.assertEqual(representation.term_frequency(s), s_true)
def test_term_frequency_not_lowercase(self):
s = pd.Series([["A"], ["a"]])
idx = pd.MultiIndex.from_tuples(
[(0, "A"), (1, "a")], names=("document", "word")
)
s_true = pd.Series([1, 1], index=idx, dtype="int").astype(
pd.SparseDtype("int", 0)
)
self.assertEqual(representation.term_frequency(s), s_true)
def test_term_frequency_punctuation_are_kept(self):
s = pd.Series([["number", "one", "!"]])
idx = pd.MultiIndex.from_tuples(
[(0, "!"), (0, "number"), (0, "one")], names=("document", "word")
)
s_true = pd.Series([1, 1, 1], index=idx, dtype="int").astype(
pd.SparseDtype("int", 0)
)
self.assertEqual(representation.term_frequency(s), s_true)
def test_term_frequency_raise_when_not_tokenized(self):
s = pd.Series("not tokenized")
with self.assertRaisesRegex(ValueError, r"tokenized"):
representation.term_frequency(s)
"""
TF-IDF
"""
def test_tfidf_simple(self):
s = pd.Series([["a"]])
idx = pd.MultiIndex.from_tuples([(0, "a")], names=("document", "word"))
s_true = pd.Series([1.0], index=idx).astype("Sparse")
self.assertEqual(representation.tfidf(s), s_true)
def test_idf_single_not_lowercase(self):
tfidf_single_smooth = 0.7071067811865475 # TODO
s = pd.Series([list("Aa")])
idx = pd.MultiIndex.from_tuples(
[(0, "A"), (0, "a")], names=("document", "word")
)
s_true = pd.Series(
[tfidf_single_smooth, tfidf_single_smooth], index=idx
).astype("Sparse")
self.assertEqual(representation.tfidf(s), s_true)
def test_idf_single_different_index(self):
# compute s_true
idx = pd.MultiIndex.from_tuples(
[(10, "a"), (11, "b")], names=("document", "word")
)
s_true = pd.Series([1.0, 1.0], index=idx).astype("Sparse")
s = pd.Series([["a"], ["b"]], index=[10, 11])
self.assertEqual(representation.tfidf(s), s_true)
def test_idf_raise_when_not_tokenized(self):
s = pd.Series("not tokenized")
with self.assertRaisesRegex(ValueError, r"tokenized"):
representation.tfidf(s)
"""
PCA
"""
def test_pca_tf_simple(self):
idx = pd.MultiIndex.from_tuples(
[(0, "a"), (1, "b"), (2, "c")], names=("document", "word")
)
s = pd.Series([1, 1, 1], index=idx)
s = representation.pca(s)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
s_true = pca.fit_transform([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
s_true = pd.Series(s_true.tolist())
self.assertEqual(s, s_true)
# TODO check raise warning
"""
NMF
"""
def test_nmf_tf_simple(self):
idx = pd.MultiIndex.from_tuples(
[(0, "a"), (1, "b"), (2, "c")], names=("document", "word")
)
s = pd.Series([1, 1, 1], index=idx)
s = representation.nmf(s, random_state=1)
from sklearn.decomposition import NMF
nmf = NMF(n_components=2, random_state=1)
s_true = nmf.fit_transform([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
s_true = pd.Series(s_true.tolist())
self.assertEqual(s, s_true)
"""
TruncatedSVD
"""
def test_nmf_tf_simple(self):
idx = pd.MultiIndex.from_tuples(
[(0, "a"), (1, "b"), (2, "c")], names=("document", "word")
)
s = pd.Series([1, 1, 1], index=idx)
s = representation.truncated_svd(s, random_state=1)
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2, random_state=1)
s_true = svd.fit_transform([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
s_true = pd.Series(s_true.tolist())
self.assertEqual(s, s_true)
from texthero.
Related Issues (20)
- Module has no name errors HOT 3
- Discussion - stopwords HOT 4
- How to retrieve TDIDF feature names HOT 1
- Question regarding application of this tool to other language documents HOT 1
- spaCy 3 support HOT 1
- here.scatterplot not working
- Can't get texthero to work with current versions of spacy and gensim HOT 1
- kmeans error: __init__() got an unexpected keyword argument 'precompute_distances' HOT 3
- [QUESTION] What is the normalization method used in `top_words()`?
- How can I remove all punctuation except for "@" HOT 2
- `remove_punctuation()` is not removing "\" HOT 2
- Deprecated arguments on kmeans function call HOT 5
- Import error HOT 2
- Import Error (YAML Loader)
- installation error: Could not build wheels for spacy, which is required to install pyproject.toml-based projects HOT 2
- Is there any function to find how the weights are calculated for each word to represent a sentence?
- TextHero Documentation link is producing a 404
- is this package being maintained? HOT 1
- Visualization of PCA on embedding space with multi-labels HOT 1
- ModuleNotFoundError: No module named 'gensim.sklearn_api' on import HOT 2
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from texthero.