Skip to content

Commit

Permalink
Improve tfidf (#97)
Browse files Browse the repository at this point in the history
* Improve TFIDF. Closes #76

Docstring now includes formula/explaination.

Normalization disabled.

Representation Series is already being handled (although output
is still like before).

Function representation_series_to_flat_series added.

Co-authored-by: Maximilian Krahn <[email protected]>

* Improve TFIDF. Closes #76

Docstring now includes formula/explaination.

Normalization disabled (the option "normalization=None" was "hidden" in the sklearn code, so that turned out to be an easy fix).

Representation Series is already being handled (although output is still like before, using representation_series_to_flat_series).

Function representation_series_to_flat_series added.

Unit tests are changed accordingly, also one with the explicit calculation using the formula.

Co-authored-by: Maximilian Krahn <[email protected]>

* Implement suggested changes to tfidf.

max_features fixed

lowercase=False removed

docstring improved

tests for different arguments added

* Incorporate remote changes.

Co-authored-by: Maximilian Krahn <[email protected]>
  • Loading branch information
henrifroese and mk2510 authored Jul 17, 2020
1 parent a93cc06 commit 1d4d5a0
Show file tree
Hide file tree
Showing 2 changed files with 182 additions and 37 deletions.
58 changes: 50 additions & 8 deletions tests/test_representation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
import numpy as np
from texthero import representation
from texthero import preprocessing

Expand All @@ -7,6 +8,7 @@
import doctest
import unittest
import string
import math
import warnings

"""
Expand Down Expand Up @@ -63,15 +65,37 @@ def test_term_frequency_not_tokenized_yet(self):
TF-IDF
"""

def test_idf_single_document(self):
s = pd.Series("a")
def test_tfidf_formula(self):
s = pd.Series(["Hi Bye", "Test Bye Bye"])
s = preprocessing.tokenize(s)
s_true = pd.Series([[1]])
s_true = pd.Series(
[
[
1.0 * (math.log(3 / 3) + 1),
1.0 * (math.log(3 / 2) + 1),
0.0 * (math.log(3 / 2) + 1),
],
[
2.0 * (math.log(3 / 3) + 1),
0.0 * (math.log(3 / 2) + 1),
1.0 * (math.log(3 / 2) + 1),
],
]
)
s_true.rename_axis("document", inplace=True)
self.assertEqual(representation.tfidf(s), s_true)

def test_tfidf_single_document(self):
s = pd.Series("a", index=["yo"])
s = preprocessing.tokenize(s)
s_true = pd.Series([[1]], index=["yo"])
s_true.rename_axis("document", inplace=True)
self.assertEqual(representation.tfidf(s), s_true)

def test_idf_not_tokenized_yet(self):
def test_tfidf_not_tokenized_yet(self):
s = pd.Series("a")
s_true = pd.Series([[1]])
s_true.rename_axis("document", inplace=True)

with warnings.catch_warnings(): # avoid print warning
warnings.simplefilter("ignore")
Expand All @@ -80,10 +104,28 @@ def test_idf_not_tokenized_yet(self):
with self.assertWarns(DeprecationWarning): # check raise warning
representation.tfidf(s)

def test_idf_single_not_lowercase(self):
tfidf_single_smooth = 0.7071067811865475 # TODO

def test_tfidf_single_not_lowercase(self):
s = pd.Series("ONE one")
s = preprocessing.tokenize(s)
s_true = pd.Series([[tfidf_single_smooth, tfidf_single_smooth]])
s_true = pd.Series([[1.0, 1.0]])
s_true.rename_axis("document", inplace=True)
self.assertEqual(representation.tfidf(s), s_true)

def test_tfidf_max_features(self):
s = pd.Series("one one two")
s = preprocessing.tokenize(s)
s_true = pd.Series([[2.0]])
s_true.rename_axis("document", inplace=True)
self.assertEqual(representation.tfidf(s, max_features=1), s_true)

def test_tfidf_min_df(self):
s = pd.Series([["one"], ["one", "two"]])
s_true = pd.Series([[1.0], [1.0]])
s_true.rename_axis("document", inplace=True)
self.assertEqual(representation.tfidf(s, min_df=2), s_true)

def test_tfidf_max_df(self):
s = pd.Series([["one"], ["one", "two"]])
s_true = pd.Series([[0.0], [1.4054651081081644]])
s_true.rename_axis("document", inplace=True)
self.assertEqual(representation.tfidf(s, max_df=1), s_true)
161 changes: 132 additions & 29 deletions texthero/representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
"""

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, NMF
from sklearn.cluster import KMeans, DBSCAN, MeanShift
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix

from typing import Optional
from typing import Optional, Union, Any

from texthero import preprocessing

Expand All @@ -19,6 +21,72 @@

# from texthero import pandas_ as pd_

"""
Helper
"""


def representation_series_to_flat_series(
s: Union[pd.Series, pd.Series.sparse],
index: pd.Index = None,
fill_missing_with: Any = np.nan,
) -> pd.Series:
"""
Transform a Pandas Representation Series to a "normal" (flattened) Pandas Series.
The given Series should have a multiindex with first level being the document
and second level being individual features of that document (e.g. tdidf scores per word).
The flattened Series has one cell per document, with the cell being a list of all
the individual features of that document.
Parameters
----------
s : Sparse Pandas Series or Pandas Series
The multiindexed Pandas Series to flatten.
index : Pandas Index, optional, default to None
The index the flattened Series should have.
fill_missing_with : Any, default to np.nan
Value to fill the NaNs (missing values) with. This _does not_ mean
that existing values that are np.nan are replaced, but rather that
features that are not present in one document but present in others
are filled with fill_missing_with. See example below.
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> import numpy as np
>>> index = pd.MultiIndex.from_tuples([("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], names=['document', 'word'])
>>> s = pd.Series([3, np.nan, 4], index=index)
>>> s
document word
doc0 Word1 3.0
Word3 NaN
doc1 Word2 4.0
dtype: float64
>>> hero.representation_series_to_flat_series(s, fill_missing_with=0.0)
document
doc0 [3.0, 0.0, nan]
doc1 [0.0, 4.0, 0.0]
dtype: object
"""
s = s.unstack(fill_value=fill_missing_with)

if index is not None:
s = s.reindex(index, fill_value=fill_missing_with)
# Reindexing makes the documents for which no values
# are present in the Sparse Representation Series
# "reappear" correctly.

s = pd.Series(s.values.tolist(), index=s.index)

s.rename_axis("document", inplace=True)

return s


# Warning message for not-tokenized inputs
_not_tokenized_warning_message = (
"It seems like the given Pandas Series s is not tokenized. This function will"
Expand Down Expand Up @@ -91,49 +159,64 @@ def term_frequency(
return s


def tfidf(s: pd.Series, max_features=None, min_df=1, return_feature_names=False):
def tfidf(
s: pd.Series, max_features=None, min_df=1, max_df=1.0, return_feature_names=False
) -> pd.Series.sparse:
"""
Represent a text-based Pandas Series using TF-IDF.
*Term Frequency - Inverse Document Frequency (TF-IDF)* is a formula to
calculate the _relative importance_ of the words in a document, taking
into account the words' occurences in other documents. It consists of two parts:
The *term frequency (tf)* tells us how frequently a term is present in a document,
so tf(document d, term t) = number of times t appears in d.
The *inverse document frequency (idf)* measures how _important_ or _characteristic_
a term is among the whole corpus (i.e. among all documents).
Thus, idf(term t) = log((1 + number of documents) / (1 + number of documents where t is present)) + 1.
Finally, tf-idf(document d, term t) = tf(d, t) * idf(t).
Different from the `sklearn-implementation of tfidf <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`,
this function does *not* normalize the output in any way,
so the result is exactly what you
get applying the formula described above.
The input Series should already be tokenized. If not, it will
be tokenized before tfidf is calculated.
If working with big pandas Series, you might want to limit
the number of features through the max_features parameter.
Parameters
----------
s : Pandas Series
max_features : int, optional
Maximum number of features to keep.
min_df : int, optional. Default to 1.
When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
return_features_names : Boolean. Default to False.
If True, return a tuple (*tfidf_series*, *features_names*)
s : Pandas Series (tokenized)
max_features : int, optional, default to None.
If not None, only the max_features most frequent tokens are used.
min_df : int, optional, default to 1.
When building the vocabulary, ignore terms that have a document
frequency (number of documents a term appears in) strictly lower than the given threshold.
max_df : int or double, optional, default to 1.0
When building the vocabulary, ignore terms that have a document
frequency (number of documents a term appears in) strictly higher than the given threshold. This arguments basically permits to remove corpus-specific stop words. When the argument is a float [0.0, 1.0], the parameter represents a proportion of documents.
return_feature_names: Boolean, optional, default to False
Whether to return the feature (i.e. word) names with the output.
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["Sentence one", "Sentence two"])
>>> s = hero.tokenize(s)
>>> hero.tfidf(s)
0 [0.5797386715376657, 0.8148024746671689, 0.0]
1 [0.5797386715376657, 0.0, 0.8148024746671689]
dtype: object
To return the *feature_names*:
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["Sentence one", "Sentence two"])
>>> s = pd.Series(["Hi Bye", "Test Bye Bye"])
>>> s = hero.tokenize(s)
>>> hero.tfidf(s, return_feature_names=True)
(0 [0.5797386715376657, 0.8148024746671689, 0.0]
1 [0.5797386715376657, 0.0, 0.8148024746671689]
dtype: object, ['Sentence', 'one', 'two'])
(document
0 [1.0, 1.4054651081081644, 0.0]
1 [2.0, 0.0, 1.4054651081081644]
dtype: object, ['Bye', 'Hi', 'Test'])
"""

# TODO. In docstring show formula to compute TF-IDF and also avoid using sk-learn if possible.

# Check if input is tokenized. Else, print warning and tokenize.
if not isinstance(s.iloc[0], list):
warnings.warn(_not_tokenized_warning_message, DeprecationWarning)
Expand All @@ -143,15 +226,35 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, return_feature_names=False)
use_idf=True,
max_features=max_features,
min_df=min_df,
max_df=max_df,
tokenizer=lambda x: x,
preprocessor=lambda x: x,
norm=None, # Disable l1/l2 normalization.
)

tfidf_vectors_csr = tfidf.fit_transform(s)

# Result from sklearn is in Compressed Sparse Row format.
# Pandas Sparse Series can only be initialized from Coordinate format.
tfidf_vectors_coo = coo_matrix(tfidf_vectors_csr)
s_out = pd.Series.sparse.from_coo(tfidf_vectors_coo)

# Map word index to word name and keep original index of documents.
feature_names = tfidf.get_feature_names()
s_out.index = s_out.index.map(lambda x: (s.index[x[0]], feature_names[x[1]]))

s_out.rename_axis(["document", "word"], inplace=True)

# NOTE: Currently: still convert to flat series instead of representation series.
# Will change to return representation series directly in Version 2.
s_out = representation_series_to_flat_series(
s_out, fill_missing_with=0.0, index=s.index
)
s = pd.Series(tfidf.fit_transform(s).toarray().tolist(), index=s.index)

if return_feature_names:
return (s, tfidf.get_feature_names())
return s_out, feature_names
else:
return s
return s_out


"""
Expand Down

0 comments on commit 1d4d5a0

Please sign in to comment.