jbesomi · henrifroese · Jul 16, 2020 · Jul 16, 2020 · Jul 16, 2020 · Jul 16, 2020
diff --git a/setup.cfg b/setup.cfg
@@ -38,6 +38,7 @@ install_requires =
     unidecode>=1.1.1
     gensim>=3.6.0
     matplotlib>=3.1.0
+    wrapt>=1.12.1
 # TODO pick the correct version.
 [options.extras_require]
 dev =

diff --git a/tests/test_indexes.py b/tests/test_indexes.py
@@ -9,6 +9,7 @@
 
 # Define valid inputs for different functions.
 s_text = pd.Series(["Test"], index=[5])
+s_text_list = pd.Series([["Test", "Test2"]], index=[5])
 s_numeric = pd.Series([5.0], index=[5])
 s_numeric_lists = pd.Series([[5.0, 5.0], [6.0, 6.0]], index=[5, 6])
 
@@ -60,7 +61,7 @@
         representation.term_frequency,
         (preprocessing.tokenize(s_text),),
     ],
-    ["tfidf", representation.tfidf, (preprocessing.tokenize(s_text),)],
+    # ["tfidf", representation.tfidf, (preprocessing.tokenize(s_text),),],
     ["pca", representation.pca, (s_numeric_lists, 0)],
     ["nmf", representation.nmf, (s_numeric_lists,)],
     ["tsne", representation.tsne, (s_numeric_lists,)],

diff --git a/tests/test_nan.py b/tests/test_nan.py
@@ -0,0 +1,97 @@
+import pandas as pd
+import numpy as np
+from texthero import nlp, visualization, preprocessing, representation
+
+from . import PandasTestCase
+import unittest
+import string
+from parameterized import parameterized
+
+# Define valid inputs for different functions.
+s_nan = pd.Series(["test1", np.NaN, "test2", pd.NA])
+s_numeric_and_nan_lists = pd.Series([[5.0, 5.0], [6.0, 6.0], np.nan])
+
+# Define all test cases. Every test case is a list
+# of [name of test case, function to test, tuple of valid input for the function].
+# First argument of valid input has to be the Pandas Series we
+# want to test with at least one np.nan. If this is different for a function, a separate
+# test case has to implemented in the class below.
+# The tests will be run by AbstractNaNTest below through the @parameterized
+# decorator.
+# The names will be expanded automatically, so e.g. "named_entities"
+# creates test cases test_ignores_nan_named_entities.
+
+test_cases_nlp = [
+    ["named_entities", nlp.named_entities, (s_nan,)],
+    ["noun_chunks", nlp.noun_chunks, (s_nan,)],
+]
+
+test_cases_preprocessing = [
+    ["lowercase", preprocessing.lowercase, (s_nan,)],
+    ["replace_digits", preprocessing.replace_digits, (s_nan, "")],
+    ["remove_digits", preprocessing.remove_digits, (s_nan,)],
+    ["replace_punctuation", preprocessing.replace_punctuation, (s_nan, "")],
+    ["remove_punctuation", preprocessing.remove_punctuation, (s_nan,)],
+    ["remove_diacritics", preprocessing.remove_diacritics, (s_nan,)],
+    ["remove_whitespace", preprocessing.remove_whitespace, (s_nan,)],
+    ["replace_stopwords", preprocessing.replace_stopwords, (s_nan, "")],
+    ["remove_stopwords", preprocessing.remove_stopwords, (s_nan,)],
+    ["stem", preprocessing.stem, (s_nan,)],
+    ["remove_round_brackets", preprocessing.remove_round_brackets, (s_nan,)],
+    ["remove_curly_brackets", preprocessing.remove_curly_brackets, (s_nan,)],
+    ["remove_square_brackets", preprocessing.remove_square_brackets, (s_nan,)],
+    ["remove_angle_brackets", preprocessing.remove_angle_brackets, (s_nan,)],
+    ["remove_brackets", preprocessing.remove_brackets, (s_nan,)],
+    ["remove_html_tags", preprocessing.remove_html_tags, (s_nan,)],
+    ["tokenize", preprocessing.tokenize, (s_nan,)],
+    ["tokenize_with_phrases", preprocessing.tokenize_with_phrases, (s_nan,)],
+    ["replace_urls", preprocessing.replace_urls, (s_nan, "")],
+    ["remove_urls", preprocessing.remove_urls, (s_nan,)],
+    ["replace_tags", preprocessing.replace_tags, (s_nan, "")],
+    ["remove_tags", preprocessing.remove_tags, (s_nan,)],
+]
+
+test_cases_representation = [
+    [
+        "term_frequency",
+        representation.term_frequency,
+        (preprocessing.tokenize(s_nan),),
+    ],
+    # ["tfidf", representation.tfidf, (preprocessing.tokenize(s_nan),)],
+    ["pca", representation.pca, (s_numeric_and_nan_lists, 0)],
+    ["nmf", representation.nmf, (s_numeric_and_nan_lists,)],
+    ["tsne", representation.tsne, (s_numeric_and_nan_lists,)],
+    ["kmeans", representation.kmeans, (s_numeric_and_nan_lists, 1)],
+    ["dbscan", representation.dbscan, (s_numeric_and_nan_lists,)],
+    ["meanshift", representation.meanshift, (s_numeric_and_nan_lists,)],
+]
+
+test_cases_visualization = []
+
+test_cases = (
+    test_cases_nlp
+    + test_cases_preprocessing
+    + test_cases_representation
+    + test_cases_visualization
+)
+
+
+class AbstractNaNTest(PandasTestCase):
+    """
+    Class for np.NaN test cases. Tests for all cases
+    in test_cases whether the function ignores an input
+    with np.nan entries. Some function's tests
+    are implemented manually as they take different inputs.
+
+    """
+
+    """
+    Tests defined in test_cases above.
+    """
+
+    @parameterized.expand(test_cases)
+    def test_ignores_nan(self, name, test_function, valid_input):
+        s = valid_input[0]
+        result_s = test_function(*valid_input)
+        t_same = pd.Series(s.values)
+        self.assertTrue(result_s.isna().equals(t_same.isna()))
diff --git a/tests/test_nlp.py b/tests/test_nlp.py
@@ -44,17 +44,13 @@ def test_noun_chunks(self):
 
     def test_count_sentences(self):
         s = pd.Series("I think ... it counts correctly. Doesn't it? Great!")
-        s_true = pd.Series(3)
+        s_true = pd.Series(3, dtype=object)
         self.assertEqual(nlp.count_sentences(s), s_true)
 
     def test_count_sentences_numeric(self):
         s = pd.Series([13.0, 42.0])
         self.assertRaises(TypeError, nlp.count_sentences, s)
 
-    def test_count_sentences_missing_value(self):
-        s = pd.Series(["Test.", np.nan])
-        self.assertRaises(TypeError, nlp.count_sentences, s)
-
     def test_count_sentences_index(self):
         s = pd.Series(["Test"], index=[5])
         counted_sentences_s = nlp.count_sentences(s)

diff --git a/texthero/_helper.py b/texthero/_helper.py
@@ -0,0 +1,89 @@
+"""
+Useful helper functions for the texthero library.
+"""
+
+import functools
+import wrapt
+import numpy as np
+
+
+"""
+Decorators.
+"""
+
+
+def handle_nans(wrapped=None, input_only=False):
+    """
+    Decorator to make a function not change NaN values.
+
+    Using the decorator, the function to be applied
+    will not change cells that have value np.nan.
+
+    The function must take as first input a Series s,
+    manipulate that Series (e.g. removing diacritics)
+    and then return as first output the Series s.
+
+    Parameters
+    ----------
+    input_only: Boolean, default to False.
+        Set to True when the output that is returned by the
+        function is _not_ the same as the input series
+        with (some) cells changed (e.g. in top_words,
+        the output Series is different from the input
+        Series, and in pca there is no return, so in both
+        cases input_only is set to True).
+
+
+    Examples
+    --------
+    >>> from texthero._helper import *
+    >>> import pandas as pd
+    >>> import numpy as np
+    >>> @handle_nans
+    ... def replace_a_with_b(s):
+    ...     return s.str.replace("a", "b")
+    >>> s_with_nan = pd.Series(["Test a", np.nan])
+    >>> replace_a_with_b(s_with_nan)
+    0    Test b
+    1       NaN
+    dtype: object
+    """
+    if wrapped is None:
+        return functools.partial(handle_nans, input_only=input_only)
+
+    @wrapt.decorator
+    def wrapper(wrapped, instance, args, kwargs):
+
+        # Get first input argument (the series).
+        s = args[0]
+        nan_mask = ~s.isna()
+        # Need a copy as changing s[nan_mask] would change the original input.
+        s_result = s.copy()
+        s_result_without_nans = s_result[nan_mask]
+
+        # Change input Series so the function will only work on the non-nan fields.
+        args = (
+            (s_result_without_nans,) + args[1:]
+            if args[1:]
+            else (s_result_without_nans,)
+        )
+
+        # Execute the function and get the result.
+        output = wrapped(*args, **kwargs)
+
+        # If we should also handle the output.
+        if not input_only:
+            # Replace first argument of output (that's the Series) to refill the NaN fields.
+            if not isinstance(output, tuple):
+                output = (output,)
+            s_result[nan_mask] = output[0]
+
+            # Recover index name if set.
+            if output[0].index.name:
+                s_result.index.name = output[0].index.name
+
+            output = (s_result,) + output[1:] if output[1:] else s_result
+
+        return output
+
+    return wrapper(wrapped)
diff --git a/texthero/nlp.py b/texthero/nlp.py
@@ -5,7 +5,10 @@
 import spacy
 import pandas as pd
 
+from texthero._helper import handle_nans
 
+
+@handle_nans
 def named_entities(s, package="spacy"):
     """
     Return named-entities.
@@ -57,6 +60,7 @@ def named_entities(s, package="spacy"):
     return pd.Series(entities, index=s.index)
 
 
+@handle_nans
 def noun_chunks(s):
     """
     Return noun chunks (noun phrases).
@@ -101,6 +105,7 @@ def noun_chunks(s):
     return pd.Series(noun_chunks, index=s.index)
 
 
+@handle_nans
 def count_sentences(s: pd.Series) -> pd.Series:
     """
     Count the number of sentences per cell in a Pandas Series.
@@ -117,7 +122,7 @@ def count_sentences(s: pd.Series) -> pd.Series:
     >>> hero.count_sentences(s)
     0    2
     1    3
-    dtype: int64
+    dtype: object
     """
     number_of_sentences = []