voronind · clabra · Jun 25, 2014 · Jun 25, 2014 · Jul 1, 2014
diff --git a/slugify/__init__.py b/slugify/__init__.py
@@ -8,8 +8,11 @@
 
 slugify_url = Slugify()
 slugify_url.to_lower = True
-slugify_url.stop_words = ('a', 'an', 'the')
-slugify_url.max_length = 200
+#slugify_url.stop_words = ('a', 'an', 'the')
+slugify_url.max_length = 100
+slugify_url.min_length = 25
+slugify_url.separator = '-'
+slugify_url.extract_phrase = True
 
 slugify_filename = Slugify()
 slugify_filename.separator = '_'

diff --git a/slugify/main.py b/slugify/main.py
@@ -4,6 +4,7 @@
 
 from unidecode import unidecode
 import regex as re
+from django.utils.text import Truncator
 
 
 re.DEFAULT_VERSION = re.V1  # Version 1 behaviour: nested sets and set operations are supported
@@ -65,7 +66,10 @@ class Slugify(object):
     _stop_words = ()
 
     def __init__(self, pretranslate=None, translate=unidecode, safe_chars='', stop_words=(),
-                 to_lower=False, max_length=None, separator=u'-', capitalize=False):
+                 to_lower=False, max_length=2000, min_length=25, separator=u'-', capitalize=False, extract_phrase=False, truncate_words=False):
+        """Init next parametesr taking in account URL format recommendations: 
+        to_lower = True, max_length = 2000, separator = '-'
+        """
 
         self.pretranslate = pretranslate
         self.translate = translate
@@ -74,8 +78,11 @@ def __init__(self, pretranslate=None, translate=unidecode, safe_chars='', stop_w
 
         self.to_lower = to_lower
         self.max_length = max_length
+        self.min_length = min_length
         self.separator = separator
         self.capitalize = capitalize
+        self.extract_phrase = extract_phrase
+        self.truncate_words = truncate_words # Is allowed cut words in the middle?
 
     def pretranslate_dict_to_function(self, convert_dict):
 
@@ -141,7 +148,45 @@ def sanitize(self, text):
             text = text.replace("'", '').strip()  # remove '
         return filter(None, self.sanitize_re.split(text))  # split by unwanted characters
 
+    def avoid_truncated_word(self, text): 
+        """Truncate in a way that text will be shorter than max_length and won't be cut in the middle of a word""" 
+        words = text.split()
+        if not words:
+            return text
+        truncator = Truncator(text)
+        last_word = text.split()[-1]
+        text = truncator.chars(self.max_length, '')
+        truncated_last_word = text.split()[-1]
+        if truncated_last_word !=  last_word: 
+            # last word is cut. So, remove it
+            num_words = len(text.split())
+            text = truncator.words(num_words - 1) 
+        return text
+
+    def phrase(self, text): 
+        """Try to get an slug as most meaningful as possible using punctuation to extract fragment"""
+        text =  self.avoid_truncated_word(text)  # Note we have to cut text here, can't wait after sanitize phase
+        punctuation_marks = [u'\.', u';', u',', u'\:']
+        len_text = len(text)
+        for mark in punctuation_marks: 
+            r = re.compile(u".*%s" % (mark))
+            m = r.match(text)
+            if m: 
+                phrase = m.group()
+                len_phrase = len(phrase)
+                if len_phrase >= self.min_length and len_phrase <= len_text:
+                    text = phrase
+                    break
+        return text
+
+
     def __call__(self, text, **kwargs):
+        """Use: 
+        slugify_url = Slugify()
+        slugify_url.extract_phrase = True # set parameters
+        ...
+        slugify("Text to slugify")
+        """
 
         max_length = kwargs.get('max_length', self.max_length)
         separator = kwargs.get('separator', self.separator)
@@ -166,7 +211,12 @@ def __call__(self, text, **kwargs):
 
             text = u''.join(text_parts)
 
-        words = self.sanitize(text)
+        if self.extract_phrase:
+            text = self.phrase(text) # calls self.avoid_truncated_word()
+        elif not self.truncate_words: 
+            text = self.avoid_truncated_word(text)
+
+        words = self.sanitize(text) # leave only secure chars
         text = join_words(words, separator, max_length)
 
         if text and kwargs.get('capitalize', self.capitalize):

diff --git a/slugify/tests.py b/slugify/tests.py
@@ -54,7 +54,7 @@ def test_slugify_unicode(self):
 class PredefinedSlugifyTestCase(unittest.TestCase):
 
     def test_slugify_url(self):
-        self.assertEqual(slugify_url('The Über article'), 'uber-article')
+        self.assertEqual(slugify_url('The Über article'), 'the-uber-article')
 
     def test_slugify_filename(self):
         self.assertEqual(slugify_filename(u'Дrаft №2.txt'), u'Draft_2.txt')
@@ -254,5 +254,48 @@ def test_deprecated_get_slugify(self):
             self.assertIn("'slugify.get_slugify' is deprecated", str(warning[-1].message))
 
 
+class PhraseSlugifyTestCase(unittest.TestCase):
+    """Tests results for current config of slugify_url
+
+    slugify_url = Slugify()
+    slugify_url.to_lower = True
+    #slugify_url.stop_words = ('a', 'an', 'the')
+    slugify_url.max_length = 100
+    slugify_url.min_length = 25
+    slugify_url.separator = '-'
+    slugify_url.extract_phrase = True
+    """
+
+    def test_slugify_phrase_url(self):
+        text = "Someone must have slandered Josef K., for one morning, without having done anything truly wrong, he was arrested."
+        self.assertEqual(slugify_url(text),
+                "someone-must-have-slandered-josef-k-for-one-morning-without-having-done-anything-truly-wrong")
+
+        text = "The Miss Lonelyhearts of the New York Post-Dispatch (Are you in trouble?—Do-you-need-advice?—Write-to-Miss-Lonelyhearts-and-she-will-help-you) sat at his desk and stared at a piece of white cardboard. —Nathanael West, Miss Lonelyhearts"
+        self.assertEqual(slugify_url(text),
+                "the-miss-lonelyhearts-of-the-new-york-post-dispatch-are-you-in")
+
+        text = " I wish either my father or my mother, or indeed both of them, as they were in duty both equally bound to it, had minded what they were about when they begot me; had they duly considered how much depended upon what they were then doing;—that not only the production of a rational Being was concerned in it, but that possibly the happy formation and temperature of his body, perhaps his genius and the very cast of his mind;—and, for aught they knew to the contrary, even the fortunes of his whole house might take their turn from the humours and dispositions which were then uppermost:—Had they duly weighed and considered all this, and proceeded accordingly,—I am verily persuaded I should have made a quite different figure in the world, from that, in which the reader is likely to see me. —Laurence Sterne, Tristram Shandy (1759–1767)"
+        self.assertEqual(slugify_url(text),
+                "i-wish-either-my-father-or-my-mother-or-indeed-both-of-them")
+
+        text = "En un lugar de la Mancha, de cuyo nombre no quiero acordarme, vivía un caballero."
+        self.assertEqual(slugify_url(text),
+                "en-un-lugar-de-la-mancha-de-cuyo-nombre-no-quiero-acordarme-vivia-un-caballero")
+
+        # :
+        text = "Este era el nombre del caballero: Don Quijote"
+        self.assertEqual(slugify_url(text),
+                "este-era-el-nombre-del-caballero") # len (phrase) > min_length, cut at punctuation mark
+
+        text = "Su nombre: Don Quijote"
+        self.assertEqual(slugify_url(text),
+                "su-nombre-don-quijote") # len (phrase) < min_length, don't cut at punctuation mark
+
+        # ;
+        text = "Este era el nombre del caballero; Don Quijote"
+        self.assertEqual(slugify_url(text),
+                "este-era-el-nombre-del-caballero") # len (phrase) > min_length, cut at punctuation mark
+
 if __name__ == '__main__':
     unittest.main()