From 7cd2a4348f94ecadf522605c67dcaa1ceb7459ff Mon Sep 17 00:00:00 2001
From: clabra <clabra@gmail.com>
Date: Wed, 25 Jun 2014 14:10:39 +0200
Subject: [PATCH 1/3] Phrase feature (specailly useful for URLs): try to get an
 slug as most meaningful as possible using punctuation to extract fragment

---
 slugify/__init__.py |  7 +++++--
 slugify/main.py     | 26 +++++++++++++++++++++++++-
 slugify/tests.py    | 18 +++++++++++++++++-
 3 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/slugify/__init__.py b/slugify/__init__.py
index 856def1..2e508d0 100644
--- a/slugify/__init__.py
+++ b/slugify/__init__.py
@@ -8,8 +8,11 @@
 
 slugify_url = Slugify()
 slugify_url.to_lower = True
-slugify_url.stop_words = ('a', 'an', 'the')
-slugify_url.max_length = 200
+#slugify_url.stop_words = ('a', 'an', 'the')
+slugify_url.max_length = 100
+slugify_url.min_length = 35
+slugify_url.separator = '-'
+slugify_url.extract_phrase = True
 
 slugify_filename = Slugify()
 slugify_filename.separator = '_'
diff --git a/slugify/main.py b/slugify/main.py
index 202fe4f..055d2e7 100644
--- a/slugify/main.py
+++ b/slugify/main.py
@@ -65,7 +65,10 @@ class Slugify(object):
     _stop_words = ()
 
     def __init__(self, pretranslate=None, translate=unidecode, safe_chars='', stop_words=(),
-                 to_lower=False, max_length=None, separator=u'-', capitalize=False):
+                 to_lower=False, max_length=2000, min_length=25, separator=u'-', capitalize=False, extract_phrase=False):
+        """Init next parametesr taking in account URL format recommendations: 
+        to_lower = True, max_length = 2000, separator = '-'
+        """
 
         self.pretranslate = pretranslate
         self.translate = translate
@@ -74,8 +77,10 @@ def __init__(self, pretranslate=None, translate=unidecode, safe_chars='', stop_w
 
         self.to_lower = to_lower
         self.max_length = max_length
+        self.min_length = min_length
         self.separator = separator
         self.capitalize = capitalize
+        self.extract_phrase = extract_phrase
 
     def pretranslate_dict_to_function(self, convert_dict):
 
@@ -141,6 +146,23 @@ def sanitize(self, text):
             text = text.replace("'", '').strip()  # remove '
         return filter(None, self.sanitize_re.split(text))  # split by unwanted characters
 
+    def phrase(self, text): 
+        """Try to get an slug as most meaningful as possible using punctuation to extract fragment"""
+        text =  text[:self.max_length] # Note we have to cut text here, can't wait after sanitize phase
+        punctuation_marks = [u'\.', u';', u',', u':']
+        len_text = len(text)
+        for mark in punctuation_marks: 
+            r = re.compile(u".*%s" % (mark))
+            m = r.match(text)
+            if m: 
+                phrase = m.group()
+                len_phrase = len(phrase)
+                if len_phrase >= self.min_length and len_phrase < len_text: 
+                    text = phrase
+                    break
+        return text
+
+
     def __call__(self, text, **kwargs):
 
         max_length = kwargs.get('max_length', self.max_length)
@@ -166,6 +188,8 @@ def __call__(self, text, **kwargs):
 
             text = u''.join(text_parts)
 
+        if self.extract_phrase == True:
+            text = self.phrase(text)
         words = self.sanitize(text)
         text = join_words(words, separator, max_length)
 
diff --git a/slugify/tests.py b/slugify/tests.py
index 593d0c9..1763d5f 100644
--- a/slugify/tests.py
+++ b/slugify/tests.py
@@ -54,7 +54,7 @@ def test_slugify_unicode(self):
 class PredefinedSlugifyTestCase(unittest.TestCase):
 
     def test_slugify_url(self):
-        self.assertEqual(slugify_url('The Über article'), 'uber-article')
+        self.assertEqual(slugify_url('The Über article'), 'the-uber-article')
 
     def test_slugify_filename(self):
         self.assertEqual(slugify_filename(u'Дrаft №2.txt'), u'Draft_2.txt')
@@ -254,5 +254,21 @@ def test_deprecated_get_slugify(self):
             self.assertIn("'slugify.get_slugify' is deprecated", str(warning[-1].message))
 
 
+class PhraseSlugifyTestCase(unittest.TestCase):
+
+    def test_slugify_phrase_url(self):
+        text = "Someone must have slandered Josef K., for one morning, without having done anything truly wrong, he was arrested."
+        self.assertEqual(slugify_url(text),
+                "someone-must-have-slandered-josef-k")
+
+        text = "The Miss Lonelyhearts of the New York Post-Dispatch (Are you in trouble?—Do-you-need-advice?—Write-to-Miss-Lonelyhearts-and-she-will-help-you) sat at his desk and stared at a piece of white cardboard. —Nathanael West, Miss Lonelyhearts"
+        self.assertEqual(slugify_url(text),
+                "the-miss-lonelyhearts-of-the-new-york-post-dispatch-are-you-in-trouble-do-you-need-advice-write")
+
+        text = " I wish either my father or my mother, or indeed both of them, as they were in duty both equally bound to it, had minded what they were about when they begot me; had they duly considered how much depended upon what they were then doing;—that not only the production of a rational Being was concerned in it, but that possibly the happy formation and temperature of his body, perhaps his genius and the very cast of his mind;—and, for aught they knew to the contrary, even the fortunes of his whole house might take their turn from the humours and dispositions which were then uppermost:—Had they duly weighed and considered all this, and proceeded accordingly,—I am verily persuaded I should have made a quite different figure in the world, from that, in which the reader is likely to see me. —Laurence Sterne, Tristram Shandy (1759–1767)"
+        self.assertEqual(slugify_url(text),
+                "i-wish-either-my-father-or-my-mother-or-indeed-both-of-them")
+
+
 if __name__ == '__main__':
     unittest.main()

From a10dad638a202811e671596a72aaf7e4fe6c42b5 Mon Sep 17 00:00:00 2001
From: clabra <clabra@gmail.com>
Date: Wed, 25 Jun 2014 14:31:19 +0200
Subject: [PATCH 2/3] More tests for Phrase feature

---
 slugify/__init__.py |  2 +-
 slugify/main.py     |  4 ++--
 slugify/tests.py    | 27 +++++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/slugify/__init__.py b/slugify/__init__.py
index 2e508d0..1215eba 100644
--- a/slugify/__init__.py
+++ b/slugify/__init__.py
@@ -10,7 +10,7 @@
 slugify_url.to_lower = True
 #slugify_url.stop_words = ('a', 'an', 'the')
 slugify_url.max_length = 100
-slugify_url.min_length = 35
+slugify_url.min_length = 25
 slugify_url.separator = '-'
 slugify_url.extract_phrase = True
 
diff --git a/slugify/main.py b/slugify/main.py
index 055d2e7..8258bb6 100644
--- a/slugify/main.py
+++ b/slugify/main.py
@@ -149,7 +149,7 @@ def sanitize(self, text):
     def phrase(self, text): 
         """Try to get an slug as most meaningful as possible using punctuation to extract fragment"""
         text =  text[:self.max_length] # Note we have to cut text here, can't wait after sanitize phase
-        punctuation_marks = [u'\.', u';', u',', u':']
+        punctuation_marks = [u'\.', u';', u',', u'\:']
         len_text = len(text)
         for mark in punctuation_marks: 
             r = re.compile(u".*%s" % (mark))
@@ -157,7 +157,7 @@ def phrase(self, text):
             if m: 
                 phrase = m.group()
                 len_phrase = len(phrase)
-                if len_phrase >= self.min_length and len_phrase < len_text: 
+                if len_phrase >= self.min_length and len_phrase <= len_text:
                     text = phrase
                     break
         return text
diff --git a/slugify/tests.py b/slugify/tests.py
index 1763d5f..dfe779a 100644
--- a/slugify/tests.py
+++ b/slugify/tests.py
@@ -255,6 +255,16 @@ def test_deprecated_get_slugify(self):
 
 
 class PhraseSlugifyTestCase(unittest.TestCase):
+    """Tests results for current config of slugify_url
+
+    slugify_url = Slugify()
+    slugify_url.to_lower = True
+    #slugify_url.stop_words = ('a', 'an', 'the')
+    slugify_url.max_length = 100
+    slugify_url.min_length = 25
+    slugify_url.separator = '-'
+    slugify_url.extract_phrase = True
+    """
 
     def test_slugify_phrase_url(self):
         text = "Someone must have slandered Josef K., for one morning, without having done anything truly wrong, he was arrested."
@@ -269,6 +279,23 @@ def test_slugify_phrase_url(self):
         self.assertEqual(slugify_url(text),
                 "i-wish-either-my-father-or-my-mother-or-indeed-both-of-them")
 
+        text = "En un lugar de la Mancha, de cuyo nombre no quiero acordarme, vivía un caballero."
+        self.assertEqual(slugify_url(text),
+                "en-un-lugar-de-la-mancha-de-cuyo-nombre-no-quiero-acordarme-vivia-un-caballero")
+
+        # :
+        text = "Este era el nombre del caballero: Don Quijote"
+        self.assertEqual(slugify_url(text),
+                "este-era-el-nombre-del-caballero") # len (phrase) > min_length, cut at punctuation mark
+
+        text = "Su nombre: Don Quijote"
+        self.assertEqual(slugify_url(text),
+                "su-nombre-don-quijote") # len (phrase) < min_length, don't cut at punctuation mark
+
+        # ;
+        text = "Este era el nombre del caballero; Don Quijote"
+        self.assertEqual(slugify_url(text),
+                "este-era-el-nombre-del-caballero") # len (phrase) > min_length, cut at punctuation mark
 
 if __name__ == '__main__':
     unittest.main()

From d087e40ec6941335ee86fa3b457559dc8225e43a Mon Sep 17 00:00:00 2001
From: clabra <clabra@gmail.com>
Date: Tue, 1 Jul 2014 15:44:16 +0200
Subject: [PATCH 3/3] Add avoid_truncate_word feature to avoid cut last word

Introduces Django dependency since we use django.uitls.text methods to
manage text
---
 slugify/main.py  | 36 +++++++++++++++++++++++++++++++-----
 slugify/tests.py |  4 ++--
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/slugify/main.py b/slugify/main.py
index 8258bb6..4590889 100644
--- a/slugify/main.py
+++ b/slugify/main.py
@@ -4,6 +4,7 @@
 
 from unidecode import unidecode
 import regex as re
+from django.utils.text import Truncator
 
 
 re.DEFAULT_VERSION = re.V1  # Version 1 behaviour: nested sets and set operations are supported
@@ -65,7 +66,7 @@ class Slugify(object):
     _stop_words = ()
 
     def __init__(self, pretranslate=None, translate=unidecode, safe_chars='', stop_words=(),
-                 to_lower=False, max_length=2000, min_length=25, separator=u'-', capitalize=False, extract_phrase=False):
+                 to_lower=False, max_length=2000, min_length=25, separator=u'-', capitalize=False, extract_phrase=False, truncate_words=False):
         """Init next parametesr taking in account URL format recommendations: 
         to_lower = True, max_length = 2000, separator = '-'
         """
@@ -81,6 +82,7 @@ def __init__(self, pretranslate=None, translate=unidecode, safe_chars='', stop_w
         self.separator = separator
         self.capitalize = capitalize
         self.extract_phrase = extract_phrase
+        self.truncate_words = truncate_words # Is allowed cut words in the middle?
 
     def pretranslate_dict_to_function(self, convert_dict):
 
@@ -146,9 +148,24 @@ def sanitize(self, text):
             text = text.replace("'", '').strip()  # remove '
         return filter(None, self.sanitize_re.split(text))  # split by unwanted characters
 
+    def avoid_truncated_word(self, text): 
+        """Truncate in a way that text will be shorter than max_length and won't be cut in the middle of a word""" 
+        words = text.split()
+        if not words:
+            return text
+        truncator = Truncator(text)
+        last_word = text.split()[-1]
+        text = truncator.chars(self.max_length, '')
+        truncated_last_word = text.split()[-1]
+        if truncated_last_word !=  last_word: 
+            # last word is cut. So, remove it
+            num_words = len(text.split())
+            text = truncator.words(num_words - 1) 
+        return text
+
     def phrase(self, text): 
         """Try to get an slug as most meaningful as possible using punctuation to extract fragment"""
-        text =  text[:self.max_length] # Note we have to cut text here, can't wait after sanitize phase
+        text =  self.avoid_truncated_word(text)  # Note we have to cut text here, can't wait after sanitize phase
         punctuation_marks = [u'\.', u';', u',', u'\:']
         len_text = len(text)
         for mark in punctuation_marks: 
@@ -164,6 +181,12 @@ def phrase(self, text):
 
 
     def __call__(self, text, **kwargs):
+        """Use: 
+        slugify_url = Slugify()
+        slugify_url.extract_phrase = True # set parameters
+        ...
+        slugify("Text to slugify")
+        """
 
         max_length = kwargs.get('max_length', self.max_length)
         separator = kwargs.get('separator', self.separator)
@@ -188,9 +211,12 @@ def __call__(self, text, **kwargs):
 
             text = u''.join(text_parts)
 
-        if self.extract_phrase == True:
-            text = self.phrase(text)
-        words = self.sanitize(text)
+        if self.extract_phrase:
+            text = self.phrase(text) # calls self.avoid_truncated_word()
+        elif not self.truncate_words: 
+            text = self.avoid_truncated_word(text)
+
+        words = self.sanitize(text) # leave only secure chars
         text = join_words(words, separator, max_length)
 
         if text and kwargs.get('capitalize', self.capitalize):
diff --git a/slugify/tests.py b/slugify/tests.py
index dfe779a..8011fc2 100644
--- a/slugify/tests.py
+++ b/slugify/tests.py
@@ -269,11 +269,11 @@ class PhraseSlugifyTestCase(unittest.TestCase):
     def test_slugify_phrase_url(self):
         text = "Someone must have slandered Josef K., for one morning, without having done anything truly wrong, he was arrested."
         self.assertEqual(slugify_url(text),
-                "someone-must-have-slandered-josef-k")
+                "someone-must-have-slandered-josef-k-for-one-morning-without-having-done-anything-truly-wrong")
 
         text = "The Miss Lonelyhearts of the New York Post-Dispatch (Are you in trouble?—Do-you-need-advice?—Write-to-Miss-Lonelyhearts-and-she-will-help-you) sat at his desk and stared at a piece of white cardboard. —Nathanael West, Miss Lonelyhearts"
         self.assertEqual(slugify_url(text),
-                "the-miss-lonelyhearts-of-the-new-york-post-dispatch-are-you-in-trouble-do-you-need-advice-write")
+                "the-miss-lonelyhearts-of-the-new-york-post-dispatch-are-you-in")
 
         text = " I wish either my father or my mother, or indeed both of them, as they were in duty both equally bound to it, had minded what they were about when they begot me; had they duly considered how much depended upon what they were then doing;—that not only the production of a rational Being was concerned in it, but that possibly the happy formation and temperature of his body, perhaps his genius and the very cast of his mind;—and, for aught they knew to the contrary, even the fortunes of his whole house might take their turn from the humours and dispositions which were then uppermost:—Had they duly weighed and considered all this, and proceeded accordingly,—I am verily persuaded I should have made a quite different figure in the world, from that, in which the reader is likely to see me. —Laurence Sterne, Tristram Shandy (1759–1767)"
         self.assertEqual(slugify_url(text),