From 7cd2a4348f94ecadf522605c67dcaa1ceb7459ff Mon Sep 17 00:00:00 2001 From: clabra Date: Wed, 25 Jun 2014 14:10:39 +0200 Subject: [PATCH 1/3] Phrase feature (specailly useful for URLs): try to get an slug as most meaningful as possible using punctuation to extract fragment --- slugify/__init__.py | 7 +++++-- slugify/main.py | 26 +++++++++++++++++++++++++- slugify/tests.py | 18 +++++++++++++++++- 3 files changed, 47 insertions(+), 4 deletions(-) diff --git a/slugify/__init__.py b/slugify/__init__.py index 856def1..2e508d0 100644 --- a/slugify/__init__.py +++ b/slugify/__init__.py @@ -8,8 +8,11 @@ slugify_url = Slugify() slugify_url.to_lower = True -slugify_url.stop_words = ('a', 'an', 'the') -slugify_url.max_length = 200 +#slugify_url.stop_words = ('a', 'an', 'the') +slugify_url.max_length = 100 +slugify_url.min_length = 35 +slugify_url.separator = '-' +slugify_url.extract_phrase = True slugify_filename = Slugify() slugify_filename.separator = '_' diff --git a/slugify/main.py b/slugify/main.py index 202fe4f..055d2e7 100644 --- a/slugify/main.py +++ b/slugify/main.py @@ -65,7 +65,10 @@ class Slugify(object): _stop_words = () def __init__(self, pretranslate=None, translate=unidecode, safe_chars='', stop_words=(), - to_lower=False, max_length=None, separator=u'-', capitalize=False): + to_lower=False, max_length=2000, min_length=25, separator=u'-', capitalize=False, extract_phrase=False): + """Init next parametesr taking in account URL format recommendations: + to_lower = True, max_length = 2000, separator = '-' + """ self.pretranslate = pretranslate self.translate = translate @@ -74,8 +77,10 @@ def __init__(self, pretranslate=None, translate=unidecode, safe_chars='', stop_w self.to_lower = to_lower self.max_length = max_length + self.min_length = min_length self.separator = separator self.capitalize = capitalize + self.extract_phrase = extract_phrase def pretranslate_dict_to_function(self, convert_dict): @@ -141,6 +146,23 @@ def sanitize(self, text): text = text.replace("'", '').strip() # remove ' return filter(None, self.sanitize_re.split(text)) # split by unwanted characters + def phrase(self, text): + """Try to get an slug as most meaningful as possible using punctuation to extract fragment""" + text = text[:self.max_length] # Note we have to cut text here, can't wait after sanitize phase + punctuation_marks = [u'\.', u';', u',', u':'] + len_text = len(text) + for mark in punctuation_marks: + r = re.compile(u".*%s" % (mark)) + m = r.match(text) + if m: + phrase = m.group() + len_phrase = len(phrase) + if len_phrase >= self.min_length and len_phrase < len_text: + text = phrase + break + return text + + def __call__(self, text, **kwargs): max_length = kwargs.get('max_length', self.max_length) @@ -166,6 +188,8 @@ def __call__(self, text, **kwargs): text = u''.join(text_parts) + if self.extract_phrase == True: + text = self.phrase(text) words = self.sanitize(text) text = join_words(words, separator, max_length) diff --git a/slugify/tests.py b/slugify/tests.py index 593d0c9..1763d5f 100644 --- a/slugify/tests.py +++ b/slugify/tests.py @@ -54,7 +54,7 @@ def test_slugify_unicode(self): class PredefinedSlugifyTestCase(unittest.TestCase): def test_slugify_url(self): - self.assertEqual(slugify_url('The Über article'), 'uber-article') + self.assertEqual(slugify_url('The Über article'), 'the-uber-article') def test_slugify_filename(self): self.assertEqual(slugify_filename(u'Дrаft №2.txt'), u'Draft_2.txt') @@ -254,5 +254,21 @@ def test_deprecated_get_slugify(self): self.assertIn("'slugify.get_slugify' is deprecated", str(warning[-1].message)) +class PhraseSlugifyTestCase(unittest.TestCase): + + def test_slugify_phrase_url(self): + text = "Someone must have slandered Josef K., for one morning, without having done anything truly wrong, he was arrested." + self.assertEqual(slugify_url(text), + "someone-must-have-slandered-josef-k") + + text = "The Miss Lonelyhearts of the New York Post-Dispatch (Are you in trouble?—Do-you-need-advice?—Write-to-Miss-Lonelyhearts-and-she-will-help-you) sat at his desk and stared at a piece of white cardboard. —Nathanael West, Miss Lonelyhearts" + self.assertEqual(slugify_url(text), + "the-miss-lonelyhearts-of-the-new-york-post-dispatch-are-you-in-trouble-do-you-need-advice-write") + + text = " I wish either my father or my mother, or indeed both of them, as they were in duty both equally bound to it, had minded what they were about when they begot me; had they duly considered how much depended upon what they were then doing;—that not only the production of a rational Being was concerned in it, but that possibly the happy formation and temperature of his body, perhaps his genius and the very cast of his mind;—and, for aught they knew to the contrary, even the fortunes of his whole house might take their turn from the humours and dispositions which were then uppermost:—Had they duly weighed and considered all this, and proceeded accordingly,—I am verily persuaded I should have made a quite different figure in the world, from that, in which the reader is likely to see me. —Laurence Sterne, Tristram Shandy (1759–1767)" + self.assertEqual(slugify_url(text), + "i-wish-either-my-father-or-my-mother-or-indeed-both-of-them") + + if __name__ == '__main__': unittest.main() From a10dad638a202811e671596a72aaf7e4fe6c42b5 Mon Sep 17 00:00:00 2001 From: clabra Date: Wed, 25 Jun 2014 14:31:19 +0200 Subject: [PATCH 2/3] More tests for Phrase feature --- slugify/__init__.py | 2 +- slugify/main.py | 4 ++-- slugify/tests.py | 27 +++++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/slugify/__init__.py b/slugify/__init__.py index 2e508d0..1215eba 100644 --- a/slugify/__init__.py +++ b/slugify/__init__.py @@ -10,7 +10,7 @@ slugify_url.to_lower = True #slugify_url.stop_words = ('a', 'an', 'the') slugify_url.max_length = 100 -slugify_url.min_length = 35 +slugify_url.min_length = 25 slugify_url.separator = '-' slugify_url.extract_phrase = True diff --git a/slugify/main.py b/slugify/main.py index 055d2e7..8258bb6 100644 --- a/slugify/main.py +++ b/slugify/main.py @@ -149,7 +149,7 @@ def sanitize(self, text): def phrase(self, text): """Try to get an slug as most meaningful as possible using punctuation to extract fragment""" text = text[:self.max_length] # Note we have to cut text here, can't wait after sanitize phase - punctuation_marks = [u'\.', u';', u',', u':'] + punctuation_marks = [u'\.', u';', u',', u'\:'] len_text = len(text) for mark in punctuation_marks: r = re.compile(u".*%s" % (mark)) @@ -157,7 +157,7 @@ def phrase(self, text): if m: phrase = m.group() len_phrase = len(phrase) - if len_phrase >= self.min_length and len_phrase < len_text: + if len_phrase >= self.min_length and len_phrase <= len_text: text = phrase break return text diff --git a/slugify/tests.py b/slugify/tests.py index 1763d5f..dfe779a 100644 --- a/slugify/tests.py +++ b/slugify/tests.py @@ -255,6 +255,16 @@ def test_deprecated_get_slugify(self): class PhraseSlugifyTestCase(unittest.TestCase): + """Tests results for current config of slugify_url + + slugify_url = Slugify() + slugify_url.to_lower = True + #slugify_url.stop_words = ('a', 'an', 'the') + slugify_url.max_length = 100 + slugify_url.min_length = 25 + slugify_url.separator = '-' + slugify_url.extract_phrase = True + """ def test_slugify_phrase_url(self): text = "Someone must have slandered Josef K., for one morning, without having done anything truly wrong, he was arrested." @@ -269,6 +279,23 @@ def test_slugify_phrase_url(self): self.assertEqual(slugify_url(text), "i-wish-either-my-father-or-my-mother-or-indeed-both-of-them") + text = "En un lugar de la Mancha, de cuyo nombre no quiero acordarme, vivía un caballero." + self.assertEqual(slugify_url(text), + "en-un-lugar-de-la-mancha-de-cuyo-nombre-no-quiero-acordarme-vivia-un-caballero") + + # : + text = "Este era el nombre del caballero: Don Quijote" + self.assertEqual(slugify_url(text), + "este-era-el-nombre-del-caballero") # len (phrase) > min_length, cut at punctuation mark + + text = "Su nombre: Don Quijote" + self.assertEqual(slugify_url(text), + "su-nombre-don-quijote") # len (phrase) < min_length, don't cut at punctuation mark + + # ; + text = "Este era el nombre del caballero; Don Quijote" + self.assertEqual(slugify_url(text), + "este-era-el-nombre-del-caballero") # len (phrase) > min_length, cut at punctuation mark if __name__ == '__main__': unittest.main() From d087e40ec6941335ee86fa3b457559dc8225e43a Mon Sep 17 00:00:00 2001 From: clabra Date: Tue, 1 Jul 2014 15:44:16 +0200 Subject: [PATCH 3/3] Add avoid_truncate_word feature to avoid cut last word Introduces Django dependency since we use django.uitls.text methods to manage text --- slugify/main.py | 36 +++++++++++++++++++++++++++++++----- slugify/tests.py | 4 ++-- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/slugify/main.py b/slugify/main.py index 8258bb6..4590889 100644 --- a/slugify/main.py +++ b/slugify/main.py @@ -4,6 +4,7 @@ from unidecode import unidecode import regex as re +from django.utils.text import Truncator re.DEFAULT_VERSION = re.V1 # Version 1 behaviour: nested sets and set operations are supported @@ -65,7 +66,7 @@ class Slugify(object): _stop_words = () def __init__(self, pretranslate=None, translate=unidecode, safe_chars='', stop_words=(), - to_lower=False, max_length=2000, min_length=25, separator=u'-', capitalize=False, extract_phrase=False): + to_lower=False, max_length=2000, min_length=25, separator=u'-', capitalize=False, extract_phrase=False, truncate_words=False): """Init next parametesr taking in account URL format recommendations: to_lower = True, max_length = 2000, separator = '-' """ @@ -81,6 +82,7 @@ def __init__(self, pretranslate=None, translate=unidecode, safe_chars='', stop_w self.separator = separator self.capitalize = capitalize self.extract_phrase = extract_phrase + self.truncate_words = truncate_words # Is allowed cut words in the middle? def pretranslate_dict_to_function(self, convert_dict): @@ -146,9 +148,24 @@ def sanitize(self, text): text = text.replace("'", '').strip() # remove ' return filter(None, self.sanitize_re.split(text)) # split by unwanted characters + def avoid_truncated_word(self, text): + """Truncate in a way that text will be shorter than max_length and won't be cut in the middle of a word""" + words = text.split() + if not words: + return text + truncator = Truncator(text) + last_word = text.split()[-1] + text = truncator.chars(self.max_length, '') + truncated_last_word = text.split()[-1] + if truncated_last_word != last_word: + # last word is cut. So, remove it + num_words = len(text.split()) + text = truncator.words(num_words - 1) + return text + def phrase(self, text): """Try to get an slug as most meaningful as possible using punctuation to extract fragment""" - text = text[:self.max_length] # Note we have to cut text here, can't wait after sanitize phase + text = self.avoid_truncated_word(text) # Note we have to cut text here, can't wait after sanitize phase punctuation_marks = [u'\.', u';', u',', u'\:'] len_text = len(text) for mark in punctuation_marks: @@ -164,6 +181,12 @@ def phrase(self, text): def __call__(self, text, **kwargs): + """Use: + slugify_url = Slugify() + slugify_url.extract_phrase = True # set parameters + ... + slugify("Text to slugify") + """ max_length = kwargs.get('max_length', self.max_length) separator = kwargs.get('separator', self.separator) @@ -188,9 +211,12 @@ def __call__(self, text, **kwargs): text = u''.join(text_parts) - if self.extract_phrase == True: - text = self.phrase(text) - words = self.sanitize(text) + if self.extract_phrase: + text = self.phrase(text) # calls self.avoid_truncated_word() + elif not self.truncate_words: + text = self.avoid_truncated_word(text) + + words = self.sanitize(text) # leave only secure chars text = join_words(words, separator, max_length) if text and kwargs.get('capitalize', self.capitalize): diff --git a/slugify/tests.py b/slugify/tests.py index dfe779a..8011fc2 100644 --- a/slugify/tests.py +++ b/slugify/tests.py @@ -269,11 +269,11 @@ class PhraseSlugifyTestCase(unittest.TestCase): def test_slugify_phrase_url(self): text = "Someone must have slandered Josef K., for one morning, without having done anything truly wrong, he was arrested." self.assertEqual(slugify_url(text), - "someone-must-have-slandered-josef-k") + "someone-must-have-slandered-josef-k-for-one-morning-without-having-done-anything-truly-wrong") text = "The Miss Lonelyhearts of the New York Post-Dispatch (Are you in trouble?—Do-you-need-advice?—Write-to-Miss-Lonelyhearts-and-she-will-help-you) sat at his desk and stared at a piece of white cardboard. —Nathanael West, Miss Lonelyhearts" self.assertEqual(slugify_url(text), - "the-miss-lonelyhearts-of-the-new-york-post-dispatch-are-you-in-trouble-do-you-need-advice-write") + "the-miss-lonelyhearts-of-the-new-york-post-dispatch-are-you-in") text = " I wish either my father or my mother, or indeed both of them, as they were in duty both equally bound to it, had minded what they were about when they begot me; had they duly considered how much depended upon what they were then doing;—that not only the production of a rational Being was concerned in it, but that possibly the happy formation and temperature of his body, perhaps his genius and the very cast of his mind;—and, for aught they knew to the contrary, even the fortunes of his whole house might take their turn from the humours and dispositions which were then uppermost:—Had they duly weighed and considered all this, and proceeded accordingly,—I am verily persuaded I should have made a quite different figure in the world, from that, in which the reader is likely to see me. —Laurence Sterne, Tristram Shandy (1759–1767)" self.assertEqual(slugify_url(text),