Merge branch 'release/v1.5'

deanishe · May 11, 2014 · 1b52ea7 · 1b52ea7
2 parents 4bdaa1c + e7ea627
commit 1b52ea7
Show file tree

Hide file tree

Showing 6 changed files with 153 additions and 106 deletions.
diff --git a/TODO b/TODO
@@ -13,6 +13,7 @@ web.py:
 workflow.py:
 	- automatically add `lib` and/or `packages` subdirectory to `sys.path`?
 	- move "magic" args to class-level dictionary, so authors can add their own.
+	- optimise `filter` by memoising expensive operations, e.g. `pattern`/`search` generation @done(14-05-11 23:13)
 	ui.py:
 	Possibly provide access to dialog boxes, notifications w/out having to use AppleScript
 	- Yes/no dialog

diff --git a/alfred-workflow.zip b/alfred-workflow.zip
diff --git a/doc/howto.rst b/doc/howto.rst
@@ -250,9 +250,10 @@ description of the algorithm and match flags).
 
 **Note:** By default, :meth:`Workflow.filter() <workflow.workflow.Workflow.filter>`
 will match and return anything that contains all the characters in ``query``
-in the same order, regardless of case. It's very likely that you'll want to set
-the standard a little higher. See :ref:`restricting-results` for info on how
-to do that.
+in the same order, regardless of case. Not only can this lead to unacceptable
+performance when working with thousands of results, but it's also very likely
+that you'll want to set the standard a little higher.
+See :ref:`restricting-results` for info on how to do that.
 
 To use :meth:`Workflow.filter() <workflow.workflow.Workflow.filter>`, pass it
 a query, a list of items to filter and sort, and if your list contains items
@@ -376,15 +377,17 @@ You can set match rules using bitwise operators, so ``|`` to combine them or
 
 **Note:** ``MATCH_ALLCHARS`` is particularly slow and provides the
 worst matches. You should consider excluding it, especially if you're calling
-:meth:`Workflow.filter() <workflow.workflow.Workflow.filter>` with > 5000 items.
+:meth:`Workflow.filter() <workflow.workflow.Workflow.filter>` with more than a
+few hundred items or expect multi-word queries.
 
 Diacritic folding
 -----------------
 
-By default, :meth:`Workflow.filter() <workflow.workflow.Workflow.filter>` will fold non-ASCII characters
-to ASCII equivalents (e.g. *é* -> *e*, *ü* -> *u*) if the ``query`` contains
-only ASCII characters. This behaviour can be turned off by passing
-``fold_diacritics=False`` to :meth:`Workflow.filter() <workflow.workflow.Workflow.filter>`.
+By default, :meth:`Workflow.filter() <workflow.workflow.Workflow.filter>`
+will fold non-ASCII characters to ASCII equivalents (e.g. *é* -> *e*, *ü* -> *u*)
+if the ``query`` contains only ASCII characters. This behaviour can be turned
+off by passing ``fold_diacritics=False`` to
+:meth:`Workflow.filter() <workflow.workflow.Workflow.filter>`.
 
 **Note:** To keep the library small, only a subset of European languages are
 supported. The `Unidecode <https://pypi.python.org/pypi/Unidecode>`_ library

diff --git a/workflow/__init__.py b/workflow/__init__.py
@@ -108,7 +108,7 @@ def main(wf):
 
 """
 
-__version__ = '1.4.4'
+__version__ = '1.5'
 
 from .workflow import Workflow, PasswordNotFound, KeychainError
 from .workflow import (ICON_ERROR, ICON_WARNING, ICON_NOTE, ICON_INFO,

diff --git a/workflow/background.py b/workflow/background.py
@@ -159,7 +159,7 @@ def _background(stdin='/dev/null', stdout='/dev/null',
         pid = os.fork()
         if pid > 0:
             sys.exit(0)  # Exit first parent.
-    except OSError, e:
+    except OSError as e:
         log.critical("fork #1 failed: (%d) %s\n" % (e.errno, e.strerror))
         sys.exit(1)
     # Decouple from parent environment.
@@ -171,7 +171,7 @@ def _background(stdin='/dev/null', stdout='/dev/null',
         pid = os.fork()
         if pid > 0:
             sys.exit(0)  # Exit second parent.
-    except OSError, e:
+    except OSError as e:
         log.critical("fork #2 failed: (%d) %s\n" % (e.errno, e.strerror))
         sys.exit(1)
     # Now I am a daemon!
@@ -201,7 +201,7 @@ def run_in_background(name, args, **kwargs):
     """
 
     if is_running(name):
-        log.info('Task `{}` is already running')
+        log.info('Task `{}` is already running'.format(name))
         return
 
     argcache = _arg_cache(name)
@@ -211,7 +211,9 @@ def run_in_background(name, args, **kwargs):
         pickle.dump({'args': args, 'kwargs': kwargs}, file)
 
     # Call this script
-    retcode = subprocess.call(['/usr/bin/python', __file__, name])
+    cmd = ['/usr/bin/python', __file__, name]
+    log.debug('Calling {!r} ...'.format(cmd))
+    retcode = subprocess.call(cmd)
     if retcode:  # pragma: no cover
         log.error('Failed to call task in background')
     else:

diff --git a/workflow/workflow.py b/workflow/workflow.py
@@ -625,6 +625,7 @@ def __init__(self, default_settings=None, input_encoding='utf-8',
         self._info_loaded = False
         self._logger = None
         self._items = []
+        self._search_pattern_cache = {}
         if libraries:
             sys.path = libraries + sys.path
 
@@ -1074,114 +1075,41 @@ def filter(self, query, items, key=lambda x: x, ascending=False,
 
         """
 
-        results = {}
-        query = query.lower()
-        queryset = set(query)
+        # Remove preceding/trailing spaces
+        query = query.strip()
 
         # Use user override if there is one
         fold_diacritics = self.settings.get('__workflows_diacritic_folding',
                                             fold_diacritics)
 
-        if not isascii(query):
-            fold_diacritics = False
-
-        # Build pattern: include all characters
-        pattern = []
-        for c in query:
-            # pattern.append('[^{0}]*{0}'.format(re.escape(c)))
-            pattern.append('.*?{0}'.format(re.escape(c)))
-        pattern = ''.join(pattern)
-        search = re.compile(pattern, re.IGNORECASE).search
-        # print('filter: searching %d items' % len(items))
+        results = {}
 
         for i, item in enumerate(items):
-            rule = None
+            skip = False
             score = 0
-            value = key(item)
-
-            if fold_diacritics:
-                value = self.fold_to_ascii(value)
-
-            # pre-filter any items that do not contain all characters
-            # of ``query`` to save on running several more expensive tests
-            if not queryset <= set(value.lower()):
+            words = [s.strip() for s in query.split(' ')]
+            value = key(item).strip()
+            if value == '':
                 continue
+            for word in words:
+                if word == '':
+                    continue
+                s, r = self._filter_item(value, word, match_on,
+                                         fold_diacritics)
+
+                if not s:  # Skip items that don't match part of the query
+                    skip = True
+                score += s
 
-            # item starts with query
-            if (match_on & MATCH_STARTSWITH and
-                    value.lower().startswith(query)):
-                score = 100.0 - (len(value) / len(query))
-                rule = MATCH_STARTSWITH
-
-            if not score and match_on & MATCH_CAPITALS:
-                # query matches capitalised letters in item,
-                # e.g. of = OmniFocus
-                initials = ''.join([c for c in value if c in INITIALS])
-                if initials.lower().startswith(query):
-                    score = 100.0 - (len(initials) / len(query))
-                    rule = MATCH_CAPITALS
-
-            if not score:
-                if (match_on & MATCH_ATOM or
-                        match_on & MATCH_INITIALS_CONTAIN or
-                        match_on & MATCH_INITIALS_STARTSWITH):
-                    # split the item into "atoms", i.e. words separated by
-                    # spaces or other non-word characters
-                    atoms = [s.lower() for s in split_on_delimiters(value)]
-                    # print('atoms : %s  -->  %s' % (value, atoms))
-                    # initials of the atoms
-                    initials = ''.join([s[0] for s in atoms if s])
-
-                if match_on & MATCH_ATOM:
-                    # is `query` one of the atoms in item?
-                    # similar to substring, but scores more highly, as it's
-                    # a word within the item
-                    if query in atoms:
-                        score = 100.0 - (len(value) / len(query))
-                        rule = MATCH_ATOM
-
-            if not score:
-                # `query` matches start (or all) of the initials of the
-                # atoms, e.g. ``himym`` matches "How I Met Your Mother"
-                # *and* "how i met your mother" (the ``capitals`` rule only
-                # matches the former)
-                if (match_on & MATCH_INITIALS_STARTSWITH and
-                        initials.startswith(query)):
-                    score = 100.0 - (len(initials) / len(query))
-                    rule = MATCH_INITIALS_STARTSWITH
-
-                # `query` is a substring of initials, e.g. ``doh`` matches
-                # "The Dukes of Hazzard"
-                elif (match_on & MATCH_INITIALS_CONTAIN and
-                        query in initials):
-                    score = 95.0 - (len(initials) / len(query))
-                    rule = MATCH_INITIALS_CONTAIN
-
-            if not score:
-                # `query` is a substring of item
-                if match_on & MATCH_SUBSTRING and query in value.lower():
-                        score = 90.0 - (len(value) / len(query))
-                        rule = MATCH_SUBSTRING
-
-            if not score:
-                # finally, assign a score based on how close together the
-                # characters in `query` are in item.
-                if match_on & MATCH_ALLCHARS:
-                    match = search(value)
-                    if match:
-                        score = 100.0 / ((1 + match.start()) *
-                                         (match.end() - match.start() + 1))
-                        rule = MATCH_ALLCHARS
-
-            if min_score and score < min_score:
+            if skip:
                 continue
 
-            if score > 0:
+            if score:
                 # use "reversed" `score` (i.e. highest becomes lowest) and
                 # `value` as sort key. This means items with the same score
                 # will be sorted in alphabetical not reverse alphabetical order
-                results[(100.0 / score, value.lower(), i)] = (item, score,
-                                                              rule)
+                results[(100.0 / score, value.lower(), score)] = (item, score,
+                                                                  r)
 
         # sort on keys, then discard the keys
         keys = sorted(results.keys(), reverse=ascending)
@@ -1190,12 +1118,125 @@ def filter(self, query, items, key=lambda x: x, ascending=False,
         if max_results and len(results) > max_results:
             results = results[:max_results]
 
+        if min_score:
+            results = [r for r in results if r[1] > min_score]
+
         # return list of ``(item, score, rule)``
         if include_score:
             return results
         # just return list of items
         return [t[0] for t in results]
 
+    def _filter_item(self, value, query, match_on, fold_diacritics):
+        """Filter ``value`` against ``query`` using rules ``match_on``
+
+        :returns: ``(score, rule)``
+
+        """
+
+        query = query.lower()
+        queryset = set(query)
+
+        if not isascii(query):
+            fold_diacritics = False
+
+        rule = None
+        score = 0
+
+        if fold_diacritics:
+            value = self.fold_to_ascii(value)
+
+        # pre-filter any items that do not contain all characters
+        # of ``query`` to save on running several more expensive tests
+        if not queryset <= set(value.lower()):
+            return (0, None)
+
+        # item starts with query
+        if (match_on & MATCH_STARTSWITH and
+                value.lower().startswith(query)):
+            score = 100.0 - (len(value) / len(query))
+            rule = MATCH_STARTSWITH
+
+        if not score and match_on & MATCH_CAPITALS:
+            # query matches capitalised letters in item,
+            # e.g. of = OmniFocus
+            initials = ''.join([c for c in value if c in INITIALS])
+            if initials.lower().startswith(query):
+                score = 100.0 - (len(initials) / len(query))
+                rule = MATCH_CAPITALS
+
+        if not score:
+            if (match_on & MATCH_ATOM or
+                    match_on & MATCH_INITIALS_CONTAIN or
+                    match_on & MATCH_INITIALS_STARTSWITH):
+                # split the item into "atoms", i.e. words separated by
+                # spaces or other non-word characters
+                atoms = [s.lower() for s in split_on_delimiters(value)]
+                # print('atoms : %s  -->  %s' % (value, atoms))
+                # initials of the atoms
+                initials = ''.join([s[0] for s in atoms if s])
+
+            if match_on & MATCH_ATOM:
+                # is `query` one of the atoms in item?
+                # similar to substring, but scores more highly, as it's
+                # a word within the item
+                if query in atoms:
+                    score = 100.0 - (len(value) / len(query))
+                    rule = MATCH_ATOM
+
+        if not score:
+            # `query` matches start (or all) of the initials of the
+            # atoms, e.g. ``himym`` matches "How I Met Your Mother"
+            # *and* "how i met your mother" (the ``capitals`` rule only
+            # matches the former)
+            if (match_on & MATCH_INITIALS_STARTSWITH and
+                    initials.startswith(query)):
+                score = 100.0 - (len(initials) / len(query))
+                rule = MATCH_INITIALS_STARTSWITH
+
+            # `query` is a substring of initials, e.g. ``doh`` matches
+            # "The Dukes of Hazzard"
+            elif (match_on & MATCH_INITIALS_CONTAIN and
+                    query in initials):
+                score = 95.0 - (len(initials) / len(query))
+                rule = MATCH_INITIALS_CONTAIN
+
+        if not score:
+            # `query` is a substring of item
+            if match_on & MATCH_SUBSTRING and query in value.lower():
+                    score = 90.0 - (len(value) / len(query))
+                    rule = MATCH_SUBSTRING
+
+        if not score:
+            # finally, assign a score based on how close together the
+            # characters in `query` are in item.
+            if match_on & MATCH_ALLCHARS:
+                search = self._search_for_query(query)
+                match = search(value)
+                if match:
+                    score = 100.0 / ((1 + match.start()) *
+                                     (match.end() - match.start() + 1))
+                    rule = MATCH_ALLCHARS
+
+        if score > 0:
+            return (score, rule)
+        return (0, None)
+
+    def _search_for_query(self, query):
+        if query in self._search_pattern_cache:
+            return self._search_pattern_cache[query]
+
+        # Build pattern: include all characters
+        pattern = []
+        for c in query:
+            # pattern.append('[^{0}]*{0}'.format(re.escape(c)))
+            pattern.append('.*?{0}'.format(re.escape(c)))
+        pattern = ''.join(pattern)
+        search = re.compile(pattern, re.IGNORECASE).search
+
+        self._search_pattern_cache[query] = search
+        return search
+
     def run(self, func):
         """Call `func` to run your workflow