virantha · benjsec · Oct 12, 2017 · Oct 12, 2017 · Oct 12, 2017 · Oct 12, 2017
diff --git a/.travis.yml b/.travis.yml
@@ -1,9 +1,14 @@
 language: python
 python:
     - "2.7"
+    - "3.5"
+    - "3.6"
+before_install:
+    - sudo apt-get -qq update
+    - sudo apt-get install -y tesseract-ocr ghostscript imagemagick
 install: 
-    - "pip install -r requirements.txt --use-mirrors"
-    - "pip install pytest mock --use-mirrors"
+    - "pip install -r requirements.txt"
+    - "pip install pytest mock"
     - "pip install ."
 script: 
-    - "python setup.py test"
+    - "pytest test"
diff --git a/pypdfocr/pypdfocr.py b/pypdfocr/pypdfocr.py
@@ -21,24 +21,37 @@
 import itertools
 from functools import wraps
 
-from version import __version__
+from .version import __version__
 from PIL import Image
 import yaml
 
 import multiprocessing
-# Replace the Popen routine to allow win32 pyinstaller to build
-from multiprocessing import forking
-from pypdfocr_multiprocessing import _Popen
+
+""" Special work-around to support multiprocessing and pyinstaller --onefile on windows systms
+
+    https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing
+"""
+try:
+    # Python 3.4+
+    if sys.platform.startswith('win'):
+        import multiprocessing.popen_spawn_win32 as forking
+    else:
+        import multiprocessing.popen_fork as forking
+except ImportError:
+    import multiprocessing.forking as forking
+
+from .pypdfocr_multiprocessing import _Popen
 forking.Popen = _Popen
 
-from pypdfocr_pdf import PyPdf
-from pypdfocr_tesseract import PyTesseract
-from pypdfocr_gs import PyGs
-from pypdfocr_watcher import PyPdfWatcher
-from pypdfocr_pdffiler import PyPdfFiler
-from pypdfocr_filer_dirs import PyFilerDirs
-from pypdfocr_filer_evernote import PyFilerEvernote
-from pypdfocr_preprocess import PyPreprocess
+from .pypdfocr_pdf import PyPdf
+from .pypdfocr_tesseract import PyTesseract
+from .pypdfocr_gs import PyGs
+from .pypdfocr_watcher import PyPdfWatcher
+from .pypdfocr_pdffiler import PyPdfFiler
+from .pypdfocr_filer_dirs import PyFilerDirs
+from .pypdfocr_filer_evernote import ENABLED as evernote_enabled
+from .pypdfocr_filer_evernote import PyFilerEvernote
+from .pypdfocr_preprocess import PyPreprocess
 
 def error(text):
     print("ERROR: %s" % text)
@@ -49,12 +62,14 @@ def retry(count=5, exc_type = Exception):
     def decorator(func):
         @wraps(func)
         def result(*args, **kwargs):
+            err = None
             for _ in range(count):
                 try:
                     return func(*args, **kwargs)
-                except exc_type:
-                    pass
-                raise
+                except exc_type as e:
+                    err = e
+            else:
+                raise err
         return result
     return decorator
 
@@ -161,11 +176,11 @@ def get_options(self, argv):
         filing_group = p.add_argument_group(title="Filing optinos")
         filing_group.add_argument('-f', '--file', action='store_true',
             default=False, dest='enable_filing', help='Enable filing of converted PDFs')
-        #filing_group.add_argument('-c', '--config', type = argparse.FileType('r'),
+        # filing_group.add_argument('-c', '--config', type = argparse.FileType('r'),
         filing_group.add_argument('-c', '--config', type = lambda x: open_file_with_timeout(p,x),
              dest='configfile', help='Configuration file for defaults and PDF filing')
         filing_group.add_argument('-e', '--evernote', action='store_true',
-            default=False, dest='enable_evernote', help='Enable filing to Evernote')
+            default=False, dest='enable_evernote', help='Enable filing to Evernote.')
         filing_group.add_argument('-n', action='store_true',
             default=False, dest='match_using_filename', help='Use filename to match if contents did not match anything, before filing to default folder')
 
@@ -204,7 +219,11 @@ def get_options(self, argv):
             logging.debug("Read in configuration file")
             logging.debug(self.config)
 
-        if args.enable_evernote:
+        # Evernote filing does not work in py3
+        if args.enable_evernote and not evernote_enabled:
+            print("Warning: Evernote filing disabled, could not find evernote API. Evernote not available in py3.")
+            self.enable_evernote = False
+        elif args.enable_evernote:
             self.enable_evernote = True
         else:
             self.enable_evernote = False
@@ -367,11 +386,11 @@ def run_conversion(self, pdf_filename):
             time.sleep(1)
             if not self.debug:
                 # Need to clean up the original image files before preprocessing
-                if locals().has_key("fns"): # Have to check if this was set before exception raised
+                if "fns" in locals(): # Have to check if this was set before exception raised
                     logging.info("Cleaning up %s" % fns)
                     self._clean_up_files(fns)
 
-                if locals().has_key("preprocess_imagefilenames"):  # Have to check if this was set before exception raised
+                if "preprocess_imagefilenames" in locals():  # Have to check if this was set before exception raised
                     logging.info("Cleaning up %s" % preprocess_imagefilenames)
                     self._clean_up_files(preprocess_imagefilenames) # splat the hocr_filenames as it is a list of pairs
                     for ext in [".hocr", ".html", ".txt"]:
@@ -467,7 +486,7 @@ def go(self, argv):
                 except KeyboardInterrupt:
                     break
                 except Exception as e:
-                    print traceback.print_exc(e)
+                    print(traceback.print_exc(e))
                     py_watcher.stop()
 
         else:

diff --git a/pypdfocr/pypdfocr_filer_dirs.py b/pypdfocr/pypdfocr_filer_dirs.py
@@ -16,7 +16,7 @@
 import os
 import shutil
 
-from pypdfocr_filer import PyFiler
+from .pypdfocr_filer import PyFiler
 
 """
     Implementation of a filer class 

diff --git a/pypdfocr/pypdfocr_filer_evernote.py b/pypdfocr/pypdfocr_filer_evernote.py
@@ -19,17 +19,21 @@
 import time
 import sys
 
-from pypdfocr_filer import PyFiler
+from .pypdfocr_filer import PyFiler
 
 import functools
 
-from evernote.api.client import EvernoteClient
-import evernote.edam.type.ttypes as Types
-import evernote.edam.userstore.constants as UserStoreConstants
-from evernote.edam.error.ttypes import EDAMUserException
-from evernote.edam.error.ttypes import EDAMSystemException
-from evernote.edam.error.ttypes import EDAMNotFoundException
-from evernote.edam.error.ttypes import EDAMErrorCode
+try:
+    from evernote.api.client import EvernoteClient
+    import evernote.edam.type.ttypes as Types
+    import evernote.edam.userstore.constants as UserStoreConstants
+    from evernote.edam.error.ttypes import EDAMUserException
+    from evernote.edam.error.ttypes import EDAMSystemException
+    from evernote.edam.error.ttypes import EDAMNotFoundException
+    from evernote.edam.error.ttypes import EDAMErrorCode
+    ENABLED = True
+except ImportError:
+    ENABLED = False
 
 
 """

diff --git a/pypdfocr/pypdfocr_gs.py b/pypdfocr/pypdfocr_gs.py
@@ -92,21 +92,21 @@ def _find_windows_gs(self):
             listing = os.listdir('.')
 
             # Find all possible gs* sub-directories
-	    listing = [x for x in listing if x.startswith('gs')]
+            listing = [x for x in listing if x.startswith('gs')]
 
             # TODO: Make this a natural sort
             listing.sort(reverse=True)
-	    for bindir in listing:
-		binpath = os.path.join(bindir,'bin')
-		if not os.path.exists(binpath): continue
-		os.chdir(binpath)
+            for bindir in listing:
+                binpath = os.path.join(bindir,'bin')
+                if not os.path.exists(binpath): continue
+                os.chdir(binpath)
                 # Look for gswin64c.exe or gswin32c.exe (the c is for the command-line version)
-		gswin = glob.glob('gswin*c.exe')
-		if len(gswin) == 0:
-		    continue
-		gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?)
-		os.chdir(cwd)
-		return gs
+                gswin = glob.glob('gswin*c.exe')
+                if len(gswin) == 0:
+                    continue
+                gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?)
+                os.chdir(cwd)
+                return gs
 
         if not gs:
             error(self.msgs['GS_MISSING_BINARY'])
@@ -171,10 +171,10 @@ def _run_gs(self, options, output_filename, pdf_filename):
         try:
             cmd = '%s -q -dNOPAUSE %s -sOutputFile="%s" "%s" -c quit' % (self.binary, options, output_filename, pdf_filename)
             logging.info(cmd)        
-            out = subprocess.check_output(cmd, shell=True)
+            out = subprocess.check_output(cmd, shell=True, universal_newlines=True)
 
         except subprocess.CalledProcessError as e:
-            print e.output
+            print(e.output)
             if "undefined in .getdeviceparams" in e.output:
                 error(self.msgs['GS_OUTDATED'])
             else:

diff --git a/pypdfocr/pypdfocr_multiprocessing.py b/pypdfocr/pypdfocr_multiprocessing.py
@@ -13,19 +13,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys, os, multiprocessing.forking
 import logging
+import os
+import sys
 
 """ Special work-around to support multiprocessing and pyinstaller --onefile on windows systms
 
     https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing
 """
+try:
+    # Python 3.4+
+    if sys.platform.startswith('win'):
+        import multiprocessing.popen_spawn_win32 as forking
+    else:
+        import multiprocessing.popen_fork as forking
+except ImportError:
+    import multiprocessing.forking as forking
 
-import multiprocessing.forking as forking
-import os
-import sys
 
-class _Popen(multiprocessing.forking.Popen):
+class _Popen(forking.Popen):
     def __init__(self, *args, **kw):
         if hasattr(sys, 'frozen'):
             # We have to set original _MEIPASS2 value from sys._MEIPASS

diff --git a/pypdfocr/pypdfocr_pdf.py b/pypdfocr/pypdfocr_pdf.py
@@ -31,7 +31,6 @@
 import tempfile
 import glob
 
-import cStringIO
 import base64
 import zlib
 import math
@@ -52,7 +51,7 @@
 from reportlab.lib.enums import TA_LEFT
 from reportlab.platypus.paragraph import Paragraph
 
-from pypdfocr_util import Retry
+from .pypdfocr_util import Retry
 from functools import partial
 
 class RotatedPara(Paragraph):
@@ -152,10 +151,11 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
         all_text_filename = os.path.join(pdf_dir, "%s_text.pdf" % (basename))
         merger = PdfFileMerger()
         for text_pdf_filename in text_pdf_filenames:
-            merger.append(PdfFileReader(file(text_pdf_filename, 'rb')))
+            with open(text_pdf_filename, 'rb') as f:
+                merger.append(PdfFileReader(f))
         merger.write(all_text_filename)
         merger.close()
-	del merger
+        del merger
 
 
         writer = PdfFileWriter()

diff --git a/pypdfocr/pypdfocr_pdffiler.py b/pypdfocr/pypdfocr_pdffiler.py
@@ -18,15 +18,14 @@
     on keywords
 """
 
-from sets import Set    
 import sys, os
 import re
 import logging
 import shutil
 
 from PyPDF2 import PdfFileReader
-from pypdfocr_filer import PyFiler
-from pypdfocr_filer_dirs import PyFilerDirs
+from .pypdfocr_filer import PyFiler
+from .pypdfocr_filer_dirs import PyFilerDirs
 
 class PyPdfFiler(object):
     def __init__(self, filer):
@@ -36,15 +35,15 @@ def __init__(self, filer):
 
         # Whether to fall back on filename for matching keywords against
         # if there is no match in the text
-        self.file_using_filename = False 
+        self.file_using_filename = False
 
     def iter_pdf_page_text(self, filename):
         self.filename = filename
         reader = PdfFileReader(filename)
         logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
         for pgnum in range(reader.getNumPages()):
             text = reader.getPage(pgnum).extractText()
-            text = text.encode('ascii', 'ignore')
+            # text = text.encode('ascii', 'ignore')
             text = text.replace('\n', ' ')
             yield text
 
@@ -56,10 +55,10 @@ def _get_matching_folder(self, pdfText):
                 if s in searchText:
                     logging.info("Matched keyword '%s'" % s)
                     return folder
-        # No match found, so return 
+        # No match found, so return
         return None
 
-    def file_original (self, original_filename):
+    def file_original(self, original_filename):
         return self.filer.file_original(original_filename)
 
     def move_to_matching_folder(self, filename):
@@ -72,9 +71,9 @@ def move_to_matching_folder(self, filename):
 
         tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder)
         return tgt_file
-        
+
 if __name__ == '__main__':
     p = PyPdfFiler(PyFilerDirs())
     for page_text in p.iter_pdf_page_text("scan_ocr.pdf"):
-        print (page_text)
+        print(page_text)
 
diff --git a/pypdfocr/pypdfocr_preprocess.py b/pypdfocr/pypdfocr_preprocess.py
@@ -28,7 +28,7 @@
 import signal
 
 from multiprocessing import Pool
-from pypdfocr_interrupts import init_worker
+from .pypdfocr_interrupts import init_worker
 
 # Ugly hack to pass in object method to the multiprocessing library
 # From http://www.rueckstiess.net/research/snippets/show/ca1d7d90
@@ -58,7 +58,7 @@ def cmd(self, cmd_list):
             logging.debug(out)
             return out
         except subprocess.CalledProcessError as e:
-            print e.output
+            print(e.output)
             self._warn("Could not run command %s" % cmd_list)
 
 
@@ -102,14 +102,14 @@ def preprocess(self, in_filenames):
             logging.info("Starting preprocessing parallel execution")
             preprocessed_filenames = pool.map(unwrap_self,zip([self]*len(fns),fns))
             pool.close()
-        except KeyboardInterrupt or Exception:
+        except (KeyboardInterrupt, Exception):
             print("Caught keyboard interrupt... terminating")
             pool.terminate()
             #sys,exit(-1)
             raise
         finally:
             pool.join()
-            logging.info ("Completed preprocessing")
+            logging.info("Completed preprocessing")
 
         return preprocessed_filenames