From d8a6067fc20f78f27468ad2c2a1a4f7d6d22148a Mon Sep 17 00:00:00 2001 From: Ben S Date: Mon, 6 Nov 2017 08:44:10 +0000 Subject: [PATCH] Python 3.5 and 3.6 compatability This updates to include compatability with py3, whilst retaining all functionality in py2. The evernote library is not yet py3 compatible, so is not used in the py3 version. It is however still retained when using py2. --- .travis.yml | 11 ++- pypdfocr/pypdfocr.py | 61 +++++++++------ pypdfocr/pypdfocr_filer_dirs.py | 2 +- pypdfocr/pypdfocr_filer_evernote.py | 20 +++-- pypdfocr/pypdfocr_gs.py | 26 +++---- pypdfocr/pypdfocr_multiprocessing.py | 16 ++-- pypdfocr/pypdfocr_pdf.py | 8 +- pypdfocr/pypdfocr_pdffiler.py | 17 ++--- pypdfocr/pypdfocr_preprocess.py | 8 +- pypdfocr/pypdfocr_tesseract.py | 52 ++++--------- pypdfocr/pypdfocr_watcher.py | 37 ++++----- requirements.txt | 2 +- test/test_evernote.py | 30 ++++++-- test/test_gs.py | 7 +- test/test_option_parsing.py | 46 ++++++++++-- test/test_pdf_filer.py | 17 +++-- test/test_pypdfocr.py | 107 +++++++++++++++++---------- test/test_tesseract.py | 8 +- test/test_watcher.py | 25 +++---- 19 files changed, 288 insertions(+), 212 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5da555e..06dd8be 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,14 @@ language: python python: - "2.7" + - "3.5" + - "3.6" +before_install: + - sudo apt-get -qq update + - sudo apt-get install -y tesseract-ocr ghostscript imagemagick install: - - "pip install -r requirements.txt --use-mirrors" - - "pip install pytest mock --use-mirrors" + - "pip install -r requirements.txt" + - "pip install pytest mock" - "pip install ." script: - - "python setup.py test" + - "pytest test" diff --git a/pypdfocr/pypdfocr.py b/pypdfocr/pypdfocr.py index 7ee7e9e..a95cb66 100644 --- a/pypdfocr/pypdfocr.py +++ b/pypdfocr/pypdfocr.py @@ -21,24 +21,37 @@ import itertools from functools import wraps -from version import __version__ +from .version import __version__ from PIL import Image import yaml import multiprocessing -# Replace the Popen routine to allow win32 pyinstaller to build -from multiprocessing import forking -from pypdfocr_multiprocessing import _Popen + +""" Special work-around to support multiprocessing and pyinstaller --onefile on windows systms + + https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing +""" +try: + # Python 3.4+ + if sys.platform.startswith('win'): + import multiprocessing.popen_spawn_win32 as forking + else: + import multiprocessing.popen_fork as forking +except ImportError: + import multiprocessing.forking as forking + +from .pypdfocr_multiprocessing import _Popen forking.Popen = _Popen -from pypdfocr_pdf import PyPdf -from pypdfocr_tesseract import PyTesseract -from pypdfocr_gs import PyGs -from pypdfocr_watcher import PyPdfWatcher -from pypdfocr_pdffiler import PyPdfFiler -from pypdfocr_filer_dirs import PyFilerDirs -from pypdfocr_filer_evernote import PyFilerEvernote -from pypdfocr_preprocess import PyPreprocess +from .pypdfocr_pdf import PyPdf +from .pypdfocr_tesseract import PyTesseract +from .pypdfocr_gs import PyGs +from .pypdfocr_watcher import PyPdfWatcher +from .pypdfocr_pdffiler import PyPdfFiler +from .pypdfocr_filer_dirs import PyFilerDirs +from .pypdfocr_filer_evernote import ENABLED as evernote_enabled +from .pypdfocr_filer_evernote import PyFilerEvernote +from .pypdfocr_preprocess import PyPreprocess def error(text): print("ERROR: %s" % text) @@ -49,12 +62,14 @@ def retry(count=5, exc_type = Exception): def decorator(func): @wraps(func) def result(*args, **kwargs): + err = None for _ in range(count): try: return func(*args, **kwargs) - except exc_type: - pass - raise + except exc_type as e: + err = e + else: + raise err return result return decorator @@ -161,11 +176,11 @@ def get_options(self, argv): filing_group = p.add_argument_group(title="Filing optinos") filing_group.add_argument('-f', '--file', action='store_true', default=False, dest='enable_filing', help='Enable filing of converted PDFs') - #filing_group.add_argument('-c', '--config', type = argparse.FileType('r'), + # filing_group.add_argument('-c', '--config', type = argparse.FileType('r'), filing_group.add_argument('-c', '--config', type = lambda x: open_file_with_timeout(p,x), dest='configfile', help='Configuration file for defaults and PDF filing') filing_group.add_argument('-e', '--evernote', action='store_true', - default=False, dest='enable_evernote', help='Enable filing to Evernote') + default=False, dest='enable_evernote', help='Enable filing to Evernote.') filing_group.add_argument('-n', action='store_true', default=False, dest='match_using_filename', help='Use filename to match if contents did not match anything, before filing to default folder') @@ -204,7 +219,11 @@ def get_options(self, argv): logging.debug("Read in configuration file") logging.debug(self.config) - if args.enable_evernote: + # Evernote filing does not work in py3 + if args.enable_evernote and not evernote_enabled: + print("Warning: Evernote filing disabled, could not find evernote API. Evernote not available in py3.") + self.enable_evernote = False + elif args.enable_evernote: self.enable_evernote = True else: self.enable_evernote = False @@ -367,11 +386,11 @@ def run_conversion(self, pdf_filename): time.sleep(1) if not self.debug: # Need to clean up the original image files before preprocessing - if locals().has_key("fns"): # Have to check if this was set before exception raised + if "fns" in locals(): # Have to check if this was set before exception raised logging.info("Cleaning up %s" % fns) self._clean_up_files(fns) - if locals().has_key("preprocess_imagefilenames"): # Have to check if this was set before exception raised + if "preprocess_imagefilenames" in locals(): # Have to check if this was set before exception raised logging.info("Cleaning up %s" % preprocess_imagefilenames) self._clean_up_files(preprocess_imagefilenames) # splat the hocr_filenames as it is a list of pairs for ext in [".hocr", ".html", ".txt"]: @@ -467,7 +486,7 @@ def go(self, argv): except KeyboardInterrupt: break except Exception as e: - print traceback.print_exc(e) + print(traceback.print_exc(e)) py_watcher.stop() else: diff --git a/pypdfocr/pypdfocr_filer_dirs.py b/pypdfocr/pypdfocr_filer_dirs.py index dc19330..c7dc73f 100644 --- a/pypdfocr/pypdfocr_filer_dirs.py +++ b/pypdfocr/pypdfocr_filer_dirs.py @@ -16,7 +16,7 @@ import os import shutil -from pypdfocr_filer import PyFiler +from .pypdfocr_filer import PyFiler """ Implementation of a filer class diff --git a/pypdfocr/pypdfocr_filer_evernote.py b/pypdfocr/pypdfocr_filer_evernote.py index 80ec115..9064415 100644 --- a/pypdfocr/pypdfocr_filer_evernote.py +++ b/pypdfocr/pypdfocr_filer_evernote.py @@ -19,17 +19,21 @@ import time import sys -from pypdfocr_filer import PyFiler +from .pypdfocr_filer import PyFiler import functools -from evernote.api.client import EvernoteClient -import evernote.edam.type.ttypes as Types -import evernote.edam.userstore.constants as UserStoreConstants -from evernote.edam.error.ttypes import EDAMUserException -from evernote.edam.error.ttypes import EDAMSystemException -from evernote.edam.error.ttypes import EDAMNotFoundException -from evernote.edam.error.ttypes import EDAMErrorCode +try: + from evernote.api.client import EvernoteClient + import evernote.edam.type.ttypes as Types + import evernote.edam.userstore.constants as UserStoreConstants + from evernote.edam.error.ttypes import EDAMUserException + from evernote.edam.error.ttypes import EDAMSystemException + from evernote.edam.error.ttypes import EDAMNotFoundException + from evernote.edam.error.ttypes import EDAMErrorCode + ENABLED = True +except ImportError: + ENABLED = False """ diff --git a/pypdfocr/pypdfocr_gs.py b/pypdfocr/pypdfocr_gs.py index 5599082..1477847 100644 --- a/pypdfocr/pypdfocr_gs.py +++ b/pypdfocr/pypdfocr_gs.py @@ -92,21 +92,21 @@ def _find_windows_gs(self): listing = os.listdir('.') # Find all possible gs* sub-directories - listing = [x for x in listing if x.startswith('gs')] + listing = [x for x in listing if x.startswith('gs')] # TODO: Make this a natural sort listing.sort(reverse=True) - for bindir in listing: - binpath = os.path.join(bindir,'bin') - if not os.path.exists(binpath): continue - os.chdir(binpath) + for bindir in listing: + binpath = os.path.join(bindir,'bin') + if not os.path.exists(binpath): continue + os.chdir(binpath) # Look for gswin64c.exe or gswin32c.exe (the c is for the command-line version) - gswin = glob.glob('gswin*c.exe') - if len(gswin) == 0: - continue - gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?) - os.chdir(cwd) - return gs + gswin = glob.glob('gswin*c.exe') + if len(gswin) == 0: + continue + gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?) + os.chdir(cwd) + return gs if not gs: error(self.msgs['GS_MISSING_BINARY']) @@ -171,10 +171,10 @@ def _run_gs(self, options, output_filename, pdf_filename): try: cmd = '%s -q -dNOPAUSE %s -sOutputFile="%s" "%s" -c quit' % (self.binary, options, output_filename, pdf_filename) logging.info(cmd) - out = subprocess.check_output(cmd, shell=True) + out = subprocess.check_output(cmd, shell=True, universal_newlines=True) except subprocess.CalledProcessError as e: - print e.output + print(e.output) if "undefined in .getdeviceparams" in e.output: error(self.msgs['GS_OUTDATED']) else: diff --git a/pypdfocr/pypdfocr_multiprocessing.py b/pypdfocr/pypdfocr_multiprocessing.py index 3666268..253bd55 100644 --- a/pypdfocr/pypdfocr_multiprocessing.py +++ b/pypdfocr/pypdfocr_multiprocessing.py @@ -13,19 +13,25 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys, os, multiprocessing.forking import logging +import os +import sys """ Special work-around to support multiprocessing and pyinstaller --onefile on windows systms https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing """ +try: + # Python 3.4+ + if sys.platform.startswith('win'): + import multiprocessing.popen_spawn_win32 as forking + else: + import multiprocessing.popen_fork as forking +except ImportError: + import multiprocessing.forking as forking -import multiprocessing.forking as forking -import os -import sys -class _Popen(multiprocessing.forking.Popen): +class _Popen(forking.Popen): def __init__(self, *args, **kw): if hasattr(sys, 'frozen'): # We have to set original _MEIPASS2 value from sys._MEIPASS diff --git a/pypdfocr/pypdfocr_pdf.py b/pypdfocr/pypdfocr_pdf.py index bdc1f86..8438b38 100644 --- a/pypdfocr/pypdfocr_pdf.py +++ b/pypdfocr/pypdfocr_pdf.py @@ -31,7 +31,6 @@ import tempfile import glob -import cStringIO import base64 import zlib import math @@ -52,7 +51,7 @@ from reportlab.lib.enums import TA_LEFT from reportlab.platypus.paragraph import Paragraph -from pypdfocr_util import Retry +from .pypdfocr_util import Retry from functools import partial class RotatedPara(Paragraph): @@ -152,10 +151,11 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename): all_text_filename = os.path.join(pdf_dir, "%s_text.pdf" % (basename)) merger = PdfFileMerger() for text_pdf_filename in text_pdf_filenames: - merger.append(PdfFileReader(file(text_pdf_filename, 'rb'))) + with open(text_pdf_filename, 'rb') as f: + merger.append(PdfFileReader(f)) merger.write(all_text_filename) merger.close() - del merger + del merger writer = PdfFileWriter() diff --git a/pypdfocr/pypdfocr_pdffiler.py b/pypdfocr/pypdfocr_pdffiler.py index 1bb23f5..1ab4247 100644 --- a/pypdfocr/pypdfocr_pdffiler.py +++ b/pypdfocr/pypdfocr_pdffiler.py @@ -18,15 +18,14 @@ on keywords """ -from sets import Set import sys, os import re import logging import shutil from PyPDF2 import PdfFileReader -from pypdfocr_filer import PyFiler -from pypdfocr_filer_dirs import PyFilerDirs +from .pypdfocr_filer import PyFiler +from .pypdfocr_filer_dirs import PyFilerDirs class PyPdfFiler(object): def __init__(self, filer): @@ -36,7 +35,7 @@ def __init__(self, filer): # Whether to fall back on filename for matching keywords against # if there is no match in the text - self.file_using_filename = False + self.file_using_filename = False def iter_pdf_page_text(self, filename): self.filename = filename @@ -44,7 +43,7 @@ def iter_pdf_page_text(self, filename): logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename)) for pgnum in range(reader.getNumPages()): text = reader.getPage(pgnum).extractText() - text = text.encode('ascii', 'ignore') + # text = text.encode('ascii', 'ignore') text = text.replace('\n', ' ') yield text @@ -56,10 +55,10 @@ def _get_matching_folder(self, pdfText): if s in searchText: logging.info("Matched keyword '%s'" % s) return folder - # No match found, so return + # No match found, so return return None - def file_original (self, original_filename): + def file_original(self, original_filename): return self.filer.file_original(original_filename) def move_to_matching_folder(self, filename): @@ -72,9 +71,9 @@ def move_to_matching_folder(self, filename): tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder) return tgt_file - + if __name__ == '__main__': p = PyPdfFiler(PyFilerDirs()) for page_text in p.iter_pdf_page_text("scan_ocr.pdf"): - print (page_text) + print(page_text) diff --git a/pypdfocr/pypdfocr_preprocess.py b/pypdfocr/pypdfocr_preprocess.py index e942cc3..34ed89a 100644 --- a/pypdfocr/pypdfocr_preprocess.py +++ b/pypdfocr/pypdfocr_preprocess.py @@ -28,7 +28,7 @@ import signal from multiprocessing import Pool -from pypdfocr_interrupts import init_worker +from .pypdfocr_interrupts import init_worker # Ugly hack to pass in object method to the multiprocessing library # From http://www.rueckstiess.net/research/snippets/show/ca1d7d90 @@ -58,7 +58,7 @@ def cmd(self, cmd_list): logging.debug(out) return out except subprocess.CalledProcessError as e: - print e.output + print(e.output) self._warn("Could not run command %s" % cmd_list) @@ -102,14 +102,14 @@ def preprocess(self, in_filenames): logging.info("Starting preprocessing parallel execution") preprocessed_filenames = pool.map(unwrap_self,zip([self]*len(fns),fns)) pool.close() - except KeyboardInterrupt or Exception: + except (KeyboardInterrupt, Exception): print("Caught keyboard interrupt... terminating") pool.terminate() #sys,exit(-1) raise finally: pool.join() - logging.info ("Completed preprocessing") + logging.info("Completed preprocessing") return preprocessed_filenames diff --git a/pypdfocr/pypdfocr_tesseract.py b/pypdfocr/pypdfocr_tesseract.py index 8f246ee..1cfb9f2 100644 --- a/pypdfocr/pypdfocr_tesseract.py +++ b/pypdfocr/pypdfocr_tesseract.py @@ -23,10 +23,11 @@ import logging import subprocess import glob +from pkg_resources import parse_version from subprocess import CalledProcessError from multiprocessing import Pool -from pypdfocr_interrupts import init_worker +from .pypdfocr_interrupts import init_worker def error(text): print("ERROR: %s" % text) @@ -79,50 +80,27 @@ def _is_version_uptodate(self): Make sure the version is current """ logging.info("Checking tesseract version") - cmd = '%s -v' % (self.binary) + cmd = "%s -v" % self.binary logging.info(cmd) try: - ret_output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) + ret_output = subprocess.check_output( + cmd, shell=True, stderr=subprocess.STDOUT, universal_newlines=True) except CalledProcessError: # Could not run tesseract error(self.msgs['TS_MISSING']) ver_str = '0.0.0' for line in ret_output.splitlines(): + print(line) if 'tesseract' in line: ver_str = line.split(' ')[1] - if ver_str.endswith('dev'): # Fix for version strings that end in 'dev' - ver_str = ver_str[:-3] - - # Iterate through the version dots - ver = [int(x) for x in ver_str.split('.')] - req = [int(x) for x in self.required.split('.')] - # Aargh, in windows 3.02.02 is reported as version 3.02 - # SFKM if str(os.name) == 'nt': - req = req[:2] - - version_good = False - for i,num in enumerate(req): - if len(ver) < i+1: - # This minor version number is not present in tesseract, so it must be - # lower than required. (3.02 < 3.02.01) - break - if ver[i]==num and len(ver) == i+1 and len(ver)==len(req): - # 3.02.02 == 3.02.02 - version_good = True - continue - if ver[i]>num: - # 4.0 > 3.02.02 - # 3.03.02 > 3.02.02 - version_good = True - break - if ver[i]= parse_version(req)), ver_str def _warn(self, msg): # pragma: no cover print("WARNING: %s" % msg) @@ -139,16 +117,16 @@ def make_hocr_from_pnms(self, fns): pool = Pool(processes=self.threads, initializer=init_worker) try: - hocr_filenames = pool.map(unwrap_self, zip([self]*len(fns), fns)) + hocr_filenames = pool.map(unwrap_self, list(zip([self]*len(fns), fns))) pool.close() - except KeyboardInterrupt or Exception: + except (KeyboardInterrupt, Exception): print("Caught keyboard interrupt... terminating") pool.terminate() raise finally: pool.join() - return zip(fns,hocr_filenames) + return list(zip(fns,hocr_filenames)) def make_hocr_from_pnm(self, img_filename): @@ -166,7 +144,7 @@ def make_hocr_from_pnm(self, img_filename): ret_output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: # Could not run tesseract - print e.output + print(e.output) self._warn (self.msgs['TS_FAILED']) if os.path.isfile(hocr_filename): diff --git a/pypdfocr/pypdfocr_watcher.py b/pypdfocr/pypdfocr_watcher.py index f7ef556..ec94b04 100755 --- a/pypdfocr/pypdfocr_watcher.py +++ b/pypdfocr/pypdfocr_watcher.py @@ -25,17 +25,19 @@ class PyPdfWatcher(FileSystemEventHandler): Every few seconds pop-off queue and if timestamp older than 3 seconds, process the file else, push it back onto queue. """ - events = {} - events_lock = Lock() def __init__(self, monitor_dir, config): FileSystemEventHandler.__init__(self) + + self.events = {} + self.events_lock = Lock() self.monitor_dir = monitor_dir if not config: config = {} self.scan_interval = config.get('scan_interval', 3) # If no updates in 3 seconds (or user specified option in config file) process file + def start(self): self.observer = Observer() self.observer.schedule(self, self.monitor_dir) @@ -94,19 +96,19 @@ def check_for_new_pdf(self,ev_path): """ if ev_path.endswith(".pdf"): if not ev_path.endswith(("_ocr.pdf", "_test.pdf")): - PyPdfWatcher.events_lock.acquire() - if not ev_path in PyPdfWatcher.events: - PyPdfWatcher.events[ev_path] = time.time() + self.events_lock.acquire() + if not ev_path in self.events: + self.events[ev_path] = time.time() logging.info ("Adding %s to event queue" % ev_path) else: - if PyPdfWatcher.events[ev_path] == -1: + if self.events[ev_path] == -1: logging.info ( "%s removing from event queue" % (ev_path)) - del PyPdfWatcher.events[ev_path] + del self.events[ev_path] else: newTime = time.time() logging.debug ( "%s already in event queue, updating timestamp to %d" % (ev_path, newTime)) - PyPdfWatcher.events[ev_path] = newTime - PyPdfWatcher.events_lock.release() + self.events[ev_path] = newTime + self.events_lock.release() @@ -133,19 +135,18 @@ def check_queue(self): :returns: Filename if available to process, otherwise None. """ now = time.time() - PyPdfWatcher.events_lock.acquire() - for monitored_file, timestamp in PyPdfWatcher.events.items(): - if timestamp == -1: - del PyPdfWatcher.events[monitored_file] - elif now - timestamp > self.scan_interval: + self.events_lock.acquire() + self.events = {file:ts for file, ts in self.events.items() if ts != -1} + for monitored_file, timestamp in self.events.items(): + if now - timestamp > self.scan_interval: logging.info("Processing new file %s" % (monitored_file)) # Remove this file from the dict - del PyPdfWatcher.events[monitored_file] + del self.events[monitored_file] monitored_file = self.rename_file_with_spaces(monitored_file) - PyPdfWatcher.events[monitored_file] = -1 # Add back into queue and mark as not needing further action in the event handler - PyPdfWatcher.events_lock.release() + self.events[monitored_file] = -1 # Add back into queue and mark as not needing further action in the event handler + self.events_lock.release() return monitored_file - PyPdfWatcher.events_lock.release() + self.events_lock.release() return None diff --git a/requirements.txt b/requirements.txt index ae91a04..c086189 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ pillow>=2.2 reportlab>=2.7 watchdog>=0.6.0 pypdf2>=1.23 -evernote +evernote; python_version < '3' diff --git a/test/test_evernote.py b/test/test_evernote.py index d337ec4..9afbf02 100644 --- a/test/test_evernote.py +++ b/test/test_evernote.py @@ -2,13 +2,24 @@ import pypdfocr.pypdfocr_filer_evernote as P import pytest import os +import sys -import evernote.api.client -import evernote.edam.type.ttypes as Types +if sys.version_info.major == 2: + import evernote.api.client + import evernote.edam.type.ttypes as Types import hashlib from mock import patch, call + +def test_import(): + """Evernote filing enabled for py2 only""" + expect_enabled = sys.version_info.major == 2 + assert P.ENABLED == expect_enabled + + +@pytest.mark.skipif(sys.version_info.major>=3, + reason="Evernote API not compatible with py3.") class TestEvernote: def test_connecct(self): @@ -22,23 +33,25 @@ def test_connecct(self): def test_file_original(self, mock_move): with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: p = P.PyFilerEvernote("TOKEN") - filename = os.path.join("pdfs","test_recipe.pdf") + filepath = os.path.dirname(__file__) + filename = os.path.join(filepath, "pdfs","test_recipe.pdf") # First, test code that does not move original p.file_original(filename) assert (not mock_move.called) # Now test moving - p.set_original_move_folder(os.path.join("temp", "original")) + p.set_original_move_folder(os.path.join(filepath, "temp", "original")) p.file_original(filename) - mock_move.assert_called_with(filename, os.path.join("temp","original", "test_recipe_2.pdf")) + mock_move.assert_called_with(filename, os.path.join(filepath, "temp","original", "test_recipe_2.pdf")) @patch('os.remove') def test_move_to_folder(self, mock_remove): with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: p = P.PyFilerEvernote("TOKEN") - filename = os.path.join("pdfs", "test_recipe.pdf") - foldername = 'recipe' + filepath = os.path.dirname(__file__) + filename = os.path.join(filepath, "pdfs", "test_recipe.pdf") + foldername = os.path.join(filepath, 'recipe') with pytest.raises(AssertionError): p.move_to_matching_folder(filename, foldername) p.set_target_folder('target') @@ -61,7 +74,8 @@ def test_create_note(self): p = P.PyFilerEvernote("TOKEN") notebook = Types.Notebook() notebook.name = "recipe" - filename = "pdfs/test_recipe.pdf" + filepath = os.path.dirname(__file__) + filename = os.path.join(filepath, "pdfs/test_recipe.pdf") note = p._create_evernote_note(notebook, filename) xml = '' assert(note.content.startswith(xml)) diff --git a/test/test_gs.py b/test/test_gs.py index c092b4f..9bebb57 100644 --- a/test/test_gs.py +++ b/test/test_gs.py @@ -1,12 +1,9 @@ #from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr_gs as P +from pypdfocr import pypdfocr_gs as P import pytest import os -import hashlib - -from mock import patch, call -from pytest import skip +from mock import patch class TestGS: diff --git a/test/test_option_parsing.py b/test/test_option_parsing.py index b8ae055..c97e39d 100644 --- a/test/test_option_parsing.py +++ b/test/test_option_parsing.py @@ -1,5 +1,7 @@ -#from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr as P +import os +import sys + +from pypdfocr import pypdfocr as P import pytest @@ -37,11 +39,15 @@ def test_standalone_filing(self): self.p.get_options(opts) # Assert that it checks that the config file is present - opts.append('--config=test_option_config.yaml') + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts.append('--config={}'.format(conf_path)) self.p.get_options(opts) assert(self.p.enable_filing) assert(self.p.config) + @pytest.mark.skipif(sys.version_info.major>2, + reason="Evernote disabled for py3") def test_standalone_filing_evernote(self): # Check when evernote is enabled opts = ["blah.pdf"] @@ -50,7 +56,9 @@ def test_standalone_filing_evernote(self): with pytest.raises(SystemExit): self.p.get_options(opts) - opts.append('--config=test_option_config.yaml') + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts.append('--config={}'.format(conf_path)) self.p.get_options(opts) # Enabling -e should turn on filing too assert(self.p.enable_filing) @@ -65,6 +73,21 @@ def test_standalone_filing_evernote(self): assert(self.p.config) assert(not self.p.watch) + @pytest.mark.skipif(sys.version_info.major==2, + reason="Evernote works on py2") + def test_evernote_disabled(self): + opts = ["blah.pdf"] + opts.append('-e') + # Assert that it checks that the config file is present + with pytest.raises(SystemExit): + self.p.get_options(opts) + + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts.append('--config={}'.format(conf_path)) + self.p.get_options(opts) + assert not self.p.enable_evernote + def test_standalone_watch_conflict(self): # When pdf file is specified, we don't want to allow watch option opts = ["blah.pdf", '-w'] @@ -80,23 +103,30 @@ def test_watch_filing(self): opts = ['-w temp'] self.p.get_options(opts) assert(self.p.watch_dir) - - opts.append('--config=test_option_config.yaml') + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts.append('--config={}'.format(conf_path)) self.p.get_options(opts) assert(self.p.watch) assert(self.p.config) assert(not self.p.enable_filing) assert(not self.p.enable_evernote) + @pytest.mark.skipif(sys.version_info.major>2, + reason="Evernote disabled for py3") def test_watch_filing_evernote(self): - opts = ['-w temp', '-e', '--config=test_option_config.yaml'] + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts = ['-w temp', '-e', '--config={}'.format(conf_path)] self.p.get_options(opts) assert(self.p.watch) assert(self.p.config) assert(self.p.enable_filing) assert(self.p.enable_evernote) - opts = ['-w temp', '-f', '-e', '--config=test_option_config.yaml'] + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts = ['-w temp', '-f', '-e', '--config={}'.format(conf_path)] self.p.get_options(opts) assert(self.p.watch) assert(self.p.config) diff --git a/test/test_pdf_filer.py b/test/test_pdf_filer.py index 9db7382..bdea966 100644 --- a/test/test_pdf_filer.py +++ b/test/test_pdf_filer.py @@ -1,12 +1,8 @@ #from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr as P -import pytest +from pypdfocr import pypdfocr as P import os -import hashlib - from mock import patch, call -from pytest import skip class TestPDFFiler: @@ -19,7 +15,9 @@ def test_file_by_filename(self, mock_move): # Mock the move function so we don't actually end up filing p = P.PyPDFOCR() cwd = os.getcwd() - filename = os.path.join("pdfs", "test_super_long_keyword.pdf") + filename = os.path.join(os.path.dirname(__file__), + "pdfs", + "test_super_long_keyword.pdf") out_filename = filename.replace(".pdf", "_ocr.pdf") if os.path.exists(out_filename): @@ -27,7 +25,10 @@ def test_file_by_filename(self, mock_move): print("Current directory: %s" % os.getcwd()) #opts = [filename, "--config=test_pypdfocr_config.yaml", "-f"] - opts = [filename, "--config=test_pypdfocr_config_filename.yaml", "-f", "-n"] + conf_path = os.path.join( + os.path.dirname(__file__), 'test_pypdfocr_config.yaml') + + opts = [filename, "--config={}".format(conf_path), "-f", "-n"] p.go(opts) assert(os.path.exists(out_filename)) @@ -38,4 +39,4 @@ def test_file_by_filename(self, mock_move): - + diff --git a/test/test_pypdfocr.py b/test/test_pypdfocr.py index f3c8db7..21d1c94 100644 --- a/test/test_pypdfocr.py +++ b/test/test_pypdfocr.py @@ -1,15 +1,11 @@ #from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr as P +from pypdfocr import pypdfocr as P import pytest import os import logging from PyPDF2 import PdfFileReader -import smtplib -from mock import Mock from mock import patch, call -from mock import MagicMock -from mock import PropertyMock class TestPydfocr: @@ -23,34 +19,62 @@ def _iter_pdf(self, filename): logging.debug("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename)) for pgnum in range(reader.getNumPages()): text = reader.getPage(pgnum).extractText() - text = text.encode('ascii', 'ignore') + # text = text.encode('ascii', 'ignore') text = text.replace('\n', ' ') yield text - + + filepath = os.path.dirname(__file__) pdf_tests = [ - (".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "test_recipe.pdf"), [ ["Simply Recipes"], - ]), - (".", os.path.join("temp","target","patents"), os.path.join("pdfs","test_patent.pdf"), [ - ["asynchronous", "subject to", "20 Claims"], # Page 1 - ["FOREIGN PATENT" ], # Page 2 - ]), - (".", os.path.join("temp","target", "default"), os.path.join("pdfs","test_sherlock.pdf"), [ ["Bohemia", "Trincomalee"], # Page 1 - ["hundreds of times" ], # Page 2 - ]), - ("pdfs", os.path.join("temp","target","default"), "test_sherlock.pdf", [ ["Bohemia", "Trincomalee"], # Page 1 - ["hundreds of times" ], # Page 2 - ]), - (".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "1.pdf"), [ ["Simply","Recipes"], - ]), - (".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "test_recipe_sideways.pdf"), [ ["Simply","Recipes", 'spinach'], - ]), + ( + filepath, + os.path.join(filepath, "temp","target","recipe"), + os.path.join(filepath, "pdfs", "test_recipe.pdf"), + [ ["Simply Recipes"],]), + ( + filepath, + os.path.join(filepath, "temp","target","patents"), + os.path.join("pdfs","test_patent.pdf"), + [ + ["asynchronous", "subject to", "20 Claims"], # Page 1 + ["FOREIGN PATENT" ], # Page 2 + ]), + ( + filepath, + os.path.join(filepath, "temp","target", "default"), + os.path.join("pdfs","test_sherlock.pdf"), + [ + ["Bohemia", "Trincomalee"], # Page 1 + ["hundreds of times" ], # Page 2 + ]), + ( + os.path.join(filepath, "pdfs"), + os.path.join(filepath, "temp","target","default"), + "test_sherlock.pdf", + [ + ["Bohemia", "Trincomalee"], # Page 1 + ["hundreds of times" ], # Page 2 + ]), + ( + filepath, + os.path.join(filepath, "temp","target","recipe"), + os.path.join("..","test", "pdfs", "1.pdf"), + [ + ["Simply","Recipes"], + ]), + ( + filepath, + os.path.join(filepath, "temp","target","recipe"), + os.path.join("..","test", "pdfs", "test_recipe_sideways.pdf"), + [ + ["Simply","Recipes", 'spinach'], + ]), ] - #@pytest.mark.skipif(True, reason="Just testing") + # @pytest.mark.skipif(True, reason="Just testing") @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", pdf_tests) def test_standalone(self, dirname, tgt_folder, filename, expected): """ - Test the single file conversion with no filing. + Test the single file conversion with no filing. Tests relative paths (".."), files in subirs, and files in current dir Checks for that _ocr file is created and keywords found in pdf. Modify :attribute:`pdf_tests` for changing keywords, etc @@ -61,9 +85,9 @@ def test_standalone(self, dirname, tgt_folder, filename, expected): # First redo the unix-style paths, in case we're running on windows # Assume paths in unix-style - dirname = os.path.join(*(dirname.split("/"))) - tgt_folder = os.path.join(*(tgt_folder.split("/"))) - filename = os.path.join(*(filename.split("/"))) + # dirname = os.path.join(*(dirname.split("/"))) + # tgt_folder = os.path.join(*(tgt_folder.split("/"))) + # filename = os.path.join(*(filename.split("/"))) cwd = os.getcwd() @@ -77,12 +101,12 @@ def test_standalone(self, dirname, tgt_folder, filename, expected): if len(expected) > i: for keyword in expected[i]: assert(keyword in t) - print ("\n----------------------\nPage %d\n" % i) - print t + print("\n----------------------\nPage %d\n" % i) + print(t) os.remove(out_filename) os.chdir(cwd) - #@pytest.mark.skipif(True, reason="just testing") + # @pytest.mark.skipif(True, reason="just testing") @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", [pdf_tests[0]]) def test_standalone_email(self, dirname, tgt_folder, filename, expected): """ @@ -104,11 +128,11 @@ def test_standalone_email(self, dirname, tgt_folder, filename, expected): if len(expected) > i: for keyword in expected[i]: assert(keyword in t) - print ("\n----------------------\nPage %d\n" % i) - print t + print("\n----------------------\nPage %d\n" % i) + print(t) os.remove(out_filename) os.chdir(cwd) - + # Assert the smtp calls instance = mock_smtp.return_value assert(instance.starttls.called) @@ -116,7 +140,10 @@ def test_standalone_email(self, dirname, tgt_folder, filename, expected): assert(instance.sendmail.called) @patch('shutil.move') - @pytest.mark.parametrize("config", [("test_pypdfocr_config.yaml"), ("test_pypdfocr_config_no_move_original.yaml")]) + @pytest.mark.parametrize( + "config", + [(os.path.join(filepath, "test_pypdfocr_config.yaml")), + (os.path.join(filepath, "test_pypdfocr_config_no_move_original.yaml"))]) @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", pdf_tests[0:3]) def test_standalone_filing(self, mock_move, config, dirname, tgt_folder, filename, expected): """ @@ -146,18 +173,18 @@ def test_standalone_filing(self, mock_move, config, dirname, tgt_folder, filenam if len(expected) > i: for keyword in expected[i]: assert(keyword in t) - print ("\n----------------------\nPage %d\n" % i) - print t + print("\n----------------------\nPage %d\n" % i) + print(t) os.remove(out_filename) os.chdir(cwd) - + # Assert the smtp calls calls = [call(out_filename, - os.path.abspath(os.path.join(tgt_folder,os.path.basename(out_filename))))] + os.path.abspath(os.path.join(tgt_folder,os.path.basename(out_filename))))] if not "no_move_original" in config: new_file_name = os.path.basename(filename).replace(".pdf", "_2.pdf") calls.append(call(filename, - os.path.abspath(os.path.join("temp","original", new_file_name)))) + os.path.abspath(os.path.join("test", "temp","original", new_file_name)))) mock_move.assert_has_calls(calls) def test_set_binaries(self): diff --git a/test/test_tesseract.py b/test/test_tesseract.py index c137248..5f10131 100644 --- a/test/test_tesseract.py +++ b/test/test_tesseract.py @@ -1,11 +1,9 @@ #from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr_tesseract as P +from pypdfocr import pypdfocr_tesseract as P import pytest import os -import hashlib - -from mock import patch, call +from mock import patch class TestTesseract: @@ -72,7 +70,7 @@ def test_tesseract_presence(self, capsys): def test_tesseract_version(self, capsys): p = P.PyTesseract({}) - p.required = "100" + p.required = "100.01" with pytest.raises(SystemExit): p.make_hocr_from_pnms("") out, err = capsys.readouterr() diff --git a/test/test_watcher.py b/test/test_watcher.py index 8470760..a7ae6ce 100644 --- a/test/test_watcher.py +++ b/test/test_watcher.py @@ -1,15 +1,11 @@ -#from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr_watcher as P +from pypdfocr import pypdfocr_watcher as P import pytest -import evernote.api.client -import evernote.edam.type.ttypes as Types -import hashlib import time import os from collections import namedtuple -from mock import patch, call +from mock import patch class TestWatching: @@ -23,20 +19,20 @@ class TestWatching: @patch('shutil.move') @pytest.mark.parametrize(("filename, expected"), filenames) - def test_rename(self, mock_move, filename, expected): + def test_rename(self, mock_move, filename, expected, tmpdir): if expected == None: expected = filename - p = P.PyPdfWatcher('temp',{}) + p = P.PyPdfWatcher(str(tmpdir.mkdir("tmp")),{}) # First, test code that does not move original ret = p.rename_file_with_spaces(filename) assert (ret==expected) - def test_check_for_new_pdf(self): + def test_check_for_new_pdf(self, tmpdir): - p = P.PyPdfWatcher('temp', {}) + p = P.PyPdfWatcher(str(tmpdir.mkdir("tmp")), {}) p.check_for_new_pdf("blah_ocr.pdf") assert("blah_ocr.pdf" not in p.events) p.check_for_new_pdf("blah.pdf") @@ -49,8 +45,8 @@ def test_check_for_new_pdf(self): p.check_for_new_pdf("blah.pdf") assert(p.events['blah.pdf']-time.time() <=1) # Check that time stamp was updated - def test_events(self): - p = P.PyPdfWatcher('temp', {}) + def test_events(self, tmpdir): + p = P.PyPdfWatcher(str(tmpdir.mkdir("tmp")), {}) event = namedtuple('event', 'src_path, dest_path') @@ -63,8 +59,9 @@ def test_events(self): p.on_modified(event(src_path='temp_recipe3.pdf', dest_path=None)) assert('temp_recipe3.pdf' in p.events) - def test_check_queue(self): - p = P.PyPdfWatcher('temp', {}) + def test_check_queue(self, tmpdir): + p = P.PyPdfWatcher(str(tmpdir.mkdir("tmp")), {}) + assert p.events == {} now = time.time() p.events['blah.pdf'] = now f = p.check_queue()