diff --git a/.travis.yml b/.travis.yml index 5da555e..06dd8be 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,14 @@ language: python python: - "2.7" + - "3.5" + - "3.6" +before_install: + - sudo apt-get -qq update + - sudo apt-get install -y tesseract-ocr ghostscript imagemagick install: - - "pip install -r requirements.txt --use-mirrors" - - "pip install pytest mock --use-mirrors" + - "pip install -r requirements.txt" + - "pip install pytest mock" - "pip install ." script: - - "python setup.py test" + - "pytest test" diff --git a/pypdfocr/pypdfocr.py b/pypdfocr/pypdfocr.py index 7ee7e9e..a95cb66 100644 --- a/pypdfocr/pypdfocr.py +++ b/pypdfocr/pypdfocr.py @@ -21,24 +21,37 @@ import itertools from functools import wraps -from version import __version__ +from .version import __version__ from PIL import Image import yaml import multiprocessing -# Replace the Popen routine to allow win32 pyinstaller to build -from multiprocessing import forking -from pypdfocr_multiprocessing import _Popen + +""" Special work-around to support multiprocessing and pyinstaller --onefile on windows systms + + https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing +""" +try: + # Python 3.4+ + if sys.platform.startswith('win'): + import multiprocessing.popen_spawn_win32 as forking + else: + import multiprocessing.popen_fork as forking +except ImportError: + import multiprocessing.forking as forking + +from .pypdfocr_multiprocessing import _Popen forking.Popen = _Popen -from pypdfocr_pdf import PyPdf -from pypdfocr_tesseract import PyTesseract -from pypdfocr_gs import PyGs -from pypdfocr_watcher import PyPdfWatcher -from pypdfocr_pdffiler import PyPdfFiler -from pypdfocr_filer_dirs import PyFilerDirs -from pypdfocr_filer_evernote import PyFilerEvernote -from pypdfocr_preprocess import PyPreprocess +from .pypdfocr_pdf import PyPdf +from .pypdfocr_tesseract import PyTesseract +from .pypdfocr_gs import PyGs +from .pypdfocr_watcher import PyPdfWatcher +from .pypdfocr_pdffiler import PyPdfFiler +from .pypdfocr_filer_dirs import PyFilerDirs +from .pypdfocr_filer_evernote import ENABLED as evernote_enabled +from .pypdfocr_filer_evernote import PyFilerEvernote +from .pypdfocr_preprocess import PyPreprocess def error(text): print("ERROR: %s" % text) @@ -49,12 +62,14 @@ def retry(count=5, exc_type = Exception): def decorator(func): @wraps(func) def result(*args, **kwargs): + err = None for _ in range(count): try: return func(*args, **kwargs) - except exc_type: - pass - raise + except exc_type as e: + err = e + else: + raise err return result return decorator @@ -161,11 +176,11 @@ def get_options(self, argv): filing_group = p.add_argument_group(title="Filing optinos") filing_group.add_argument('-f', '--file', action='store_true', default=False, dest='enable_filing', help='Enable filing of converted PDFs') - #filing_group.add_argument('-c', '--config', type = argparse.FileType('r'), + # filing_group.add_argument('-c', '--config', type = argparse.FileType('r'), filing_group.add_argument('-c', '--config', type = lambda x: open_file_with_timeout(p,x), dest='configfile', help='Configuration file for defaults and PDF filing') filing_group.add_argument('-e', '--evernote', action='store_true', - default=False, dest='enable_evernote', help='Enable filing to Evernote') + default=False, dest='enable_evernote', help='Enable filing to Evernote.') filing_group.add_argument('-n', action='store_true', default=False, dest='match_using_filename', help='Use filename to match if contents did not match anything, before filing to default folder') @@ -204,7 +219,11 @@ def get_options(self, argv): logging.debug("Read in configuration file") logging.debug(self.config) - if args.enable_evernote: + # Evernote filing does not work in py3 + if args.enable_evernote and not evernote_enabled: + print("Warning: Evernote filing disabled, could not find evernote API. Evernote not available in py3.") + self.enable_evernote = False + elif args.enable_evernote: self.enable_evernote = True else: self.enable_evernote = False @@ -367,11 +386,11 @@ def run_conversion(self, pdf_filename): time.sleep(1) if not self.debug: # Need to clean up the original image files before preprocessing - if locals().has_key("fns"): # Have to check if this was set before exception raised + if "fns" in locals(): # Have to check if this was set before exception raised logging.info("Cleaning up %s" % fns) self._clean_up_files(fns) - if locals().has_key("preprocess_imagefilenames"): # Have to check if this was set before exception raised + if "preprocess_imagefilenames" in locals(): # Have to check if this was set before exception raised logging.info("Cleaning up %s" % preprocess_imagefilenames) self._clean_up_files(preprocess_imagefilenames) # splat the hocr_filenames as it is a list of pairs for ext in [".hocr", ".html", ".txt"]: @@ -467,7 +486,7 @@ def go(self, argv): except KeyboardInterrupt: break except Exception as e: - print traceback.print_exc(e) + print(traceback.print_exc(e)) py_watcher.stop() else: diff --git a/pypdfocr/pypdfocr_filer_dirs.py b/pypdfocr/pypdfocr_filer_dirs.py index dc19330..c7dc73f 100644 --- a/pypdfocr/pypdfocr_filer_dirs.py +++ b/pypdfocr/pypdfocr_filer_dirs.py @@ -16,7 +16,7 @@ import os import shutil -from pypdfocr_filer import PyFiler +from .pypdfocr_filer import PyFiler """ Implementation of a filer class diff --git a/pypdfocr/pypdfocr_filer_evernote.py b/pypdfocr/pypdfocr_filer_evernote.py index 80ec115..9064415 100644 --- a/pypdfocr/pypdfocr_filer_evernote.py +++ b/pypdfocr/pypdfocr_filer_evernote.py @@ -19,17 +19,21 @@ import time import sys -from pypdfocr_filer import PyFiler +from .pypdfocr_filer import PyFiler import functools -from evernote.api.client import EvernoteClient -import evernote.edam.type.ttypes as Types -import evernote.edam.userstore.constants as UserStoreConstants -from evernote.edam.error.ttypes import EDAMUserException -from evernote.edam.error.ttypes import EDAMSystemException -from evernote.edam.error.ttypes import EDAMNotFoundException -from evernote.edam.error.ttypes import EDAMErrorCode +try: + from evernote.api.client import EvernoteClient + import evernote.edam.type.ttypes as Types + import evernote.edam.userstore.constants as UserStoreConstants + from evernote.edam.error.ttypes import EDAMUserException + from evernote.edam.error.ttypes import EDAMSystemException + from evernote.edam.error.ttypes import EDAMNotFoundException + from evernote.edam.error.ttypes import EDAMErrorCode + ENABLED = True +except ImportError: + ENABLED = False """ diff --git a/pypdfocr/pypdfocr_gs.py b/pypdfocr/pypdfocr_gs.py index 5599082..1477847 100644 --- a/pypdfocr/pypdfocr_gs.py +++ b/pypdfocr/pypdfocr_gs.py @@ -92,21 +92,21 @@ def _find_windows_gs(self): listing = os.listdir('.') # Find all possible gs* sub-directories - listing = [x for x in listing if x.startswith('gs')] + listing = [x for x in listing if x.startswith('gs')] # TODO: Make this a natural sort listing.sort(reverse=True) - for bindir in listing: - binpath = os.path.join(bindir,'bin') - if not os.path.exists(binpath): continue - os.chdir(binpath) + for bindir in listing: + binpath = os.path.join(bindir,'bin') + if not os.path.exists(binpath): continue + os.chdir(binpath) # Look for gswin64c.exe or gswin32c.exe (the c is for the command-line version) - gswin = glob.glob('gswin*c.exe') - if len(gswin) == 0: - continue - gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?) - os.chdir(cwd) - return gs + gswin = glob.glob('gswin*c.exe') + if len(gswin) == 0: + continue + gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?) + os.chdir(cwd) + return gs if not gs: error(self.msgs['GS_MISSING_BINARY']) @@ -171,10 +171,10 @@ def _run_gs(self, options, output_filename, pdf_filename): try: cmd = '%s -q -dNOPAUSE %s -sOutputFile="%s" "%s" -c quit' % (self.binary, options, output_filename, pdf_filename) logging.info(cmd) - out = subprocess.check_output(cmd, shell=True) + out = subprocess.check_output(cmd, shell=True, universal_newlines=True) except subprocess.CalledProcessError as e: - print e.output + print(e.output) if "undefined in .getdeviceparams" in e.output: error(self.msgs['GS_OUTDATED']) else: diff --git a/pypdfocr/pypdfocr_multiprocessing.py b/pypdfocr/pypdfocr_multiprocessing.py index 3666268..253bd55 100644 --- a/pypdfocr/pypdfocr_multiprocessing.py +++ b/pypdfocr/pypdfocr_multiprocessing.py @@ -13,19 +13,25 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys, os, multiprocessing.forking import logging +import os +import sys """ Special work-around to support multiprocessing and pyinstaller --onefile on windows systms https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing """ +try: + # Python 3.4+ + if sys.platform.startswith('win'): + import multiprocessing.popen_spawn_win32 as forking + else: + import multiprocessing.popen_fork as forking +except ImportError: + import multiprocessing.forking as forking -import multiprocessing.forking as forking -import os -import sys -class _Popen(multiprocessing.forking.Popen): +class _Popen(forking.Popen): def __init__(self, *args, **kw): if hasattr(sys, 'frozen'): # We have to set original _MEIPASS2 value from sys._MEIPASS diff --git a/pypdfocr/pypdfocr_pdf.py b/pypdfocr/pypdfocr_pdf.py index bdc1f86..8438b38 100644 --- a/pypdfocr/pypdfocr_pdf.py +++ b/pypdfocr/pypdfocr_pdf.py @@ -31,7 +31,6 @@ import tempfile import glob -import cStringIO import base64 import zlib import math @@ -52,7 +51,7 @@ from reportlab.lib.enums import TA_LEFT from reportlab.platypus.paragraph import Paragraph -from pypdfocr_util import Retry +from .pypdfocr_util import Retry from functools import partial class RotatedPara(Paragraph): @@ -152,10 +151,11 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename): all_text_filename = os.path.join(pdf_dir, "%s_text.pdf" % (basename)) merger = PdfFileMerger() for text_pdf_filename in text_pdf_filenames: - merger.append(PdfFileReader(file(text_pdf_filename, 'rb'))) + with open(text_pdf_filename, 'rb') as f: + merger.append(PdfFileReader(f)) merger.write(all_text_filename) merger.close() - del merger + del merger writer = PdfFileWriter() diff --git a/pypdfocr/pypdfocr_pdffiler.py b/pypdfocr/pypdfocr_pdffiler.py index 1bb23f5..1ab4247 100644 --- a/pypdfocr/pypdfocr_pdffiler.py +++ b/pypdfocr/pypdfocr_pdffiler.py @@ -18,15 +18,14 @@ on keywords """ -from sets import Set import sys, os import re import logging import shutil from PyPDF2 import PdfFileReader -from pypdfocr_filer import PyFiler -from pypdfocr_filer_dirs import PyFilerDirs +from .pypdfocr_filer import PyFiler +from .pypdfocr_filer_dirs import PyFilerDirs class PyPdfFiler(object): def __init__(self, filer): @@ -36,7 +35,7 @@ def __init__(self, filer): # Whether to fall back on filename for matching keywords against # if there is no match in the text - self.file_using_filename = False + self.file_using_filename = False def iter_pdf_page_text(self, filename): self.filename = filename @@ -44,7 +43,7 @@ def iter_pdf_page_text(self, filename): logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename)) for pgnum in range(reader.getNumPages()): text = reader.getPage(pgnum).extractText() - text = text.encode('ascii', 'ignore') + # text = text.encode('ascii', 'ignore') text = text.replace('\n', ' ') yield text @@ -56,10 +55,10 @@ def _get_matching_folder(self, pdfText): if s in searchText: logging.info("Matched keyword '%s'" % s) return folder - # No match found, so return + # No match found, so return return None - def file_original (self, original_filename): + def file_original(self, original_filename): return self.filer.file_original(original_filename) def move_to_matching_folder(self, filename): @@ -72,9 +71,9 @@ def move_to_matching_folder(self, filename): tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder) return tgt_file - + if __name__ == '__main__': p = PyPdfFiler(PyFilerDirs()) for page_text in p.iter_pdf_page_text("scan_ocr.pdf"): - print (page_text) + print(page_text) diff --git a/pypdfocr/pypdfocr_preprocess.py b/pypdfocr/pypdfocr_preprocess.py index e942cc3..34ed89a 100644 --- a/pypdfocr/pypdfocr_preprocess.py +++ b/pypdfocr/pypdfocr_preprocess.py @@ -28,7 +28,7 @@ import signal from multiprocessing import Pool -from pypdfocr_interrupts import init_worker +from .pypdfocr_interrupts import init_worker # Ugly hack to pass in object method to the multiprocessing library # From http://www.rueckstiess.net/research/snippets/show/ca1d7d90 @@ -58,7 +58,7 @@ def cmd(self, cmd_list): logging.debug(out) return out except subprocess.CalledProcessError as e: - print e.output + print(e.output) self._warn("Could not run command %s" % cmd_list) @@ -102,14 +102,14 @@ def preprocess(self, in_filenames): logging.info("Starting preprocessing parallel execution") preprocessed_filenames = pool.map(unwrap_self,zip([self]*len(fns),fns)) pool.close() - except KeyboardInterrupt or Exception: + except (KeyboardInterrupt, Exception): print("Caught keyboard interrupt... terminating") pool.terminate() #sys,exit(-1) raise finally: pool.join() - logging.info ("Completed preprocessing") + logging.info("Completed preprocessing") return preprocessed_filenames diff --git a/pypdfocr/pypdfocr_tesseract.py b/pypdfocr/pypdfocr_tesseract.py index 8f246ee..1cfb9f2 100644 --- a/pypdfocr/pypdfocr_tesseract.py +++ b/pypdfocr/pypdfocr_tesseract.py @@ -23,10 +23,11 @@ import logging import subprocess import glob +from pkg_resources import parse_version from subprocess import CalledProcessError from multiprocessing import Pool -from pypdfocr_interrupts import init_worker +from .pypdfocr_interrupts import init_worker def error(text): print("ERROR: %s" % text) @@ -79,50 +80,27 @@ def _is_version_uptodate(self): Make sure the version is current """ logging.info("Checking tesseract version") - cmd = '%s -v' % (self.binary) + cmd = "%s -v" % self.binary logging.info(cmd) try: - ret_output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) + ret_output = subprocess.check_output( + cmd, shell=True, stderr=subprocess.STDOUT, universal_newlines=True) except CalledProcessError: # Could not run tesseract error(self.msgs['TS_MISSING']) ver_str = '0.0.0' for line in ret_output.splitlines(): + print(line) if 'tesseract' in line: ver_str = line.split(' ')[1] - if ver_str.endswith('dev'): # Fix for version strings that end in 'dev' - ver_str = ver_str[:-3] - - # Iterate through the version dots - ver = [int(x) for x in ver_str.split('.')] - req = [int(x) for x in self.required.split('.')] - # Aargh, in windows 3.02.02 is reported as version 3.02 - # SFKM if str(os.name) == 'nt': - req = req[:2] - - version_good = False - for i,num in enumerate(req): - if len(ver) < i+1: - # This minor version number is not present in tesseract, so it must be - # lower than required. (3.02 < 3.02.01) - break - if ver[i]==num and len(ver) == i+1 and len(ver)==len(req): - # 3.02.02 == 3.02.02 - version_good = True - continue - if ver[i]>num: - # 4.0 > 3.02.02 - # 3.03.02 > 3.02.02 - version_good = True - break - if ver[i]= parse_version(req)), ver_str def _warn(self, msg): # pragma: no cover print("WARNING: %s" % msg) @@ -139,16 +117,16 @@ def make_hocr_from_pnms(self, fns): pool = Pool(processes=self.threads, initializer=init_worker) try: - hocr_filenames = pool.map(unwrap_self, zip([self]*len(fns), fns)) + hocr_filenames = pool.map(unwrap_self, list(zip([self]*len(fns), fns))) pool.close() - except KeyboardInterrupt or Exception: + except (KeyboardInterrupt, Exception): print("Caught keyboard interrupt... terminating") pool.terminate() raise finally: pool.join() - return zip(fns,hocr_filenames) + return list(zip(fns,hocr_filenames)) def make_hocr_from_pnm(self, img_filename): @@ -166,7 +144,7 @@ def make_hocr_from_pnm(self, img_filename): ret_output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: # Could not run tesseract - print e.output + print(e.output) self._warn (self.msgs['TS_FAILED']) if os.path.isfile(hocr_filename): diff --git a/pypdfocr/pypdfocr_watcher.py b/pypdfocr/pypdfocr_watcher.py index f7ef556..ec94b04 100755 --- a/pypdfocr/pypdfocr_watcher.py +++ b/pypdfocr/pypdfocr_watcher.py @@ -25,17 +25,19 @@ class PyPdfWatcher(FileSystemEventHandler): Every few seconds pop-off queue and if timestamp older than 3 seconds, process the file else, push it back onto queue. """ - events = {} - events_lock = Lock() def __init__(self, monitor_dir, config): FileSystemEventHandler.__init__(self) + + self.events = {} + self.events_lock = Lock() self.monitor_dir = monitor_dir if not config: config = {} self.scan_interval = config.get('scan_interval', 3) # If no updates in 3 seconds (or user specified option in config file) process file + def start(self): self.observer = Observer() self.observer.schedule(self, self.monitor_dir) @@ -94,19 +96,19 @@ def check_for_new_pdf(self,ev_path): """ if ev_path.endswith(".pdf"): if not ev_path.endswith(("_ocr.pdf", "_test.pdf")): - PyPdfWatcher.events_lock.acquire() - if not ev_path in PyPdfWatcher.events: - PyPdfWatcher.events[ev_path] = time.time() + self.events_lock.acquire() + if not ev_path in self.events: + self.events[ev_path] = time.time() logging.info ("Adding %s to event queue" % ev_path) else: - if PyPdfWatcher.events[ev_path] == -1: + if self.events[ev_path] == -1: logging.info ( "%s removing from event queue" % (ev_path)) - del PyPdfWatcher.events[ev_path] + del self.events[ev_path] else: newTime = time.time() logging.debug ( "%s already in event queue, updating timestamp to %d" % (ev_path, newTime)) - PyPdfWatcher.events[ev_path] = newTime - PyPdfWatcher.events_lock.release() + self.events[ev_path] = newTime + self.events_lock.release() @@ -133,19 +135,18 @@ def check_queue(self): :returns: Filename if available to process, otherwise None. """ now = time.time() - PyPdfWatcher.events_lock.acquire() - for monitored_file, timestamp in PyPdfWatcher.events.items(): - if timestamp == -1: - del PyPdfWatcher.events[monitored_file] - elif now - timestamp > self.scan_interval: + self.events_lock.acquire() + self.events = {file:ts for file, ts in self.events.items() if ts != -1} + for monitored_file, timestamp in self.events.items(): + if now - timestamp > self.scan_interval: logging.info("Processing new file %s" % (monitored_file)) # Remove this file from the dict - del PyPdfWatcher.events[monitored_file] + del self.events[monitored_file] monitored_file = self.rename_file_with_spaces(monitored_file) - PyPdfWatcher.events[monitored_file] = -1 # Add back into queue and mark as not needing further action in the event handler - PyPdfWatcher.events_lock.release() + self.events[monitored_file] = -1 # Add back into queue and mark as not needing further action in the event handler + self.events_lock.release() return monitored_file - PyPdfWatcher.events_lock.release() + self.events_lock.release() return None diff --git a/requirements.txt b/requirements.txt index ae91a04..c086189 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ pillow>=2.2 reportlab>=2.7 watchdog>=0.6.0 pypdf2>=1.23 -evernote +evernote; python_version < '3' diff --git a/test/test_evernote.py b/test/test_evernote.py index d337ec4..9afbf02 100644 --- a/test/test_evernote.py +++ b/test/test_evernote.py @@ -2,13 +2,24 @@ import pypdfocr.pypdfocr_filer_evernote as P import pytest import os +import sys -import evernote.api.client -import evernote.edam.type.ttypes as Types +if sys.version_info.major == 2: + import evernote.api.client + import evernote.edam.type.ttypes as Types import hashlib from mock import patch, call + +def test_import(): + """Evernote filing enabled for py2 only""" + expect_enabled = sys.version_info.major == 2 + assert P.ENABLED == expect_enabled + + +@pytest.mark.skipif(sys.version_info.major>=3, + reason="Evernote API not compatible with py3.") class TestEvernote: def test_connecct(self): @@ -22,23 +33,25 @@ def test_connecct(self): def test_file_original(self, mock_move): with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: p = P.PyFilerEvernote("TOKEN") - filename = os.path.join("pdfs","test_recipe.pdf") + filepath = os.path.dirname(__file__) + filename = os.path.join(filepath, "pdfs","test_recipe.pdf") # First, test code that does not move original p.file_original(filename) assert (not mock_move.called) # Now test moving - p.set_original_move_folder(os.path.join("temp", "original")) + p.set_original_move_folder(os.path.join(filepath, "temp", "original")) p.file_original(filename) - mock_move.assert_called_with(filename, os.path.join("temp","original", "test_recipe_2.pdf")) + mock_move.assert_called_with(filename, os.path.join(filepath, "temp","original", "test_recipe_2.pdf")) @patch('os.remove') def test_move_to_folder(self, mock_remove): with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: p = P.PyFilerEvernote("TOKEN") - filename = os.path.join("pdfs", "test_recipe.pdf") - foldername = 'recipe' + filepath = os.path.dirname(__file__) + filename = os.path.join(filepath, "pdfs", "test_recipe.pdf") + foldername = os.path.join(filepath, 'recipe') with pytest.raises(AssertionError): p.move_to_matching_folder(filename, foldername) p.set_target_folder('target') @@ -61,7 +74,8 @@ def test_create_note(self): p = P.PyFilerEvernote("TOKEN") notebook = Types.Notebook() notebook.name = "recipe" - filename = "pdfs/test_recipe.pdf" + filepath = os.path.dirname(__file__) + filename = os.path.join(filepath, "pdfs/test_recipe.pdf") note = p._create_evernote_note(notebook, filename) xml = '' assert(note.content.startswith(xml)) diff --git a/test/test_gs.py b/test/test_gs.py index c092b4f..9bebb57 100644 --- a/test/test_gs.py +++ b/test/test_gs.py @@ -1,12 +1,9 @@ #from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr_gs as P +from pypdfocr import pypdfocr_gs as P import pytest import os -import hashlib - -from mock import patch, call -from pytest import skip +from mock import patch class TestGS: diff --git a/test/test_option_parsing.py b/test/test_option_parsing.py index b8ae055..c97e39d 100644 --- a/test/test_option_parsing.py +++ b/test/test_option_parsing.py @@ -1,5 +1,7 @@ -#from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr as P +import os +import sys + +from pypdfocr import pypdfocr as P import pytest @@ -37,11 +39,15 @@ def test_standalone_filing(self): self.p.get_options(opts) # Assert that it checks that the config file is present - opts.append('--config=test_option_config.yaml') + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts.append('--config={}'.format(conf_path)) self.p.get_options(opts) assert(self.p.enable_filing) assert(self.p.config) + @pytest.mark.skipif(sys.version_info.major>2, + reason="Evernote disabled for py3") def test_standalone_filing_evernote(self): # Check when evernote is enabled opts = ["blah.pdf"] @@ -50,7 +56,9 @@ def test_standalone_filing_evernote(self): with pytest.raises(SystemExit): self.p.get_options(opts) - opts.append('--config=test_option_config.yaml') + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts.append('--config={}'.format(conf_path)) self.p.get_options(opts) # Enabling -e should turn on filing too assert(self.p.enable_filing) @@ -65,6 +73,21 @@ def test_standalone_filing_evernote(self): assert(self.p.config) assert(not self.p.watch) + @pytest.mark.skipif(sys.version_info.major==2, + reason="Evernote works on py2") + def test_evernote_disabled(self): + opts = ["blah.pdf"] + opts.append('-e') + # Assert that it checks that the config file is present + with pytest.raises(SystemExit): + self.p.get_options(opts) + + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts.append('--config={}'.format(conf_path)) + self.p.get_options(opts) + assert not self.p.enable_evernote + def test_standalone_watch_conflict(self): # When pdf file is specified, we don't want to allow watch option opts = ["blah.pdf", '-w'] @@ -80,23 +103,30 @@ def test_watch_filing(self): opts = ['-w temp'] self.p.get_options(opts) assert(self.p.watch_dir) - - opts.append('--config=test_option_config.yaml') + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts.append('--config={}'.format(conf_path)) self.p.get_options(opts) assert(self.p.watch) assert(self.p.config) assert(not self.p.enable_filing) assert(not self.p.enable_evernote) + @pytest.mark.skipif(sys.version_info.major>2, + reason="Evernote disabled for py3") def test_watch_filing_evernote(self): - opts = ['-w temp', '-e', '--config=test_option_config.yaml'] + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts = ['-w temp', '-e', '--config={}'.format(conf_path)] self.p.get_options(opts) assert(self.p.watch) assert(self.p.config) assert(self.p.enable_filing) assert(self.p.enable_evernote) - opts = ['-w temp', '-f', '-e', '--config=test_option_config.yaml'] + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts = ['-w temp', '-f', '-e', '--config={}'.format(conf_path)] self.p.get_options(opts) assert(self.p.watch) assert(self.p.config) diff --git a/test/test_pdf_filer.py b/test/test_pdf_filer.py index 9db7382..bdea966 100644 --- a/test/test_pdf_filer.py +++ b/test/test_pdf_filer.py @@ -1,12 +1,8 @@ #from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr as P -import pytest +from pypdfocr import pypdfocr as P import os -import hashlib - from mock import patch, call -from pytest import skip class TestPDFFiler: @@ -19,7 +15,9 @@ def test_file_by_filename(self, mock_move): # Mock the move function so we don't actually end up filing p = P.PyPDFOCR() cwd = os.getcwd() - filename = os.path.join("pdfs", "test_super_long_keyword.pdf") + filename = os.path.join(os.path.dirname(__file__), + "pdfs", + "test_super_long_keyword.pdf") out_filename = filename.replace(".pdf", "_ocr.pdf") if os.path.exists(out_filename): @@ -27,7 +25,10 @@ def test_file_by_filename(self, mock_move): print("Current directory: %s" % os.getcwd()) #opts = [filename, "--config=test_pypdfocr_config.yaml", "-f"] - opts = [filename, "--config=test_pypdfocr_config_filename.yaml", "-f", "-n"] + conf_path = os.path.join( + os.path.dirname(__file__), 'test_pypdfocr_config.yaml') + + opts = [filename, "--config={}".format(conf_path), "-f", "-n"] p.go(opts) assert(os.path.exists(out_filename)) @@ -38,4 +39,4 @@ def test_file_by_filename(self, mock_move): - + diff --git a/test/test_pypdfocr.py b/test/test_pypdfocr.py index f3c8db7..21d1c94 100644 --- a/test/test_pypdfocr.py +++ b/test/test_pypdfocr.py @@ -1,15 +1,11 @@ #from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr as P +from pypdfocr import pypdfocr as P import pytest import os import logging from PyPDF2 import PdfFileReader -import smtplib -from mock import Mock from mock import patch, call -from mock import MagicMock -from mock import PropertyMock class TestPydfocr: @@ -23,34 +19,62 @@ def _iter_pdf(self, filename): logging.debug("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename)) for pgnum in range(reader.getNumPages()): text = reader.getPage(pgnum).extractText() - text = text.encode('ascii', 'ignore') + # text = text.encode('ascii', 'ignore') text = text.replace('\n', ' ') yield text - + + filepath = os.path.dirname(__file__) pdf_tests = [ - (".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "test_recipe.pdf"), [ ["Simply Recipes"], - ]), - (".", os.path.join("temp","target","patents"), os.path.join("pdfs","test_patent.pdf"), [ - ["asynchronous", "subject to", "20 Claims"], # Page 1 - ["FOREIGN PATENT" ], # Page 2 - ]), - (".", os.path.join("temp","target", "default"), os.path.join("pdfs","test_sherlock.pdf"), [ ["Bohemia", "Trincomalee"], # Page 1 - ["hundreds of times" ], # Page 2 - ]), - ("pdfs", os.path.join("temp","target","default"), "test_sherlock.pdf", [ ["Bohemia", "Trincomalee"], # Page 1 - ["hundreds of times" ], # Page 2 - ]), - (".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "1.pdf"), [ ["Simply","Recipes"], - ]), - (".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "test_recipe_sideways.pdf"), [ ["Simply","Recipes", 'spinach'], - ]), + ( + filepath, + os.path.join(filepath, "temp","target","recipe"), + os.path.join(filepath, "pdfs", "test_recipe.pdf"), + [ ["Simply Recipes"],]), + ( + filepath, + os.path.join(filepath, "temp","target","patents"), + os.path.join("pdfs","test_patent.pdf"), + [ + ["asynchronous", "subject to", "20 Claims"], # Page 1 + ["FOREIGN PATENT" ], # Page 2 + ]), + ( + filepath, + os.path.join(filepath, "temp","target", "default"), + os.path.join("pdfs","test_sherlock.pdf"), + [ + ["Bohemia", "Trincomalee"], # Page 1 + ["hundreds of times" ], # Page 2 + ]), + ( + os.path.join(filepath, "pdfs"), + os.path.join(filepath, "temp","target","default"), + "test_sherlock.pdf", + [ + ["Bohemia", "Trincomalee"], # Page 1 + ["hundreds of times" ], # Page 2 + ]), + ( + filepath, + os.path.join(filepath, "temp","target","recipe"), + os.path.join("..","test", "pdfs", "1.pdf"), + [ + ["Simply","Recipes"], + ]), + ( + filepath, + os.path.join(filepath, "temp","target","recipe"), + os.path.join("..","test", "pdfs", "test_recipe_sideways.pdf"), + [ + ["Simply","Recipes", 'spinach'], + ]), ] - #@pytest.mark.skipif(True, reason="Just testing") + # @pytest.mark.skipif(True, reason="Just testing") @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", pdf_tests) def test_standalone(self, dirname, tgt_folder, filename, expected): """ - Test the single file conversion with no filing. + Test the single file conversion with no filing. Tests relative paths (".."), files in subirs, and files in current dir Checks for that _ocr file is created and keywords found in pdf. Modify :attribute:`pdf_tests` for changing keywords, etc @@ -61,9 +85,9 @@ def test_standalone(self, dirname, tgt_folder, filename, expected): # First redo the unix-style paths, in case we're running on windows # Assume paths in unix-style - dirname = os.path.join(*(dirname.split("/"))) - tgt_folder = os.path.join(*(tgt_folder.split("/"))) - filename = os.path.join(*(filename.split("/"))) + # dirname = os.path.join(*(dirname.split("/"))) + # tgt_folder = os.path.join(*(tgt_folder.split("/"))) + # filename = os.path.join(*(filename.split("/"))) cwd = os.getcwd() @@ -77,12 +101,12 @@ def test_standalone(self, dirname, tgt_folder, filename, expected): if len(expected) > i: for keyword in expected[i]: assert(keyword in t) - print ("\n----------------------\nPage %d\n" % i) - print t + print("\n----------------------\nPage %d\n" % i) + print(t) os.remove(out_filename) os.chdir(cwd) - #@pytest.mark.skipif(True, reason="just testing") + # @pytest.mark.skipif(True, reason="just testing") @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", [pdf_tests[0]]) def test_standalone_email(self, dirname, tgt_folder, filename, expected): """ @@ -104,11 +128,11 @@ def test_standalone_email(self, dirname, tgt_folder, filename, expected): if len(expected) > i: for keyword in expected[i]: assert(keyword in t) - print ("\n----------------------\nPage %d\n" % i) - print t + print("\n----------------------\nPage %d\n" % i) + print(t) os.remove(out_filename) os.chdir(cwd) - + # Assert the smtp calls instance = mock_smtp.return_value assert(instance.starttls.called) @@ -116,7 +140,10 @@ def test_standalone_email(self, dirname, tgt_folder, filename, expected): assert(instance.sendmail.called) @patch('shutil.move') - @pytest.mark.parametrize("config", [("test_pypdfocr_config.yaml"), ("test_pypdfocr_config_no_move_original.yaml")]) + @pytest.mark.parametrize( + "config", + [(os.path.join(filepath, "test_pypdfocr_config.yaml")), + (os.path.join(filepath, "test_pypdfocr_config_no_move_original.yaml"))]) @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", pdf_tests[0:3]) def test_standalone_filing(self, mock_move, config, dirname, tgt_folder, filename, expected): """ @@ -146,18 +173,18 @@ def test_standalone_filing(self, mock_move, config, dirname, tgt_folder, filenam if len(expected) > i: for keyword in expected[i]: assert(keyword in t) - print ("\n----------------------\nPage %d\n" % i) - print t + print("\n----------------------\nPage %d\n" % i) + print(t) os.remove(out_filename) os.chdir(cwd) - + # Assert the smtp calls calls = [call(out_filename, - os.path.abspath(os.path.join(tgt_folder,os.path.basename(out_filename))))] + os.path.abspath(os.path.join(tgt_folder,os.path.basename(out_filename))))] if not "no_move_original" in config: new_file_name = os.path.basename(filename).replace(".pdf", "_2.pdf") calls.append(call(filename, - os.path.abspath(os.path.join("temp","original", new_file_name)))) + os.path.abspath(os.path.join("test", "temp","original", new_file_name)))) mock_move.assert_has_calls(calls) def test_set_binaries(self): diff --git a/test/test_tesseract.py b/test/test_tesseract.py index c137248..5f10131 100644 --- a/test/test_tesseract.py +++ b/test/test_tesseract.py @@ -1,11 +1,9 @@ #from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr_tesseract as P +from pypdfocr import pypdfocr_tesseract as P import pytest import os -import hashlib - -from mock import patch, call +from mock import patch class TestTesseract: @@ -72,7 +70,7 @@ def test_tesseract_presence(self, capsys): def test_tesseract_version(self, capsys): p = P.PyTesseract({}) - p.required = "100" + p.required = "100.01" with pytest.raises(SystemExit): p.make_hocr_from_pnms("") out, err = capsys.readouterr() diff --git a/test/test_watcher.py b/test/test_watcher.py index 8470760..a7ae6ce 100644 --- a/test/test_watcher.py +++ b/test/test_watcher.py @@ -1,15 +1,11 @@ -#from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr_watcher as P +from pypdfocr import pypdfocr_watcher as P import pytest -import evernote.api.client -import evernote.edam.type.ttypes as Types -import hashlib import time import os from collections import namedtuple -from mock import patch, call +from mock import patch class TestWatching: @@ -23,20 +19,20 @@ class TestWatching: @patch('shutil.move') @pytest.mark.parametrize(("filename, expected"), filenames) - def test_rename(self, mock_move, filename, expected): + def test_rename(self, mock_move, filename, expected, tmpdir): if expected == None: expected = filename - p = P.PyPdfWatcher('temp',{}) + p = P.PyPdfWatcher(str(tmpdir.mkdir("tmp")),{}) # First, test code that does not move original ret = p.rename_file_with_spaces(filename) assert (ret==expected) - def test_check_for_new_pdf(self): + def test_check_for_new_pdf(self, tmpdir): - p = P.PyPdfWatcher('temp', {}) + p = P.PyPdfWatcher(str(tmpdir.mkdir("tmp")), {}) p.check_for_new_pdf("blah_ocr.pdf") assert("blah_ocr.pdf" not in p.events) p.check_for_new_pdf("blah.pdf") @@ -49,8 +45,8 @@ def test_check_for_new_pdf(self): p.check_for_new_pdf("blah.pdf") assert(p.events['blah.pdf']-time.time() <=1) # Check that time stamp was updated - def test_events(self): - p = P.PyPdfWatcher('temp', {}) + def test_events(self, tmpdir): + p = P.PyPdfWatcher(str(tmpdir.mkdir("tmp")), {}) event = namedtuple('event', 'src_path, dest_path') @@ -63,8 +59,9 @@ def test_events(self): p.on_modified(event(src_path='temp_recipe3.pdf', dest_path=None)) assert('temp_recipe3.pdf' in p.events) - def test_check_queue(self): - p = P.PyPdfWatcher('temp', {}) + def test_check_queue(self, tmpdir): + p = P.PyPdfWatcher(str(tmpdir.mkdir("tmp")), {}) + assert p.events == {} now = time.time() p.events['blah.pdf'] = now f = p.check_queue()