From 765c9e899edc6b2032424d50fef42a64b4990bc8 Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 13:05:59 +0100 Subject: [PATCH 01/31] Remove Evernote from requirements, not currently available for py3 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ae91a04..232bfd7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ pillow>=2.2 reportlab>=2.7 watchdog>=0.6.0 pypdf2>=1.23 -evernote +# evernote From 8ffb08c0f66ce6af986e14a11a62128d655fb651 Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 13:12:23 +0100 Subject: [PATCH 02/31] Comment out Evernote tests Evernote module not compatible with py3 yes. --- test/test_evernote.py | 220 +++++++++++++++++++++--------------------- 1 file changed, 110 insertions(+), 110 deletions(-) diff --git a/test/test_evernote.py b/test/test_evernote.py index d337ec4..1e08f58 100644 --- a/test/test_evernote.py +++ b/test/test_evernote.py @@ -1,123 +1,123 @@ -#from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr_filer_evernote as P -import pytest -import os - -import evernote.api.client -import evernote.edam.type.ttypes as Types -import hashlib - -from mock import patch, call - -class TestEvernote: - - def test_connecct(self): - # Tricky mocking. Need to mock the EvernoteClient import in pypdfocr_filer_evernote.py file - with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: - p = P.PyFilerEvernote("TOKEN") - inst = mock_evernote_client.return_value - assert(inst.get_user_store.called) - - @patch('shutil.move') - def test_file_original(self, mock_move): - with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: - p = P.PyFilerEvernote("TOKEN") - filename = os.path.join("pdfs","test_recipe.pdf") - - # First, test code that does not move original - p.file_original(filename) - assert (not mock_move.called) - - # Now test moving - p.set_original_move_folder(os.path.join("temp", "original")) - p.file_original(filename) - mock_move.assert_called_with(filename, os.path.join("temp","original", "test_recipe_2.pdf")) - - @patch('os.remove') - def test_move_to_folder(self, mock_remove): - with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: - p = P.PyFilerEvernote("TOKEN") - filename = os.path.join("pdfs", "test_recipe.pdf") - foldername = 'recipe' - with pytest.raises(AssertionError): - p.move_to_matching_folder(filename, foldername) - p.set_target_folder('target') - with pytest.raises(AssertionError): - p.move_to_matching_folder(filename, foldername) - p.set_default_folder('default') - p.move_to_matching_folder(filename, None) - p.move_to_matching_folder(filename, foldername) +# #from pypdfocr import PyPDFOCR as P +# import pypdfocr.pypdfocr_filer_evernote as P +# import pytest +# import os + +# import evernote.api.client +# import evernote.edam.type.ttypes as Types +# import hashlib + +# from mock import patch, call + +# class TestEvernote: + +# def test_connecct(self): +# # Tricky mocking. Need to mock the EvernoteClient import in pypdfocr_filer_evernote.py file +# with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: +# p = P.PyFilerEvernote("TOKEN") +# inst = mock_evernote_client.return_value +# assert(inst.get_user_store.called) + +# @patch('shutil.move') +# def test_file_original(self, mock_move): +# with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: +# p = P.PyFilerEvernote("TOKEN") +# filename = os.path.join("pdfs","test_recipe.pdf") + +# # First, test code that does not move original +# p.file_original(filename) +# assert (not mock_move.called) + +# # Now test moving +# p.set_original_move_folder(os.path.join("temp", "original")) +# p.file_original(filename) +# mock_move.assert_called_with(filename, os.path.join("temp","original", "test_recipe_2.pdf")) + +# @patch('os.remove') +# def test_move_to_folder(self, mock_remove): +# with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: +# p = P.PyFilerEvernote("TOKEN") +# filename = os.path.join("pdfs", "test_recipe.pdf") +# foldername = 'recipe' +# with pytest.raises(AssertionError): +# p.move_to_matching_folder(filename, foldername) +# p.set_target_folder('target') +# with pytest.raises(AssertionError): +# p.move_to_matching_folder(filename, foldername) +# p.set_default_folder('default') +# p.move_to_matching_folder(filename, None) +# p.move_to_matching_folder(filename, foldername) - mock_client = mock_evernote_client.return_value - assert(mock_client.get_note_store.called) - assert(mock_client.get_note_store.return_value.createNote.called) - mock_remove.assert_called_with(filename) +# mock_client = mock_evernote_client.return_value +# assert(mock_client.get_note_store.called) +# assert(mock_client.get_note_store.return_value.createNote.called) +# mock_remove.assert_called_with(filename) - def test_create_note(self): - with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: - p = P.PyFilerEvernote("TOKEN") - notebook = Types.Notebook() - notebook.name = "recipe" - filename = "pdfs/test_recipe.pdf" - note = p._create_evernote_note(notebook, filename) - xml = '' - assert(note.content.startswith(xml)) +# def test_create_note(self): +# with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: +# p = P.PyFilerEvernote("TOKEN") +# notebook = Types.Notebook() +# notebook.name = "recipe" +# filename = "pdfs/test_recipe.pdf" +# note = p._create_evernote_note(notebook, filename) +# xml = '' +# assert(note.content.startswith(xml)) - md5 = hashlib.md5() - with open(filename,'rb') as f: - pdf_bytes = f.read() - md5.update(pdf_bytes) +# md5 = hashlib.md5() +# with open(filename,'rb') as f: +# pdf_bytes = f.read() +# md5.update(pdf_bytes) - md5hash = md5.hexdigest() +# md5hash = md5.hexdigest() - assert(md5hash in note.content) - assert(note.resources[0].data.bodyHash == md5hash) - - - def test_check_notebook(self): - with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: - p = P.PyFilerEvernote("TOKEN") - p._check_and_make_notebook("new_notebook") - # Let's assert that we tried to create a new notebook - mock_client = mock_evernote_client.return_value - assert(mock_client.get_note_store.called) - create_func = mock_client.get_note_store.return_value.createNotebook - update_func = mock_client.get_note_store.return_value.updateNotebook - assert(create_func.called) - assert(not update_func.called) - notebook = create_func.call_args[0][0] - assert(notebook.name == 'new_notebook') - - # Now, let's setup a value for the notebooks, so we test the code for - # a "pre-exisiting" notebook - test_notebook = Types.Notebook() - test_notebook.name = "new_notebook" - mock_client.get_note_store.return_value.listNotebooks.return_value = [test_notebook] - p._check_and_make_notebook("new_notebook") - - # Now check that the code to update a notebook stack is correct - test_notebook.stack = "new_stack" - update_func = mock_client.get_note_store.return_value.updateNotebook - p.set_target_folder("Boogie") - p._check_and_make_notebook("new_notebook") - # Check that the update call was called with correct arguments - assert(update_func.called) - notebook = update_func.call_args[0][0] - assert(notebook.stack == 'Boogie') +# assert(md5hash in note.content) +# assert(note.resources[0].data.bodyHash == md5hash) + + +# def test_check_notebook(self): +# with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: +# p = P.PyFilerEvernote("TOKEN") +# p._check_and_make_notebook("new_notebook") +# # Let's assert that we tried to create a new notebook +# mock_client = mock_evernote_client.return_value +# assert(mock_client.get_note_store.called) +# create_func = mock_client.get_note_store.return_value.createNotebook +# update_func = mock_client.get_note_store.return_value.updateNotebook +# assert(create_func.called) +# assert(not update_func.called) +# notebook = create_func.call_args[0][0] +# assert(notebook.name == 'new_notebook') + +# # Now, let's setup a value for the notebooks, so we test the code for +# # a "pre-exisiting" notebook +# test_notebook = Types.Notebook() +# test_notebook.name = "new_notebook" +# mock_client.get_note_store.return_value.listNotebooks.return_value = [test_notebook] +# p._check_and_make_notebook("new_notebook") + +# # Now check that the code to update a notebook stack is correct +# test_notebook.stack = "new_stack" +# update_func = mock_client.get_note_store.return_value.updateNotebook +# p.set_target_folder("Boogie") +# p._check_and_make_notebook("new_notebook") +# # Check that the update call was called with correct arguments +# assert(update_func.called) +# notebook = update_func.call_args[0][0] +# assert(notebook.stack == 'Boogie') - def test_add_folder_target(self): - with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: - p = P.PyFilerEvernote("TOKEN") - p.add_folder_target("folder1", ["target1", "target2"]) - with pytest.raises(AssertionError): - p.add_folder_target("folder1", ["target1", "target2"]) - p.add_folder_target("folder2", ["target1", "target2"]) - assert("folder1" in p.folder_targets.keys()) - assert("folder2" in p.folder_targets.keys()) +# def test_add_folder_target(self): +# with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: +# p = P.PyFilerEvernote("TOKEN") +# p.add_folder_target("folder1", ["target1", "target2"]) +# with pytest.raises(AssertionError): +# p.add_folder_target("folder1", ["target1", "target2"]) +# p.add_folder_target("folder2", ["target1", "target2"]) +# assert("folder1" in p.folder_targets.keys()) +# assert("folder2" in p.folder_targets.keys()) From 822582c4652d7e034691c7f636a206362548a60a Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 13:16:17 +0100 Subject: [PATCH 03/31] Update print to function calls for py3 compatibility --- test/test_pypdfocr.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/test_pypdfocr.py b/test/test_pypdfocr.py index f3c8db7..faf917f 100644 --- a/test/test_pypdfocr.py +++ b/test/test_pypdfocr.py @@ -77,8 +77,8 @@ def test_standalone(self, dirname, tgt_folder, filename, expected): if len(expected) > i: for keyword in expected[i]: assert(keyword in t) - print ("\n----------------------\nPage %d\n" % i) - print t + print("\n----------------------\nPage %d\n" % i) + print(t) os.remove(out_filename) os.chdir(cwd) @@ -104,8 +104,8 @@ def test_standalone_email(self, dirname, tgt_folder, filename, expected): if len(expected) > i: for keyword in expected[i]: assert(keyword in t) - print ("\n----------------------\nPage %d\n" % i) - print t + print("\n----------------------\nPage %d\n" % i) + print(t) os.remove(out_filename) os.chdir(cwd) @@ -146,8 +146,8 @@ def test_standalone_filing(self, mock_move, config, dirname, tgt_folder, filenam if len(expected) > i: for keyword in expected[i]: assert(keyword in t) - print ("\n----------------------\nPage %d\n" % i) - print t + print("\n----------------------\nPage %d\n" % i) + print(t) os.remove(out_filename) os.chdir(cwd) From dffacba2cdfc989028e1927e79faf7b926f20901 Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 13:20:01 +0100 Subject: [PATCH 04/31] Remove unused imports from test files --- test/test_gs.py | 5 +---- test/test_pdf_filer.py | 4 ---- test/test_pypdfocr.py | 4 ---- test/test_tesseract.py | 4 +--- test/test_watcher.py | 6 +----- 5 files changed, 3 insertions(+), 20 deletions(-) diff --git a/test/test_gs.py b/test/test_gs.py index c092b4f..94d1130 100644 --- a/test/test_gs.py +++ b/test/test_gs.py @@ -3,10 +3,7 @@ import pytest import os -import hashlib - -from mock import patch, call -from pytest import skip +from mock import patch class TestGS: diff --git a/test/test_pdf_filer.py b/test/test_pdf_filer.py index 9db7382..823466c 100644 --- a/test/test_pdf_filer.py +++ b/test/test_pdf_filer.py @@ -1,12 +1,8 @@ #from pypdfocr import PyPDFOCR as P import pypdfocr.pypdfocr as P -import pytest import os -import hashlib - from mock import patch, call -from pytest import skip class TestPDFFiler: diff --git a/test/test_pypdfocr.py b/test/test_pypdfocr.py index faf917f..ec58d05 100644 --- a/test/test_pypdfocr.py +++ b/test/test_pypdfocr.py @@ -5,11 +5,7 @@ import logging from PyPDF2 import PdfFileReader -import smtplib -from mock import Mock from mock import patch, call -from mock import MagicMock -from mock import PropertyMock class TestPydfocr: diff --git a/test/test_tesseract.py b/test/test_tesseract.py index c137248..cbcd87e 100644 --- a/test/test_tesseract.py +++ b/test/test_tesseract.py @@ -3,9 +3,7 @@ import pytest import os -import hashlib - -from mock import patch, call +from mock import patch class TestTesseract: diff --git a/test/test_watcher.py b/test/test_watcher.py index 8470760..1d3507e 100644 --- a/test/test_watcher.py +++ b/test/test_watcher.py @@ -1,15 +1,11 @@ -#from pypdfocr import PyPDFOCR as P import pypdfocr.pypdfocr_watcher as P import pytest -import evernote.api.client -import evernote.edam.type.ttypes as Types -import hashlib import time import os from collections import namedtuple -from mock import patch, call +from mock import patch class TestWatching: From 53fc18c16dd73ce9eded61a2e30db71a03106215 Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 13:26:57 +0100 Subject: [PATCH 05/31] Replace tabs with spaces --- pypdfocr/pypdfocr_gs.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pypdfocr/pypdfocr_gs.py b/pypdfocr/pypdfocr_gs.py index 5599082..d20f63a 100644 --- a/pypdfocr/pypdfocr_gs.py +++ b/pypdfocr/pypdfocr_gs.py @@ -92,21 +92,21 @@ def _find_windows_gs(self): listing = os.listdir('.') # Find all possible gs* sub-directories - listing = [x for x in listing if x.startswith('gs')] + listing = [x for x in listing if x.startswith('gs')] # TODO: Make this a natural sort listing.sort(reverse=True) - for bindir in listing: - binpath = os.path.join(bindir,'bin') - if not os.path.exists(binpath): continue - os.chdir(binpath) + for bindir in listing: + binpath = os.path.join(bindir,'bin') + if not os.path.exists(binpath): continue + os.chdir(binpath) # Look for gswin64c.exe or gswin32c.exe (the c is for the command-line version) - gswin = glob.glob('gswin*c.exe') - if len(gswin) == 0: - continue - gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?) - os.chdir(cwd) - return gs + gswin = glob.glob('gswin*c.exe') + if len(gswin) == 0: + continue + gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?) + os.chdir(cwd) + return gs if not gs: error(self.msgs['GS_MISSING_BINARY']) From 2a09669d3f890774696963fe043af78d43ec642a Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 13:29:38 +0100 Subject: [PATCH 06/31] Replace tabs with spaces --- pypdfocr/pypdfocr_pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdfocr/pypdfocr_pdf.py b/pypdfocr/pypdfocr_pdf.py index bdc1f86..7aa6ab5 100644 --- a/pypdfocr/pypdfocr_pdf.py +++ b/pypdfocr/pypdfocr_pdf.py @@ -155,7 +155,7 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename): merger.append(PdfFileReader(file(text_pdf_filename, 'rb'))) merger.write(all_text_filename) merger.close() - del merger + del merger writer = PdfFileWriter() From ad019dd27f6a1f871f2df8b7d5161565e14437bd Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 13:35:58 +0100 Subject: [PATCH 07/31] Update print to function calls for py3 compatibility --- pypdfocr/pypdfocr.py | 2 +- pypdfocr/pypdfocr_gs.py | 2 +- pypdfocr/pypdfocr_pdffiler.py | 4 ++-- pypdfocr/pypdfocr_preprocess.py | 6 +++--- pypdfocr/pypdfocr_tesseract.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pypdfocr/pypdfocr.py b/pypdfocr/pypdfocr.py index 7ee7e9e..bb217d2 100644 --- a/pypdfocr/pypdfocr.py +++ b/pypdfocr/pypdfocr.py @@ -467,7 +467,7 @@ def go(self, argv): except KeyboardInterrupt: break except Exception as e: - print traceback.print_exc(e) + print(traceback.print_exc(e)) py_watcher.stop() else: diff --git a/pypdfocr/pypdfocr_gs.py b/pypdfocr/pypdfocr_gs.py index d20f63a..21ff3df 100644 --- a/pypdfocr/pypdfocr_gs.py +++ b/pypdfocr/pypdfocr_gs.py @@ -174,7 +174,7 @@ def _run_gs(self, options, output_filename, pdf_filename): out = subprocess.check_output(cmd, shell=True) except subprocess.CalledProcessError as e: - print e.output + print(e.output) if "undefined in .getdeviceparams" in e.output: error(self.msgs['GS_OUTDATED']) else: diff --git a/pypdfocr/pypdfocr_pdffiler.py b/pypdfocr/pypdfocr_pdffiler.py index 1bb23f5..1b54f8b 100644 --- a/pypdfocr/pypdfocr_pdffiler.py +++ b/pypdfocr/pypdfocr_pdffiler.py @@ -59,7 +59,7 @@ def _get_matching_folder(self, pdfText): # No match found, so return return None - def file_original (self, original_filename): + def file_original(self, original_filename): return self.filer.file_original(original_filename) def move_to_matching_folder(self, filename): @@ -76,5 +76,5 @@ def move_to_matching_folder(self, filename): if __name__ == '__main__': p = PyPdfFiler(PyFilerDirs()) for page_text in p.iter_pdf_page_text("scan_ocr.pdf"): - print (page_text) + print(page_text) diff --git a/pypdfocr/pypdfocr_preprocess.py b/pypdfocr/pypdfocr_preprocess.py index e942cc3..13ad7cd 100644 --- a/pypdfocr/pypdfocr_preprocess.py +++ b/pypdfocr/pypdfocr_preprocess.py @@ -58,7 +58,7 @@ def cmd(self, cmd_list): logging.debug(out) return out except subprocess.CalledProcessError as e: - print e.output + print(e.output) self._warn("Could not run command %s" % cmd_list) @@ -102,14 +102,14 @@ def preprocess(self, in_filenames): logging.info("Starting preprocessing parallel execution") preprocessed_filenames = pool.map(unwrap_self,zip([self]*len(fns),fns)) pool.close() - except KeyboardInterrupt or Exception: + except (KeyboardInterrupt, Exception): print("Caught keyboard interrupt... terminating") pool.terminate() #sys,exit(-1) raise finally: pool.join() - logging.info ("Completed preprocessing") + logging.info("Completed preprocessing") return preprocessed_filenames diff --git a/pypdfocr/pypdfocr_tesseract.py b/pypdfocr/pypdfocr_tesseract.py index 8f246ee..8d0d0ef 100644 --- a/pypdfocr/pypdfocr_tesseract.py +++ b/pypdfocr/pypdfocr_tesseract.py @@ -166,7 +166,7 @@ def make_hocr_from_pnm(self, img_filename): ret_output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: # Could not run tesseract - print e.output + print(e.output) self._warn (self.msgs['TS_FAILED']) if os.path.isfile(hocr_filename): From bba9c8f6ca4279342b652d2d03abcd2404c34128 Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 13:56:13 +0100 Subject: [PATCH 08/31] Update imports for py3 * Py3 requires absolute imports * Multiprocessing module has been reorganized for py3.4+ (see https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing) --- pypdfocr/pypdfocr.py | 36 ++++++++++++++++++---------- pypdfocr/pypdfocr_filer_dirs.py | 2 +- pypdfocr/pypdfocr_filer_evernote.py | 2 +- pypdfocr/pypdfocr_multiprocessing.py | 16 +++++++++---- pypdfocr/pypdfocr_pdf.py | 2 +- pypdfocr/pypdfocr_pdffiler.py | 4 ++-- pypdfocr/pypdfocr_preprocess.py | 2 +- pypdfocr/pypdfocr_tesseract.py | 2 +- 8 files changed, 42 insertions(+), 24 deletions(-) diff --git a/pypdfocr/pypdfocr.py b/pypdfocr/pypdfocr.py index bb217d2..690862b 100644 --- a/pypdfocr/pypdfocr.py +++ b/pypdfocr/pypdfocr.py @@ -21,24 +21,36 @@ import itertools from functools import wraps -from version import __version__ +from pypdfocr.version import __version__ from PIL import Image import yaml import multiprocessing -# Replace the Popen routine to allow win32 pyinstaller to build -from multiprocessing import forking -from pypdfocr_multiprocessing import _Popen + +""" Special work-around to support multiprocessing and pyinstaller --onefile on windows systms + + https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing +""" +try: + # Python 3.4+ + if sys.platform.startswith('win'): + import multiprocessing.popen_spawn_win32 as forking + else: + import multiprocessing.popen_fork as forking +except ImportError: + import multiprocessing.forking as forking + +from pypdfocr.pypdfocr_multiprocessing import _Popen forking.Popen = _Popen -from pypdfocr_pdf import PyPdf -from pypdfocr_tesseract import PyTesseract -from pypdfocr_gs import PyGs -from pypdfocr_watcher import PyPdfWatcher -from pypdfocr_pdffiler import PyPdfFiler -from pypdfocr_filer_dirs import PyFilerDirs -from pypdfocr_filer_evernote import PyFilerEvernote -from pypdfocr_preprocess import PyPreprocess +from pypdfocr.pypdfocr_pdf import PyPdf +from pypdfocr.pypdfocr_tesseract import PyTesseract +from pypdfocr.pypdfocr_gs import PyGs +from pypdfocr.pypdfocr_watcher import PyPdfWatcher +from pypdfocr.pypdfocr_pdffiler import PyPdfFiler +from pypdfocr.pypdfocr_filer_dirs import PyFilerDirs +from pypdfocr.pypdfocr_filer_evernote import PyFilerEvernote +from pypdfocr.pypdfocr_preprocess import PyPreprocess def error(text): print("ERROR: %s" % text) diff --git a/pypdfocr/pypdfocr_filer_dirs.py b/pypdfocr/pypdfocr_filer_dirs.py index dc19330..e6d1505 100644 --- a/pypdfocr/pypdfocr_filer_dirs.py +++ b/pypdfocr/pypdfocr_filer_dirs.py @@ -16,7 +16,7 @@ import os import shutil -from pypdfocr_filer import PyFiler +from pypdfocr.pypdfocr_filer import PyFiler """ Implementation of a filer class diff --git a/pypdfocr/pypdfocr_filer_evernote.py b/pypdfocr/pypdfocr_filer_evernote.py index 80ec115..8389398 100644 --- a/pypdfocr/pypdfocr_filer_evernote.py +++ b/pypdfocr/pypdfocr_filer_evernote.py @@ -19,7 +19,7 @@ import time import sys -from pypdfocr_filer import PyFiler +from pypdfocr.pypdfocr_filer import PyFiler import functools diff --git a/pypdfocr/pypdfocr_multiprocessing.py b/pypdfocr/pypdfocr_multiprocessing.py index 3666268..253bd55 100644 --- a/pypdfocr/pypdfocr_multiprocessing.py +++ b/pypdfocr/pypdfocr_multiprocessing.py @@ -13,19 +13,25 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys, os, multiprocessing.forking import logging +import os +import sys """ Special work-around to support multiprocessing and pyinstaller --onefile on windows systms https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing """ +try: + # Python 3.4+ + if sys.platform.startswith('win'): + import multiprocessing.popen_spawn_win32 as forking + else: + import multiprocessing.popen_fork as forking +except ImportError: + import multiprocessing.forking as forking -import multiprocessing.forking as forking -import os -import sys -class _Popen(multiprocessing.forking.Popen): +class _Popen(forking.Popen): def __init__(self, *args, **kw): if hasattr(sys, 'frozen'): # We have to set original _MEIPASS2 value from sys._MEIPASS diff --git a/pypdfocr/pypdfocr_pdf.py b/pypdfocr/pypdfocr_pdf.py index 7aa6ab5..b4a9085 100644 --- a/pypdfocr/pypdfocr_pdf.py +++ b/pypdfocr/pypdfocr_pdf.py @@ -52,7 +52,7 @@ from reportlab.lib.enums import TA_LEFT from reportlab.platypus.paragraph import Paragraph -from pypdfocr_util import Retry +from pypdfocr.pypdfocr_util import Retry from functools import partial class RotatedPara(Paragraph): diff --git a/pypdfocr/pypdfocr_pdffiler.py b/pypdfocr/pypdfocr_pdffiler.py index 1b54f8b..646fc64 100644 --- a/pypdfocr/pypdfocr_pdffiler.py +++ b/pypdfocr/pypdfocr_pdffiler.py @@ -25,8 +25,8 @@ import shutil from PyPDF2 import PdfFileReader -from pypdfocr_filer import PyFiler -from pypdfocr_filer_dirs import PyFilerDirs +from pypdfocr.pypdfocr_filer import PyFiler +from pypdfocr.pypdfocr_filer_dirs import PyFilerDirs class PyPdfFiler(object): def __init__(self, filer): diff --git a/pypdfocr/pypdfocr_preprocess.py b/pypdfocr/pypdfocr_preprocess.py index 13ad7cd..4775089 100644 --- a/pypdfocr/pypdfocr_preprocess.py +++ b/pypdfocr/pypdfocr_preprocess.py @@ -28,7 +28,7 @@ import signal from multiprocessing import Pool -from pypdfocr_interrupts import init_worker +from pypdfocr.pypdfocr_interrupts import init_worker # Ugly hack to pass in object method to the multiprocessing library # From http://www.rueckstiess.net/research/snippets/show/ca1d7d90 diff --git a/pypdfocr/pypdfocr_tesseract.py b/pypdfocr/pypdfocr_tesseract.py index 8d0d0ef..7d615dc 100644 --- a/pypdfocr/pypdfocr_tesseract.py +++ b/pypdfocr/pypdfocr_tesseract.py @@ -26,7 +26,7 @@ from subprocess import CalledProcessError from multiprocessing import Pool -from pypdfocr_interrupts import init_worker +from pypdfocr.pypdfocr_interrupts import init_worker def error(text): print("ERROR: %s" % text) From 82c9649d31ca70d64b9ad091b05fe033017ef6b5 Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 14:03:11 +0100 Subject: [PATCH 09/31] Comment out evernote imports, not working for py3 at this time --- pypdfocr/pypdfocr_filer_evernote.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pypdfocr/pypdfocr_filer_evernote.py b/pypdfocr/pypdfocr_filer_evernote.py index 8389398..666b874 100644 --- a/pypdfocr/pypdfocr_filer_evernote.py +++ b/pypdfocr/pypdfocr_filer_evernote.py @@ -23,13 +23,13 @@ import functools -from evernote.api.client import EvernoteClient -import evernote.edam.type.ttypes as Types -import evernote.edam.userstore.constants as UserStoreConstants -from evernote.edam.error.ttypes import EDAMUserException -from evernote.edam.error.ttypes import EDAMSystemException -from evernote.edam.error.ttypes import EDAMNotFoundException -from evernote.edam.error.ttypes import EDAMErrorCode +# from evernote.api.client import EvernoteClient +# import evernote.edam.type.ttypes as Types +# import evernote.edam.userstore.constants as UserStoreConstants +# from evernote.edam.error.ttypes import EDAMUserException +# from evernote.edam.error.ttypes import EDAMSystemException +# from evernote.edam.error.ttypes import EDAMNotFoundException +# from evernote.edam.error.ttypes import EDAMErrorCode """ From 116bd7fe1150149cfe307983f144aa4b21cb714b Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 14:04:37 +0100 Subject: [PATCH 10/31] Remove unused imports that didn't work in py3 --- pypdfocr/pypdfocr_pdf.py | 1 - pypdfocr/pypdfocr_pdffiler.py | 1 - 2 files changed, 2 deletions(-) diff --git a/pypdfocr/pypdfocr_pdf.py b/pypdfocr/pypdfocr_pdf.py index b4a9085..5a737cb 100644 --- a/pypdfocr/pypdfocr_pdf.py +++ b/pypdfocr/pypdfocr_pdf.py @@ -31,7 +31,6 @@ import tempfile import glob -import cStringIO import base64 import zlib import math diff --git a/pypdfocr/pypdfocr_pdffiler.py b/pypdfocr/pypdfocr_pdffiler.py index 646fc64..14d2a7b 100644 --- a/pypdfocr/pypdfocr_pdffiler.py +++ b/pypdfocr/pypdfocr_pdffiler.py @@ -18,7 +18,6 @@ on keywords """ -from sets import Set import sys, os import re import logging From c1a629663cd7ac9fdef59aaa040e9365fad94cfd Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 14:52:44 +0100 Subject: [PATCH 11/31] Update tesseract module for py3 compatability * Use DistUtils.StrictVersion for version checking, makes it easier * Update except listings to py3 format * Force encoding for subprocess output to avoid having bytestrings in py3 --- pypdfocr/pypdfocr_tesseract.py | 38 +++++++--------------------------- test/test_tesseract.py | 2 +- 2 files changed, 9 insertions(+), 31 deletions(-) diff --git a/pypdfocr/pypdfocr_tesseract.py b/pypdfocr/pypdfocr_tesseract.py index 7d615dc..aba973c 100644 --- a/pypdfocr/pypdfocr_tesseract.py +++ b/pypdfocr/pypdfocr_tesseract.py @@ -23,6 +23,7 @@ import logging import subprocess import glob +from distutils.version import StrictVersion from subprocess import CalledProcessError from multiprocessing import Pool @@ -82,7 +83,8 @@ def _is_version_uptodate(self): cmd = '%s -v' % (self.binary) logging.info(cmd) try: - ret_output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) + ret_output = subprocess.check_output( + cmd, shell=True, encoding="utf-8", stderr=subprocess.STDOUT) except CalledProcessError: # Could not run tesseract error(self.msgs['TS_MISSING']) @@ -93,36 +95,12 @@ def _is_version_uptodate(self): ver_str = line.split(' ')[1] if ver_str.endswith('dev'): # Fix for version strings that end in 'dev' ver_str = ver_str[:-3] - - # Iterate through the version dots - ver = [int(x) for x in ver_str.split('.')] - req = [int(x) for x in self.required.split('.')] - # Aargh, in windows 3.02.02 is reported as version 3.02 - # SFKM if str(os.name) == 'nt': - req = req[:2] - - version_good = False - for i,num in enumerate(req): - if len(ver) < i+1: - # This minor version number is not present in tesseract, so it must be - # lower than required. (3.02 < 3.02.01) - break - if ver[i]==num and len(ver) == i+1 and len(ver)==len(req): - # 3.02.02 == 3.02.02 - version_good = True - continue - if ver[i]>num: - # 4.0 > 3.02.02 - # 3.03.02 > 3.02.02 - version_good = True - break - if ver[i]= StrictVersion(req)), ver_str def _warn(self, msg): # pragma: no cover print("WARNING: %s" % msg) @@ -141,7 +119,7 @@ def make_hocr_from_pnms(self, fns): try: hocr_filenames = pool.map(unwrap_self, zip([self]*len(fns), fns)) pool.close() - except KeyboardInterrupt or Exception: + except (KeyboardInterrupt, Exception): print("Caught keyboard interrupt... terminating") pool.terminate() raise diff --git a/test/test_tesseract.py b/test/test_tesseract.py index cbcd87e..04757f4 100644 --- a/test/test_tesseract.py +++ b/test/test_tesseract.py @@ -70,7 +70,7 @@ def test_tesseract_presence(self, capsys): def test_tesseract_version(self, capsys): p = P.PyTesseract({}) - p.required = "100" + p.required = "100.01" with pytest.raises(SystemExit): p.make_hocr_from_pnms("") out, err = capsys.readouterr() From 8a411452e1b1b0edee59c55bad6bf9c4bacc7127 Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 19:06:10 +0100 Subject: [PATCH 12/31] Make path to test config files more absolute --- test/test_option_parsing.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/test/test_option_parsing.py b/test/test_option_parsing.py index b8ae055..0def2f4 100644 --- a/test/test_option_parsing.py +++ b/test/test_option_parsing.py @@ -1,4 +1,5 @@ -#from pypdfocr import PyPDFOCR as P +import os + import pypdfocr.pypdfocr as P import pytest @@ -37,7 +38,9 @@ def test_standalone_filing(self): self.p.get_options(opts) # Assert that it checks that the config file is present - opts.append('--config=test_option_config.yaml') + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts.append('--config={}'.format(conf_path)) self.p.get_options(opts) assert(self.p.enable_filing) assert(self.p.config) @@ -50,7 +53,9 @@ def test_standalone_filing_evernote(self): with pytest.raises(SystemExit): self.p.get_options(opts) - opts.append('--config=test_option_config.yaml') + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts.append('--config={}'.format(conf_path)) self.p.get_options(opts) # Enabling -e should turn on filing too assert(self.p.enable_filing) @@ -80,8 +85,9 @@ def test_watch_filing(self): opts = ['-w temp'] self.p.get_options(opts) assert(self.p.watch_dir) - - opts.append('--config=test_option_config.yaml') + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts.append('--config={}'.format(conf_path)) self.p.get_options(opts) assert(self.p.watch) assert(self.p.config) @@ -96,7 +102,10 @@ def test_watch_filing_evernote(self): assert(self.p.enable_filing) assert(self.p.enable_evernote) - opts = ['-w temp', '-f', '-e', '--config=test_option_config.yaml'] + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts.append('--config={}'.format(conf_path)) + opts = ['-w temp', '-f', '-e', '--config={}'.format(conf_path)] self.p.get_options(opts) assert(self.p.watch) assert(self.p.config) From 1752f65625ea88d571ec9a11bc4f397a029faad4 Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 19:45:22 +0100 Subject: [PATCH 13/31] Make path to test config files more absolute --- test/test_option_parsing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_option_parsing.py b/test/test_option_parsing.py index 0def2f4..3b16bf3 100644 --- a/test/test_option_parsing.py +++ b/test/test_option_parsing.py @@ -95,7 +95,9 @@ def test_watch_filing(self): assert(not self.p.enable_evernote) def test_watch_filing_evernote(self): - opts = ['-w temp', '-e', '--config=test_option_config.yaml'] + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts = ['-w temp', '-e', '--config={}'.format(conf_path)] self.p.get_options(opts) assert(self.p.watch) assert(self.p.config) @@ -104,7 +106,6 @@ def test_watch_filing_evernote(self): conf_path = os.path.join( os.path.dirname(__file__), 'test_option_config.yaml') - opts.append('--config={}'.format(conf_path)) opts = ['-w temp', '-f', '-e', '--config={}'.format(conf_path)] self.p.get_options(opts) assert(self.p.watch) From 78fe57fce19bf0302e7678553e2ef264816d9fe3 Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 19:48:24 +0100 Subject: [PATCH 14/31] Re-raise original error message With a bare `raise` not being acceptable in py3, just raising an exception of the same type lost much of the useful information, making debugging hard. Now the original error is preserved, and re-raised appropriately. --- pypdfocr/pypdfocr.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pypdfocr/pypdfocr.py b/pypdfocr/pypdfocr.py index 690862b..cce86cf 100644 --- a/pypdfocr/pypdfocr.py +++ b/pypdfocr/pypdfocr.py @@ -61,12 +61,14 @@ def retry(count=5, exc_type = Exception): def decorator(func): @wraps(func) def result(*args, **kwargs): + err = None for _ in range(count): try: return func(*args, **kwargs) - except exc_type: - pass - raise + except exc_type as e: + err = e + else: + raise err return result return decorator @@ -173,7 +175,7 @@ def get_options(self, argv): filing_group = p.add_argument_group(title="Filing optinos") filing_group.add_argument('-f', '--file', action='store_true', default=False, dest='enable_filing', help='Enable filing of converted PDFs') - #filing_group.add_argument('-c', '--config', type = argparse.FileType('r'), + # filing_group.add_argument('-c', '--config', type = argparse.FileType('r'), filing_group.add_argument('-c', '--config', type = lambda x: open_file_with_timeout(p,x), dest='configfile', help='Configuration file for defaults and PDF filing') filing_group.add_argument('-e', '--evernote', action='store_true', From dbdf457f7f320547234830e73e34849c88e85260 Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 19:56:52 +0100 Subject: [PATCH 15/31] Update dict key checking to py3 compatible syntax --- pypdfocr/pypdfocr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdfocr/pypdfocr.py b/pypdfocr/pypdfocr.py index cce86cf..8dcf181 100644 --- a/pypdfocr/pypdfocr.py +++ b/pypdfocr/pypdfocr.py @@ -381,11 +381,11 @@ def run_conversion(self, pdf_filename): time.sleep(1) if not self.debug: # Need to clean up the original image files before preprocessing - if locals().has_key("fns"): # Have to check if this was set before exception raised + if "fns" in locals(): # Have to check if this was set before exception raised logging.info("Cleaning up %s" % fns) self._clean_up_files(fns) - if locals().has_key("preprocess_imagefilenames"): # Have to check if this was set before exception raised + if "preprocess_imagefilenames" in locals(): # Have to check if this was set before exception raised logging.info("Cleaning up %s" % preprocess_imagefilenames) self._clean_up_files(preprocess_imagefilenames) # splat the hocr_filenames as it is a list of pairs for ext in [".hocr", ".html", ".txt"]: From 4790d2e0357aae2639852f3f7dcd913bb815c78e Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 20:08:51 +0100 Subject: [PATCH 16/31] Wrap zip function in list calls for py3 compatability --- pypdfocr/pypdfocr_tesseract.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdfocr/pypdfocr_tesseract.py b/pypdfocr/pypdfocr_tesseract.py index aba973c..5452446 100644 --- a/pypdfocr/pypdfocr_tesseract.py +++ b/pypdfocr/pypdfocr_tesseract.py @@ -117,7 +117,7 @@ def make_hocr_from_pnms(self, fns): pool = Pool(processes=self.threads, initializer=init_worker) try: - hocr_filenames = pool.map(unwrap_self, zip([self]*len(fns), fns)) + hocr_filenames = pool.map(unwrap_self, list(zip([self]*len(fns), fns))) pool.close() except (KeyboardInterrupt, Exception): print("Caught keyboard interrupt... terminating") @@ -126,7 +126,7 @@ def make_hocr_from_pnms(self, fns): finally: pool.join() - return zip(fns,hocr_filenames) + return list(zip(fns,hocr_filenames)) def make_hocr_from_pnm(self, img_filename): From 9a0d318856639fb95333bcbc04728578d94e985e Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 20:18:25 +0100 Subject: [PATCH 17/31] More updates for py3 compatibility * Use context manager for opening files, to make sure they are closed properly * Different (better) string conversions now py3 is unicode default --- pypdfocr/pypdfocr_pdf.py | 3 ++- pypdfocr/pypdfocr_pdffiler.py | 8 ++++---- test/test_pdf_filer.py | 11 ++++++++--- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pypdfocr/pypdfocr_pdf.py b/pypdfocr/pypdfocr_pdf.py index 5a737cb..a8ae5d3 100644 --- a/pypdfocr/pypdfocr_pdf.py +++ b/pypdfocr/pypdfocr_pdf.py @@ -151,7 +151,8 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename): all_text_filename = os.path.join(pdf_dir, "%s_text.pdf" % (basename)) merger = PdfFileMerger() for text_pdf_filename in text_pdf_filenames: - merger.append(PdfFileReader(file(text_pdf_filename, 'rb'))) + with open(text_pdf_filename, 'rb') as f: + merger.append(PdfFileReader(f)) merger.write(all_text_filename) merger.close() del merger diff --git a/pypdfocr/pypdfocr_pdffiler.py b/pypdfocr/pypdfocr_pdffiler.py index 14d2a7b..7b1c05d 100644 --- a/pypdfocr/pypdfocr_pdffiler.py +++ b/pypdfocr/pypdfocr_pdffiler.py @@ -35,7 +35,7 @@ def __init__(self, filer): # Whether to fall back on filename for matching keywords against # if there is no match in the text - self.file_using_filename = False + self.file_using_filename = False def iter_pdf_page_text(self, filename): self.filename = filename @@ -43,7 +43,7 @@ def iter_pdf_page_text(self, filename): logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename)) for pgnum in range(reader.getNumPages()): text = reader.getPage(pgnum).extractText() - text = text.encode('ascii', 'ignore') + # text = text.encode('ascii', 'ignore') text = text.replace('\n', ' ') yield text @@ -55,7 +55,7 @@ def _get_matching_folder(self, pdfText): if s in searchText: logging.info("Matched keyword '%s'" % s) return folder - # No match found, so return + # No match found, so return return None def file_original(self, original_filename): @@ -71,7 +71,7 @@ def move_to_matching_folder(self, filename): tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder) return tgt_file - + if __name__ == '__main__': p = PyPdfFiler(PyFilerDirs()) for page_text in p.iter_pdf_page_text("scan_ocr.pdf"): diff --git a/test/test_pdf_filer.py b/test/test_pdf_filer.py index 823466c..c85c1fa 100644 --- a/test/test_pdf_filer.py +++ b/test/test_pdf_filer.py @@ -15,7 +15,9 @@ def test_file_by_filename(self, mock_move): # Mock the move function so we don't actually end up filing p = P.PyPDFOCR() cwd = os.getcwd() - filename = os.path.join("pdfs", "test_super_long_keyword.pdf") + filename = os.path.join(os.path.dirname(__file__), + "pdfs", + "test_super_long_keyword.pdf") out_filename = filename.replace(".pdf", "_ocr.pdf") if os.path.exists(out_filename): @@ -23,7 +25,10 @@ def test_file_by_filename(self, mock_move): print("Current directory: %s" % os.getcwd()) #opts = [filename, "--config=test_pypdfocr_config.yaml", "-f"] - opts = [filename, "--config=test_pypdfocr_config_filename.yaml", "-f", "-n"] + conf_path = os.path.join( + os.path.dirname(__file__), 'test_pypdfocr_config.yaml') + + opts = [filename, "--config={}".format(conf_path), "-f", "-n"] p.go(opts) assert(os.path.exists(out_filename)) @@ -34,4 +39,4 @@ def test_file_by_filename(self, mock_move): - + From 2617352b1314ae8834a36a8683877f3611aa10a4 Mon Sep 17 00:00:00 2001 From: Ben S Date: Thu, 12 Oct 2017 20:52:36 +0100 Subject: [PATCH 18/31] Update test paths to be absolute instead of relative when testing --- test/test_pypdfocr.py | 82 ++++++++++++++++++++++++++++++------------- 1 file changed, 57 insertions(+), 25 deletions(-) diff --git a/test/test_pypdfocr.py b/test/test_pypdfocr.py index ec58d05..e0ad274 100644 --- a/test/test_pypdfocr.py +++ b/test/test_pypdfocr.py @@ -19,34 +19,63 @@ def _iter_pdf(self, filename): logging.debug("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename)) for pgnum in range(reader.getNumPages()): text = reader.getPage(pgnum).extractText() - text = text.encode('ascii', 'ignore') + # text = text.encode('ascii', 'ignore') text = text.replace('\n', ' ') yield text - + + filepath = os.path.dirname(__file__) pdf_tests = [ - (".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "test_recipe.pdf"), [ ["Simply Recipes"], - ]), - (".", os.path.join("temp","target","patents"), os.path.join("pdfs","test_patent.pdf"), [ - ["asynchronous", "subject to", "20 Claims"], # Page 1 - ["FOREIGN PATENT" ], # Page 2 - ]), - (".", os.path.join("temp","target", "default"), os.path.join("pdfs","test_sherlock.pdf"), [ ["Bohemia", "Trincomalee"], # Page 1 - ["hundreds of times" ], # Page 2 - ]), - ("pdfs", os.path.join("temp","target","default"), "test_sherlock.pdf", [ ["Bohemia", "Trincomalee"], # Page 1 - ["hundreds of times" ], # Page 2 - ]), - (".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "1.pdf"), [ ["Simply","Recipes"], - ]), - (".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "test_recipe_sideways.pdf"), [ ["Simply","Recipes", 'spinach'], - ]), + ( + # Test the single file conversion with no filing. + filepath, + os.path.join(filepath, "temp","target","recipe"), + os.path.join(filepath, "pdfs", "test_recipe.pdf"), + [ ["Simply Recipes"],]), + ( + filepath, + os.path.join("temp","target","patents"), + os.path.join("pdfs","test_patent.pdf"), + [ + ["asynchronous", "subject to", "20 Claims"], # Page 1 + ["FOREIGN PATENT" ], # Page 2 + ]), + ( + filepath, + os.path.join("temp","target", "default"), + os.path.join("pdfs","test_sherlock.pdf"), + [ + ["Bohemia", "Trincomalee"], # Page 1 + ["hundreds of times" ], # Page 2 + ]), + ( + os.path.join(filepath, "pdfs"), + os.path.join("temp","target","default"), + "test_sherlock.pdf", + [ + ["Bohemia", "Trincomalee"], # Page 1 + ["hundreds of times" ], # Page 2 + ]), + ( + filepath, + os.path.join("temp","target","recipe"), + os.path.join("..","test", "pdfs", "1.pdf"), + [ + ["Simply","Recipes"], + ]), + ( + filepath, + os.path.join("temp","target","recipe"), + os.path.join("..","test", "pdfs", "test_recipe_sideways.pdf"), + [ + ["Simply","Recipes", 'spinach'], + ]), ] #@pytest.mark.skipif(True, reason="Just testing") @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", pdf_tests) def test_standalone(self, dirname, tgt_folder, filename, expected): """ - Test the single file conversion with no filing. + Test the single file conversion with no filing. Tests relative paths (".."), files in subirs, and files in current dir Checks for that _ocr file is created and keywords found in pdf. Modify :attribute:`pdf_tests` for changing keywords, etc @@ -57,9 +86,9 @@ def test_standalone(self, dirname, tgt_folder, filename, expected): # First redo the unix-style paths, in case we're running on windows # Assume paths in unix-style - dirname = os.path.join(*(dirname.split("/"))) - tgt_folder = os.path.join(*(tgt_folder.split("/"))) - filename = os.path.join(*(filename.split("/"))) + # dirname = os.path.join(*(dirname.split("/"))) + # tgt_folder = os.path.join(*(tgt_folder.split("/"))) + # filename = os.path.join(*(filename.split("/"))) cwd = os.getcwd() @@ -104,7 +133,7 @@ def test_standalone_email(self, dirname, tgt_folder, filename, expected): print(t) os.remove(out_filename) os.chdir(cwd) - + # Assert the smtp calls instance = mock_smtp.return_value assert(instance.starttls.called) @@ -112,7 +141,10 @@ def test_standalone_email(self, dirname, tgt_folder, filename, expected): assert(instance.sendmail.called) @patch('shutil.move') - @pytest.mark.parametrize("config", [("test_pypdfocr_config.yaml"), ("test_pypdfocr_config_no_move_original.yaml")]) + @pytest.mark.parametrize( + "config", + [(os.path.join(filepath, "test_pypdfocr_config.yaml")), + (os.path.join(filepath, "test_pypdfocr_config_no_move_original.yaml"))]) @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", pdf_tests[0:3]) def test_standalone_filing(self, mock_move, config, dirname, tgt_folder, filename, expected): """ @@ -146,7 +178,7 @@ def test_standalone_filing(self, mock_move, config, dirname, tgt_folder, filenam print(t) os.remove(out_filename) os.chdir(cwd) - + # Assert the smtp calls calls = [call(out_filename, os.path.abspath(os.path.join(tgt_folder,os.path.basename(out_filename))))] From 7a9198d0ed416a08360266808029f8127e863d48 Mon Sep 17 00:00:00 2001 From: Ben S Date: Fri, 13 Oct 2017 10:44:09 +0100 Subject: [PATCH 19/31] More paths made absolute to allow tests to be run more easily and from anywhere --- test/test_pypdfocr.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/test/test_pypdfocr.py b/test/test_pypdfocr.py index e0ad274..740e1f1 100644 --- a/test/test_pypdfocr.py +++ b/test/test_pypdfocr.py @@ -26,22 +26,21 @@ def _iter_pdf(self, filename): filepath = os.path.dirname(__file__) pdf_tests = [ ( - # Test the single file conversion with no filing. filepath, os.path.join(filepath, "temp","target","recipe"), os.path.join(filepath, "pdfs", "test_recipe.pdf"), [ ["Simply Recipes"],]), ( filepath, - os.path.join("temp","target","patents"), + os.path.join(filepath, "temp","target","patents"), os.path.join("pdfs","test_patent.pdf"), [ ["asynchronous", "subject to", "20 Claims"], # Page 1 ["FOREIGN PATENT" ], # Page 2 - ]), + ]), ( filepath, - os.path.join("temp","target", "default"), + os.path.join(filepath, "temp","target", "default"), os.path.join("pdfs","test_sherlock.pdf"), [ ["Bohemia", "Trincomalee"], # Page 1 @@ -49,7 +48,7 @@ def _iter_pdf(self, filename): ]), ( os.path.join(filepath, "pdfs"), - os.path.join("temp","target","default"), + os.path.join(filepath, "temp","target","default"), "test_sherlock.pdf", [ ["Bohemia", "Trincomalee"], # Page 1 @@ -57,21 +56,21 @@ def _iter_pdf(self, filename): ]), ( filepath, - os.path.join("temp","target","recipe"), + os.path.join(filepath, "temp","target","recipe"), os.path.join("..","test", "pdfs", "1.pdf"), [ ["Simply","Recipes"], ]), ( filepath, - os.path.join("temp","target","recipe"), + os.path.join(filepath, "temp","target","recipe"), os.path.join("..","test", "pdfs", "test_recipe_sideways.pdf"), [ ["Simply","Recipes", 'spinach'], ]), ] - #@pytest.mark.skipif(True, reason="Just testing") + # @pytest.mark.skipif(True, reason="Just testing") @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", pdf_tests) def test_standalone(self, dirname, tgt_folder, filename, expected): """ @@ -107,7 +106,7 @@ def test_standalone(self, dirname, tgt_folder, filename, expected): os.remove(out_filename) os.chdir(cwd) - #@pytest.mark.skipif(True, reason="just testing") + # @pytest.mark.skipif(True, reason="just testing") @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", [pdf_tests[0]]) def test_standalone_email(self, dirname, tgt_folder, filename, expected): """ @@ -181,11 +180,11 @@ def test_standalone_filing(self, mock_move, config, dirname, tgt_folder, filenam # Assert the smtp calls calls = [call(out_filename, - os.path.abspath(os.path.join(tgt_folder,os.path.basename(out_filename))))] + os.path.abspath(os.path.join(tgt_folder,os.path.basename(out_filename))))] if not "no_move_original" in config: new_file_name = os.path.basename(filename).replace(".pdf", "_2.pdf") calls.append(call(filename, - os.path.abspath(os.path.join("temp","original", new_file_name)))) + os.path.abspath(os.path.join("test", "temp","original", new_file_name)))) mock_move.assert_has_calls(calls) def test_set_binaries(self): From 68e6de1d40bac7fbf2444c7773be202f61331040 Mon Sep 17 00:00:00 2001 From: Ben S Date: Fri, 13 Oct 2017 11:11:39 +0100 Subject: [PATCH 20/31] Make queue checking more robust in watcher * Use tmpdir in tests to ensure a new directory is used for every test and cleaned up afterwards. * Use `self` instead of class name when addressing class attributes * Define class attributes in `__init__` to prevent data-leaks between multiple instances of the class. * Move queue purging out of loop to avoid "Dictionary changed size..." errors. --- pypdfocr/pypdfocr_watcher.py | 37 ++++++++++++++++++------------------ test/test_watcher.py | 17 +++++++++-------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/pypdfocr/pypdfocr_watcher.py b/pypdfocr/pypdfocr_watcher.py index f7ef556..d23d028 100755 --- a/pypdfocr/pypdfocr_watcher.py +++ b/pypdfocr/pypdfocr_watcher.py @@ -25,17 +25,19 @@ class PyPdfWatcher(FileSystemEventHandler): Every few seconds pop-off queue and if timestamp older than 3 seconds, process the file else, push it back onto queue. """ - events = {} - events_lock = Lock() def __init__(self, monitor_dir, config): FileSystemEventHandler.__init__(self) + + self.events = {} + self.events_lock = Lock() self.monitor_dir = monitor_dir if not config: config = {} self.scan_interval = config.get('scan_interval', 3) # If no updates in 3 seconds (or user specified option in config file) process file + def start(self): self.observer = Observer() self.observer.schedule(self, self.monitor_dir) @@ -94,19 +96,19 @@ def check_for_new_pdf(self,ev_path): """ if ev_path.endswith(".pdf"): if not ev_path.endswith(("_ocr.pdf", "_test.pdf")): - PyPdfWatcher.events_lock.acquire() - if not ev_path in PyPdfWatcher.events: - PyPdfWatcher.events[ev_path] = time.time() + self.events_lock.acquire() + if not ev_path in self.events: + self.events[ev_path] = time.time() logging.info ("Adding %s to event queue" % ev_path) else: - if PyPdfWatcher.events[ev_path] == -1: + if self.events[ev_path] == -1: logging.info ( "%s removing from event queue" % (ev_path)) - del PyPdfWatcher.events[ev_path] + del self.events[ev_path] else: newTime = time.time() logging.debug ( "%s already in event queue, updating timestamp to %d" % (ev_path, newTime)) - PyPdfWatcher.events[ev_path] = newTime - PyPdfWatcher.events_lock.release() + self.events[ev_path] = newTime + self.events_lock.release() @@ -133,19 +135,18 @@ def check_queue(self): :returns: Filename if available to process, otherwise None. """ now = time.time() - PyPdfWatcher.events_lock.acquire() - for monitored_file, timestamp in PyPdfWatcher.events.items(): - if timestamp == -1: - del PyPdfWatcher.events[monitored_file] - elif now - timestamp > self.scan_interval: + self.events_lock.acquire() + self.events = {file:ts for file, ts in self.events.items() if ts != -1} + for monitored_file, timestamp in self.events.items(): + if now - timestamp > self.scan_interval: logging.info("Processing new file %s" % (monitored_file)) # Remove this file from the dict - del PyPdfWatcher.events[monitored_file] + del self.events[monitored_file] monitored_file = self.rename_file_with_spaces(monitored_file) - PyPdfWatcher.events[monitored_file] = -1 # Add back into queue and mark as not needing further action in the event handler - PyPdfWatcher.events_lock.release() + self.events[monitored_file] = -1 # Add back into queue and mark as not needing further action in the event handler + self.events_lock.release() return monitored_file - PyPdfWatcher.events_lock.release() + self.events_lock.release() return None diff --git a/test/test_watcher.py b/test/test_watcher.py index 1d3507e..62f64d2 100644 --- a/test/test_watcher.py +++ b/test/test_watcher.py @@ -19,20 +19,20 @@ class TestWatching: @patch('shutil.move') @pytest.mark.parametrize(("filename, expected"), filenames) - def test_rename(self, mock_move, filename, expected): + def test_rename(self, mock_move, filename, expected, tmpdir): if expected == None: expected = filename - p = P.PyPdfWatcher('temp',{}) + p = P.PyPdfWatcher(str(tmpdir.mkdir("tmp")),{}) # First, test code that does not move original ret = p.rename_file_with_spaces(filename) assert (ret==expected) - def test_check_for_new_pdf(self): + def test_check_for_new_pdf(self, tmpdir): - p = P.PyPdfWatcher('temp', {}) + p = P.PyPdfWatcher(str(tmpdir.mkdir("tmp")), {}) p.check_for_new_pdf("blah_ocr.pdf") assert("blah_ocr.pdf" not in p.events) p.check_for_new_pdf("blah.pdf") @@ -45,8 +45,8 @@ def test_check_for_new_pdf(self): p.check_for_new_pdf("blah.pdf") assert(p.events['blah.pdf']-time.time() <=1) # Check that time stamp was updated - def test_events(self): - p = P.PyPdfWatcher('temp', {}) + def test_events(self, tmpdir): + p = P.PyPdfWatcher(str(tmpdir.mkdir("tmp")), {}) event = namedtuple('event', 'src_path, dest_path') @@ -59,8 +59,9 @@ def test_events(self): p.on_modified(event(src_path='temp_recipe3.pdf', dest_path=None)) assert('temp_recipe3.pdf' in p.events) - def test_check_queue(self): - p = P.PyPdfWatcher('temp', {}) + def test_check_queue(self, tmpdir): + p = P.PyPdfWatcher(str(tmpdir.mkdir("tmp")), {}) + assert p.events == {} now = time.time() p.events['blah.pdf'] = now f = p.check_queue() From 3721af7f3b8a91f0111d9ebe3662cf1b0961e5a5 Mon Sep 17 00:00:00 2001 From: Ben S Date: Fri, 13 Oct 2017 11:28:28 +0100 Subject: [PATCH 21/31] Update for py3 compatability. * Include py3.5 and py3.6 testing in travis. * Run pytest directly instead of through `setup.py` and fabric. At this time fabric is py2 only, so an updated `runtests.py` can't be generated. --- .travis.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 5da555e..361eac3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,11 @@ language: python python: - "2.7" + - "3.5" + - "3.6" install: - "pip install -r requirements.txt --use-mirrors" - "pip install pytest mock --use-mirrors" - "pip install ." script: - - "python setup.py test" + - "pytest test" From 337a0cfa36ed65739f2aebbf9f7530778f1d5d40 Mon Sep 17 00:00:00 2001 From: Ben S Date: Fri, 13 Oct 2017 11:37:07 +0100 Subject: [PATCH 22/31] Remove --use-mirrors flag from pip install commands This flag is no longer supported, and has been removed from current versions of pip. See https://github.com/travis-ci/docs-travis-ci-com/pull/10 for more details. --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 361eac3..648cea4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,8 +4,8 @@ python: - "3.5" - "3.6" install: - - "pip install -r requirements.txt --use-mirrors" - - "pip install pytest mock --use-mirrors" + - "pip install -r requirements.txt" + - "pip install pytest mock" - "pip install ." script: - "pytest test" From 43df00ecf619235fd7e11c539bf20c184a14dc55 Mon Sep 17 00:00:00 2001 From: Ben S Date: Fri, 13 Oct 2017 12:04:01 +0100 Subject: [PATCH 23/31] Install dependencies in travis build environment - how was this working before? --- .travis.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.travis.yml b/.travis.yml index 648cea4..d0282b4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,9 @@ python: - "2.7" - "3.5" - "3.6" +before-install: + - sudo apt -qq update + - sudo apt install -y tesseract-ocr ghostscript imagemagick install: - "pip install -r requirements.txt" - "pip install pytest mock" From c86a43055b72c6bbea6bd694bb4aa06304d17e61 Mon Sep 17 00:00:00 2001 From: Ben S Date: Fri, 13 Oct 2017 16:18:48 +0100 Subject: [PATCH 24/31] Use parse_version instead of StrictVersion to maintain py2 compatability --- pypdfocr/pypdfocr_tesseract.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pypdfocr/pypdfocr_tesseract.py b/pypdfocr/pypdfocr_tesseract.py index 5452446..c8a66f9 100644 --- a/pypdfocr/pypdfocr_tesseract.py +++ b/pypdfocr/pypdfocr_tesseract.py @@ -23,11 +23,11 @@ import logging import subprocess import glob -from distutils.version import StrictVersion +from pkg_resources import parse_version from subprocess import CalledProcessError from multiprocessing import Pool -from pypdfocr.pypdfocr_interrupts import init_worker +from .pypdfocr_interrupts import init_worker def error(text): print("ERROR: %s" % text) @@ -80,11 +80,11 @@ def _is_version_uptodate(self): Make sure the version is current """ logging.info("Checking tesseract version") - cmd = '%s -v' % (self.binary) + cmd = [self.binary, '-v'] logging.info(cmd) try: ret_output = subprocess.check_output( - cmd, shell=True, encoding="utf-8", stderr=subprocess.STDOUT) + cmd, shell=True, stderr=subprocess.STDOUT) except CalledProcessError: # Could not run tesseract error(self.msgs['TS_MISSING']) @@ -93,14 +93,12 @@ def _is_version_uptodate(self): for line in ret_output.splitlines(): if 'tesseract' in line: ver_str = line.split(' ')[1] - if ver_str.endswith('dev'): # Fix for version strings that end in 'dev' - ver_str = ver_str[:-3] # Aargh, in windows 3.02.02 is reported as version 3.02 if str(os.name) == 'nt': req = self.required[:-3] else: req = self.required - return (StrictVersion(ver_str) >= StrictVersion(req)), ver_str + return (parse_version(ver_str) >= parse_version(req)), ver_str def _warn(self, msg): # pragma: no cover print("WARNING: %s" % msg) From 397ee107bc6659be7743f8014d102a7bf825656f Mon Sep 17 00:00:00 2001 From: Ben S Date: Fri, 13 Oct 2017 16:27:56 +0100 Subject: [PATCH 25/31] Use universal newlines to ensure output is string not bytes in both py2 and py3 --- pypdfocr/pypdfocr_tesseract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdfocr/pypdfocr_tesseract.py b/pypdfocr/pypdfocr_tesseract.py index c8a66f9..23783c0 100644 --- a/pypdfocr/pypdfocr_tesseract.py +++ b/pypdfocr/pypdfocr_tesseract.py @@ -84,7 +84,7 @@ def _is_version_uptodate(self): logging.info(cmd) try: ret_output = subprocess.check_output( - cmd, shell=True, stderr=subprocess.STDOUT) + cmd, shell=True, stderr=subprocess.STDOUT, universal_newlines=True) except CalledProcessError: # Could not run tesseract error(self.msgs['TS_MISSING']) From 9c8a44cd3b7336832e1b8eacff1b3ce3da16b8c8 Mon Sep 17 00:00:00 2001 From: Ben S Date: Fri, 13 Oct 2017 16:45:01 +0100 Subject: [PATCH 26/31] Use explicit relative imports for best py2 & py3 support It seems that py2 wasn't happy with purely absolute imports, whilst py3 isn't happy with purely relative imports. Using explicit relative imports seems to be working in both py2 and py3. --- pypdfocr/pypdfocr.py | 20 ++++++++++---------- pypdfocr/pypdfocr_filer_dirs.py | 2 +- pypdfocr/pypdfocr_filer_evernote.py | 2 +- pypdfocr/pypdfocr_pdf.py | 2 +- pypdfocr/pypdfocr_pdffiler.py | 4 ++-- pypdfocr/pypdfocr_preprocess.py | 2 +- pypdfocr/pypdfocr_tesseract.py | 4 +++- test/test_gs.py | 2 +- test/test_option_parsing.py | 2 +- test/test_pdf_filer.py | 2 +- test/test_pypdfocr.py | 2 +- test/test_tesseract.py | 2 +- test/test_watcher.py | 2 +- 13 files changed, 25 insertions(+), 23 deletions(-) diff --git a/pypdfocr/pypdfocr.py b/pypdfocr/pypdfocr.py index 8dcf181..cd9fb9e 100644 --- a/pypdfocr/pypdfocr.py +++ b/pypdfocr/pypdfocr.py @@ -21,7 +21,7 @@ import itertools from functools import wraps -from pypdfocr.version import __version__ +from .version import __version__ from PIL import Image import yaml @@ -40,17 +40,17 @@ except ImportError: import multiprocessing.forking as forking -from pypdfocr.pypdfocr_multiprocessing import _Popen +from .pypdfocr_multiprocessing import _Popen forking.Popen = _Popen -from pypdfocr.pypdfocr_pdf import PyPdf -from pypdfocr.pypdfocr_tesseract import PyTesseract -from pypdfocr.pypdfocr_gs import PyGs -from pypdfocr.pypdfocr_watcher import PyPdfWatcher -from pypdfocr.pypdfocr_pdffiler import PyPdfFiler -from pypdfocr.pypdfocr_filer_dirs import PyFilerDirs -from pypdfocr.pypdfocr_filer_evernote import PyFilerEvernote -from pypdfocr.pypdfocr_preprocess import PyPreprocess +from .pypdfocr_pdf import PyPdf +from .pypdfocr_tesseract import PyTesseract +from .pypdfocr_gs import PyGs +from .pypdfocr_watcher import PyPdfWatcher +from .pypdfocr_pdffiler import PyPdfFiler +from .pypdfocr_filer_dirs import PyFilerDirs +from .pypdfocr_filer_evernote import PyFilerEvernote +from .pypdfocr_preprocess import PyPreprocess def error(text): print("ERROR: %s" % text) diff --git a/pypdfocr/pypdfocr_filer_dirs.py b/pypdfocr/pypdfocr_filer_dirs.py index e6d1505..c7dc73f 100644 --- a/pypdfocr/pypdfocr_filer_dirs.py +++ b/pypdfocr/pypdfocr_filer_dirs.py @@ -16,7 +16,7 @@ import os import shutil -from pypdfocr.pypdfocr_filer import PyFiler +from .pypdfocr_filer import PyFiler """ Implementation of a filer class diff --git a/pypdfocr/pypdfocr_filer_evernote.py b/pypdfocr/pypdfocr_filer_evernote.py index 666b874..34ce561 100644 --- a/pypdfocr/pypdfocr_filer_evernote.py +++ b/pypdfocr/pypdfocr_filer_evernote.py @@ -19,7 +19,7 @@ import time import sys -from pypdfocr.pypdfocr_filer import PyFiler +from .pypdfocr_filer import PyFiler import functools diff --git a/pypdfocr/pypdfocr_pdf.py b/pypdfocr/pypdfocr_pdf.py index a8ae5d3..8438b38 100644 --- a/pypdfocr/pypdfocr_pdf.py +++ b/pypdfocr/pypdfocr_pdf.py @@ -51,7 +51,7 @@ from reportlab.lib.enums import TA_LEFT from reportlab.platypus.paragraph import Paragraph -from pypdfocr.pypdfocr_util import Retry +from .pypdfocr_util import Retry from functools import partial class RotatedPara(Paragraph): diff --git a/pypdfocr/pypdfocr_pdffiler.py b/pypdfocr/pypdfocr_pdffiler.py index 7b1c05d..dea839b 100644 --- a/pypdfocr/pypdfocr_pdffiler.py +++ b/pypdfocr/pypdfocr_pdffiler.py @@ -24,8 +24,8 @@ import shutil from PyPDF2 import PdfFileReader -from pypdfocr.pypdfocr_filer import PyFiler -from pypdfocr.pypdfocr_filer_dirs import PyFilerDirs +from .pypdfocr_filer import PyFiler +from .pypdfocr_filer_dirs import PyFilerDirs class PyPdfFiler(object): def __init__(self, filer): diff --git a/pypdfocr/pypdfocr_preprocess.py b/pypdfocr/pypdfocr_preprocess.py index 4775089..34ed89a 100644 --- a/pypdfocr/pypdfocr_preprocess.py +++ b/pypdfocr/pypdfocr_preprocess.py @@ -28,7 +28,7 @@ import signal from multiprocessing import Pool -from pypdfocr.pypdfocr_interrupts import init_worker +from .pypdfocr_interrupts import init_worker # Ugly hack to pass in object method to the multiprocessing library # From http://www.rueckstiess.net/research/snippets/show/ca1d7d90 diff --git a/pypdfocr/pypdfocr_tesseract.py b/pypdfocr/pypdfocr_tesseract.py index 23783c0..1cfb9f2 100644 --- a/pypdfocr/pypdfocr_tesseract.py +++ b/pypdfocr/pypdfocr_tesseract.py @@ -80,7 +80,7 @@ def _is_version_uptodate(self): Make sure the version is current """ logging.info("Checking tesseract version") - cmd = [self.binary, '-v'] + cmd = "%s -v" % self.binary logging.info(cmd) try: ret_output = subprocess.check_output( @@ -91,6 +91,7 @@ def _is_version_uptodate(self): ver_str = '0.0.0' for line in ret_output.splitlines(): + print(line) if 'tesseract' in line: ver_str = line.split(' ')[1] # Aargh, in windows 3.02.02 is reported as version 3.02 @@ -98,6 +99,7 @@ def _is_version_uptodate(self): req = self.required[:-3] else: req = self.required + print(ver_str) return (parse_version(ver_str) >= parse_version(req)), ver_str def _warn(self, msg): # pragma: no cover diff --git a/test/test_gs.py b/test/test_gs.py index 94d1130..9bebb57 100644 --- a/test/test_gs.py +++ b/test/test_gs.py @@ -1,5 +1,5 @@ #from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr_gs as P +from pypdfocr import pypdfocr_gs as P import pytest import os diff --git a/test/test_option_parsing.py b/test/test_option_parsing.py index 3b16bf3..5984dc5 100644 --- a/test/test_option_parsing.py +++ b/test/test_option_parsing.py @@ -1,6 +1,6 @@ import os -import pypdfocr.pypdfocr as P +from pypdfocr import pypdfocr as P import pytest diff --git a/test/test_pdf_filer.py b/test/test_pdf_filer.py index c85c1fa..bdea966 100644 --- a/test/test_pdf_filer.py +++ b/test/test_pdf_filer.py @@ -1,5 +1,5 @@ #from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr as P +from pypdfocr import pypdfocr as P import os from mock import patch, call diff --git a/test/test_pypdfocr.py b/test/test_pypdfocr.py index 740e1f1..21d1c94 100644 --- a/test/test_pypdfocr.py +++ b/test/test_pypdfocr.py @@ -1,5 +1,5 @@ #from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr as P +from pypdfocr import pypdfocr as P import pytest import os import logging diff --git a/test/test_tesseract.py b/test/test_tesseract.py index 04757f4..5f10131 100644 --- a/test/test_tesseract.py +++ b/test/test_tesseract.py @@ -1,5 +1,5 @@ #from pypdfocr import PyPDFOCR as P -import pypdfocr.pypdfocr_tesseract as P +from pypdfocr import pypdfocr_tesseract as P import pytest import os diff --git a/test/test_watcher.py b/test/test_watcher.py index 62f64d2..a7ae6ce 100644 --- a/test/test_watcher.py +++ b/test/test_watcher.py @@ -1,4 +1,4 @@ -import pypdfocr.pypdfocr_watcher as P +from pypdfocr import pypdfocr_watcher as P import pytest import time From 604ff5e3461ec6fefc6fc8f415d7ec2b13da8fef Mon Sep 17 00:00:00 2001 From: Ben S Date: Fri, 13 Oct 2017 16:59:44 +0100 Subject: [PATCH 27/31] Also use universal newlines in other instances of subprocess --- pypdfocr/pypdfocr_gs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdfocr/pypdfocr_gs.py b/pypdfocr/pypdfocr_gs.py index 21ff3df..1477847 100644 --- a/pypdfocr/pypdfocr_gs.py +++ b/pypdfocr/pypdfocr_gs.py @@ -171,7 +171,7 @@ def _run_gs(self, options, output_filename, pdf_filename): try: cmd = '%s -q -dNOPAUSE %s -sOutputFile="%s" "%s" -c quit' % (self.binary, options, output_filename, pdf_filename) logging.info(cmd) - out = subprocess.check_output(cmd, shell=True) + out = subprocess.check_output(cmd, shell=True, universal_newlines=True) except subprocess.CalledProcessError as e: print(e.output) From a632f4aac200ebc4279a11e05a0b17d970a1efb7 Mon Sep 17 00:00:00 2001 From: Ben S Date: Fri, 13 Oct 2017 17:00:24 +0100 Subject: [PATCH 28/31] Use apt-get - apt may be too new for the travis images --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index d0282b4..9bb48c7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,8 +4,8 @@ python: - "3.5" - "3.6" before-install: - - sudo apt -qq update - - sudo apt install -y tesseract-ocr ghostscript imagemagick + - sudo apt-get -qq update + - sudo apt-get install -y tesseract-ocr ghostscript imagemagick install: - "pip install -r requirements.txt" - "pip install pytest mock" From ce79aa9ec8ba2c291bec8a44fe1106f1913e75a0 Mon Sep 17 00:00:00 2001 From: Ben S Date: Fri, 13 Oct 2017 17:01:29 +0100 Subject: [PATCH 29/31] Should be before_install, not before-install --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 9bb48c7..06dd8be 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,7 @@ python: - "2.7" - "3.5" - "3.6" -before-install: +before_install: - sudo apt-get -qq update - sudo apt-get install -y tesseract-ocr ghostscript imagemagick install: From 3ecf58b5874d782e22b9f07f905f18eb524290f1 Mon Sep 17 00:00:00 2001 From: Ben S Date: Wed, 18 Oct 2017 22:36:27 +0100 Subject: [PATCH 30/31] Re-enable evernote filing in python2 This reenables evernote support in python2 whilst keeping python3 functionality for all other aspects. --- pypdfocr/pypdfocr.py | 9 +- pypdfocr/pypdfocr_filer_evernote.py | 18 ++- requirements.txt | 2 +- test/test_evernote.py | 234 +++++++++++++++------------- test/test_option_parsing.py | 22 +++ 5 files changed, 165 insertions(+), 120 deletions(-) diff --git a/pypdfocr/pypdfocr.py b/pypdfocr/pypdfocr.py index cd9fb9e..a95cb66 100644 --- a/pypdfocr/pypdfocr.py +++ b/pypdfocr/pypdfocr.py @@ -49,6 +49,7 @@ from .pypdfocr_watcher import PyPdfWatcher from .pypdfocr_pdffiler import PyPdfFiler from .pypdfocr_filer_dirs import PyFilerDirs +from .pypdfocr_filer_evernote import ENABLED as evernote_enabled from .pypdfocr_filer_evernote import PyFilerEvernote from .pypdfocr_preprocess import PyPreprocess @@ -179,7 +180,7 @@ def get_options(self, argv): filing_group.add_argument('-c', '--config', type = lambda x: open_file_with_timeout(p,x), dest='configfile', help='Configuration file for defaults and PDF filing') filing_group.add_argument('-e', '--evernote', action='store_true', - default=False, dest='enable_evernote', help='Enable filing to Evernote') + default=False, dest='enable_evernote', help='Enable filing to Evernote.') filing_group.add_argument('-n', action='store_true', default=False, dest='match_using_filename', help='Use filename to match if contents did not match anything, before filing to default folder') @@ -218,7 +219,11 @@ def get_options(self, argv): logging.debug("Read in configuration file") logging.debug(self.config) - if args.enable_evernote: + # Evernote filing does not work in py3 + if args.enable_evernote and not evernote_enabled: + print("Warning: Evernote filing disabled, could not find evernote API. Evernote not available in py3.") + self.enable_evernote = False + elif args.enable_evernote: self.enable_evernote = True else: self.enable_evernote = False diff --git a/pypdfocr/pypdfocr_filer_evernote.py b/pypdfocr/pypdfocr_filer_evernote.py index 34ce561..9064415 100644 --- a/pypdfocr/pypdfocr_filer_evernote.py +++ b/pypdfocr/pypdfocr_filer_evernote.py @@ -23,13 +23,17 @@ import functools -# from evernote.api.client import EvernoteClient -# import evernote.edam.type.ttypes as Types -# import evernote.edam.userstore.constants as UserStoreConstants -# from evernote.edam.error.ttypes import EDAMUserException -# from evernote.edam.error.ttypes import EDAMSystemException -# from evernote.edam.error.ttypes import EDAMNotFoundException -# from evernote.edam.error.ttypes import EDAMErrorCode +try: + from evernote.api.client import EvernoteClient + import evernote.edam.type.ttypes as Types + import evernote.edam.userstore.constants as UserStoreConstants + from evernote.edam.error.ttypes import EDAMUserException + from evernote.edam.error.ttypes import EDAMSystemException + from evernote.edam.error.ttypes import EDAMNotFoundException + from evernote.edam.error.ttypes import EDAMErrorCode + ENABLED = True +except ImportError: + ENABLED = False """ diff --git a/requirements.txt b/requirements.txt index 232bfd7..c086189 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ pillow>=2.2 reportlab>=2.7 watchdog>=0.6.0 pypdf2>=1.23 -# evernote +evernote; python_version < '3' diff --git a/test/test_evernote.py b/test/test_evernote.py index 1e08f58..9afbf02 100644 --- a/test/test_evernote.py +++ b/test/test_evernote.py @@ -1,123 +1,137 @@ -# #from pypdfocr import PyPDFOCR as P -# import pypdfocr.pypdfocr_filer_evernote as P -# import pytest -# import os - -# import evernote.api.client -# import evernote.edam.type.ttypes as Types -# import hashlib - -# from mock import patch, call - -# class TestEvernote: - -# def test_connecct(self): -# # Tricky mocking. Need to mock the EvernoteClient import in pypdfocr_filer_evernote.py file -# with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: -# p = P.PyFilerEvernote("TOKEN") -# inst = mock_evernote_client.return_value -# assert(inst.get_user_store.called) - -# @patch('shutil.move') -# def test_file_original(self, mock_move): -# with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: -# p = P.PyFilerEvernote("TOKEN") -# filename = os.path.join("pdfs","test_recipe.pdf") - -# # First, test code that does not move original -# p.file_original(filename) -# assert (not mock_move.called) - -# # Now test moving -# p.set_original_move_folder(os.path.join("temp", "original")) -# p.file_original(filename) -# mock_move.assert_called_with(filename, os.path.join("temp","original", "test_recipe_2.pdf")) - -# @patch('os.remove') -# def test_move_to_folder(self, mock_remove): -# with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: -# p = P.PyFilerEvernote("TOKEN") -# filename = os.path.join("pdfs", "test_recipe.pdf") -# foldername = 'recipe' -# with pytest.raises(AssertionError): -# p.move_to_matching_folder(filename, foldername) -# p.set_target_folder('target') -# with pytest.raises(AssertionError): -# p.move_to_matching_folder(filename, foldername) -# p.set_default_folder('default') -# p.move_to_matching_folder(filename, None) -# p.move_to_matching_folder(filename, foldername) +#from pypdfocr import PyPDFOCR as P +import pypdfocr.pypdfocr_filer_evernote as P +import pytest +import os +import sys + +if sys.version_info.major == 2: + import evernote.api.client + import evernote.edam.type.ttypes as Types +import hashlib + +from mock import patch, call + + +def test_import(): + """Evernote filing enabled for py2 only""" + expect_enabled = sys.version_info.major == 2 + assert P.ENABLED == expect_enabled + + +@pytest.mark.skipif(sys.version_info.major>=3, + reason="Evernote API not compatible with py3.") +class TestEvernote: + + def test_connecct(self): + # Tricky mocking. Need to mock the EvernoteClient import in pypdfocr_filer_evernote.py file + with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: + p = P.PyFilerEvernote("TOKEN") + inst = mock_evernote_client.return_value + assert(inst.get_user_store.called) + + @patch('shutil.move') + def test_file_original(self, mock_move): + with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: + p = P.PyFilerEvernote("TOKEN") + filepath = os.path.dirname(__file__) + filename = os.path.join(filepath, "pdfs","test_recipe.pdf") + + # First, test code that does not move original + p.file_original(filename) + assert (not mock_move.called) + + # Now test moving + p.set_original_move_folder(os.path.join(filepath, "temp", "original")) + p.file_original(filename) + mock_move.assert_called_with(filename, os.path.join(filepath, "temp","original", "test_recipe_2.pdf")) + + @patch('os.remove') + def test_move_to_folder(self, mock_remove): + with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: + p = P.PyFilerEvernote("TOKEN") + filepath = os.path.dirname(__file__) + filename = os.path.join(filepath, "pdfs", "test_recipe.pdf") + foldername = os.path.join(filepath, 'recipe') + with pytest.raises(AssertionError): + p.move_to_matching_folder(filename, foldername) + p.set_target_folder('target') + with pytest.raises(AssertionError): + p.move_to_matching_folder(filename, foldername) + p.set_default_folder('default') + p.move_to_matching_folder(filename, None) + p.move_to_matching_folder(filename, foldername) -# mock_client = mock_evernote_client.return_value -# assert(mock_client.get_note_store.called) -# assert(mock_client.get_note_store.return_value.createNote.called) -# mock_remove.assert_called_with(filename) + mock_client = mock_evernote_client.return_value + assert(mock_client.get_note_store.called) + assert(mock_client.get_note_store.return_value.createNote.called) + mock_remove.assert_called_with(filename) -# def test_create_note(self): -# with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: -# p = P.PyFilerEvernote("TOKEN") -# notebook = Types.Notebook() -# notebook.name = "recipe" -# filename = "pdfs/test_recipe.pdf" -# note = p._create_evernote_note(notebook, filename) -# xml = '' -# assert(note.content.startswith(xml)) + def test_create_note(self): + with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: + p = P.PyFilerEvernote("TOKEN") + notebook = Types.Notebook() + notebook.name = "recipe" + filepath = os.path.dirname(__file__) + filename = os.path.join(filepath, "pdfs/test_recipe.pdf") + note = p._create_evernote_note(notebook, filename) + xml = '' + assert(note.content.startswith(xml)) -# md5 = hashlib.md5() -# with open(filename,'rb') as f: -# pdf_bytes = f.read() -# md5.update(pdf_bytes) + md5 = hashlib.md5() + with open(filename,'rb') as f: + pdf_bytes = f.read() + md5.update(pdf_bytes) -# md5hash = md5.hexdigest() + md5hash = md5.hexdigest() -# assert(md5hash in note.content) -# assert(note.resources[0].data.bodyHash == md5hash) - - -# def test_check_notebook(self): -# with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: -# p = P.PyFilerEvernote("TOKEN") -# p._check_and_make_notebook("new_notebook") -# # Let's assert that we tried to create a new notebook -# mock_client = mock_evernote_client.return_value -# assert(mock_client.get_note_store.called) -# create_func = mock_client.get_note_store.return_value.createNotebook -# update_func = mock_client.get_note_store.return_value.updateNotebook -# assert(create_func.called) -# assert(not update_func.called) -# notebook = create_func.call_args[0][0] -# assert(notebook.name == 'new_notebook') - -# # Now, let's setup a value for the notebooks, so we test the code for -# # a "pre-exisiting" notebook -# test_notebook = Types.Notebook() -# test_notebook.name = "new_notebook" -# mock_client.get_note_store.return_value.listNotebooks.return_value = [test_notebook] -# p._check_and_make_notebook("new_notebook") - -# # Now check that the code to update a notebook stack is correct -# test_notebook.stack = "new_stack" -# update_func = mock_client.get_note_store.return_value.updateNotebook -# p.set_target_folder("Boogie") -# p._check_and_make_notebook("new_notebook") -# # Check that the update call was called with correct arguments -# assert(update_func.called) -# notebook = update_func.call_args[0][0] -# assert(notebook.stack == 'Boogie') + assert(md5hash in note.content) + assert(note.resources[0].data.bodyHash == md5hash) + + + def test_check_notebook(self): + with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: + p = P.PyFilerEvernote("TOKEN") + p._check_and_make_notebook("new_notebook") + # Let's assert that we tried to create a new notebook + mock_client = mock_evernote_client.return_value + assert(mock_client.get_note_store.called) + create_func = mock_client.get_note_store.return_value.createNotebook + update_func = mock_client.get_note_store.return_value.updateNotebook + assert(create_func.called) + assert(not update_func.called) + notebook = create_func.call_args[0][0] + assert(notebook.name == 'new_notebook') + + # Now, let's setup a value for the notebooks, so we test the code for + # a "pre-exisiting" notebook + test_notebook = Types.Notebook() + test_notebook.name = "new_notebook" + mock_client.get_note_store.return_value.listNotebooks.return_value = [test_notebook] + p._check_and_make_notebook("new_notebook") + + # Now check that the code to update a notebook stack is correct + test_notebook.stack = "new_stack" + update_func = mock_client.get_note_store.return_value.updateNotebook + p.set_target_folder("Boogie") + p._check_and_make_notebook("new_notebook") + # Check that the update call was called with correct arguments + assert(update_func.called) + notebook = update_func.call_args[0][0] + assert(notebook.stack == 'Boogie') -# def test_add_folder_target(self): -# with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: -# p = P.PyFilerEvernote("TOKEN") -# p.add_folder_target("folder1", ["target1", "target2"]) -# with pytest.raises(AssertionError): -# p.add_folder_target("folder1", ["target1", "target2"]) -# p.add_folder_target("folder2", ["target1", "target2"]) -# assert("folder1" in p.folder_targets.keys()) -# assert("folder2" in p.folder_targets.keys()) + def test_add_folder_target(self): + with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: + p = P.PyFilerEvernote("TOKEN") + p.add_folder_target("folder1", ["target1", "target2"]) + with pytest.raises(AssertionError): + p.add_folder_target("folder1", ["target1", "target2"]) + p.add_folder_target("folder2", ["target1", "target2"]) + assert("folder1" in p.folder_targets.keys()) + assert("folder2" in p.folder_targets.keys()) diff --git a/test/test_option_parsing.py b/test/test_option_parsing.py index 5984dc5..974f93d 100644 --- a/test/test_option_parsing.py +++ b/test/test_option_parsing.py @@ -1,4 +1,5 @@ import os +import sys from pypdfocr import pypdfocr as P import pytest @@ -45,6 +46,8 @@ def test_standalone_filing(self): assert(self.p.enable_filing) assert(self.p.config) + @pytest.mark.skipif(sys.version_info.major>2, + reason="Evernote disabled for py3") def test_standalone_filing_evernote(self): # Check when evernote is enabled opts = ["blah.pdf"] @@ -70,6 +73,23 @@ def test_standalone_filing_evernote(self): assert(self.p.config) assert(not self.p.watch) + @pytest.mark.skipif(sys.version_info.major==2, + reason="Evernote works on py2") + def test_evernote_disabled(self): + opts = ["blah.pdf"] + opts.append('-e') + # Assert that it checks that the config file is present + with pytest.raises(SystemExit): + self.p.get_options(opts) + + conf_path = os.path.join( + os.path.dirname(__file__), 'test_option_config.yaml') + opts.append('--config={}'.format(conf_path)) + self.p.get_options(opts) + # Enabling -e should turn on filing too + assert(self.p.enable_filing) + assert not self.p.enable_evernote + def test_standalone_watch_conflict(self): # When pdf file is specified, we don't want to allow watch option opts = ["blah.pdf", '-w'] @@ -94,6 +114,8 @@ def test_watch_filing(self): assert(not self.p.enable_filing) assert(not self.p.enable_evernote) + @pytest.mark.skipif(sys.version_info.major>2, + reason="Evernote disabled for py3") def test_watch_filing_evernote(self): conf_path = os.path.join( os.path.dirname(__file__), 'test_option_config.yaml') From fc5890b6e55d3c7c2e28bc6302bdb2c0b3091453 Mon Sep 17 00:00:00 2001 From: Ben Secretan Date: Mon, 23 Oct 2017 11:33:56 +0100 Subject: [PATCH 31/31] Don't check for filing being enabled by evernote flag if evernote is disabled --- test/test_option_parsing.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/test_option_parsing.py b/test/test_option_parsing.py index 974f93d..c97e39d 100644 --- a/test/test_option_parsing.py +++ b/test/test_option_parsing.py @@ -86,8 +86,6 @@ def test_evernote_disabled(self): os.path.dirname(__file__), 'test_option_config.yaml') opts.append('--config={}'.format(conf_path)) self.p.get_options(opts) - # Enabling -e should turn on filing too - assert(self.p.enable_filing) assert not self.p.enable_evernote def test_standalone_watch_conflict(self):