Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
765c9e8
Remove Evernote from requirements, not currently available for py3
Oct 12, 2017
8ffb08c
Comment out Evernote tests
Oct 12, 2017
822582c
Update print to function calls for py3 compatibility
Oct 12, 2017
dffacba
Remove unused imports from test files
Oct 12, 2017
53fc18c
Replace tabs with spaces
Oct 12, 2017
2a09669
Replace tabs with spaces
Oct 12, 2017
ad019dd
Update print to function calls for py3 compatibility
Oct 12, 2017
bba9c8f
Update imports for py3
Oct 12, 2017
82c9649
Comment out evernote imports, not working for py3 at this time
Oct 12, 2017
116bd7f
Remove unused imports that didn't work in py3
Oct 12, 2017
c1a6296
Update tesseract module for py3 compatability
Oct 12, 2017
8a41145
Make path to test config files more absolute
Oct 12, 2017
1752f65
Make path to test config files more absolute
Oct 12, 2017
78fe57f
Re-raise original error message
Oct 12, 2017
dbdf457
Update dict key checking to py3 compatible syntax
Oct 12, 2017
4790d2e
Wrap zip function in list calls for py3 compatability
Oct 12, 2017
9a0d318
More updates for py3 compatibility
Oct 12, 2017
2617352
Update test paths to be absolute instead of relative when testing
Oct 12, 2017
7a9198d
More paths made absolute to allow tests to be run more easily and fro…
Oct 13, 2017
68e6de1
Make queue checking more robust in watcher
Oct 13, 2017
3721af7
Update for py3 compatability.
Oct 13, 2017
337a0cf
Remove --use-mirrors flag from pip install commands
Oct 13, 2017
43df00e
Install dependencies in travis build environment - how was this worki…
Oct 13, 2017
c86a430
Use parse_version instead of StrictVersion to maintain py2 compatability
Oct 13, 2017
397ee10
Use universal newlines to ensure output is string not bytes in both p…
Oct 13, 2017
9c8a44c
Use explicit relative imports for best py2 & py3 support
Oct 13, 2017
604ff5e
Also use universal newlines in other instances of subprocess
Oct 13, 2017
a632f4a
Use apt-get - apt may be too new for the travis images
Oct 13, 2017
ce79aa9
Should be before_install, not before-install
Oct 13, 2017
3ecf58b
Re-enable evernote filing in python2
Oct 18, 2017
fc5890b
Don't check for filing being enabled by evernote flag if evernote is …
Oct 23, 2017
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
language: python
python:
- "2.7"
- "3.5"
- "3.6"
before_install:
- sudo apt-get -qq update
- sudo apt-get install -y tesseract-ocr ghostscript imagemagick
install:
- "pip install -r requirements.txt --use-mirrors"
- "pip install pytest mock --use-mirrors"
- "pip install -r requirements.txt"
- "pip install pytest mock"
- "pip install ."
script:
- "python setup.py test"
- "pytest test"
61 changes: 40 additions & 21 deletions pypdfocr/pypdfocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,37 @@
import itertools
from functools import wraps

from version import __version__
from .version import __version__
from PIL import Image
import yaml

import multiprocessing
# Replace the Popen routine to allow win32 pyinstaller to build
from multiprocessing import forking
from pypdfocr_multiprocessing import _Popen

""" Special work-around to support multiprocessing and pyinstaller --onefile on windows systms

https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing
"""
try:
# Python 3.4+
if sys.platform.startswith('win'):
import multiprocessing.popen_spawn_win32 as forking
else:
import multiprocessing.popen_fork as forking
except ImportError:
import multiprocessing.forking as forking

from .pypdfocr_multiprocessing import _Popen
forking.Popen = _Popen

from pypdfocr_pdf import PyPdf
from pypdfocr_tesseract import PyTesseract
from pypdfocr_gs import PyGs
from pypdfocr_watcher import PyPdfWatcher
from pypdfocr_pdffiler import PyPdfFiler
from pypdfocr_filer_dirs import PyFilerDirs
from pypdfocr_filer_evernote import PyFilerEvernote
from pypdfocr_preprocess import PyPreprocess
from .pypdfocr_pdf import PyPdf
from .pypdfocr_tesseract import PyTesseract
from .pypdfocr_gs import PyGs
from .pypdfocr_watcher import PyPdfWatcher
from .pypdfocr_pdffiler import PyPdfFiler
from .pypdfocr_filer_dirs import PyFilerDirs
from .pypdfocr_filer_evernote import ENABLED as evernote_enabled
from .pypdfocr_filer_evernote import PyFilerEvernote
from .pypdfocr_preprocess import PyPreprocess

def error(text):
print("ERROR: %s" % text)
Expand All @@ -49,12 +62,14 @@ def retry(count=5, exc_type = Exception):
def decorator(func):
@wraps(func)
def result(*args, **kwargs):
err = None
for _ in range(count):
try:
return func(*args, **kwargs)
except exc_type:
pass
raise
except exc_type as e:
err = e
else:
raise err
return result
return decorator

Expand Down Expand Up @@ -161,11 +176,11 @@ def get_options(self, argv):
filing_group = p.add_argument_group(title="Filing optinos")
filing_group.add_argument('-f', '--file', action='store_true',
default=False, dest='enable_filing', help='Enable filing of converted PDFs')
#filing_group.add_argument('-c', '--config', type = argparse.FileType('r'),
# filing_group.add_argument('-c', '--config', type = argparse.FileType('r'),
filing_group.add_argument('-c', '--config', type = lambda x: open_file_with_timeout(p,x),
dest='configfile', help='Configuration file for defaults and PDF filing')
filing_group.add_argument('-e', '--evernote', action='store_true',
default=False, dest='enable_evernote', help='Enable filing to Evernote')
default=False, dest='enable_evernote', help='Enable filing to Evernote.')
filing_group.add_argument('-n', action='store_true',
default=False, dest='match_using_filename', help='Use filename to match if contents did not match anything, before filing to default folder')

Expand Down Expand Up @@ -204,7 +219,11 @@ def get_options(self, argv):
logging.debug("Read in configuration file")
logging.debug(self.config)

if args.enable_evernote:
# Evernote filing does not work in py3
if args.enable_evernote and not evernote_enabled:
print("Warning: Evernote filing disabled, could not find evernote API. Evernote not available in py3.")
self.enable_evernote = False
elif args.enable_evernote:
self.enable_evernote = True
else:
self.enable_evernote = False
Expand Down Expand Up @@ -367,11 +386,11 @@ def run_conversion(self, pdf_filename):
time.sleep(1)
if not self.debug:
# Need to clean up the original image files before preprocessing
if locals().has_key("fns"): # Have to check if this was set before exception raised
if "fns" in locals(): # Have to check if this was set before exception raised
logging.info("Cleaning up %s" % fns)
self._clean_up_files(fns)

if locals().has_key("preprocess_imagefilenames"): # Have to check if this was set before exception raised
if "preprocess_imagefilenames" in locals(): # Have to check if this was set before exception raised
logging.info("Cleaning up %s" % preprocess_imagefilenames)
self._clean_up_files(preprocess_imagefilenames) # splat the hocr_filenames as it is a list of pairs
for ext in [".hocr", ".html", ".txt"]:
Expand Down Expand Up @@ -467,7 +486,7 @@ def go(self, argv):
except KeyboardInterrupt:
break
except Exception as e:
print traceback.print_exc(e)
print(traceback.print_exc(e))
py_watcher.stop()

else:
Expand Down
2 changes: 1 addition & 1 deletion pypdfocr/pypdfocr_filer_dirs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
import shutil

from pypdfocr_filer import PyFiler
from .pypdfocr_filer import PyFiler

"""
Implementation of a filer class
Expand Down
20 changes: 12 additions & 8 deletions pypdfocr/pypdfocr_filer_evernote.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,21 @@
import time
import sys

from pypdfocr_filer import PyFiler
from .pypdfocr_filer import PyFiler

import functools

from evernote.api.client import EvernoteClient
import evernote.edam.type.ttypes as Types
import evernote.edam.userstore.constants as UserStoreConstants
from evernote.edam.error.ttypes import EDAMUserException
from evernote.edam.error.ttypes import EDAMSystemException
from evernote.edam.error.ttypes import EDAMNotFoundException
from evernote.edam.error.ttypes import EDAMErrorCode
try:
from evernote.api.client import EvernoteClient
import evernote.edam.type.ttypes as Types
import evernote.edam.userstore.constants as UserStoreConstants
from evernote.edam.error.ttypes import EDAMUserException
from evernote.edam.error.ttypes import EDAMSystemException
from evernote.edam.error.ttypes import EDAMNotFoundException
from evernote.edam.error.ttypes import EDAMErrorCode
ENABLED = True
except ImportError:
ENABLED = False


"""
Expand Down
26 changes: 13 additions & 13 deletions pypdfocr/pypdfocr_gs.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,21 +92,21 @@ def _find_windows_gs(self):
listing = os.listdir('.')

# Find all possible gs* sub-directories
listing = [x for x in listing if x.startswith('gs')]
listing = [x for x in listing if x.startswith('gs')]

# TODO: Make this a natural sort
listing.sort(reverse=True)
for bindir in listing:
binpath = os.path.join(bindir,'bin')
if not os.path.exists(binpath): continue
os.chdir(binpath)
for bindir in listing:
binpath = os.path.join(bindir,'bin')
if not os.path.exists(binpath): continue
os.chdir(binpath)
# Look for gswin64c.exe or gswin32c.exe (the c is for the command-line version)
gswin = glob.glob('gswin*c.exe')
if len(gswin) == 0:
continue
gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?)
os.chdir(cwd)
return gs
gswin = glob.glob('gswin*c.exe')
if len(gswin) == 0:
continue
gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?)
os.chdir(cwd)
return gs

if not gs:
error(self.msgs['GS_MISSING_BINARY'])
Expand Down Expand Up @@ -171,10 +171,10 @@ def _run_gs(self, options, output_filename, pdf_filename):
try:
cmd = '%s -q -dNOPAUSE %s -sOutputFile="%s" "%s" -c quit' % (self.binary, options, output_filename, pdf_filename)
logging.info(cmd)
out = subprocess.check_output(cmd, shell=True)
out = subprocess.check_output(cmd, shell=True, universal_newlines=True)

except subprocess.CalledProcessError as e:
print e.output
print(e.output)
if "undefined in .getdeviceparams" in e.output:
error(self.msgs['GS_OUTDATED'])
else:
Expand Down
16 changes: 11 additions & 5 deletions pypdfocr/pypdfocr_multiprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import sys, os, multiprocessing.forking
import logging
import os
import sys

""" Special work-around to support multiprocessing and pyinstaller --onefile on windows systms

https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing
"""
try:
# Python 3.4+
if sys.platform.startswith('win'):
import multiprocessing.popen_spawn_win32 as forking
else:
import multiprocessing.popen_fork as forking
except ImportError:
import multiprocessing.forking as forking

import multiprocessing.forking as forking
import os
import sys

class _Popen(multiprocessing.forking.Popen):
class _Popen(forking.Popen):
def __init__(self, *args, **kw):
if hasattr(sys, 'frozen'):
# We have to set original _MEIPASS2 value from sys._MEIPASS
Expand Down
8 changes: 4 additions & 4 deletions pypdfocr/pypdfocr_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
import tempfile
import glob

import cStringIO
import base64
import zlib
import math
Expand All @@ -52,7 +51,7 @@
from reportlab.lib.enums import TA_LEFT
from reportlab.platypus.paragraph import Paragraph

from pypdfocr_util import Retry
from .pypdfocr_util import Retry
from functools import partial

class RotatedPara(Paragraph):
Expand Down Expand Up @@ -152,10 +151,11 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
all_text_filename = os.path.join(pdf_dir, "%s_text.pdf" % (basename))
merger = PdfFileMerger()
for text_pdf_filename in text_pdf_filenames:
merger.append(PdfFileReader(file(text_pdf_filename, 'rb')))
with open(text_pdf_filename, 'rb') as f:
merger.append(PdfFileReader(f))
merger.write(all_text_filename)
merger.close()
del merger
del merger


writer = PdfFileWriter()
Expand Down
17 changes: 8 additions & 9 deletions pypdfocr/pypdfocr_pdffiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,14 @@
on keywords
"""

from sets import Set
import sys, os
import re
import logging
import shutil

from PyPDF2 import PdfFileReader
from pypdfocr_filer import PyFiler
from pypdfocr_filer_dirs import PyFilerDirs
from .pypdfocr_filer import PyFiler
from .pypdfocr_filer_dirs import PyFilerDirs

class PyPdfFiler(object):
def __init__(self, filer):
Expand All @@ -36,15 +35,15 @@ def __init__(self, filer):

# Whether to fall back on filename for matching keywords against
# if there is no match in the text
self.file_using_filename = False
self.file_using_filename = False

def iter_pdf_page_text(self, filename):
self.filename = filename
reader = PdfFileReader(filename)
logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
for pgnum in range(reader.getNumPages()):
text = reader.getPage(pgnum).extractText()
text = text.encode('ascii', 'ignore')
# text = text.encode('ascii', 'ignore')
text = text.replace('\n', ' ')
yield text

Expand All @@ -56,10 +55,10 @@ def _get_matching_folder(self, pdfText):
if s in searchText:
logging.info("Matched keyword '%s'" % s)
return folder
# No match found, so return
# No match found, so return
return None

def file_original (self, original_filename):
def file_original(self, original_filename):
return self.filer.file_original(original_filename)

def move_to_matching_folder(self, filename):
Expand All @@ -72,9 +71,9 @@ def move_to_matching_folder(self, filename):

tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder)
return tgt_file

if __name__ == '__main__':
p = PyPdfFiler(PyFilerDirs())
for page_text in p.iter_pdf_page_text("scan_ocr.pdf"):
print (page_text)
print(page_text)

8 changes: 4 additions & 4 deletions pypdfocr/pypdfocr_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import signal

from multiprocessing import Pool
from pypdfocr_interrupts import init_worker
from .pypdfocr_interrupts import init_worker

# Ugly hack to pass in object method to the multiprocessing library
# From http://www.rueckstiess.net/research/snippets/show/ca1d7d90
Expand Down Expand Up @@ -58,7 +58,7 @@ def cmd(self, cmd_list):
logging.debug(out)
return out
except subprocess.CalledProcessError as e:
print e.output
print(e.output)
self._warn("Could not run command %s" % cmd_list)


Expand Down Expand Up @@ -102,14 +102,14 @@ def preprocess(self, in_filenames):
logging.info("Starting preprocessing parallel execution")
preprocessed_filenames = pool.map(unwrap_self,zip([self]*len(fns),fns))
pool.close()
except KeyboardInterrupt or Exception:
except (KeyboardInterrupt, Exception):
print("Caught keyboard interrupt... terminating")
pool.terminate()
#sys,exit(-1)
raise
finally:
pool.join()
logging.info ("Completed preprocessing")
logging.info("Completed preprocessing")

return preprocessed_filenames

Expand Down
Loading