diff --git a/AUTHORS.rst b/AUTHORS.rst index c64eeff..8422d16 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -10,4 +10,4 @@ Development Lead Contributors ------------ -None yet. Why not be the first? \ No newline at end of file +* Dale Furrow \ No newline at end of file diff --git a/CHANGES.rst b/CHANGES.rst index ecceb3b..eb4c776 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -2,6 +2,7 @@ Version Date Changes ------- -------- ------ +v0.4.0 9/22/16 Add deskew, text-recognition, gui-dialog for save, multithreading v0.3.0 8/25/14 Allow arbitrary page sizes and auto-crops v0.1.0 1/1/14 First release ======= ======== ====== diff --git a/README.rst b/README.rst index 1981e25..87d31e4 100644 --- a/README.rst +++ b/README.rst @@ -27,7 +27,9 @@ Features * `Integrates with ScanBd `_ to respond to hardware button presses * Automatically removes blank pages. * Scans in color, and automatically down-converts into 1-bit B/W image for text/greyscale images -* Auto-crops to the proper page size. +* (optionally) Auto-crops to the proper page size. +* (optionally) applyies unpaper formatting to finished images +* (optionally) applies pdf-sandwich text-recognition to finished pdf Usage: ------ @@ -37,6 +39,10 @@ The simplest way to use this is: scanpdf scan pdf + Or alternatively + + scanpdf scan pdf (To bring up a file-save dialog to direct the finished pdf file.) + This will first perform the scan, and then the conversion to PDF. If you want to split up the scan and the PDF conversion into two separate invocations (for reasons clarified below), then you can do: @@ -64,15 +70,19 @@ additional post-processing using unpaper_: :: --dpi= DPI to scan in [default: 300] + --device= Scanning device (sub '%' for spaces) + --crop Run ImageMagick cropping routine + --tmpdir= Temporary directory + --keep-tmpdir Whether to keep the tmp dir after scanning or not [default: False] --face-up= Face-up scanning [default: True] --keep-blanks Don't check for and remove blank pages - --blank-threshold= Percentage of white to be marked as blank [default: 0.97] - --post-process Run unpaper to deskew/clean up - + --blank-threshold= Percentage of white to be marked as blank [default: 0.97] + --post-process Process finished images with unpaper + --text-recognize Run pdfsandwich for text recognition -Right now, I'm assuming this is getting called via ScanBD, so I don't have the option to manually specify the -scanner. If you really want to use this standalone, for now, please just set the ``SCANBD_DEVICE`` environment -variable to your scanner device name before running this script. +Right now, I'm assuming this is getting called via ScanBD, so I don't have the option to manually specify the +scanner. If you really want to use this standalone, for now, either set the --device option or just set the + ``SCANBD_DEVICE`` environment variable to your scanner device name before running this script. Installation @@ -87,6 +97,8 @@ Requires ImageMagick and SANE to be installed, for the command line tools: * ``identify`` * ``ps2pdf`` * ``scanadf`` +* ``unpaper`` +* ``pdfsandwich`` Also requires epstopdf. diff --git a/TODO.rst b/TODO.rst index 4ad1d1c..3450a1f 100644 --- a/TODO.rst +++ b/TODO.rst @@ -2,5 +2,5 @@ Todo list ========= - Make it more generic in terms of stand-alone usage -- Add docstrings +- Consider changing default blank-threshold value diff --git a/docs/deskew_readme.txt b/docs/deskew_readme.txt new file mode 100644 index 0000000..a121215 --- /dev/null +++ b/docs/deskew_readme.txt @@ -0,0 +1,63 @@ +Deskew +------------------------ +by Marek Mauder +http://galfar.vevb.net/deskew/ +https://bitbucket.org/galfar/app-deskew + +v1.10 2014-03-04 + +Overview +------------------------ + +Deskew is a command line tool for deskewing scanned text documents. +It uses Hough transform to detect "text lines" in the image. As an output, you get +an image rotated so that the lines are horizontal. + +There are binaries built for these platforms (located in Bin folder): +Win32, Win64, Linux 32bit+64bit, Mac OSX 32bit. Some binaries have sufix +identifying their platform (deskew64.exe, deskew-osx, etc.). + +You can find some test images in TestImages folder and +scripts to run tests (RunTests.bat and runtests.sh) in Bin. +Note that scripts just call 'deskew' command so you may need +to rename binary for your platform to just 'deskew'. + +Usage +------------------------ + +deskew [-o output] [-a angle] [-t a|treshold] [-b color] [-r rect] [-f format] [-s info] input + -o output: Output image file (default: out.png) + -a angle: Maximal skew angle in degrees (default: 10) + -t a|treshold: Auto threshold or value in 0..255 (default: a) + -b color: Background color in hex format RRGGBB (default: trns. black) + -r rect: Skew detection only in content rectangle (pixels): + left,top,right,bottom (default: whole page) + -f format: Force output pixel format (values: b1|g8|rgba32) + -s info: Info dump (any combination of): + s - skew detection stats, p - program parameters + input: Input image file + + Supported file formats + Input: BMP, JPG, PNG, JNG, GIF, DDS, TGA, PBM, PGM, PPM, PAM, PFM, PSD, TIF (depends on platform) + Output: BMP, JPG, PNG, JNG, GIF, DDS, TGA, PGM, PPM, PAM, PFM, PSD, TIF (depends on platform) + +Version History +------------------------ + 1.10 2014-03-04: + - TIFF support for Win64 and 32/64bit Linux + - forced output formats + - fix: output file name were always lowercase + - fix: preserves resolution metadata (e.g. 300dpi) of input when writing output + 1.00 2012-06-04: + - background color + - "area of interest" content rect + - 64bit and Mac OSX support + - PSD and TIFF (win32) support + - show skew detection stats and program parameters + 0.95 2010-12-28: + - Added auto thresholding + - Imaging library updated. + 0.90 2010-02-12: + -Initial version + + diff --git a/requirements.txt b/requirements.txt index cd550f1..23725bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ docopt>=0.6.1 +wx>=3.0.2.0 + diff --git a/scanpdf.egg-info/PKG-INFO b/scanpdf.egg-info/PKG-INFO index 1ca6e47..c9ac244 100644 --- a/scanpdf.egg-info/PKG-INFO +++ b/scanpdf.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: scanpdf -Version: 0.3.0 +Version: 0.4.0 Summary: Utility to use SANE/scanadf to scan to PDF Home-page: UNKNOWN Author: Virantha N. Ekanayake @@ -35,6 +35,9 @@ Description: Scan PDF - Easy scans in Linux with a document scanner like the Fuj * `Integrates with ScanBd `_ to respond to hardware button presses * Automatically removes blank pages. * Scans in color, and automatically down-converts into 1-bit B/W image for text/greyscale images + * deskews images + * (optionally) applyies unpaper formatting to finished images + * (optionally) applies pdf-sandwich text-recognition to finished pdf Usage: ------ @@ -43,6 +46,12 @@ Description: Scan PDF - Easy scans in Linux with a document scanner like the Fuj :: scanpdf scan pdf + + Or alternatively + + scanpdf scan pdf + + To bring up a file-save dialog to direct the finished pdf file. This will first perform the scan, and then the conversion to PDF. If you want to split up the scan and the PDF conversion into two separate invocations (for @@ -70,16 +79,21 @@ Description: Scan PDF - Easy scans in Linux with a document scanner like the Fuj :: - --dpi= DPI to scan in [default: 300] + ---dpi= DPI to scan in [default: 300] + --device= Scanning device (sub '%' for spaces) + --crop Run ImageMagick cropping routine + --tmpdir= Temporary directory + --keep-tmpdir Whether to keep the tmp dir after scanning or not [default: False] --face-up= Face-up scanning [default: True] --keep-blanks Don't check for and remove blank pages - --blank-threshold= Percentage of white to be marked as blank [default: 0.97] - --post-process Run unpaper to deskew/clean up + --blank-threshold= Percentage of white to be marked as blank [default: 0.97] + --post-process Process finished images with unpaper + --text-recognize Run pdfsandwich for text recognition Right now, I'm assuming this is getting called via ScanBD, so I don't have the option to manually specify the - scanner. If you really want to use this standalone, for now, please just set the ``SCANBD_DEVICE`` environment - variable to your scanner device name before running this script. + scanner. If you really want to use this standalone, for now, either set the --device option or just set the + ``SCANBD_DEVICE`` environment variable to your scanner device name before running this script. Installation @@ -88,12 +102,14 @@ Description: Scan PDF - Easy scans in Linux with a document scanner like the Fuj $ pip install scanpdf - Requires ImageMagick and SANE to be installed, for the command line tools: + Requires ImageMagick, SANE, unpaper and pdfsandwich to be installed, for the command line tools: * ``convert`` * ``identify`` * ``ps2pdf`` * ``scanadf`` + * ``unpaper`` + * ``pdfsandwich`` Also requires epstopdf. diff --git a/scanpdf.egg-info/requires.txt b/scanpdf.egg-info/requires.txt index 7920c79..cc1d939 100644 --- a/scanpdf.egg-info/requires.txt +++ b/scanpdf.egg-info/requires.txt @@ -1 +1,2 @@ -docopt>=0.6.1 \ No newline at end of file +docopt>=0.6.1 +wx>=3.0.2.0 \ No newline at end of file diff --git a/scanpdf/deskew64 b/scanpdf/deskew64 new file mode 100644 index 0000000..3ff9d44 Binary files /dev/null and b/scanpdf/deskew64 differ diff --git a/scanpdf/scanpdf.py b/scanpdf/scanpdf.py index b211987..254f29b 100644 --- a/scanpdf/scanpdf.py +++ b/scanpdf/scanpdf.py @@ -17,48 +17,266 @@ Usage: scanpdf [options] scan scanpdf [options] pdf - scanpdf [options] scan pdf + scanpdf [options] scan pdf + scanpdf [options] pdf + scanpdf [options] scan pdf Options: -v --verbose Verbose logging -d --debug Debug logging --dpi= DPI to scan in [default: 300] + --device= Scanning device (sub '%' for spaces) + --crop Run ImageMagick cropping routine --tmpdir= Temporary directory --keep-tmpdir Whether to keep the tmp dir after scanning or not [default: False] --face-up= Face-up scanning [default: True] --keep-blanks Don't check for and remove blank pages - --blank-threshold= Percentage of white to be marked as blank [default: 0.97] - --post-process Run unpaper to deskew/clean up + --blank-threshold= Percentage of white to be marked as blank [default: 0.97] + --post-process Process finished images with unpaper + --text-recognize Run pdfsandwich for text recognition """ -import sys, os +import glob import logging -import shutil +import multiprocessing +import os import re +import shutil +import subprocess +import sys +import time +import wx +from multiprocessing.dummy import Pool as Threadpool -from version import __version__ import docopt -import subprocess -import time -import glob -from itertools import combinations +from version import __version__ +date_format = '%m/%d/%Y %H:%M:%S' -class ScanPdf(object): - """ - The main clas. Performs the following functions: +class ProcessPage: + """Process page applies methods to a single PPM Image + Attributes: + page (str): Filename of image + scanpdf (ScanPdf): ScanPDF object which instantiated the ProcessPage """ + page = None + scanpdf = None + + def __init__(self, page, scanpdf): + """ + """ + self.page = page + self.scanpdf = scanpdf + os.chdir(self.scanpdf.tmp_dir) + + def process(self): + """ + Apply all processing in accordance with selcted options + :return: None + """ + self.run_deskew() + if self.scanpdf.crop: + self.run_crop() + self.convert_to_bw() + if not self.scanpdf.keep_blanks: + self.remove_blank() + if self.page is not None and self.scanpdf.post_process: + self.run_postprocess() + + def run_deskew(self): + """ + Deskew image using Marke Mauder Deskew + https://bitbucket.org/galfar/app-deskew + :return: None + """ + deskew = os.path.dirname(os.path.realpath(__file__)) + os.path.sep + 'deskew64' + logging.info("Deskewing: " + os.path.basename(self.page)) + ppm_page = '%s.ppm' % self.page + c = [deskew, ' %s ' % self.page, '-o', ppm_page] + result = self.scanpdf.cmd(c) + logging.debug("deskew result: " + result) + os.remove(self.page) + self.page = ppm_page + + def run_crop(self): + """ + Apply standard cropping routine from ImageMagick + :return: + """ + logging.info("Cropping: " + os.path.basename(self.page)) + crop_page = '%s.crop' % self.page + c = ['convert', '-fuzz 20%', '-trim', self.page, crop_page] + self.scanpdf.cmd(c) + os.remove(self.page) + self.page = crop_page + + def run_postprocess(self): + logging.info("Post-processing with unpaper: " + os.path.basename(self.page)) + shutil.move(self.page, '%s.ppm' % self.page) + self.page = '%s.ppm' % self.page + processed_page = '%s_unpaper' % self.page + c = ['unpaper', self.page, processed_page] + self.scanpdf.cmd(c) + os.remove(self.page) + self.page = processed_page + + def convert_to_bw(self): + """ + Check if color exists, convert to black-white if not + :return: None + """ + logging.info("Checking if: " + os.path.basename(self.page) + " is bw...") + if not self._is_color(): + self._page_to_bw() - def __init__ (self): + def _page_to_bw(self): + """ + Apply ImageMagick Black-White Conversion + :return: None + """ + bw_page = "%s_bw" % self.page + c = "convert %s +dither -colors 2 -colorspace gray -normalize %s_bw" % (self.page, self.page) + self.scanpdf.cmd(c) + # Remove the old file + os.remove(self.page) + self.page = bw_page + + @staticmethod + def run(args): + """ + Static run method for ProcessPage + :param args: page (string pagename), ScanPdf tuple + :return: name of final processed page + """ + process_page = ProcessPage(args[0], args[1]) + process_page.process() + return process_page.page + + def remove_blank(self): + """ + Check if page is blank: if so, remove + :return: None + """ + logging.info("Checking if: " + os.path.basename(self.page) + " is blank") + if self.is_blank(): + os.remove(self.page) + self.page = None + + def is_blank(self): + """ + Check if page blank by comparing stdev from ImageMagick 'Identify' + to threshold (1 - blank_threshold) + :return: true if image is blank + """ + if not os.path.exists(self.page): + return True + c = 'identify -verbose %s' % self.page + result = self.scanpdf.cmd(c) + m_std_dev = re.compile("""\s*standard deviation:\s*\d+\.\d+\s*\((?P\d+\.\d+)\).*""") + for line in result.splitlines(): + match = m_std_dev.search(line) + if match: + stdev = float(match.group('percent')) + logging.info(os.path.basename(self.page) + " std. dev: " + "{0:.4f}".format(stdev)) + if stdev > 1. - self.scanpdf.blank_threshold: + return False + return True + + def _is_color(self): + """ + Run the following command from ImageMagick: + convert holi.pdf -colors 8 -depth 8 -format %c histogram:info:- + This outputs something like the following: + 10831: ( 24, 26, 26,255) #181A1A srgba(24,26,26,1) + 4836: ( 55, 87, 79,255) #37574F srgba(55,87,79,1) + 6564: ( 77,138,121,255) #4D8A79 srgba(77,138,121,1) + 4997: ( 86, 96, 93,255) #56605D srgba(86,96,93,1) + 7005: ( 92,153,139,255) #5C998B srgba(92,153,139,1) + 2479: (143,118,123,255) #8F767B srgba(143,118,123,1) + 8870: (169,176,170,255) #A9B0AA srgba(169,176,170,1) + 442906: (254,254,254,255) #FEFEFE srgba(254,254,254,1) + 1053: ( 0, 0, 0,255) #000000 black + 484081: (255,255,255,255) #FFFFFF white + :return: true if image color exists + """ + c = "convert %s -colors 8 -depth 8 -format %%c histogram:info:-" % self.page + out = self.scanpdf.cmd(c) + m_line = re.compile(r"""\s*(?P\d+):\s*\(\s*(?P\d+),\s*(?P\d+),\s*(?P\d+).+""") + colors = [] + for line in out.splitlines(): + match_line = m_line.search(line) + if match_line: + logging.debug("Found RGB values") + color = [int(x) for x in (match_line.group('count'), + match_line.group('R'), + match_line.group('G'), + match_line.group('B'), + ) + ] + colors.append(color) + # sort + colors.sort(reverse=True, key=lambda y: y[0]) + logging.debug(colors) + is_color = False + logging.debug(colors) + for color in colors: + # Calculate the mean differences between the RGB components + # Shades of grey will be very close to zero in this metric... + diff = float(sum([abs(color[2] - color[1]), + abs(color[3] - color[1]), + abs(color[3] - color[2]), + ])) / 3 + if diff > 20: + is_color = True + logging.debug("Found color") + return is_color + + +class ScanPdf(object): + """ + Controls Scan and PDF functions + Attributes: + pages (list): list of page filenames + cwd (str): Current Working Directory + tmp_dir(str): name of temporary directory for original and processed images + args (dict): command line arguments received from docopt + device (str): scanning device name recongized by scanadf + pdf_filename (str): path/name for pdf output + dpi (int): resolution for scanned image (dots per inch) + keep_blanks (bool): Whether to keep blank images + crop (bool): Whether to crop images + post-process (bool): Whether to apply unpaper post-processing + text_recognize (bool): Whether to apply pdf-sandwich post processing + """ + pages = None + cwd = None + tmp_dir = None + args = None + device = None + pdf_filename = None + dpi = None + keep_blanks = None + blank_threshold = None + crop = None + post_process = None + text_recognize = None + + def __init__(self): """ """ self.config = None + self.cwd = os.getcwd() def cmd(self, cmd_list): + """ + Run command in operating system + :param cmd_list: list of commands + :return: os outpout of command + """ if isinstance(cmd_list, list): cmd_list = ' '.join(cmd_list) logging.debug("Running cmd: %s" % cmd_list) @@ -69,235 +287,151 @@ def cmd(self, cmd_list): except subprocess.CalledProcessError as e: print e.output self._error("Could not run command %s" % cmd_list) - - def run_scan(self): - device = os.environ['SCANBD_DEVICE'] + """ + Scan documents from device + :return: None + """ + if self.device is None: + self._error("Scanning device is undefined!") self.cmd('logger -t "scanbd: " "Begin of scan "') c = ['scanadf', - '-d "%s"' % device, - '--source "ADF Duplex"', - '--mode Color', - '--resolution %sdpi' % self.dpi, - '--y-resolution %sdpi' % self.dpi, - '-o %s/page_%%04d' % self.tmp_dir, - '-y 876', - '--page-height 376', - ] + '-d "%s"' % self.device, + '--source "ADF Duplex"', + '--mode Color', + '--resolution %sdpi' % self.dpi, + '-o %s/page_%%04d' % self.tmp_dir, + '-y 279.364', + '--page-height 279.364', + ] self.cmd(c) self.cmd('logger -t "scanbd: " "End of scan "') + file_count = len([name for name in os.listdir(self.tmp_dir) if + os.path.isfile(os.path.join(self.tmp_dir, name))]) + logging.info('Receved {0:d} files in {1:}...'.format(file_count, self.tmp_dir)) - def _error(self, msg): + @staticmethod + def _error(msg): + """ + print error and exit + :param msg: error message + :return: None + """ print("ERROR: %s" % msg) sys.exit(-1) - def _atoi(self,text): - return int(text) if text.isdigit() else text + @staticmethod + def _atoi(text): + return int(text) if text.isdigit() else text def _natural_keys(self, text): - ''' + """ alist.sort(key=natural_keys) sorts in human order http://nedbatchelder.com/blog/200712/human_sorting.html (See Toothy's implementation in the comments) - ''' - return [ self._atoi(c) for c in re.split('(\d+)', text) ] + :param text: + :return: sorted + """ + return [self._atoi(c) for c in re.split('(\d+)', text)] def get_pages(self): cwd = os.getcwd() os.chdir(self.tmp_dir) - pages = glob.glob('./page_*') - pages.sort(key = self._natural_keys) + pages = glob.glob('page_*') + pages.sort(key=self._natural_keys) os.chdir(cwd) return pages - def reorder_face_up(self, pages): - reorder = [] - assert len(pages) % 2 == 0, "Why is page count not even for duplexing??" + def reorder_face_up(self): + assert len(self.pages) % 2 == 0, "Why is page count not even for duplexing??" logging.info("Reordering pages") - #for i in range(0,len(pages),2): - #pages[i], pages[i+1] = pages[i+1], pages[i] - pages.reverse() - return pages - - def is_blank(self, filename): - """ - Returns true if image in filename is blank - - standard deviation: 56.9662 (0.223397) - """ - if not os.path.exists(filename): - return True - - - c = 'identify -verbose %s' % filename - result = self.cmd(c) - mStdDev = re.compile("""\s*standard deviation:\s*\d+\.\d+\s*\((?P\d+\.\d+)\).*""") - for line in result.splitlines(): - match = mStdDev.search(line) - if match: - stdev = float(match.group('percent')) - if stdev > 0.1: - return False - return True + # for i in range(0,len(pages),2): + # pages[i], pages[i+1] = pages[i+1], pages[i] + self.pages.reverse() # OLD CODE - doesn't work for color images - c = 'convert %s -shave 1%%x1%% -format "%%[fx:mean]" info:' % filename - result = self.cmd(c) - if float(result.strip()) > self.blank_threshold: - return True - else: - return False + # c = 'convert %s -shave 1%%x1%% -format "%%[fx:mean]" info:' % filename + # result = self.cmd(c) + # if float(result.strip()) > self.blank_threshold: + # return True + # else: + # return False + + def run_text_recognize(self, pdf_file): + """ + run text recognition + :param pdf_file: pdf filename (in temporary directory) + :return: + """ + c = ['pdfsandwich', '-coo', '\"-deskew 40%\"', pdf_file] + self.cmd(c) + filename, file_extension = os.path.splitext(pdf_file) + ocr_file = filename + "_ocr" + file_extension + os.remove(pdf_file) + shutil.move(ocr_file, pdf_file) - def run_postprocess(self, page_files): - cwd = os.getcwd() - os.chdir(self.tmp_dir) - - processed_pages = [] - for page in page_files: - processed_page = '%s_unpaper' % page - c = ['unpaper', page, processed_page] - self.cmd(c) - os.remove(page) - processed_pages.append(processed_page) - os.chdir(cwd) - return processed_pages + @staticmethod + def save_file(source_file): + """ + Run dialog to save file to user-defined name/folder + :param source_file: pdf file in temporary directory + :return: None + """ + path = None + _ = wx.App(redirect=True) + wildcard = "PDF Files (*.pdf)|*.pdf|" \ + "All files (*.*)|*.*" - def run_crop(self, page_files): - cwd = os.getcwd() - os.chdir(self.tmp_dir) - crop_pages = [] - for i, page in enumerate(page_files): - logging.debug("Cropping page %d" % i) - crop_page = '%s.crop' % page - crop_pages.append(crop_page) - c = ['convert', - '-fuzz 20%', - '-trim', - ' %s ' % page, - crop_page, - ] - self.cmd(c) - os.remove(page) + dialog = wx.FileDialog(None, "Choose a file", os.path.expanduser("~"), "", wildcard, wx.SAVE) + if dialog.ShowModal() == wx.ID_OK: + path = dialog.GetPath() - os.chdir(cwd) - return crop_pages + dialog.Destroy() + logging.info("Saving from Dialog: " + path) + shutil.move(source_file, path) - def run_convert(self, page_files): - cwd = os.getcwd() + def run_convert(self, text_recognize): + """ + convert images to pdf file, run text recongnition (if desired), + save to directory (pre-defined or user-defined) + :param text_recognize: boolean on whether to apply text recognition + :return: None + """ os.chdir(self.tmp_dir) - - pdf_basename = os.path.basename(self.pdf_filename) + if self.pdf_filename is not None: + pdf_basename = os.path.basename(self.pdf_filename) + else: + pdf_basename = "temp.pdf" ps_filename = pdf_basename ps_filename = ps_filename.replace(".pdf", ".ps") c = ['convert', - '-density %s' % self.dpi, - '-rotate 180', - ' '.join(page_files), - ps_filename - ] + '-density %s' % self.dpi, + ' '.join(self.pages), + ps_filename + ] self.cmd(c) c = ['ps2pdf', - '-DPDFSETTINGS=/prepress', - ps_filename, - pdf_basename, - ] - c = ['epstopdf', - ps_filename, - ] - + '-DPDFSETTINGS=/prepress', + ps_filename, + pdf_basename, + ] self.cmd(c) - shutil.move(pdf_basename, self.pdf_filename) - for filename in page_files+[ps_filename]: + if text_recognize: + logging.info("running pdf sandwich for text recognition...") + self.run_text_recognize(pdf_basename) + if self.pdf_filename is not None: + shutil.move(pdf_basename, self.pdf_filename) + else: + source_file = os.path.join(self.tmp_dir, pdf_basename) + self.save_file(source_file) + for filename in self.pages + [ps_filename]: os.remove(filename) - + # IF we did the scan, then remove the tmp dir too if self.args['scan'] and not self.args['--keep-tmpdir']: os.rmdir(self.tmp_dir) - os.chdir(cwd) - - - def convert_to_bw(self, pages): - new_pages = [] - for i, page in enumerate(pages): - filename = os.path.join(self.tmp_dir, page) - logging.info("Checking if %s is bw..." % filename) - if self._is_color(filename): - new_pages.append(page) - else: # COnvert to BW - bw_page = self._page_to_bw(filename) - new_pages.append(bw_page) - return new_pages - - - def _page_to_bw(self, page): - out_page = "%s_bw" % page - cwd = os.getcwd() - os.chdir(self.tmp_dir) - - cmd = "convert %s +dither -colors 2 -colorspace gray -normalize %s_bw" % (page, page) - out = self.cmd(cmd) - # Remove the old file - os.remove(page) - os.chdir(cwd) - return out_page - - def _is_color(self, filename): - """ - Run the following command from ImageMagick: - - :: - - convert holi.pdf -colors 8 -depth 8 -format %c histogram:info:- - - This outputs something like the following: - :: - - 10831: ( 24, 26, 26,255) #181A1A srgba(24,26,26,1) - 4836: ( 55, 87, 79,255) #37574F srgba(55,87,79,1) - 6564: ( 77,138,121,255) #4D8A79 srgba(77,138,121,1) - 4997: ( 86, 96, 93,255) #56605D srgba(86,96,93,1) - 7005: ( 92,153,139,255) #5C998B srgba(92,153,139,1) - 2479: (143,118,123,255) #8F767B srgba(143,118,123,1) - 8870: (169,176,170,255) #A9B0AA srgba(169,176,170,1) - 442906: (254,254,254,255) #FEFEFE srgba(254,254,254,1) - 1053: ( 0, 0, 0,255) #000000 black - 484081: (255,255,255,255) #FFFFFF white - - """ - cmd = "convert %s -colors 8 -depth 8 -format %%c histogram:info:-" % filename - out = self.cmd(cmd) - mLine = re.compile(r"""\s*(?P\d+):\s*\(\s*(?P\d+),\s*(?P\d+),\s*(?P\d+).+""") - colors = [] - for line in out.splitlines(): - matchLine = mLine.search(line) - if matchLine: - logging.debug("Found RGB values") - color = [int(x) for x in (matchLine.group('count'), - matchLine.group('R'), - matchLine.group('G'), - matchLine.group('B'), - ) - ] - colors.append(color) - # sort - colors.sort(reverse=True, key = lambda x: x[0]) - logging.debug(colors) - is_color = False - logging.debug(colors) - for color in colors: - # Calculate the mean differences between the RGB components - # Shades of grey will be very close to zero in this metric... - diff = float(sum([abs(color[2]-color[1]), - abs(color[3]-color[1]), - abs(color[3]-color[2]), - ]))/3 - if diff > 20: - is_color = True - logging.debug("Found color") - return is_color - - + os.chdir(self.cwd) def get_options(self, argv): """ @@ -305,23 +439,20 @@ def get_options(self, argv): :param argv: usually just sys.argv[1:] :returns: Nothing - - :ivar debug: Enable logging debug statements - :ivar verbose: Enable verbose logging - :ivar config: Dict of the config file - """ self.args = argv if argv['--verbose']: logging.basicConfig(level=logging.INFO, format='%(message)s') if argv['--debug']: - logging.basicConfig(level=logging.DEBUG, format='%(message)s') + logging.basicConfig(level=logging.DEBUG, format='%(message)s') if self.args['pdf']: - self.pdf_filename = os.path.abspath(self.args['']) - + if self.args['']: + self.pdf_filename = os.path.abspath(self.args['']) + logging.info('saving to file: ' + self.pdf_filename) + else: + logging.info('saving file via dialog after processing...') self.dpi = self.args['--dpi'] - output_dir = time.strftime('%Y%m%d_%H%M%S', time.localtime()) if argv['--tmpdir']: self.tmp_dir = argv['--tmpdir'] @@ -329,8 +460,12 @@ def get_options(self, argv): self.tmp_dir = os.path.join('/tmp', output_dir) self.tmp_dir = os.path.abspath(self.tmp_dir) - # Make the tmp dir only if we're scanning, o/w throw an error + # Make the tmp dir only if we're scanning, o/w throw an error, also get device if argv['scan']: + if argv['--device']: + self.device = argv['--device'].replace('%', ' ') # Replace % with spaces to comply with docopt + else: + self.device = os.environ.get('SCANBD_DEVICE') if os.path.exists(self.tmp_dir): self._error("Temporary output directory %s already exists!" % self.tmp_dir) else: @@ -338,15 +473,17 @@ def get_options(self, argv): else: if not os.path.exists(self.tmp_dir): self._error("Scan files directory %s does not exist!" % self.tmp_dir) - + # Blank checks - self.keep_blanks = argv['--keep-blanks'] + self.keep_blanks = argv['--keep-blanks'] self.blank_threshold = float(argv['--blank-threshold']) - assert(self.blank_threshold >= 0 and self.blank_threshold <= 1.0) + assert (0 <= self.blank_threshold <= 1.0) + self.crop = argv['--crop'] + self.text_recognize = argv['--text-recognize'] self.post_process = argv['--post-process'] def go(self, argv): - """ + """ The main entry point into ScanPdf #. Get the options @@ -355,52 +492,45 @@ def go(self, argv): """ # Read the command line options self.get_options(argv) + start = time.time() + logging.info("Starting: " + time.strftime(date_format)) logging.info("Temp dir: %s" % self.tmp_dir) if self.args['scan']: self.run_scan() - + if self.args['pdf']: # Now, convert the files to ps - pages = self.get_pages() - logging.debug( pages ) - if self.args['--face-up']: - pages = self.reorder_face_up(pages) - - logging.debug( pages ) - - # Crop the pages - pages = self.run_crop(pages) - - # Now, check if color or bw - pages = self.convert_to_bw(pages) - logging.debug(pages) - - # Run blanks - if not self.keep_blanks: - no_blank_pages = [] - for i,page in enumerate(pages): - filename = os.path.join(self.tmp_dir, page) - logging.info("Checking if %s is blank..." % filename) - if not self.is_blank(filename): - no_blank_pages.append(page) - else: - logging.info(" page %s is blank, removing..." % i) - os.remove(filename) - pages = no_blank_pages - - logging.debug( pages ) - - if self.post_process: - pages = self.run_postprocess(pages) - - self.run_convert(pages) - + self.pages = self.get_pages() + logging.debug(self.pages) + if self.args['--face-up'] == 'True': # Default is a text value of 'True' + self.reorder_face_up() + + start_pages = [(page, self) for page in self.pages] + + threads_to_run = multiprocessing.cpu_count() + logging.info('Processing pages with {0:d} threads...'.format(threads_to_run)) + + pool = Threadpool(threads_to_run) + results = pool.map(ProcessPage.run, start_pages) + pool.close() + pool.join() + + self.pages = [page for page in results if page is not None] + + logging.debug(self.pages) + self.run_convert(self.text_recognize) + + end = time.time() + logging.info("End: " + time.strftime(date_format)) + logging.info("Elapsed Time (seconds): {0:.2f}".format(end-start)) + + def main(): - args = docopt.docopt(__doc__, version='Scan PDF %s' % __version__ ) + args = docopt.docopt(__doc__, version='Scan PDF %s' % __version__) script = ScanPdf() print args script.go(args) + if __name__ == '__main__': main() - diff --git a/scanpdf/version.py b/scanpdf/version.py index 260c070..6a9beea 100644 --- a/scanpdf/version.py +++ b/scanpdf/version.py @@ -1 +1 @@ -__version__ = "0.3.1" +__version__ = "0.4.0" diff --git a/test/checkscanpdf.py b/test/checkscanpdf.py new file mode 100644 index 0000000..d43a558 --- /dev/null +++ b/test/checkscanpdf.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python2.7 + +# Basic Test for scanpdf.py + +import os +from scanpdf import scanpdf + +OUTPUT_DIRECTORY = "~/Temp" +FILENAME = "image_test.pdf" + + +def ensure_dir(f): + d = os.path.dirname(f) + if not os.path.exists(d): + os.makedirs(d) + + +def main(): + ensure_dir(OUTPUT_DIRECTORY) + full_path = os.path.join(OUTPUT_DIRECTORY, FILENAME) + if os.path.exists(full_path): + os.remove(full_path) + os.environ["SCANBD_DEVICE"] = 'net:localhost:fujitsu:ScanSnap S1500:1448' + args = {'scan': True, 'pdf': True, '': full_path, + '--verbose': True, '--debug': True, '--device': "net:localhost:fujitsu:ScanSnap S1500:1448", + '--dpi': 300, '--crop': False, '--tmpdir': False, + '--keep-blanks': False, '--blank-threshold': 0.80, '--text-recognize': False, + '--face-up': False, '--keep-tmpdir': False, '--post-process': False} + script = scanpdf.ScanPdf() + print args + script.go(args) + + +if __name__ == '__main__': + main()