From b6aeb0e5c32283a66bb049c9c472339333a7f12e Mon Sep 17 00:00:00 2001 From: Brent Noorda Date: Thu, 12 May 2016 19:19:49 -0700 Subject: [PATCH 1/2] When identifying original pdf file, skip the first output lines that are warnings, e.g., **** Warning: considering '0000000000 XXXXX n' as a free entry. --- pypdfocr/pypdfocr_gs.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pypdfocr/pypdfocr_gs.py b/pypdfocr/pypdfocr_gs.py index 5599082..a346a87 100644 --- a/pypdfocr/pypdfocr_gs.py +++ b/pypdfocr/pypdfocr_gs.py @@ -124,7 +124,7 @@ def _get_dpi(self, pdf_filename): try: out = subprocess.check_output(cmd, shell=True) except subprocess.CalledProcessError as e: - self._warn ("Could not execute pdfimages to calculate DPI (try installing xpdf or poppler?), so defaulting to %sdpi" % self.output_dpi) + self._warn ("Could not execute pdfimages to calculate DPI (try installing xpdf or poppler?), so defaulting to %sdpi" % self.output_dpi) return # Need the second line of output @@ -137,7 +137,7 @@ def _get_dpi(self, pdf_filename): logging.debug(results) results = results.split() if(results[2] != 'image'): - self._warn("Could not understand output of pdfimages, please rerun with -d option and file an issue at http://github.com/virantha/pypdfocr/issues") + self._warn("Could not understand output of pdfimages, please rerun with -d option and file an issue at http://github.com/virantha/pypdfocr/issues") return x_pt, y_pt, greyscale = int(results[3]), int(results[4]), results[5]=='gray' self.greyscale = greyscale @@ -146,7 +146,13 @@ def _get_dpi(self, pdf_filename): cmd = 'identify -format "%%w %%x %%h %%y\n" "%s"' % pdf_filename try: out = subprocess.check_output(cmd, shell=True) - results = out.splitlines()[0] + + # skip any lines starting with "*** Warning" + out_splitlines = out.splitlines() + while "* Warning:" in out_splitlines[0]: + out_splitlines = out_splitlines[1:] + results = out_splitlines[0] + results = results.replace("Undefined", "") width, xdensity, height, ydensity = [float(x) for x in results.split()] xdpi = round(x_pt/width*xdensity) @@ -161,8 +167,9 @@ def _get_dpi(self, pdf_filename): except Exception as e: + logging.debug(cmd) logging.debug(str(e)) - self._warn ("Could not execute identify to calculate DPI (try installing imagemagick?), so defaulting to %sdpi" % self.output_dpi) + self._warn ("Could not execute identify to calculate DPI (try installing imagemagick?), so defaulting to %sdpi" % self.output_dpi) return @@ -170,7 +177,7 @@ def _get_dpi(self, pdf_filename): def _run_gs(self, options, output_filename, pdf_filename): try: cmd = '%s -q -dNOPAUSE %s -sOutputFile="%s" "%s" -c quit' % (self.binary, options, output_filename, pdf_filename) - logging.info(cmd) + logging.info(cmd) out = subprocess.check_output(cmd, shell=True) except subprocess.CalledProcessError as e: From 1e69d5c31068173e9561a411ade93467fda23c70 Mon Sep 17 00:00:00 2001 From: Brent Noorda Date: Thu, 12 May 2016 21:16:57 -0700 Subject: [PATCH 2/2] width is being used in place of height (although this function actually doesn't seem to do anything) --- pypdfocr/pypdfocr_pdf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pypdfocr/pypdfocr_pdf.py b/pypdfocr/pypdfocr_pdf.py index bdc1f86..4a98b3a 100644 --- a/pypdfocr/pypdfocr_pdf.py +++ b/pypdfocr/pypdfocr_pdf.py @@ -155,7 +155,7 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename): merger.append(PdfFileReader(file(text_pdf_filename, 'rb'))) merger.write(all_text_filename) merger.close() - del merger + del merger writer = PdfFileWriter() @@ -191,10 +191,10 @@ def _get_merged_single_page(self, original_page, ocr_text_page): if orig_rotation_angle != 0: logging.info("Original Rotation: %s" % orig_rotation_angle) - self.mergeRotateAroundPointPage(original_page, ocr_text_page, orig_rotation_angle, ocr_text_page.mediaBox.getWidth()/2, ocr_text_page.mediaBox.getWidth()/2) + self.mergeRotateAroundPointPage(original_page, ocr_text_page, orig_rotation_angle, ocr_text_page.mediaBox.getWidth()/2, ocr_text_page.mediaBox.getHeight()/2) # None of these commands worked for me: - #orig_pg.rotateCounterClockwise(orig_rotation_angle) - #orig_pg.mergeRotatedPage(text_pg,orig_rotation_angle) + #original_page.rotateCounterClockwise(orig_rotation_angle) + #original_page.mergeRotatedPage(ocr_text_page,orig_rotation_angle) else: original_page.mergePage(ocr_text_page) original_page.compressContentStreams()