From b6aeb0e5c32283a66bb049c9c472339333a7f12e Mon Sep 17 00:00:00 2001
From: Brent Noorda <brent.noorda@gmail.com>
Date: Thu, 12 May 2016 19:19:49 -0700
Subject: [PATCH 1/2] When identifying original pdf file, skip the first output
 lines that are warnings, e.g.,     **** Warning: considering '0000000000
 XXXXX n' as a free entry.

---
 pypdfocr/pypdfocr_gs.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/pypdfocr/pypdfocr_gs.py b/pypdfocr/pypdfocr_gs.py
index 5599082..a346a87 100644
--- a/pypdfocr/pypdfocr_gs.py
+++ b/pypdfocr/pypdfocr_gs.py
@@ -124,7 +124,7 @@ def _get_dpi(self, pdf_filename):
         try:
             out = subprocess.check_output(cmd, shell=True)
         except subprocess.CalledProcessError as e:
-            self._warn ("Could not execute pdfimages to calculate DPI (try installing xpdf or poppler?), so defaulting to %sdpi" % self.output_dpi) 
+            self._warn ("Could not execute pdfimages to calculate DPI (try installing xpdf or poppler?), so defaulting to %sdpi" % self.output_dpi)
             return
 
         # Need the second line of output
@@ -137,7 +137,7 @@ def _get_dpi(self, pdf_filename):
         logging.debug(results)
         results = results.split()
         if(results[2] != 'image'):
-            self._warn("Could not understand output of pdfimages, please rerun with -d option and file an issue at http://github.com/virantha/pypdfocr/issues") 
+            self._warn("Could not understand output of pdfimages, please rerun with -d option and file an issue at http://github.com/virantha/pypdfocr/issues")
             return
         x_pt, y_pt, greyscale = int(results[3]), int(results[4]), results[5]=='gray'
         self.greyscale = greyscale
@@ -146,7 +146,13 @@ def _get_dpi(self, pdf_filename):
         cmd = 'identify -format "%%w %%x %%h %%y\n" "%s"' % pdf_filename
         try:
             out = subprocess.check_output(cmd, shell=True)
-            results = out.splitlines()[0]
+
+            # skip any lines starting with "*** Warning"
+            out_splitlines = out.splitlines()
+            while "* Warning:" in  out_splitlines[0]:
+                out_splitlines = out_splitlines[1:]
+            results = out_splitlines[0]
+
             results = results.replace("Undefined", "")
             width, xdensity, height, ydensity = [float(x) for x in results.split()]
             xdpi = round(x_pt/width*xdensity)
@@ -161,8 +167,9 @@ def _get_dpi(self, pdf_filename):
 
 
         except Exception as e:
+            logging.debug(cmd)
             logging.debug(str(e))
-            self._warn ("Could not execute identify to calculate DPI (try installing imagemagick?), so defaulting to %sdpi" % self.output_dpi) 
+            self._warn ("Could not execute identify to calculate DPI (try installing imagemagick?), so defaulting to %sdpi" % self.output_dpi)
         return
 
 
@@ -170,7 +177,7 @@ def _get_dpi(self, pdf_filename):
     def _run_gs(self, options, output_filename, pdf_filename):
         try:
             cmd = '%s -q -dNOPAUSE %s -sOutputFile="%s" "%s" -c quit' % (self.binary, options, output_filename, pdf_filename)
-            logging.info(cmd)        
+            logging.info(cmd)
             out = subprocess.check_output(cmd, shell=True)
 
         except subprocess.CalledProcessError as e:

From 1e69d5c31068173e9561a411ade93467fda23c70 Mon Sep 17 00:00:00 2001
From: Brent Noorda <brent.noorda@gmail.com>
Date: Thu, 12 May 2016 21:16:57 -0700
Subject: [PATCH 2/2] width is being used in place of height (although this
 function actually doesn't seem to do anything)

---
 pypdfocr/pypdfocr_pdf.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pypdfocr/pypdfocr_pdf.py b/pypdfocr/pypdfocr_pdf.py
index bdc1f86..4a98b3a 100644
--- a/pypdfocr/pypdfocr_pdf.py
+++ b/pypdfocr/pypdfocr_pdf.py
@@ -155,7 +155,7 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
             merger.append(PdfFileReader(file(text_pdf_filename, 'rb')))
         merger.write(all_text_filename)
         merger.close()
-	del merger
+        del merger
 
 
         writer = PdfFileWriter()
@@ -191,10 +191,10 @@ def _get_merged_single_page(self, original_page, ocr_text_page):
 
         if orig_rotation_angle != 0:
             logging.info("Original Rotation: %s" % orig_rotation_angle)
-            self.mergeRotateAroundPointPage(original_page, ocr_text_page, orig_rotation_angle, ocr_text_page.mediaBox.getWidth()/2, ocr_text_page.mediaBox.getWidth()/2)
+            self.mergeRotateAroundPointPage(original_page, ocr_text_page, orig_rotation_angle, ocr_text_page.mediaBox.getWidth()/2, ocr_text_page.mediaBox.getHeight()/2)
             # None of these commands worked for me:
-            #orig_pg.rotateCounterClockwise(orig_rotation_angle)
-            #orig_pg.mergeRotatedPage(text_pg,orig_rotation_angle)
+            #original_page.rotateCounterClockwise(orig_rotation_angle)
+            #original_page.mergeRotatedPage(ocr_text_page,orig_rotation_angle)
         else:
             original_page.mergePage(ocr_text_page)
         original_page.compressContentStreams()