From 9685c76a7dc1cbca16c422e2df25af0bc5cde7ab Mon Sep 17 00:00:00 2001
From: Taneja <deepankur.taneja.ext@senvion.com>
Date: Fri, 24 May 2019 18:40:27 +0530
Subject: [PATCH 1/2] Added fileObj to read file object

---
 camelot/__init__.py |   1 +
 camelot/handlers.py | 103 ++++++++++++++++++++++++++++++++++++--------
 camelot/io.py       | 101 +++++++++++++++++++++++++++++++++++++++++++
 camelot/utils.py    |   1 +
 4 files changed, 188 insertions(+), 18 deletions(-)

diff --git a/camelot/__init__.py b/camelot/__init__.py
index 68815f25..7cebffad 100644
--- a/camelot/__init__.py
+++ b/camelot/__init__.py
@@ -6,6 +6,7 @@
 
 from .__version__ import __version__
 from .io import read_pdf
+from .io import read_fileObj
 from .plotting import PlotMethods
 
 
diff --git a/camelot/handlers.py b/camelot/handlers.py
index d773e4a6..7a492047 100644
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@@ -27,28 +27,42 @@ class PDFHandler(object):
         Password for decryption.
 
     """
-    def __init__(self, filepath, pages='1', password=None):
-        if is_url(filepath):
-            filepath = download_url(filepath)
-        self.filepath = filepath
-        if not filepath.lower().endswith('.pdf'):
-            raise NotImplementedError("File format not supported")
-
-        if password is None:
-            self.password = ''
-        else:
-            self.password = password
-            if sys.version_info[0] < 3:
-                self.password = self.password.encode('ascii')
-        self.pages = self._get_pages(self.filepath, pages)
+    def __init__(self, filepath="",fileObj="", pages='1', password=None):
+        if filepath != "":
+            if is_url(filepath):
+                filepath = download_url(filepath)
+            self.filepath = filepath
+            self.fileObj = ""
+            if not filepath.lower().endswith('.pdf'):
+                raise NotImplementedError("File format not supported")
+
+            if password is None:
+                self.password = ''
+            else:
+                self.password = password
+                if sys.version_info[0] < 3:
+                    self.password = self.password.encode('ascii')
+            self.pages = self._get_pages(filepath=self.filepath, pages=pages)
+        if fileObj != "":
+            self.fileObj = fileObj
+            self.filepath = ""
+            if password is None:
+                self.password = ''
+            else:
+                self.password = password
+                if sys.version_info[0] < 3:
+                    self.password = self.password.encode('ascii')
+            self.pages = self._get_pages(fileObj=self.fileObj, pages=pages)
 
-    def _get_pages(self, filepath, pages):
+    def _get_pages(self, filepath="", pages='1',fileObj=""):
         """Converts pages string to list of ints.
 
         Parameters
         ----------
         filepath : str
             Filepath or URL of the PDF file.
+        fileObj : str
+            File Object of the PDF file.
         pages : str, optional (default: '1')
             Comma-separated page numbers.
             Example: '1,3,4' or '1,4-end' or 'all'.
@@ -63,7 +77,10 @@ def _get_pages(self, filepath, pages):
         if pages == '1':
             page_numbers.append({'start': 1, 'end': 1})
         else:
-            infile = PdfFileReader(open(filepath, 'rb'), strict=False)
+            if filepath:
+                infile = PdfFileReader(open(filepath, 'rb'), strict=False)
+            if fileObj:
+                infile = PdfFileReader(fileObj, strict=False)
             if infile.isEncrypted:
                 infile.decrypt(self.password)
             if pages == 'all':
@@ -82,7 +99,7 @@ def _get_pages(self, filepath, pages):
             P.extend(range(p['start'], p['end'] + 1))
         return sorted(set(P))
 
-    def _save_page(self, filepath, page, temp):
+    def _save_page(self, filepath, page='1', temp=''):
         """Saves specified page from PDF into a temporary directory.
 
         Parameters
@@ -95,6 +112,7 @@ def _save_page(self, filepath, page, temp):
             Tmp directory.
 
         """
+
         with open(filepath, 'rb') as fileobj:
             infile = PdfFileReader(fileobj, strict=False)
             if infile.isEncrypted:
@@ -128,6 +146,52 @@ def _save_page(self, filepath, page, temp):
                 with open(fpath, 'wb') as f:
                     outfile.write(f)
 
+    def _save_page_new(self, fileObj, page='1', temp=''):
+        """Saves specified page from PDF into a temporary directory.
+
+        Parameters
+        ----------
+        fileObj : str
+            File Object of the PDF file.
+        page : int
+            Page number.
+        temp : str
+            Tmp directory.
+
+        """
+        fileobj = fileObj
+        infile = PdfFileReader(fileobj, strict=False)
+        if infile.isEncrypted:
+            infile.decrypt(self.password)
+        fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
+        froot, fext = os.path.splitext(fpath)
+        p = infile.getPage(page - 1)
+        outfile = PdfFileWriter()
+        outfile.addPage(p)
+        with open(fpath, 'wb') as f:
+            outfile.write(f)
+        layout, dim = get_page_layout(fpath)
+        # fix rotated PDF
+        chars = get_text_objects(layout, ltype="char")
+        horizontal_text = get_text_objects(layout, ltype="horizontal_text")
+        vertical_text = get_text_objects(layout, ltype="vertical_text")
+        rotation = get_rotation(chars, horizontal_text, vertical_text)
+        if rotation != '':
+            fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
+            os.rename(fpath, fpath_new)
+            infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
+            if infile.isEncrypted:
+                infile.decrypt(self.password)
+            outfile = PdfFileWriter()
+            p = infile.getPage(0)
+            if rotation == 'anticlockwise':
+                p.rotateClockwise(90)
+            elif rotation == 'clockwise':
+                p.rotateCounterClockwise(90)
+            outfile.addPage(p)
+            with open(fpath, 'wb') as f:
+                outfile.write(f)
+
     def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs):
         """Extracts tables by calling parser.get_tables on all single
         page PDFs.
@@ -153,7 +217,10 @@ def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwa
         tables = []
         with TemporaryDirectory() as tempdir:
             for p in self.pages:
-                self._save_page(self.filepath, p, tempdir)
+                if self.filepath != "":
+                    self._save_page(filepath=self.filepath, page=p, temp=tempdir)
+                if self.fileObj != "":
+                    self._save_page_new(fileObj=self.fileObj, page=p, temp=tempdir)
             pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
                      for p in self.pages]
             parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
diff --git a/camelot/io.py b/camelot/io.py
index 5162dd29..85165941 100644
--- a/camelot/io.py
+++ b/camelot/io.py
@@ -105,3 +105,104 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
         tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
                          layout_kwargs=layout_kwargs, **kwargs)
         return tables
+
+
+def read_fileObj(fileObj, pages='1', password=None, flavor='lattice', suppress_stdout=False, layout_kwargs={},
+                 **kwargs):
+    """Read PDF and return extracted tables.
+
+    Note: kwargs annotated with ^ can only be used with flavor='stream'
+    and kwargs annotated with * can only be used with flavor='lattice'.
+
+    Parameters
+    ----------
+    fileObj : str
+            File Object of the PDF file.
+    pages : str, optional (default: '1')
+        Comma-separated page numbers.
+        Example: '1,3,4' or '1,4-end' or 'all'.
+    password : str, optional (default: None)
+        Password for decryption.
+    flavor : str (default: 'lattice')
+        The parsing method to use ('lattice' or 'stream').
+        Lattice is used by default.
+    suppress_stdout : bool, optional (default: True)
+        Print all logs and warnings.
+    layout_kwargs : dict, optional (default: {})
+        A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
+    table_areas : list, optional (default: None)
+        List of table area strings of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
+    columns^ : list, optional (default: None)
+        List of column x-coordinates strings where the coordinates
+        are comma-separated.
+    split_text : bool, optional (default: False)
+        Split text that spans across multiple cells.
+    flag_size : bool, optional (default: False)
+        Flag text based on font size. Useful to detect
+        super/subscripts. Adds <s></s> around flagged text.
+    strip_text : str, optional (default: '')
+        Characters that should be stripped from a string before
+        assigning it to a cell.
+    row_tol^ : int, optional (default: 2)
+        Tolerance parameter used to combine text vertically,
+        to generate rows.
+    column_tol^ : int, optional (default: 0)
+        Tolerance parameter used to combine text horizontally,
+        to generate columns.
+    process_background* : bool, optional (default: False)
+        Process background lines.
+    line_scale* : int, optional (default: 15)
+        Line size scaling factor. The larger the value the smaller
+        the detected lines. Making it very large will lead to text
+        being detected as lines.
+    copy_text* : list, optional (default: None)
+        {'h', 'v'}
+        Direction in which text in a spanning cell will be copied
+        over.
+    shift_text* : list, optional (default: ['l', 't'])
+        {'l', 'r', 't', 'b'}
+        Direction in which text in a spanning cell will flow.
+    line_tol* : int, optional (default: 2)
+        Tolerance parameter used to merge close vertical and horizontal
+        lines.
+    joint_tol* : int, optional (default: 2)
+        Tolerance parameter used to decide whether the detected lines
+        and points lie close to each other.
+    threshold_blocksize* : int, optional (default: 15)
+        Size of a pixel neighborhood that is used to calculate a
+        threshold value for the pixel: 3, 5, 7, and so on.
+
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+    threshold_constant* : int, optional (default: -2)
+        Constant subtracted from the mean or weighted mean.
+        Normally, it is positive but may be zero or negative as well.
+
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+    iterations* : int, optional (default: 0)
+        Number of times for erosion/dilation is applied.
+
+        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
+    resolution* : int, optional (default: 300)
+        Resolution used for PDF to PNG conversion.
+
+    Returns
+    -------
+    tables : camelot.core.TableList
+
+    """
+    if flavor not in ['lattice', 'stream']:
+        raise NotImplementedError("Unknown flavor specified."
+                                  " Use either 'lattice' or 'stream'")
+
+    with warnings.catch_warnings():
+        if suppress_stdout:
+            warnings.simplefilter("ignore")
+
+        validate_input(kwargs, flavor=flavor)
+        p = PDFHandler(fileObj=fileObj, pages=pages, password=password)
+        kwargs = remove_extra(kwargs, flavor=flavor)
+        tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
+                         layout_kwargs=layout_kwargs, **kwargs)
+        return tables
diff --git a/camelot/utils.py b/camelot/utils.py
index 48e39aff..c7fb2dcc 100644
--- a/camelot/utils.py
+++ b/camelot/utils.py
@@ -141,6 +141,7 @@ def remove_extra(kwargs, flavor='lattice'):
 class TemporaryDirectory(object):
     def __enter__(self):
         self.name = tempfile.mkdtemp()
+        print(self.name)
         return self.name
 
     def __exit__(self, exc_type, exc_value, traceback):

From 725ff2867c004980987637ab2e01a2abac660910 Mon Sep 17 00:00:00 2001
From: Taneja <deepankur.taneja.ext@senvion.com>
Date: Mon, 27 May 2019 11:30:06 +0530
Subject: [PATCH 2/2] Updated code

---
 .gitignore       | 4 +++-
 camelot/utils.py | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0a92ca02..3f1c6103 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,6 @@ coverage.xml
 _build/
 
 # vscode
-.vscode
\ No newline at end of file
+.vscode
+
+.idea/
diff --git a/camelot/utils.py b/camelot/utils.py
index c7fb2dcc..48e39aff 100644
--- a/camelot/utils.py
+++ b/camelot/utils.py
@@ -141,7 +141,6 @@ def remove_extra(kwargs, flavor='lattice'):
 class TemporaryDirectory(object):
     def __enter__(self):
         self.name = tempfile.mkdtemp()
-        print(self.name)
         return self.name
 
     def __exit__(self, exc_type, exc_value, traceback):