From 9685c76a7dc1cbca16c422e2df25af0bc5cde7ab Mon Sep 17 00:00:00 2001 From: Taneja <deepankur.taneja.ext@senvion.com> Date: Fri, 24 May 2019 18:40:27 +0530 Subject: [PATCH 1/2] Added fileObj to read file object --- camelot/__init__.py | 1 + camelot/handlers.py | 103 ++++++++++++++++++++++++++++++++++++-------- camelot/io.py | 101 +++++++++++++++++++++++++++++++++++++++++++ camelot/utils.py | 1 + 4 files changed, 188 insertions(+), 18 deletions(-) diff --git a/camelot/__init__.py b/camelot/__init__.py index 68815f25..7cebffad 100644 --- a/camelot/__init__.py +++ b/camelot/__init__.py @@ -6,6 +6,7 @@ from .__version__ import __version__ from .io import read_pdf +from .io import read_fileObj from .plotting import PlotMethods diff --git a/camelot/handlers.py b/camelot/handlers.py index d773e4a6..7a492047 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -27,28 +27,42 @@ class PDFHandler(object): Password for decryption. """ - def __init__(self, filepath, pages='1', password=None): - if is_url(filepath): - filepath = download_url(filepath) - self.filepath = filepath - if not filepath.lower().endswith('.pdf'): - raise NotImplementedError("File format not supported") - - if password is None: - self.password = '' - else: - self.password = password - if sys.version_info[0] < 3: - self.password = self.password.encode('ascii') - self.pages = self._get_pages(self.filepath, pages) + def __init__(self, filepath="",fileObj="", pages='1', password=None): + if filepath != "": + if is_url(filepath): + filepath = download_url(filepath) + self.filepath = filepath + self.fileObj = "" + if not filepath.lower().endswith('.pdf'): + raise NotImplementedError("File format not supported") + + if password is None: + self.password = '' + else: + self.password = password + if sys.version_info[0] < 3: + self.password = self.password.encode('ascii') + self.pages = self._get_pages(filepath=self.filepath, pages=pages) + if fileObj != "": + self.fileObj = fileObj + self.filepath = "" + if password is None: + self.password = '' + else: + self.password = password + if sys.version_info[0] < 3: + self.password = self.password.encode('ascii') + self.pages = self._get_pages(fileObj=self.fileObj, pages=pages) - def _get_pages(self, filepath, pages): + def _get_pages(self, filepath="", pages='1',fileObj=""): """Converts pages string to list of ints. Parameters ---------- filepath : str Filepath or URL of the PDF file. + fileObj : str + File Object of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. @@ -63,7 +77,10 @@ def _get_pages(self, filepath, pages): if pages == '1': page_numbers.append({'start': 1, 'end': 1}) else: - infile = PdfFileReader(open(filepath, 'rb'), strict=False) + if filepath: + infile = PdfFileReader(open(filepath, 'rb'), strict=False) + if fileObj: + infile = PdfFileReader(fileObj, strict=False) if infile.isEncrypted: infile.decrypt(self.password) if pages == 'all': @@ -82,7 +99,7 @@ def _get_pages(self, filepath, pages): P.extend(range(p['start'], p['end'] + 1)) return sorted(set(P)) - def _save_page(self, filepath, page, temp): + def _save_page(self, filepath, page='1', temp=''): """Saves specified page from PDF into a temporary directory. Parameters @@ -95,6 +112,7 @@ def _save_page(self, filepath, page, temp): Tmp directory. """ + with open(filepath, 'rb') as fileobj: infile = PdfFileReader(fileobj, strict=False) if infile.isEncrypted: @@ -128,6 +146,52 @@ def _save_page(self, filepath, page, temp): with open(fpath, 'wb') as f: outfile.write(f) + def _save_page_new(self, fileObj, page='1', temp=''): + """Saves specified page from PDF into a temporary directory. + + Parameters + ---------- + fileObj : str + File Object of the PDF file. + page : int + Page number. + temp : str + Tmp directory. + + """ + fileobj = fileObj + infile = PdfFileReader(fileobj, strict=False) + if infile.isEncrypted: + infile.decrypt(self.password) + fpath = os.path.join(temp, 'page-{0}.pdf'.format(page)) + froot, fext = os.path.splitext(fpath) + p = infile.getPage(page - 1) + outfile = PdfFileWriter() + outfile.addPage(p) + with open(fpath, 'wb') as f: + outfile.write(f) + layout, dim = get_page_layout(fpath) + # fix rotated PDF + chars = get_text_objects(layout, ltype="char") + horizontal_text = get_text_objects(layout, ltype="horizontal_text") + vertical_text = get_text_objects(layout, ltype="vertical_text") + rotation = get_rotation(chars, horizontal_text, vertical_text) + if rotation != '': + fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext]) + os.rename(fpath, fpath_new) + infile = PdfFileReader(open(fpath_new, 'rb'), strict=False) + if infile.isEncrypted: + infile.decrypt(self.password) + outfile = PdfFileWriter() + p = infile.getPage(0) + if rotation == 'anticlockwise': + p.rotateClockwise(90) + elif rotation == 'clockwise': + p.rotateCounterClockwise(90) + outfile.addPage(p) + with open(fpath, 'wb') as f: + outfile.write(f) + def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs): """Extracts tables by calling parser.get_tables on all single page PDFs. @@ -153,7 +217,10 @@ def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwa tables = [] with TemporaryDirectory() as tempdir: for p in self.pages: - self._save_page(self.filepath, p, tempdir) + if self.filepath != "": + self._save_page(filepath=self.filepath, page=p, temp=tempdir) + if self.fileObj != "": + self._save_page_new(fileObj=self.fileObj, page=p, temp=tempdir) pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p)) for p in self.pages] parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs) diff --git a/camelot/io.py b/camelot/io.py index 5162dd29..85165941 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -105,3 +105,104 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs, **kwargs) return tables + + +def read_fileObj(fileObj, pages='1', password=None, flavor='lattice', suppress_stdout=False, layout_kwargs={}, + **kwargs): + """Read PDF and return extracted tables. + + Note: kwargs annotated with ^ can only be used with flavor='stream' + and kwargs annotated with * can only be used with flavor='lattice'. + + Parameters + ---------- + fileObj : str + File Object of the PDF file. + pages : str, optional (default: '1') + Comma-separated page numbers. + Example: '1,3,4' or '1,4-end' or 'all'. + password : str, optional (default: None) + Password for decryption. + flavor : str (default: 'lattice') + The parsing method to use ('lattice' or 'stream'). + Lattice is used by default. + suppress_stdout : bool, optional (default: True) + Print all logs and warnings. + layout_kwargs : dict, optional (default: {}) + A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. + table_areas : list, optional (default: None) + List of table area strings of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. + columns^ : list, optional (default: None) + List of column x-coordinates strings where the coordinates + are comma-separated. + split_text : bool, optional (default: False) + Split text that spans across multiple cells. + flag_size : bool, optional (default: False) + Flag text based on font size. Useful to detect + super/subscripts. Adds <s></s> around flagged text. + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. + row_tol^ : int, optional (default: 2) + Tolerance parameter used to combine text vertically, + to generate rows. + column_tol^ : int, optional (default: 0) + Tolerance parameter used to combine text horizontally, + to generate columns. + process_background* : bool, optional (default: False) + Process background lines. + line_scale* : int, optional (default: 15) + Line size scaling factor. The larger the value the smaller + the detected lines. Making it very large will lead to text + being detected as lines. + copy_text* : list, optional (default: None) + {'h', 'v'} + Direction in which text in a spanning cell will be copied + over. + shift_text* : list, optional (default: ['l', 't']) + {'l', 'r', 't', 'b'} + Direction in which text in a spanning cell will flow. + line_tol* : int, optional (default: 2) + Tolerance parameter used to merge close vertical and horizontal + lines. + joint_tol* : int, optional (default: 2) + Tolerance parameter used to decide whether the detected lines + and points lie close to each other. + threshold_blocksize* : int, optional (default: 15) + Size of a pixel neighborhood that is used to calculate a + threshold value for the pixel: 3, 5, 7, and so on. + + For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. + threshold_constant* : int, optional (default: -2) + Constant subtracted from the mean or weighted mean. + Normally, it is positive but may be zero or negative as well. + + For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. + iterations* : int, optional (default: 0) + Number of times for erosion/dilation is applied. + + For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. + resolution* : int, optional (default: 300) + Resolution used for PDF to PNG conversion. + + Returns + ------- + tables : camelot.core.TableList + + """ + if flavor not in ['lattice', 'stream']: + raise NotImplementedError("Unknown flavor specified." + " Use either 'lattice' or 'stream'") + + with warnings.catch_warnings(): + if suppress_stdout: + warnings.simplefilter("ignore") + + validate_input(kwargs, flavor=flavor) + p = PDFHandler(fileObj=fileObj, pages=pages, password=password) + kwargs = remove_extra(kwargs, flavor=flavor) + tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, + layout_kwargs=layout_kwargs, **kwargs) + return tables diff --git a/camelot/utils.py b/camelot/utils.py index 48e39aff..c7fb2dcc 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -141,6 +141,7 @@ def remove_extra(kwargs, flavor='lattice'): class TemporaryDirectory(object): def __enter__(self): self.name = tempfile.mkdtemp() + print(self.name) return self.name def __exit__(self, exc_type, exc_value, traceback): From 725ff2867c004980987637ab2e01a2abac660910 Mon Sep 17 00:00:00 2001 From: Taneja <deepankur.taneja.ext@senvion.com> Date: Mon, 27 May 2019 11:30:06 +0530 Subject: [PATCH 2/2] Updated code --- .gitignore | 4 +++- camelot/utils.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 0a92ca02..3f1c6103 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,6 @@ coverage.xml _build/ # vscode -.vscode \ No newline at end of file +.vscode + +.idea/ diff --git a/camelot/utils.py b/camelot/utils.py index c7fb2dcc..48e39aff 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -141,7 +141,6 @@ def remove_extra(kwargs, flavor='lattice'): class TemporaryDirectory(object): def __enter__(self): self.name = tempfile.mkdtemp() - print(self.name) return self.name def __exit__(self, exc_type, exc_value, traceback):