diff --git a/.github/workflows/default.yml b/.github/workflows/default.yml index 21375ce..e2cd29f 100644 --- a/.github/workflows/default.yml +++ b/.github/workflows/default.yml @@ -17,16 +17,16 @@ jobs: steps: - uses: actions/checkout@v1 - - name: Set up Python 3.8 + - name: Set up Python 3.13 uses: actions/setup-python@v1 with: - python-version: '3.8' + python-version: '3.13' architecture: ${{ matrix.config.python-arch }} - name: Install Excalibur run: | python -m pip install --upgrade pip python -m pip install pyinstaller - python -m pip install . + python -m pip install ".[all]" - name: Build with PyInstaller on Ubuntu and MacOS if: runner.os == 'Linux' || runner.os == 'macOS' run: | diff --git a/.github/workflows/static-code-analysis.yml b/.github/workflows/static-code-analysis.yml deleted file mode 100644 index b334746..0000000 --- a/.github/workflows/static-code-analysis.yml +++ /dev/null @@ -1,36 +0,0 @@ -# This workflow will install Python dependencies, run tests -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - -name: Run Static Code Analysis - -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] - -jobs: - build: - - runs-on: ubuntu-latest - strategy: - matrix: - python-version: [3.8] - - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-dev.txt - pip install . - - name: Type-checking with mypy - run: mypy . - - name: Style-checking with black - run: black --check . - - name: Style-checking with flake8 - run: flake8 . diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index c5ee5a7..8448b85 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.11, 3.12, 3.13] steps: - uses: actions/checkout@v2 @@ -26,8 +26,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -r requirements-dev.txt - pip install . + python -m pip install ".[all]" - name: Test with pytest run: | - pytest + python -m pytest diff --git a/docs/_themes/flask_theme_support.py b/docs/_themes/flask_theme_support.py index bc259bf..279b798 100644 --- a/docs/_themes/flask_theme_support.py +++ b/docs/_themes/flask_theme_support.py @@ -23,76 +23,67 @@ class FlaskyStyle(Style): styles = { # No corresponding class for the following: # Text: "", # class: '' - Whitespace: "underline #f8f8f8", # class: 'w' - Error: "#a40000 border:#ef2929", # class: 'err' - Other: "#000000", # class 'x' - - Comment: "italic #8f5902", # class: 'c' - Comment.Preproc: "noitalic", # class: 'cp' - - Keyword: "bold #004461", # class: 'k' - Keyword.Constant: "bold #004461", # class: 'kc' - Keyword.Declaration: "bold #004461", # class: 'kd' - Keyword.Namespace: "bold #004461", # class: 'kn' - Keyword.Pseudo: "bold #004461", # class: 'kp' - Keyword.Reserved: "bold #004461", # class: 'kr' - Keyword.Type: "bold #004461", # class: 'kt' - - Operator: "#582800", # class: 'o' - Operator.Word: "bold #004461", # class: 'ow' - like keywords - - Punctuation: "bold #000000", # class: 'p' - + Whitespace: "underline #f8f8f8", # class: 'w' + Error: "#a40000 border:#ef2929", # class: 'err' + Other: "#000000", # class 'x' + Comment: "italic #8f5902", # class: 'c' + Comment.Preproc: "noitalic", # class: 'cp' + Keyword: "bold #004461", # class: 'k' + Keyword.Constant: "bold #004461", # class: 'kc' + Keyword.Declaration: "bold #004461", # class: 'kd' + Keyword.Namespace: "bold #004461", # class: 'kn' + Keyword.Pseudo: "bold #004461", # class: 'kp' + Keyword.Reserved: "bold #004461", # class: 'kr' + Keyword.Type: "bold #004461", # class: 'kt' + Operator: "#582800", # class: 'o' + Operator.Word: "bold #004461", # class: 'ow' - like keywords + Punctuation: "bold #000000", # class: 'p' # because special names such as Name.Class, Name.Function, etc. # are not recognized as such later in the parsing, we choose them # to look the same as ordinary variables. - Name: "#000000", # class: 'n' - Name.Attribute: "#c4a000", # class: 'na' - to be revised - Name.Builtin: "#004461", # class: 'nb' - Name.Builtin.Pseudo: "#3465a4", # class: 'bp' - Name.Class: "#000000", # class: 'nc' - to be revised - Name.Constant: "#000000", # class: 'no' - to be revised - Name.Decorator: "#888", # class: 'nd' - to be revised - Name.Entity: "#ce5c00", # class: 'ni' - Name.Exception: "bold #cc0000", # class: 'ne' - Name.Function: "#000000", # class: 'nf' - Name.Property: "#000000", # class: 'py' - Name.Label: "#f57900", # class: 'nl' - Name.Namespace: "#000000", # class: 'nn' - to be revised - Name.Other: "#000000", # class: 'nx' - Name.Tag: "bold #004461", # class: 'nt' - like a keyword - Name.Variable: "#000000", # class: 'nv' - to be revised - Name.Variable.Class: "#000000", # class: 'vc' - to be revised - Name.Variable.Global: "#000000", # class: 'vg' - to be revised - Name.Variable.Instance: "#000000", # class: 'vi' - to be revised - - Number: "#990000", # class: 'm' - - Literal: "#000000", # class: 'l' - Literal.Date: "#000000", # class: 'ld' - - String: "#4e9a06", # class: 's' - String.Backtick: "#4e9a06", # class: 'sb' - String.Char: "#4e9a06", # class: 'sc' - String.Doc: "italic #8f5902", # class: 'sd' - like a comment - String.Double: "#4e9a06", # class: 's2' - String.Escape: "#4e9a06", # class: 'se' - String.Heredoc: "#4e9a06", # class: 'sh' - String.Interpol: "#4e9a06", # class: 'si' - String.Other: "#4e9a06", # class: 'sx' - String.Regex: "#4e9a06", # class: 'sr' - String.Single: "#4e9a06", # class: 's1' - String.Symbol: "#4e9a06", # class: 'ss' - - Generic: "#000000", # class: 'g' - Generic.Deleted: "#a40000", # class: 'gd' - Generic.Emph: "italic #000000", # class: 'ge' - Generic.Error: "#ef2929", # class: 'gr' - Generic.Heading: "bold #000080", # class: 'gh' - Generic.Inserted: "#00A000", # class: 'gi' - Generic.Output: "#888", # class: 'go' - Generic.Prompt: "#745334", # class: 'gp' - Generic.Strong: "bold #000000", # class: 'gs' - Generic.Subheading: "bold #800080", # class: 'gu' - Generic.Traceback: "bold #a40000", # class: 'gt' + Name: "#000000", # class: 'n' + Name.Attribute: "#c4a000", # class: 'na' - to be revised + Name.Builtin: "#004461", # class: 'nb' + Name.Builtin.Pseudo: "#3465a4", # class: 'bp' + Name.Class: "#000000", # class: 'nc' - to be revised + Name.Constant: "#000000", # class: 'no' - to be revised + Name.Decorator: "#888", # class: 'nd' - to be revised + Name.Entity: "#ce5c00", # class: 'ni' + Name.Exception: "bold #cc0000", # class: 'ne' + Name.Function: "#000000", # class: 'nf' + Name.Property: "#000000", # class: 'py' + Name.Label: "#f57900", # class: 'nl' + Name.Namespace: "#000000", # class: 'nn' - to be revised + Name.Other: "#000000", # class: 'nx' + Name.Tag: "bold #004461", # class: 'nt' - like a keyword + Name.Variable: "#000000", # class: 'nv' - to be revised + Name.Variable.Class: "#000000", # class: 'vc' - to be revised + Name.Variable.Global: "#000000", # class: 'vg' - to be revised + Name.Variable.Instance: "#000000", # class: 'vi' - to be revised + Number: "#990000", # class: 'm' + Literal: "#000000", # class: 'l' + Literal.Date: "#000000", # class: 'ld' + String: "#4e9a06", # class: 's' + String.Backtick: "#4e9a06", # class: 'sb' + String.Char: "#4e9a06", # class: 'sc' + String.Doc: "italic #8f5902", # class: 'sd' - like a comment + String.Double: "#4e9a06", # class: 's2' + String.Escape: "#4e9a06", # class: 'se' + String.Heredoc: "#4e9a06", # class: 'sh' + String.Interpol: "#4e9a06", # class: 'si' + String.Other: "#4e9a06", # class: 'sx' + String.Regex: "#4e9a06", # class: 'sr' + String.Single: "#4e9a06", # class: 's1' + String.Symbol: "#4e9a06", # class: 'ss' + Generic: "#000000", # class: 'g' + Generic.Deleted: "#a40000", # class: 'gd' + Generic.Emph: "italic #000000", # class: 'ge' + Generic.Error: "#ef2929", # class: 'gr' + Generic.Heading: "bold #000080", # class: 'gh' + Generic.Inserted: "#00A000", # class: 'gi' + Generic.Output: "#888", # class: 'go' + Generic.Prompt: "#745334", # class: 'gp' + Generic.Strong: "bold #000000", # class: 'gs' + Generic.Subheading: "bold #800080", # class: 'gu' + Generic.Traceback: "bold #a40000", # class: 'gt' } diff --git a/docs/conf.py b/docs/conf.py index 961ab8d..331870c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,16 +14,16 @@ import os import sys -sys.path.insert(0, os.path.abspath('..')) -sys.path.insert(0, os.path.abspath('_themes')) +sys.path.insert(0, os.path.abspath("..")) +sys.path.insert(0, os.path.abspath("_themes")) import excalibur # -- Project information ----------------------------------------------------- -project = 'Excalibur' -copyright = '2018, Camelot Developers' -author = 'Camelot Developers' +project = "Excalibur" +copyright = "2018, Camelot Developers" +author = "Camelot Developers" # The short X.Y version version = excalibur.__version__ @@ -40,18 +40,17 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = [ -] +extensions = [] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -63,39 +62,39 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'flask_theme_support.FlaskyStyle' +pygments_style = "flask_theme_support.FlaskyStyle" # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'alabaster' +html_theme = "alabaster" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. html_theme_options = { - 'show_powered_by': False, - 'github_user': 'camelot-dev', - 'github_repo': 'excalibur', - 'github_banner': True, - 'show_related': False, - 'note_bg': '#FFF59C' + "show_powered_by": False, + "github_user": "camelot-dev", + "github_repo": "excalibur", + "github_banner": True, + "show_related": False, + "note_bg": "#FFF59C", } # The name of an image file (relative to this directory) to use as a favicon of # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -html_favicon = '_static/favicon.ico' +html_favicon = "_static/favicon.ico" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Custom sidebar templates, must be a dictionary that maps document names # to template names. @@ -105,17 +104,28 @@ # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', # 'searchbox.html']``. html_sidebars = { - 'index': ['sidebarintro.html', 'relations.html', 'sourcelink.html', - 'searchbox.html', 'hacks.html'], - '**': ['sidebarlogo.html', 'localtoc.html', 'relations.html', - 'sourcelink.html', 'searchbox.html', 'hacks.html'] + "index": [ + "sidebarintro.html", + "relations.html", + "sourcelink.html", + "searchbox.html", + "hacks.html", + ], + "**": [ + "sidebarlogo.html", + "localtoc.html", + "relations.html", + "sourcelink.html", + "searchbox.html", + "hacks.html", + ], } # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'Excaliburdoc' +htmlhelp_basename = "Excaliburdoc" # -- Options for LaTeX output ------------------------------------------------ @@ -124,15 +134,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -142,8 +149,13 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'Excalibur.tex', 'Excalibur Documentation', - 'Camelot Developers', 'manual'), + ( + master_doc, + "Excalibur.tex", + "Excalibur Documentation", + "Camelot Developers", + "manual", + ), ] @@ -151,10 +163,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'Excalibur', 'Excalibur Documentation', - [author], 1) -] +man_pages = [(master_doc, "Excalibur", "Excalibur Documentation", [author], 1)] # -- Options for Texinfo output ---------------------------------------------- @@ -163,9 +172,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'Excalibur', 'Excalibur Documentation', - author, 'Excalibur', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "Excalibur", + "Excalibur Documentation", + author, + "Excalibur", + "One line description of project.", + "Miscellaneous", + ), ] @@ -184,4 +199,4 @@ # epub_uid = '' # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] diff --git a/excalibur/configuration.py b/excalibur/configuration.py index 1a6251d..a4f23b2 100644 --- a/excalibur/configuration.py +++ b/excalibur/configuration.py @@ -1,7 +1,12 @@ import os import six -from backports.configparser import ConfigParser + +# With this: +try: + from configparser import ConfigParser +except ImportError: + from backports.configparser import ConfigParser def _read_default_config_file(file_name): @@ -69,8 +74,9 @@ def get(self, section, key, **kwargs): else: raise ValueError( - "section/key [{section}/{key}] not found in" - " config".format(**locals()) + "section/key [{section}/{key}] not found in" " config".format( + **locals() + ) ) def read(self, filename): diff --git a/excalibur/executors/celery_executor.py b/excalibur/executors/celery_executor.py index d111ee2..040512c 100755 --- a/excalibur/executors/celery_executor.py +++ b/excalibur/executors/celery_executor.py @@ -1,6 +1,6 @@ +import sys import traceback import subprocess -import sys from celery import Celery @@ -23,7 +23,9 @@ @app.task def execute_command(command): try: - subprocess.check_call(command, stderr=subprocess.STDOUT, close_fds=(sys.platform != 'win32')) + subprocess.check_call( + command, stderr=subprocess.STDOUT, close_fds=(sys.platform != "win32") + ) except Exception as e: traceback.print_exc(e) diff --git a/excalibur/executors/sequential_executor.py b/excalibur/executors/sequential_executor.py index 4e758c2..3308323 100644 --- a/excalibur/executors/sequential_executor.py +++ b/excalibur/executors/sequential_executor.py @@ -1,14 +1,16 @@ +import sys import traceback import subprocess from concurrent.futures import ProcessPoolExecutor -import sys from .base_executor import BaseExecutor def execute_command(command): try: - subprocess.check_call(command, stderr=subprocess.STDOUT, close_fds=(sys.platform != 'win32')) + subprocess.check_call( + command, stderr=subprocess.STDOUT, close_fds=(sys.platform != "win32") + ) except FileNotFoundError: # TODO: PyInstaller does not package console_scripts # https://github.com/pyinstaller/pyinstaller/issues/305 diff --git a/excalibur/tasks.py b/excalibur/tasks.py index 4bf92be..1dd3d9b 100644 --- a/excalibur/tasks.py +++ b/excalibur/tasks.py @@ -1,18 +1,21 @@ -import os +import datetime as dt import glob import json import logging -import datetime as dt +import os +import subprocess +import camelot +import pandas as pd +from camelot.backends.ghostscript_backend import GhostscriptBackend from camelot.core import TableList -from camelot.parsers import Stream, Lattice -from camelot.ext.ghostscript import Ghostscript +from camelot.parsers import Lattice, Stream from . import configuration as conf -from .models import Job, File, Rule +from .models import File, Job, Rule from .settings import Session from .utils.file import mkdirs -from .utils.task import get_pages, save_page, get_file_dim, get_image_dim +from .utils.task import get_file_dim, get_image_dim, get_pages, save_page def split(file_id): @@ -40,12 +43,23 @@ def split(file_id): imagepath = os.path.join(conf.PDFS_FOLDER, file_id, imagename) # convert single-page PDF to PNG - gs_call = f"-q -sDEVICE=png16m -o {imagepath} -r300 {filepath}" - gs_call = gs_call.encode().split() - null = open(os.devnull, "wb") - with Ghostscript(*gs_call, stdout=null): - pass - null.close() + try: + backend = GhostscriptBackend() + backend.convert(filepath, imagepath, 300) + except OSError: + gs_command = [ + "gs", + "-q", + "-sDEVICE=png16m", + f"-o{imagepath}", + "-r300", + filepath, + ] + try: + subprocess.run(gs_command, check=True, capture_output=True) + except subprocess.CalledProcessError as e: + logging.error(f"Ghostscript conversion failed: {e.stderr.decode()}") + raise filenames[page] = filename filepaths[page] = filepath @@ -56,16 +70,14 @@ def split(file_id): lattice_areas, stream_areas = (None for i in range(2)) # lattice - parser = Lattice() - tables = parser.extract_tables(filepath) + tables = camelot.read_pdf(filepath, flavor="lattice") if len(tables): lattice_areas = [] for table in tables: x1, y1, x2, y2 = table._bbox lattice_areas.append((x1, y2, x2, y1)) # stream - parser = Stream() - tables = parser.extract_tables(filepath) + tables = camelot.read_pdf(filepath, flavor="stream") if len(tables): stream_areas = [] for table in tables: @@ -107,10 +119,11 @@ def extract(job_id): for p in pages: kwargs = pages[p] kwargs.update(rule_options) - parser = ( - Lattice(**kwargs) if flavor.lower() == "lattice" else Stream(**kwargs) - ) - t = parser.extract_tables(filepaths[p]) + kwargs["flavor"] = flavor.lower() + if flavor.lower() == "lattice": + kwargs.pop("columns", None) + + t = camelot.read_pdf(filepaths[p], **kwargs, backend="poppler") for _t in t: _t.page = int(p) tables.extend(t) @@ -123,7 +136,14 @@ def extract(job_id): mkdirs(f_datapath) ext = f if f != "excel" else "xlsx" f_datapath = os.path.join(f_datapath, f"{froot}.{ext}") - tables.export(f_datapath, f=f, compress=True) + + if f == "excel": + with pd.ExcelWriter(f_datapath) as writer: + for i, table in enumerate(tables): + sheet_name = f"Table_{i + 1}" + table.df.to_excel(writer, sheet_name=sheet_name, index=False) + else: + tables.export(f_datapath, f=f, compress=True) # for render jsonpath = os.path.join(datapath, "json") diff --git a/excalibur/utils/task.py b/excalibur/utils/task.py index fda7a3c..30c00b1 100644 --- a/excalibur/utils/task.py +++ b/excalibur/utils/task.py @@ -1,7 +1,7 @@ import os import cv2 -from PyPDF2 import PdfFileReader, PdfFileWriter +from PyPDF2 import PdfReader, PdfWriter from camelot.utils import get_rotation, get_page_layout, get_text_objects @@ -26,21 +26,21 @@ def get_pages(filename, pages, password=""): """ page_numbers = [] inputstream = open(filename, "rb") - infile = PdfFileReader(inputstream, strict=False) - N = infile.getNumPages() + infile = PdfReader(inputstream, strict=False) + N = len(infile.pages) if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: if infile.isEncrypted: infile.decrypt(password) if pages == "all": - page_numbers.append({"start": 1, "end": infile.getNumPages()}) + page_numbers.append({"start": 1, "end": len(infile.pages)}) else: for r in pages.split(","): if "-" in r: a, b = r.split("-") if b == "end": - b = infile.getNumPages() + b = len(infile.pages) page_numbers.append({"start": int(a), "end": int(b)}) else: page_numbers.append({"start": int(r), "end": int(r)}) @@ -52,10 +52,10 @@ def get_pages(filename, pages, password=""): def save_page(filepath, page_number): - infile = PdfFileReader(open(filepath, "rb"), strict=False) - page = infile.getPage(page_number - 1) - outfile = PdfFileWriter() - outfile.addPage(page) + infile = PdfReader(open(filepath, "rb"), strict=False) + page = infile.pages[page_number - 1] + outfile = PdfWriter() + outfile.add_page(page) outpath = os.path.join(os.path.dirname(filepath), f"page-{page_number}.pdf") with open(outpath, "wb") as f: outfile.write(f) @@ -69,16 +69,16 @@ def save_page(filepath, page_number): if rotation != "": outpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) os.rename(outpath, outpath_new) - infile = PdfFileReader(open(outpath_new, "rb"), strict=False) + infile = PdfReader(open(outpath_new, "rb"), strict=False) if infile.isEncrypted: infile.decrypt("") - outfile = PdfFileWriter() - p = infile.getPage(0) + outfile = PdfWriter() + p = infile.pages[0] if rotation == "anticlockwise": p.rotateClockwise(90) elif rotation == "clockwise": p.rotateCounterClockwise(90) - outfile.addPage(p) + outfile.add_page(p) with open(outpath, "wb") as f: outfile.write(f) diff --git a/excalibur/www/static/js/job.js b/excalibur/www/static/js/job.js index 2ce29d2..5340d3d 100644 --- a/excalibur/www/static/js/job.js +++ b/excalibur/www/static/js/job.js @@ -6,4 +6,4 @@ $(document).ready(function () { $('#download-form').append($(input)); $('#download-form').submit(); }); -}); \ No newline at end of file +}); diff --git a/excalibur/www/templates/workspace.html b/excalibur/www/templates/workspace.html index 91eb6a9..f87c471 100644 --- a/excalibur/www/templates/workspace.html +++ b/excalibur/www/templates/workspace.html @@ -9,7 +9,7 @@ {% endblock %} {% block workspace %} - {% if imagepaths is not none %} + {% if imagepaths is not none or imagepaths|length == 0 %}