Skip to content

Commit

Permalink
Update python versions and camelot usage
Browse files Browse the repository at this point in the history
  • Loading branch information
vinayak-mehta committed Dec 25, 2024
1 parent 94aa617 commit 5aae86b
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 26 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/default.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ jobs:

steps:
- uses: actions/checkout@v1
- name: Set up Python 3.8
- name: Set up Python 3.14
uses: actions/setup-python@v1
with:
python-version: '3.8'
python-version: '3.14'
architecture: ${{ matrix.config.python-arch }}
- name: Install Excalibur
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/static-code-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.8]
python-version: [3.14]

steps:
- uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
python-version: [3.12, 3.13, 3.14]

steps:
- uses: actions/checkout@v2
Expand Down
52 changes: 32 additions & 20 deletions excalibur/tasks.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
import os
import datetime as dt
import glob
import json
import logging
import datetime as dt
import os
import subprocess

import camelot
import pandas as pd
from camelot.backends.ghostscript_backend import GhostscriptBackend
from camelot.core import TableList
from camelot.parsers import Stream, Lattice
from camelot.ext.ghostscript import Ghostscript
from camelot.parsers import Lattice, Stream

from . import configuration as conf
from .models import Job, File, Rule
from .models import File, Job, Rule
from .settings import Session
from .utils.file import mkdirs
from .utils.task import get_pages, save_page, get_file_dim, get_image_dim
from .utils.task import get_file_dim, get_image_dim, get_pages, save_page


def split(file_id):
Expand Down Expand Up @@ -41,12 +43,23 @@ def split(file_id):
imagepath = os.path.join(conf.PDFS_FOLDER, file_id, imagename)

# convert single-page PDF to PNG
gs_call = f"-q -sDEVICE=png16m -o {imagepath} -r300 {filepath}"
gs_call = gs_call.encode().split()
null = open(os.devnull, "wb")
with Ghostscript(*gs_call, stdout=null):
pass
null.close()
try:
backend = GhostscriptBackend()
backend.convert(filepath, imagepath, 300)
except OSError:
gs_command = [
"gs",
"-q",
"-sDEVICE=png16m",
f"-o{imagepath}",
"-r300",
filepath,
]
try:
subprocess.run(gs_command, check=True, capture_output=True)
except subprocess.CalledProcessError as e:
logging.error(f"Ghostscript conversion failed: {e.stderr.decode()}")
raise

filenames[page] = filename
filepaths[page] = filepath
Expand All @@ -57,16 +70,14 @@ def split(file_id):

lattice_areas, stream_areas = (None for i in range(2))
# lattice
parser = Lattice()
tables = parser.extract_tables(filepath)
tables = camelot.read_pdf(filepath, flavor="lattice")
if len(tables):
lattice_areas = []
for table in tables:
x1, y1, x2, y2 = table._bbox
lattice_areas.append((x1, y2, x2, y1))
# stream
parser = Stream()
tables = parser.extract_tables(filepath)
tables = camelot.read_pdf(filepath, flavor="stream")
if len(tables):
stream_areas = []
for table in tables:
Expand Down Expand Up @@ -108,10 +119,11 @@ def extract(job_id):
for p in pages:
kwargs = pages[p]
kwargs.update(rule_options)
parser = (
Lattice(**kwargs) if flavor.lower() == "lattice" else Stream(**kwargs)
)
t = parser.extract_tables(filepaths[p])
kwargs["flavor"] = flavor.lower()
if flavor.lower() == "lattice":
kwargs.pop("columns", None)

t = camelot.read_pdf(filepaths[p], **kwargs, backend="poppler")
for _t in t:
_t.page = int(p)
tables.extend(t)
Expand Down
2 changes: 1 addition & 1 deletion excalibur/www/templates/workspace.html
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
{% endblock %}

{% block workspace %}
{% if imagepaths is not none or or imagepaths|length == 0 %}
{% if imagepaths is not none or imagepaths|length == 0 %}
<div class="container">
<div class="row pb-4">
<div class="col-md-12">
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
readme = f.read()

requires = [
"camelot-py[cv]>=0.7.1",
"camelot-py[base]>=0.11.0",
"celery>=4.1.1",
"Click>=7.0",
"configparser>=7.1.0",
Expand Down

0 comments on commit 5aae86b

Please sign in to comment.