Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
src/dinglehopper/tests
dist
build
*.egg-info
.git
36 changes: 26 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,33 @@ LABEL \
maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
org.label-schema.vcs-ref=$VCS_REF \
org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
org.label-schema.build-date=$BUILD_DATE
org.label-schema.build-date=$BUILD_DATE \
org.opencontainers.image.vendor="qurator" \
org.opencontainers.image.title="dinglehopper" \
org.opencontainers.image.description="An OCR evaluation tool" \
org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
org.opencontainers.image.revision=$VCS_REF \
org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.base.name=ocrd/core

ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8

# avoid HOME/.local/share (hard to predict USER here)
# so let XDG_DATA_HOME coincide with fixed system location
# (can still be overridden by derived stages)
ENV XDG_DATA_HOME /usr/local/share
# avoid the need for an extra volume for persistent resource user db
# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources

WORKDIR /build/dinglehopper
COPY pyproject.toml .
COPY src/dinglehopper/ocrd-tool.json .
COPY src ./src
COPY requirements.txt .
COPY README.md .
COPY Makefile .
RUN make install
RUN rm -rf /build/dinglehopper
COPY . .
COPY ocrd-tool.json .
# prepackage ocrd-tool.json as ocrd-all-tool.json
RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
RUN make install && rm -rf /build/dinglehopper

WORKDIR /data
VOLUME ["/data"]
VOLUME /data
11 changes: 9 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
PYTHON = python3
PIP = pip3
PYTHONIOENCODING=utf8
PYTEST_ARGS = -vv

DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
DOCKER_TAG = ocrd/dinglehopper

help:
Expand All @@ -16,11 +17,17 @@ help:
install:
$(PIP) install .

install-dev:
$(PIP) install -e .

test:
pytest $(PYTEST_ARGS)

docker:
docker build \
--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
-t $(DOCKER_TAG) .

.PHONY: help install docker
.PHONY: help install install-dev test docker
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ uniseg >= 0.9.1
numpy
colorama
MarkupSafe
ocrd >= 2.65.0
ocrd >= 3.3.0
attrs
multimethod >= 1.3
tqdm
Expand Down
10 changes: 3 additions & 7 deletions src/dinglehopper/ocrd-tool.json
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
{
"version": "0.9.7",
"git_url": "https://github.com/qurator-spk/dinglehopper",
"dockerhub": "ocrd/dinglehopper",
"tools": {
"ocrd-dinglehopper": {
"executable": "ocrd-dinglehopper",
"input_file_grp_cardinality": 2,
"output_file_grp_cardinality": 1,
"description": "Evaluate OCR text against ground truth with dinglehopper",
"input_file_grp": [
"OCR-D-GT-PAGE",
"OCR-D-OCR"
],
"output_file_grp": [
"OCR-D-OCR-EVAL"
],
"categories": [
"Quality assurance"
],
Expand Down
109 changes: 51 additions & 58 deletions src/dinglehopper/ocrd_cli.py
Original file line number Diff line number Diff line change
@@ -1,83 +1,76 @@
import json
from functools import cached_property
import os
from typing import Optional

import click
import importlib_resources
from ocrd_models import OcrdFileType
from ocrd import Processor
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
from ocrd_utils import make_file_id

from .cli import process as cli_process

OCRD_TOOL = json.loads(
importlib_resources.files(__name__)
.joinpath("ocrd-tool.json")
.read_text(encoding="utf-8", errors="strict")
)


@click.command()
@ocrd_cli_options
def ocrd_dinglehopper(*args, **kwargs):
return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)


class OcrdDinglehopperEvaluate(Processor):
def __init__(self, *args, **kwargs):
kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
kwargs["version"] = OCRD_TOOL["version"]
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)

def process(self):
assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
assert_file_grp_cardinality(self.output_file_grp, 1)
@cached_property
def executable(self):
return 'ocrd-dinglehopper'

log = getLogger("processor.OcrdDinglehopperEvaluate")
def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:

assert self.parameter
metrics = self.parameter["metrics"]
textequiv_level = self.parameter["textequiv_level"]
gt_grp, ocr_grp = self.input_file_grp.split(",")

input_file_tuples = self.zip_input_files(on_error="abort")
for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
if not gt_file or not ocr_file:
# file/page was not found in this group
continue
gt_file = self.workspace.download_file(gt_file)
ocr_file = self.workspace.download_file(ocr_file)
page_id = gt_file.pageId

log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)

file_id = make_file_id(ocr_file, self.output_file_grp)
report_prefix = os.path.join(self.output_file_grp, file_id)

# Process the files
try:
os.mkdir(self.output_file_grp)
except FileExistsError:
pass
cli_process(
gt_file.local_filename,
ocr_file.local_filename,
report_prefix,
metrics=metrics,
textequiv_level=textequiv_level,
# wrong number of inputs: let fail
gt_file, ocr_file = input_files
# missing on either side: skip (zip_input_files already warned)
if not gt_file or not ocr_file:
return
# missing download (i.e. OCRD_DOWNLOAD_INPUT=false):
if not gt_file.local_filename:
if config.OCRD_MISSING_INPUT == 'ABORT':
raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype)
return
if not ocr_file.local_filename:
if config.OCRD_MISSING_INPUT == 'ABORT':
raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype)
return

page_id = gt_file.pageId

file_id = make_file_id(ocr_file, self.output_file_grp)
cli_process(
gt_file.local_filename,
ocr_file.local_filename,
file_id,
self.output_file_grp,
metrics=metrics,
textequiv_level=textequiv_level,
)

# Add reports to the workspace
for report_suffix, mimetype in [
[".html", "text/html"],
[".json", "application/json"],
]:
output_file_id = file_id + report_suffix
output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set")
self.workspace.add_file(
file_id=output_file_id,
file_grp=self.output_file_grp,
page_id=page_id,
mimetype=mimetype,
local_filename=file_id + report_suffix,
)

# Add reports to the workspace
for report_suffix, mimetype in [
[".html", "text/html"],
[".json", "application/json"],
]:
self.workspace.add_file(
file_id=file_id + report_suffix,
file_grp=self.output_file_grp,
page_id=page_id,
mimetype=mimetype,
local_filename=report_prefix + report_suffix,
)


if __name__ == "__main__":
ocrd_dinglehopper()