-
Notifications
You must be signed in to change notification settings - Fork 0
Ontology rendering #113
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ontology rendering #113
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,175 @@ | ||
| from enum import Enum | ||
| from typing import Callable | ||
|
|
||
| import pymupdf | ||
| from pydantic import BaseModel, model_validator | ||
|
|
||
| from .ontology import Cited, PoliceReport, PoliceReportParseResult, SourceChunk | ||
|
|
||
|
|
||
| class OntoPainterMark(Enum): | ||
| RECT = "RECT" | ||
|
|
||
|
|
||
| FieldAccessor = Callable[[PoliceReport], list[Cited]] | ||
|
|
||
|
|
||
| class OntoPainterFieldConfig(BaseModel): | ||
| field: str | None = None # TODO - validate against PoliceReport fields | ||
| label: str | None = None | ||
| mark: OntoPainterMark | ||
| fill: tuple[float, float, float] | None = None | ||
| stroke: tuple[float, float, float] | None = None | ||
| stroke_width: float = 0 | ||
| accessor: FieldAccessor | None = None | ||
|
|
||
| @model_validator(mode="after") | ||
| def validate_field_accessor(self) -> "OntoPainterFieldConfig": | ||
| """Either field or accessor must be set, but not both.""" | ||
| if not ((self.field is None) ^ (self.accessor is None)): | ||
| raise ValueError("Either field or accessor must be set, but not both.") | ||
| return self | ||
|
|
||
| def get_value(self, report: PoliceReport) -> list[Cited]: | ||
| """Get value of a field from the report. | ||
|
|
||
| Use the `field` attribute if set, otherwise use the `accessor` function. | ||
| """ | ||
| v: Cited | list[Cited | None] | list[Cited] | None = None | ||
| if self.field is None: | ||
| if not self.accessor: | ||
| raise ValueError("Accessor is required if field is not set.") | ||
| v = self.accessor(report) | ||
| else: | ||
| v = getattr(report, self.field) | ||
|
|
||
| # Normalize none into an empty list. | ||
| if v is None: | ||
| return [] | ||
| # Normalize singular value to a one-item list. | ||
| elif isinstance(v, Cited): | ||
| return [v] | ||
| # Normalize list of Optional values to a list of Cited values. | ||
| elif isinstance(v, list): | ||
| return [v for v in v if v is not None] | ||
| raise ValueError(f"Unexpected type: {type(v)}") | ||
|
|
||
|
|
||
| class OntoPainter(BaseModel): | ||
| fields: list[OntoPainterFieldConfig] | ||
|
|
||
| def paint( | ||
| self, | ||
| pdf: str | pymupdf.Document, | ||
| parse_result: PoliceReportParseResult, | ||
| pages: str | None = None, | ||
| ) -> pymupdf.Document: | ||
| """Paint a document annotated with the parse result.""" | ||
| # 1. Load the requested pages from the input path / doc | ||
| doc = self._load_pdf(pdf, pages) | ||
|
|
||
| # 2. Loop over field configs and paint each field. | ||
| for field_config in self.fields: | ||
| field_values = field_config.get_value(parse_result.report) | ||
| for i, field_value in enumerate(field_values): | ||
| for j, chunk_id in enumerate(field_value.ids): | ||
| chunk = parse_result.chunks[chunk_id] | ||
| self._paint_field( | ||
| doc, | ||
| field_config, | ||
| chunk, | ||
| label=f"{field_config.label} {i + 1}-{j + 1}", | ||
| ) | ||
|
|
||
| return doc | ||
|
|
||
| def _paint_field( | ||
| self, | ||
| doc: pymupdf.Document, | ||
| field_config: OntoPainterFieldConfig, | ||
| chunk: SourceChunk, | ||
| label: str | None = None, | ||
| ) -> None: | ||
| """Paint a field on a document.""" | ||
| match field_config.mark: | ||
| case OntoPainterMark.RECT: | ||
| self._paint_rect(doc, field_config, chunk) | ||
| case _: | ||
| raise ValueError(f"Unsupported mark: {field_config.mark}") | ||
| if label: | ||
| page = doc.load_page(chunk.regions[0].page) | ||
| page_width, page_height = page.mediabox[2:] | ||
| scaled_points = [ | ||
| (p[0] * page_width, p[1] * page_height) for p in chunk.regions[0].points | ||
| ] | ||
| x, y = scaled_points[0] | ||
| # Offset to avoid overlapping with bounding rectangle | ||
| y -= 2 | ||
| page.insert_text( | ||
| (x, y), label, fontsize=5, fill=field_config.stroke, color=(1, 1, 1) | ||
| ) | ||
|
|
||
| def _paint_rect( | ||
| self, | ||
| doc: pymupdf.Document, | ||
| field_config: OntoPainterFieldConfig, | ||
| chunk: SourceChunk, | ||
| ) -> None: | ||
| """Paint a rectangle on a document.""" | ||
| for region in chunk.regions: | ||
| # The coordinates come normalized in (0, 1) space. Project into page coords. | ||
| page = doc.load_page(chunk.regions[0].page) | ||
| page_width, page_height = page.mediabox[2:] | ||
| shape = page.new_shape() | ||
| scaled_points = [ | ||
| (p[0] * page_width, p[1] * page_height) for p in region.points | ||
| ] | ||
| shape.draw_rect(pymupdf.Quad(*scaled_points).rect) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Suggested FixClamp Prompt for AI Agent |
||
| shape.finish(color=field_config.stroke, width=field_config.stroke_width) | ||
| shape.commit() | ||
|
|
||
| def _load_pdf( | ||
| self, doc: str | pymupdf.Document, pages: str | None = None | ||
| ) -> pymupdf.Document: | ||
| """Load a PDF document.""" | ||
| if isinstance(doc, str): | ||
| with open(doc, "rb") as f: | ||
| pdf_doc = pymupdf.open(f) | ||
| else: | ||
| pdf_doc = doc | ||
|
|
||
| filter_pages = _parse_pages_range(pages) | ||
| if filter_pages: | ||
| pdf_doc.select(filter_pages) | ||
|
|
||
| return pdf_doc | ||
|
|
||
|
|
||
| def _parse_pages_range(pages: str | None = None) -> list[int] | None: | ||
| """Parse page range specification as a list of page numbers. | ||
|
|
||
| If no spec is given, return None. | ||
|
|
||
| Spec looks like: | ||
| 1 Single page | ||
| 1-3 Range of pages | ||
| 1,2,3 List of pages | ||
| 1-3,5 Range and list of pages | ||
|
|
||
| Args: | ||
| pages: The page range specification, 1-indexed. | ||
|
|
||
| Returns: | ||
| A list of page numbers (0-indexed). | ||
| """ | ||
| if pages is None: | ||
| return None | ||
| page_list = list[int]() | ||
| for segment in pages.split(","): | ||
| if "-" in segment: | ||
| start, end = segment.split("-") | ||
| page_list.extend(range(int(start.strip()), int(end.strip()) + 1)) | ||
| else: | ||
| page_list.append(int(segment.strip())) | ||
| # Clean up duplicates and sort. | ||
| return sorted([x - 1 for x in set(page_list)]) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| class Palette: | ||
| Red1 = (0.9, 0.2, 0.1) | ||
| Red2 = (0.8, 0.1, 0.0) | ||
| Red3 = (0.7, 0.0, 0.0) | ||
| Orange1 = (0.9, 0.5, 0.0) | ||
| Orange2 = (0.8, 0.4, 0.0) | ||
| Orange3 = (0.7, 0.3, 0.0) | ||
| Yellow1 = (0.9, 0.9, 0.0) | ||
| Yellow2 = (0.8, 0.8, 0.0) | ||
| Yellow3 = (0.7, 0.7, 0.0) | ||
| Green1 = (0.2, 0.9, 0.1) | ||
| Green2 = (0.1, 0.8, 0.0) | ||
| Green3 = (0.0, 0.7, 0.0) | ||
| Blue1 = (0, 0.5, 1) | ||
| Blue2 = (0.0, 0.4, 0.8) | ||
| Blue3 = (0.0, 0.3, 0.7) | ||
| Purple1 = (0.9, 0.0, 0.9) | ||
| Purple2 = (0.8, 0.0, 0.8) | ||
| Purple3 = (0.7, 0.0, 0.7) | ||
| Pink1 = (0.9, 0.0, 0.9) | ||
| Pink2 = (0.8, 0.0, 0.8) | ||
| Pink3 = (0.7, 0.0, 0.7) | ||
| Brown1 = (0.5, 0.25, 0.0) | ||
| Brown2 = (0.4, 0.2, 0.0) | ||
| Brown3 = (0.3, 0.15, 0.0) | ||
| Cyan1 = (0.0, 0.9, 0.9) | ||
| Cyan2 = (0.0, 0.8, 0.8) | ||
| Cyan3 = (0.0, 0.7, 0.7) | ||
| Lime1 = (0.9, 0.9, 0.0) | ||
| Lime2 = (0.8, 0.8, 0.0) | ||
| Lime3 = (0.7, 0.7, 0.0) | ||
| Maroon1 = (0.5, 0.0, 0.0) | ||
| Maroon2 = (0.4, 0.0, 0.0) | ||
| Maroon3 = (0.3, 0.0, 0.0) | ||
| Gray1 = (0.5, 0.5, 0.5) | ||
| Gray2 = (0.4, 0.4, 0.4) | ||
| Gray3 = (0.3, 0.3, 0.3) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| from abc import ABC, abstractmethod | ||
| from typing import Generic, TypeVar | ||
|
|
||
| from ..common.context import Context | ||
| from ..common.file import MemoryFile | ||
| from ..common.preprocess import PreprocessMixin | ||
|
|
||
| T = TypeVar("T") | ||
|
|
||
|
|
||
| class BasePainter(ABC, Generic[T], PreprocessMixin[T]): | ||
| def __call__(self, file: MemoryFile, context: Context) -> MemoryFile: | ||
| """Paint a file, returning an annotated version. | ||
|
|
||
| `file` is the primary pipe value (e.g. a serialized ontology result). | ||
| The original input file is read from `context.input_file`. | ||
| """ | ||
| data = self.preprocess(file) | ||
| return self.paint(context.input_file, data) | ||
|
|
||
| @abstractmethod | ||
| def paint(self, original: MemoryFile, data: T) -> MemoryFile: | ||
| """Paint the input file using current analysis.""" | ||
| ... |
Uh oh!
There was an error while loading. Please reload this page.