diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index 2bcc34d76..c77919559 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -669,6 +669,8 @@ def _find_page_size( if not isinstance(item, DocItem): continue for provenance in item.prov: + if not isinstance(provenance, ProvenanceItem): + continue bbox = provenance.bbox left = min(left, bbox.l) if left != -1 else bbox.l right = max(right, bbox.r) if right != -1 else bbox.r diff --git a/docling/backend/webvtt_backend.py b/docling/backend/webvtt_backend.py index 2a7d02ce7..e61cd0f34 100644 --- a/docling/backend/webvtt_backend.py +++ b/docling/backend/webvtt_backend.py @@ -1,8 +1,9 @@ +import copy import logging -import re +from dataclasses import dataclass, field from io import BytesIO from pathlib import Path -from typing import Annotated, ClassVar, Literal, Optional, Union, cast +from typing import Literal, Optional, Union from docling_core.types.doc import ( ContentLayer, @@ -10,12 +11,20 @@ DoclingDocument, DocumentOrigin, Formatting, - GroupLabel, - NodeItem, + ProvenanceTrack, ) -from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator -from pydantic.types import StringConstraints -from typing_extensions import Self, override +from docling_core.types.doc.webvtt import ( + WebVTTCueBoldSpan, + WebVTTCueComponent, + WebVTTCueComponentWithTerminator, + WebVTTCueItalicSpan, + WebVTTCueLanguageSpan, + WebVTTCueTextSpan, + WebVTTCueUnderlineSpan, + WebVTTCueVoiceSpan, + WebVTTFile, +) +from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.datamodel.base_models import InputFormat @@ -24,409 +33,29 @@ _log = logging.getLogger(__name__) -class _WebVTTTimestamp(BaseModel): - """Model representing a WebVTT timestamp. - - A WebVTT timestamp is always interpreted relative to the current playback position - of the media data that the WebVTT file is to be synchronized with. - """ - - model_config = ConfigDict(regex_engine="python-re") - - raw: Annotated[ - str, - Field( - description="A representation of the WebVTT Timestamp as a single string" - ), - ] - - _pattern: ClassVar[re.Pattern] = re.compile( - r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$" - ) - _hours: int - _minutes: int - _seconds: int - _millis: int - - @model_validator(mode="after") - def validate_raw(self) -> Self: - m = self._pattern.match(self.raw) - if not m: - raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}") - self._hours = int(m.group(1)) if m.group(1) else 0 - self._minutes = int(m.group(2)) - self._seconds = int(m.group(3)) - self._millis = int(m.group(4)) - - if self._minutes < 0 or self._minutes > 59: - raise ValueError("Minutes must be between 0 and 59") - if self._seconds < 0 or self._seconds > 59: - raise ValueError("Seconds must be between 0 and 59") - - return self - - @property - def seconds(self) -> float: - """A representation of the WebVTT Timestamp in seconds""" - return ( - self._hours * 3600 - + self._minutes * 60 - + self._seconds - + self._millis / 1000.0 - ) - - @override - def __str__(self) -> str: - return self.raw - - -_WebVTTCueIdentifier = Annotated[ - str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$") -] - - -class _WebVTTCueTimings(BaseModel): - """Model representating WebVTT cue timings.""" - - start: Annotated[ - _WebVTTTimestamp, Field(description="Start time offset of the cue") - ] - end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")] - - @model_validator(mode="after") - def check_order(self) -> Self: - if self.start and self.end: - if self.end.seconds <= self.start.seconds: - raise ValueError("End timestamp must be greater than start timestamp") - return self - - @override - def __str__(self): - return f"{self.start} --> {self.end}" - - -class _WebVTTCueTextSpan(BaseModel): - """Model representing a WebVTT cue text span.""" - +@dataclass +class AnnotatedText: text: str - span_type: Literal["text"] = "text" - - @field_validator("text", mode="after") - @classmethod - def validate_text(cls, value: str) -> str: - if any(ch in value for ch in {"\n", "\r", "&", "<"}): - raise ValueError("Cue text span contains invalid characters") - if len(value) == 0: - raise ValueError("Cue text span cannot be empty") - return value - - @override - def __str__(self): - return self.text - - -class _WebVTTCueVoiceSpan(BaseModel): - """Model representing a WebVTT cue voice span.""" - - annotation: Annotated[ - str, - Field( - description=( - "Cue span start tag annotation text representing the name of thevoice" - ) - ), - ] - classes: Annotated[ - list[str], - Field(description="List of classes representing the cue span's significance"), - ] = [] - components: Annotated[ - list["_WebVTTCueComponent"], - Field(description="The components representing the cue internal text"), - ] = [] - span_type: Literal["v"] = "v" - - @field_validator("annotation", mode="after") - @classmethod - def validate_annotation(cls, value: str) -> str: - if any(ch in value for ch in {"\n", "\r", "&", ">"}): - raise ValueError( - "Cue span start tag annotation contains invalid characters" - ) - if not value: - raise ValueError("Cue text span cannot be empty") - return value - - @field_validator("classes", mode="after") - @classmethod - def validate_classes(cls, value: list[str]) -> list[str]: - for item in value: - if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}): - raise ValueError( - "A cue span start tag class contains invalid characters" - ) - if not item: - raise ValueError("Cue span start tag classes cannot be empty") - return value - - @override - def __str__(self): - tag = f"v.{'.'.join(self.classes)}" if self.classes else "v" - inner = "".join(str(span) for span in self.components) - return f"<{tag} {self.annotation}>{inner}" - - -class _WebVTTCueClassSpan(BaseModel): - span_type: Literal["c"] = "c" - components: list["_WebVTTCueComponent"] - - @override - def __str__(self): - inner = "".join(str(span) for span in self.components) - return f"{inner}" - - -class _WebVTTCueItalicSpan(BaseModel): - span_type: Literal["i"] = "i" - components: list["_WebVTTCueComponent"] - - @override - def __str__(self): - inner = "".join(str(span) for span in self.components) - return f"{inner}" - - -class _WebVTTCueBoldSpan(BaseModel): - span_type: Literal["b"] = "b" - components: list["_WebVTTCueComponent"] - - @override - def __str__(self): - inner = "".join(str(span) for span in self.components) - return f"{inner}" - - -class _WebVTTCueUnderlineSpan(BaseModel): - span_type: Literal["u"] = "u" - components: list["_WebVTTCueComponent"] - - @override - def __str__(self): - inner = "".join(str(span) for span in self.components) - return f"{inner}" - - -_WebVTTCueComponent = Annotated[ - Union[ - _WebVTTCueTextSpan, - _WebVTTCueClassSpan, - _WebVTTCueItalicSpan, - _WebVTTCueBoldSpan, - _WebVTTCueUnderlineSpan, - _WebVTTCueVoiceSpan, - ], - Field(discriminator="span_type", description="The WebVTT cue component"), -] - - -class _WebVTTCueBlock(BaseModel): - """Model representing a WebVTT cue block. - - The optional WebVTT cue settings list is not supported. - The cue payload is limited to the following spans: text, class, italic, bold, - underline, and voice. - """ - - model_config = ConfigDict(regex_engine="python-re") - - identifier: Optional[_WebVTTCueIdentifier] = Field( - None, description="The WebVTT cue identifier" + voice: Optional[str] = None + formatting: Optional[Formatting] = None + classes: dict[Literal["b", "u", "i", "lang", "v"], list[str]] = field( + default_factory=dict ) - timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")] - payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")] - - _pattern_block: ClassVar[re.Pattern] = re.compile( - r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>" - ) - _pattern_voice_tag: ClassVar[re.Pattern] = re.compile( - r"^\.[^\t\n\r &<>]+)?" # zero or more classes - r"[ \t]+(?P[^\n\r&>]+)>" # required space and annotation - ) - - @field_validator("payload", mode="after") - @classmethod - def validate_payload(cls, payload): - for voice in payload: - if "-->" in str(voice): - raise ValueError("Cue payload must not contain '-->'") - return payload - - @classmethod - def parse(cls, raw: str) -> "_WebVTTCueBlock": - lines = raw.strip().splitlines() - if not lines: - raise ValueError("Cue block must have at least one line") - identifier: Optional[_WebVTTCueIdentifier] = None - timing_line = lines[0] - if "-->" not in timing_line and len(lines) > 1: - identifier = timing_line - timing_line = lines[1] - cue_lines = lines[2:] - else: - cue_lines = lines[1:] - - if "-->" not in timing_line: - raise ValueError("Cue block must contain WebVTT cue timings") - - start, end = [t.strip() for t in timing_line.split("-->")] - end = re.split(" |\t", end)[0] # ignore the cue settings list - timings: _WebVTTCueTimings = _WebVTTCueTimings( - start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end) + lang: set[str] = field(default_factory=set) + + def copy_meta(self, text): + return AnnotatedText( + text=text, + voice=self.voice, + formatting=self.formatting.model_copy() if self.formatting else None, + classes=copy.deepcopy(self.classes), + lang=self.lang.copy(), ) - cue_text = " ".join(cue_lines).strip() - if cue_text.startswith("" not in cue_text: - # adding close tag for cue voice spans without end tag - cue_text += "" - - stack: list[list[_WebVTTCueComponent]] = [[]] - tag_stack: list[Union[str, tuple]] = [] - - pos = 0 - matches = list(cls._pattern_block.finditer(cue_text)) - i = 0 - while i < len(matches): - match = matches[i] - if match.start() > pos: - stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()])) - tag = match.group(0) - - if tag.startswith(("", "", "", "")): - tag_type = tag[1:2] - tag_stack.append(tag_type) - stack.append([]) - elif tag == "": - children = stack.pop() - stack[-1].append(_WebVTTCueItalicSpan(components=children)) - tag_stack.pop() - elif tag == "": - children = stack.pop() - stack[-1].append(_WebVTTCueBoldSpan(components=children)) - tag_stack.pop() - elif tag == "": - children = stack.pop() - stack[-1].append(_WebVTTCueUnderlineSpan(components=children)) - tag_stack.pop() - elif tag == "": - children = stack.pop() - stack[-1].append(_WebVTTCueClassSpan(components=children)) - tag_stack.pop() - elif tag.startswith("")) - else: - parts.append(str(span)) - - return "".join(parts) - - -class _WebVTTFile(BaseModel): - """A model representing a WebVTT file.""" - - cue_blocks: list[_WebVTTCueBlock] - - @staticmethod - def verify_signature(content: str) -> bool: - if not content: - return False - elif len(content) == 6: - return content == "WEBVTT" - elif len(content) > 6 and content.startswith("WEBVTT"): - return content[6] in (" ", "\t", "\n") - else: - return False - - @classmethod - def parse(cls, raw: str) -> "_WebVTTFile": - # Normalize newlines to LF - raw = raw.replace("\r\n", "\n").replace("\r", "\n") - - # Check WebVTT signature - if not cls.verify_signature(raw): - raise ValueError("Invalid WebVTT file signature") - - # Strip "WEBVTT" header line - lines = raw.split("\n", 1) - body = lines[1] if len(lines) > 1 else "" - - # Remove NOTE/STYLE/REGION blocks - body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE) - body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE) - - # Split into cue blocks - raw_blocks = re.split(r"\n\s*\n", body.strip()) - cues: list[_WebVTTCueBlock] = [] - for block in raw_blocks: - try: - cues.append(_WebVTTCueBlock.parse(block)) - except ValueError as e: - _log.warning(f"Failed to parse cue block:\n{block}\n{e}") - - return cls(cue_blocks=cues) - - def __iter__(self): - return iter(self.cue_blocks) - def __getitem__(self, idx): - return self.cue_blocks[idx] - def __len__(self): - return len(self.cue_blocks) +@dataclass +class AnnotatedPar: + items: list[AnnotatedText] class WebVTTDocumentBackend(DeclarativeDocumentBackend): @@ -458,7 +87,7 @@ def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]): @override def is_valid(self) -> bool: - return _WebVTTFile.verify_signature(self.content) + return WebVTTFile.verify_signature(self.content) @classmethod @override @@ -477,36 +106,18 @@ def supported_formats(cls) -> set[InputFormat]: return {InputFormat.VTT} @staticmethod - def _add_text_from_component( - doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem] + def _add_classes( + item: AnnotatedText, + key: Literal["b", "u", "i", "lang", "v"], + classes: list[str], ) -> None: - """Adds a TextItem to a document by extracting text from a cue span component. - - TODO: address nesting - """ - formatting = Formatting() - text = "" - if isinstance(item, _WebVTTCueItalicSpan): - formatting.italic = True - elif isinstance(item, _WebVTTCueBoldSpan): - formatting.bold = True - elif isinstance(item, _WebVTTCueUnderlineSpan): - formatting.underline = True - if isinstance(item, _WebVTTCueTextSpan): - text = item.text - else: - # TODO: address nesting - text = "".join( - [t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)] - ) - if text := text.strip(): - doc.add_text( - label=DocItemLabel.TEXT, - text=text, - parent=parent, - content_layer=ContentLayer.BODY, - formatting=formatting, - ) + if not classes: + return + + bucket = item.classes.setdefault(key, []) + for cls in classes: + if cls not in bucket: + bucket.append(cls) @override def convert(self) -> DoclingDocument: @@ -521,52 +132,115 @@ def convert(self) -> DoclingDocument: ) doc = DoclingDocument(name=self.file.stem or "file", origin=origin) - vtt: _WebVTTFile = _WebVTTFile.parse(self.content) - for block in vtt.cue_blocks: - block_group = doc.add_group( - label=GroupLabel.SECTION, - name="WebVTT cue block", - parent=None, - content_layer=ContentLayer.BODY, - ) - if block.identifier: - doc.add_text( - label=DocItemLabel.TEXT, - text=str(block.identifier), - parent=block_group, - content_layer=ContentLayer.BODY, + vtt: WebVTTFile = WebVTTFile.parse(self.content) + cue_text: list[AnnotatedPar] = [] + parents: list[AnnotatedText] = [] + + def _extract_components( + payload: list[WebVTTCueComponentWithTerminator], + ) -> None: + nonlocal cue_text, parents + if not cue_text: + cue_text.append(AnnotatedPar(items=[])) + par = cue_text[-1] + for comp in payload: + item: AnnotatedText = ( + parents[-1].copy_meta("") if parents else AnnotatedText(text="") ) + component: WebVTTCueComponent = comp.component + if isinstance(component, WebVTTCueTextSpan): + item.text = component.text + par.items.append(item) + else: + # configure metadata based on span type + if isinstance(component, WebVTTCueBoldSpan): + item.formatting = item.formatting or Formatting() + item.formatting.bold = True + self._add_classes(item, "b", component.start_tag.classes) + + elif isinstance(component, WebVTTCueItalicSpan): + item.formatting = item.formatting or Formatting() + item.formatting.italic = True + self._add_classes(item, "i", component.start_tag.classes) + + elif isinstance(component, WebVTTCueUnderlineSpan): + item.formatting = item.formatting or Formatting() + item.formatting.underline = True + self._add_classes(item, "u", component.start_tag.classes) + + elif isinstance(component, WebVTTCueLanguageSpan): + item.lang.add(component.start_tag.annotation) + self._add_classes(item, "lang", component.start_tag.classes) + + elif isinstance(component, WebVTTCueVoiceSpan): + # voice spans cannot be embedded + item.voice = component.start_tag.annotation + self._add_classes(item, "v", component.start_tag.classes) + + parents.append(item) + _extract_components(component.internal_text.components) + parents.pop() + + if comp.terminator is not None: + cue_text.append(AnnotatedPar(items=[])) + par = cue_text[-1] + + def _add_text_item( + text: str, + formatting: Optional[Formatting], + item: AnnotatedText, + parent=None, + ): + languages = list(item.lang) if item.lang else None + classes = ( + [".".join([k, *v]) for k, v in item.classes.items()] + if item.classes + else None + ) + + track = ProvenanceTrack( + start_time=block.timings.start.seconds, + end_time=block.timings.end.seconds, + identifier=identifier, + languages=languages, + classes=classes, + voice=item.voice or None, + ) + doc.add_text( label=DocItemLabel.TEXT, - text=str(block.timings), - parent=block_group, + text=text, content_layer=ContentLayer.BODY, + prov=track, + formatting=formatting, + parent=parent, ) - for cue_span in block.payload: - if isinstance(cue_span, _WebVTTCueVoiceSpan): - voice_group = doc.add_group( - label=GroupLabel.INLINE, - name="WebVTT cue voice span", - parent=block_group, - content_layer=ContentLayer.BODY, - ) - voice = cue_span.annotation - if classes := cue_span.classes: - voice += f" ({', '.join(classes)})" - voice += ": " - doc.add_text( - label=DocItemLabel.TEXT, - text=voice, - parent=voice_group, - content_layer=ContentLayer.BODY, + + for block in vtt.cue_blocks: + cue_text = [] + parents = [] + identifier = str(block.identifier) if block.identifier else None + _extract_components(block.payload) + for par in cue_text: + if not par.items: + continue + if len(par.items) == 1: + item = par.items[0] + _add_text_item( + text=item.text, + formatting=item.formatting, + item=item, ) - for item in cue_span.components: - WebVTTDocumentBackend._add_text_from_component( - doc, item, voice_group - ) else: - WebVTTDocumentBackend._add_text_from_component( - doc, cue_span, block_group + group = doc.add_inline_group( + "WebVTT cue span", content_layer=ContentLayer.BODY ) + for item in par.items: + _add_text_item( + text=item.text, + formatting=item.formatting, + item=item, + parent=group, + ) return doc diff --git a/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py b/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py index db73db8db..70434fd8d 100644 --- a/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +++ b/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py @@ -12,8 +12,7 @@ from pathlib import Path from typing import TYPE_CHECKING, List, Optional, Union, cast -from docling_core.types.doc import DoclingDocument -from docling_core.types.doc.document import DocTagsDocument +from docling_core.types.doc import DoclingDocument, DocTagsDocument, ProvenanceItem from PIL import Image as PILImage if TYPE_CHECKING: @@ -371,13 +370,17 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: page_map = {p.page_no: p for p in conv_res.pages} scale = self.pipeline_options.images_scale for element, _level in conv_res.document.iterate_items(): - if not isinstance(element, DocItem) or len(element.prov) == 0: + if ( + not isinstance(element, DocItem) + or not element.prov + or not isinstance(prov := element.prov[0], ProvenanceItem) + ): continue if ( isinstance(element, PictureItem) and self.pipeline_options.generate_picture_images ): - page_no = element.prov[0].page_no + page_no = prov.page_no page = page_map.get(page_no) if page is None: _log.warning( @@ -387,10 +390,8 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: assert page.size is not None assert page.image is not None - crop_bbox = ( - element.prov[0] - .bbox.scaled(scale=scale) - .to_top_left_origin(page_height=page.size.height * scale) + crop_bbox = prov.bbox.scaled(scale=scale).to_top_left_origin( + page_height=page.size.height * scale ) cropped_im = page.image.crop(crop_bbox.as_tuple()) diff --git a/docling/models/base_model.py b/docling/models/base_model.py index c69b5018b..dae4ee92d 100644 --- a/docling/models/base_model.py +++ b/docling/models/base_model.py @@ -10,6 +10,7 @@ DoclingDocument, NodeItem, PictureItem, + ProvenanceItem, ) from PIL.Image import Image from typing_extensions import TypeVar @@ -199,6 +200,8 @@ def prepare_element( return None # Crop the image form the page + if not isinstance(element.prov[0], ProvenanceItem): + return None element_prov = element.prov[0] bbox = element_prov.bbox width = bbox.r - bbox.l diff --git a/docling/models/picture_description_base_model.py b/docling/models/picture_description_base_model.py index 055c74b1f..3643bd9ff 100644 --- a/docling/models/picture_description_base_model.py +++ b/docling/models/picture_description_base_model.py @@ -7,6 +7,7 @@ DoclingDocument, NodeItem, PictureItem, + ProvenanceItem, ) from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc PictureDescriptionData, @@ -64,8 +65,8 @@ def __call__( assert isinstance(el.item, PictureItem) describe_image = True # Don't describe the image if it's smaller than the threshold - if len(el.item.prov) > 0: - prov = el.item.prov[0] # PictureItems have at most a single provenance + if el.item.prov and isinstance(prov := el.item.prov[0], ProvenanceItem): + # PictureItems have at most a single provenance page = doc.pages.get(prov.page_no) if page is not None: page_area = page.size.width * page.size.height diff --git a/docling/pipeline/asr_pipeline.py b/docling/pipeline/asr_pipeline.py index 2bb94e42a..8b2f47092 100644 --- a/docling/pipeline/asr_pipeline.py +++ b/docling/pipeline/asr_pipeline.py @@ -1,47 +1,35 @@ import logging -import os -import re import sys import tempfile from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, List, Optional, Union, cast - -from docling_core.types.doc import DoclingDocument, DocumentOrigin - -# import whisper # type: ignore -# import librosa -# import numpy as np -# import soundfile as sf # type: ignore -from docling_core.types.doc.labels import DocItemLabel -from pydantic import BaseModel, Field, validator +from typing import Optional, Union + +from docling_core.types.doc import ( + ContentLayer, + DocItemLabel, + DoclingDocument, + DocumentOrigin, + ProvenanceTrack, +) +from pydantic import BaseModel, Field from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.noop_backend import NoOpBackend - -# from pydub import AudioSegment # type: ignore -# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline from docling.datamodel.accelerator_options import ( AcceleratorOptions, ) from docling.datamodel.base_models import ( ConversionStatus, - FormatToMimeType, ) -from docling.datamodel.document import ConversionResult, InputDocument +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( AsrPipelineOptions, ) from docling.datamodel.pipeline_options_asr_model import ( InlineAsrMlxWhisperOptions, InlineAsrNativeWhisperOptions, - # AsrResponseFormat, - InlineAsrOptions, ) -from docling.datamodel.pipeline_options_vlm_model import ( - InferenceFramework, -) -from docling.datamodel.settings import settings from docling.pipeline.base_pipeline import BasePipeline from docling.utils.accelerator_utils import decide_device from docling.utils.profiling import ProfilingScope, TimeRecorder @@ -190,8 +178,16 @@ def run(self, conv_res: ConversionResult) -> ConversionResult: ) for citem in conversation: + prov: ProvenanceTrack = ProvenanceTrack( + start_time=citem.start_time, + end_time=citem.end_time, + voice=citem.speaker, + ) conv_res.document.add_text( - label=DocItemLabel.TEXT, text=citem.to_string() + label=DocItemLabel.TEXT, + text=citem.text, + prov=prov, + content_layer=ContentLayer.BODY, ) return conv_res @@ -299,8 +295,16 @@ def run(self, conv_res: ConversionResult) -> ConversionResult: ) for citem in conversation: + prov: ProvenanceTrack = ProvenanceTrack( + start_time=citem.start_time, + end_time=citem.end_time, + voice=citem.speaker, + ) conv_res.document.add_text( - label=DocItemLabel.TEXT, text=citem.to_string() + label=DocItemLabel.TEXT, + text=citem.text, + prov=prov, + content_layer=ContentLayer.BODY, ) conv_res.status = ConversionStatus.SUCCESS diff --git a/docling/pipeline/legacy_standard_pdf_pipeline.py b/docling/pipeline/legacy_standard_pdf_pipeline.py index 55c2703cd..ceca82db9 100644 --- a/docling/pipeline/legacy_standard_pdf_pipeline.py +++ b/docling/pipeline/legacy_standard_pdf_pipeline.py @@ -4,7 +4,13 @@ from typing import Optional, cast import numpy as np -from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem +from docling_core.types.doc import ( + DocItem, + ImageRef, + PictureItem, + ProvenanceItem, + TableItem, +) from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend @@ -181,7 +187,11 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: ): scale = self.pipeline_options.images_scale for element, _level in conv_res.document.iterate_items(): - if not isinstance(element, DocItem) or len(element.prov) == 0: + if ( + not isinstance(element, DocItem) + or not element.prov + or not isinstance(prov := element.prov[0], ProvenanceItem) + ): continue if ( isinstance(element, PictureItem) @@ -190,7 +200,7 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: isinstance(element, TableItem) and self.pipeline_options.generate_table_images ): - page_ix = element.prov[0].page_no - 1 + page_ix = prov.page_no - 1 page = next( (p for p in conv_res.pages if p.page_no == page_ix), cast("Page", None), @@ -199,13 +209,9 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: assert page.size is not None assert page.image is not None - crop_bbox = ( - element.prov[0] - .bbox.scaled(scale=scale) - .to_top_left_origin( - page_height=page.size.height * scale - ) - ) + crop_bbox = prov.bbox.scaled( + scale=scale + ).to_top_left_origin(page_height=page.size.height * scale) cropped_im = page.image.crop(crop_bbox.as_tuple()) element.image = ImageRef.from_pil( diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 585c548c6..54def080e 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -26,7 +26,13 @@ from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, cast import numpy as np -from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem +from docling_core.types.doc import ( + DocItem, + ImageRef, + PictureItem, + ProvenanceItem, + TableItem, +) from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend @@ -760,7 +766,11 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: ): scale = self.pipeline_options.images_scale for element, _level in conv_res.document.iterate_items(): - if not isinstance(element, DocItem) or len(element.prov) == 0: + if ( + not isinstance(element, DocItem) + or not element.prov + or not isinstance(prov := element.prov[0], ProvenanceItem) + ): continue if ( isinstance(element, PictureItem) @@ -769,7 +779,7 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: isinstance(element, TableItem) and self.pipeline_options.generate_table_images ): - page_ix = element.prov[0].page_no - 1 + page_ix = prov.page_no - 1 page = next( (p for p in conv_res.pages if p.page_no == page_ix), cast("Page", None), @@ -778,13 +788,9 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: assert page.size is not None assert page.image is not None - crop_bbox = ( - element.prov[0] - .bbox.scaled(scale=scale) - .to_top_left_origin( - page_height=page.size.height * scale - ) - ) + crop_bbox = prov.bbox.scaled( + scale=scale + ).to_top_left_origin(page_height=page.size.height * scale) cropped_im = page.image.crop(crop_bbox.as_tuple()) element.image = ImageRef.from_pil( diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index ab919c4d9..73831fc49 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -165,21 +165,23 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: if self.pipeline_options.generate_picture_images: scale = self.pipeline_options.images_scale for element, _level in conv_res.document.iterate_items(): - if not isinstance(element, DocItem) or len(element.prov) == 0: + if ( + not isinstance(element, DocItem) + or not element.prov + or not isinstance(prov := element.prov[0], ProvenanceItem) + ): continue if ( isinstance(element, PictureItem) and self.pipeline_options.generate_picture_images ): - page_ix = element.prov[0].page_no - 1 + page_ix = prov.page_no - 1 page = conv_res.pages[page_ix] assert page.size is not None assert page.image is not None - crop_bbox = ( - element.prov[0] - .bbox.scaled(scale=scale) - .to_top_left_origin(page_height=page.size.height * scale) + crop_bbox = prov.bbox.scaled(scale=scale).to_top_left_origin( + page_height=page.size.height * scale ) cropped_im = page.image.crop(crop_bbox.as_tuple()) @@ -216,12 +218,14 @@ def _turn_dt_into_doc(self, conv_res) -> DoclingDocument: if self.force_backend_text: scale = self.pipeline_options.images_scale for element, _level in conv_res.document.iterate_items(): - if not isinstance(element, TextItem) or len(element.prov) == 0: + if ( + not isinstance(element, TextItem) + or not element.prov + or not isinstance(prov := element.prov[0], ProvenanceItem) + ): continue - crop_bbox = ( - element.prov[0] - .bbox.scaled(scale=scale) - .to_top_left_origin(page_height=page.size.height * scale) + crop_bbox = prov.bbox.scaled(scale=scale).to_top_left_origin( + page_height=page.size.height * scale ) txt = self.extract_text_from_backend(page, crop_bbox) element.text = txt diff --git a/pyproject.toml b/pyproject.toml index 8dc239382..8444ac4b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,6 @@ authors = [ requires-python = '>=3.9,<4.0' dependencies = [ 'pydantic (>=2.0.0,<3.0.0)', - 'docling-core[chunking] (>=2.50.1,<3.0.0)', 'docling-parse (>=4.7.0,<5.0.0)', "docling-ibm-models>=3.9.1,<4", 'filetype (>=1.2.0,<2.0.0)', @@ -74,6 +73,7 @@ dependencies = [ # 'scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"', "accelerate>=1.0.0,<2", "polyfactory>=2.22.2", + "docling-core[chunking]", ] [project.urls] @@ -160,6 +160,9 @@ constraints = [ package = true default-groups = "all" +[tool.uv.sources] +docling-core = { git = "ssh://git@github.com/docling-project/docling-core.git", rev = "c75516516358f25add2682674fc7dc6eef2c5164" } + [tool.setuptools.packages.find] include = ["docling*"] diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt index d7840e994..db52ba1b7 100644 --- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt +++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt @@ -1,66 +1,14 @@ item-0 at level 0: unspecified: group _root_ - item-1 at level 1: section: group WebVTT cue block - item-2 at level 2: text: 00:11.000 --> 00:13.000 - item-3 at level 2: inline: group WebVTT cue voice span - item-4 at level 3: text: Roger Bingham: - item-5 at level 3: text: We are in New York City - item-6 at level 1: section: group WebVTT cue block - item-7 at level 2: text: 00:13.000 --> 00:16.000 - item-8 at level 2: inline: group WebVTT cue voice span - item-9 at level 3: text: Roger Bingham: - item-10 at level 3: text: We’re actually at the Lucern Hotel, just down the street - item-11 at level 1: section: group WebVTT cue block - item-12 at level 2: text: 00:16.000 --> 00:18.000 - item-13 at level 2: inline: group WebVTT cue voice span - item-14 at level 3: text: Roger Bingham: - item-15 at level 3: text: from the American Museum of Natural History - item-16 at level 1: section: group WebVTT cue block - item-17 at level 2: text: 00:18.000 --> 00:20.000 - item-18 at level 2: inline: group WebVTT cue voice span - item-19 at level 3: text: Roger Bingham: - item-20 at level 3: text: And with me is Neil deGrasse Tyson - item-21 at level 1: section: group WebVTT cue block - item-22 at level 2: text: 00:20.000 --> 00:22.000 - item-23 at level 2: inline: group WebVTT cue voice span - item-24 at level 3: text: Roger Bingham: - item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium - item-26 at level 1: section: group WebVTT cue block - item-27 at level 2: text: 00:22.000 --> 00:24.000 - item-28 at level 2: inline: group WebVTT cue voice span - item-29 at level 3: text: Roger Bingham: - item-30 at level 3: text: at the AMNH. - item-31 at level 1: section: group WebVTT cue block - item-32 at level 2: text: 00:24.000 --> 00:26.000 - item-33 at level 2: inline: group WebVTT cue voice span - item-34 at level 3: text: Roger Bingham: - item-35 at level 3: text: Thank you for walking down here. - item-36 at level 1: section: group WebVTT cue block - item-37 at level 2: text: 00:27.000 --> 00:30.000 - item-38 at level 2: inline: group WebVTT cue voice span - item-39 at level 3: text: Roger Bingham: - item-40 at level 3: text: And I want to do a follow-up on the last conversation we did. - item-41 at level 1: section: group WebVTT cue block - item-42 at level 2: text: 00:30.000 --> 00:31.500 - item-43 at level 2: inline: group WebVTT cue voice span - item-44 at level 3: text: Roger Bingham: - item-45 at level 3: text: When we e-mailed— - item-46 at level 1: section: group WebVTT cue block - item-47 at level 2: text: 00:30.500 --> 00:32.500 - item-48 at level 2: inline: group WebVTT cue voice span - item-49 at level 3: text: Neil deGrasse Tyson: - item-50 at level 3: text: Didn’t we talk about enough in that conversation? - item-51 at level 1: section: group WebVTT cue block - item-52 at level 2: text: 00:32.000 --> 00:35.500 - item-53 at level 2: inline: group WebVTT cue voice span - item-54 at level 3: text: Roger Bingham: - item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos - item-56 at level 1: section: group WebVTT cue block - item-57 at level 2: text: 00:32.500 --> 00:33.500 - item-58 at level 2: inline: group WebVTT cue voice span - item-59 at level 3: text: Neil deGrasse Tyson: - item-60 at level 3: text: Laughs - item-61 at level 1: section: group WebVTT cue block - item-62 at level 2: text: 00:35.500 --> 00:38.000 - item-63 at level 2: inline: group WebVTT cue voice span - item-64 at level 3: text: Roger Bingham: - item-65 at level 3: text: You know I’m so excited my glasses are falling off here. \ No newline at end of file + item-1 at level 1: text: We are in New York City + item-2 at level 1: text: We’re actually at the Lucern Hotel, just down the street + item-3 at level 1: text: from the American Museum of Natural History + item-4 at level 1: text: And with me is Neil deGrasse Tyson + item-5 at level 1: text: Astrophysicist, Director of the Hayden Planetarium + item-6 at level 1: text: at the AMNH. + item-7 at level 1: text: Thank you for walking down here. + item-8 at level 1: text: And I want to do a follow-up on the last conversation we did. + item-9 at level 1: text: When we e-mailed— + item-10 at level 1: text: Didn’t we talk about enough in that conversation? + item-11 at level 1: text: No! No no no no; 'cos 'cos obviously 'cos + item-12 at level 1: text: Laughs + item-13 at level 1: text: You know I’m so excited my glasses are falling off here. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json index 831182560..5a7c9d29b 100644 --- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json +++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "webvtt_example_01", "origin": { "mimetype": "text/vtt", @@ -18,1052 +18,291 @@ "self_ref": "#/body", "children": [ { - "$ref": "#/groups/0" + "$ref": "#/texts/0" }, { - "$ref": "#/groups/2" + "$ref": "#/texts/1" }, { - "$ref": "#/groups/4" + "$ref": "#/texts/2" }, { - "$ref": "#/groups/6" + "$ref": "#/texts/3" }, { - "$ref": "#/groups/8" + "$ref": "#/texts/4" }, { - "$ref": "#/groups/10" + "$ref": "#/texts/5" }, { - "$ref": "#/groups/12" + "$ref": "#/texts/6" }, { - "$ref": "#/groups/14" + "$ref": "#/texts/7" }, { - "$ref": "#/groups/16" + "$ref": "#/texts/8" }, { - "$ref": "#/groups/18" + "$ref": "#/texts/9" }, { - "$ref": "#/groups/20" + "$ref": "#/texts/10" }, { - "$ref": "#/groups/22" + "$ref": "#/texts/11" }, { - "$ref": "#/groups/24" + "$ref": "#/texts/12" } ], "content_layer": "body", "name": "_root_", "label": "unspecified" }, - "groups": [ - { - "self_ref": "#/groups/0", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/0" - }, - { - "$ref": "#/groups/1" - } - ], - "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/1", - "parent": { - "$ref": "#/groups/0" - }, - "children": [ - { - "$ref": "#/texts/1" - }, - { - "$ref": "#/texts/2" - } - ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" - }, + "groups": [], + "texts": [ { - "self_ref": "#/groups/2", + "self_ref": "#/texts/0", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/3" - }, - { - "$ref": "#/groups/3" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/3", - "parent": { - "$ref": "#/groups/2" - }, - "children": [ - { - "$ref": "#/texts/4" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/5" + "start_time": 11.0, + "end_time": 13.0, + "voice": "Roger Bingham" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "We are in New York City", + "text": "We are in New York City" }, { - "self_ref": "#/groups/4", + "self_ref": "#/texts/1", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/6" - }, - { - "$ref": "#/groups/5" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/5", - "parent": { - "$ref": "#/groups/4" - }, - "children": [ - { - "$ref": "#/texts/7" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/8" + "start_time": 13.0, + "end_time": 16.0, + "voice": "Roger Bingham" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "We’re actually at the Lucern Hotel, just down the street", + "text": "We’re actually at the Lucern Hotel, just down the street" }, { - "self_ref": "#/groups/6", + "self_ref": "#/texts/2", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/9" - }, - { - "$ref": "#/groups/7" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/7", - "parent": { - "$ref": "#/groups/6" - }, - "children": [ - { - "$ref": "#/texts/10" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/11" + "start_time": 16.0, + "end_time": 18.0, + "voice": "Roger Bingham" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "from the American Museum of Natural History", + "text": "from the American Museum of Natural History" }, { - "self_ref": "#/groups/8", + "self_ref": "#/texts/3", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/12" - }, - { - "$ref": "#/groups/9" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/9", - "parent": { - "$ref": "#/groups/8" - }, - "children": [ - { - "$ref": "#/texts/13" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/14" + "start_time": 18.0, + "end_time": 20.0, + "voice": "Roger Bingham" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "And with me is Neil deGrasse Tyson", + "text": "And with me is Neil deGrasse Tyson" }, { - "self_ref": "#/groups/10", + "self_ref": "#/texts/4", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/15" - }, - { - "$ref": "#/groups/11" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/11", - "parent": { - "$ref": "#/groups/10" - }, - "children": [ - { - "$ref": "#/texts/16" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/17" + "start_time": 20.0, + "end_time": 22.0, + "voice": "Roger Bingham" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "Astrophysicist, Director of the Hayden Planetarium", + "text": "Astrophysicist, Director of the Hayden Planetarium" }, { - "self_ref": "#/groups/12", + "self_ref": "#/texts/5", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/18" - }, - { - "$ref": "#/groups/13" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/13", - "parent": { - "$ref": "#/groups/12" - }, - "children": [ - { - "$ref": "#/texts/19" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/20" + "start_time": 22.0, + "end_time": 24.0, + "voice": "Roger Bingham" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "at the AMNH.", + "text": "at the AMNH." }, { - "self_ref": "#/groups/14", + "self_ref": "#/texts/6", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/21" - }, - { - "$ref": "#/groups/15" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/15", - "parent": { - "$ref": "#/groups/14" - }, - "children": [ - { - "$ref": "#/texts/22" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/23" + "start_time": 24.0, + "end_time": 26.0, + "voice": "Roger Bingham" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "Thank you for walking down here.", + "text": "Thank you for walking down here." }, { - "self_ref": "#/groups/16", + "self_ref": "#/texts/7", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/24" - }, - { - "$ref": "#/groups/17" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/17", - "parent": { - "$ref": "#/groups/16" - }, - "children": [ - { - "$ref": "#/texts/25" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/26" + "start_time": 27.0, + "end_time": 30.0, + "voice": "Roger Bingham" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "And I want to do a follow-up on the last conversation we did.", + "text": "And I want to do a follow-up on the last conversation we did." }, { - "self_ref": "#/groups/18", + "self_ref": "#/texts/8", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/27" - }, - { - "$ref": "#/groups/19" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/19", - "parent": { - "$ref": "#/groups/18" - }, - "children": [ - { - "$ref": "#/texts/28" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/29" + "start_time": 30.0, + "end_time": 31.5, + "voice": "Roger Bingham" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "When we e-mailed—", + "text": "When we e-mailed—" }, { - "self_ref": "#/groups/20", + "self_ref": "#/texts/9", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/30" - }, - { - "$ref": "#/groups/21" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/21", - "parent": { - "$ref": "#/groups/20" - }, - "children": [ - { - "$ref": "#/texts/31" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/32" + "start_time": 30.5, + "end_time": 32.5, + "voice": "Neil deGrasse Tyson" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "Didn’t we talk about enough in that conversation?", + "text": "Didn’t we talk about enough in that conversation?" }, { - "self_ref": "#/groups/22", + "self_ref": "#/texts/10", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/33" - }, - { - "$ref": "#/groups/23" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/23", - "parent": { - "$ref": "#/groups/22" - }, - "children": [ - { - "$ref": "#/texts/34" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/35" + "start_time": 32.0, + "end_time": 35.5, + "voice": "Roger Bingham" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "No! No no no no; 'cos 'cos obviously 'cos", + "text": "No! No no no no; 'cos 'cos obviously 'cos" }, { - "self_ref": "#/groups/24", + "self_ref": "#/texts/11", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/36" - }, - { - "$ref": "#/groups/25" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/25", - "parent": { - "$ref": "#/groups/24" - }, - "children": [ - { - "$ref": "#/texts/37" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/38" + "start_time": 32.5, + "end_time": 33.5, + "voice": "Neil deGrasse Tyson" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" - } - ], - "texts": [ - { - "self_ref": "#/texts/0", - "parent": { - "$ref": "#/groups/0" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:11.000 --> 00:13.000", - "text": "00:11.000 --> 00:13.000" - }, - { - "self_ref": "#/texts/1", - "parent": { - "$ref": "#/groups/1" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Roger Bingham: ", - "text": "Roger Bingham: " - }, - { - "self_ref": "#/texts/2", - "parent": { - "$ref": "#/groups/1" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "We are in New York City", - "text": "We are in New York City", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/3", - "parent": { - "$ref": "#/groups/2" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:13.000 --> 00:16.000", - "text": "00:13.000 --> 00:16.000" - }, - { - "self_ref": "#/texts/4", - "parent": { - "$ref": "#/groups/3" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Roger Bingham: ", - "text": "Roger Bingham: " - }, - { - "self_ref": "#/texts/5", - "parent": { - "$ref": "#/groups/3" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "We’re actually at the Lucern Hotel, just down the street", - "text": "We’re actually at the Lucern Hotel, just down the street", + "orig": "Laughs", + "text": "Laughs", "formatting": { "bold": false, - "italic": false, + "italic": true, "underline": false, "strikethrough": false, "script": "baseline" } }, { - "self_ref": "#/texts/6", + "self_ref": "#/texts/12", "parent": { - "$ref": "#/groups/4" + "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "text", - "prov": [], - "orig": "00:16.000 --> 00:18.000", - "text": "00:16.000 --> 00:18.000" - }, - { - "self_ref": "#/texts/7", - "parent": { - "$ref": "#/groups/5" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Roger Bingham: ", - "text": "Roger Bingham: " - }, - { - "self_ref": "#/texts/8", - "parent": { - "$ref": "#/groups/5" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "from the American Museum of Natural History", - "text": "from the American Museum of Natural History", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/9", - "parent": { - "$ref": "#/groups/6" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:18.000 --> 00:20.000", - "text": "00:18.000 --> 00:20.000" - }, - { - "self_ref": "#/texts/10", - "parent": { - "$ref": "#/groups/7" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Roger Bingham: ", - "text": "Roger Bingham: " - }, - { - "self_ref": "#/texts/11", - "parent": { - "$ref": "#/groups/7" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "And with me is Neil deGrasse Tyson", - "text": "And with me is Neil deGrasse Tyson", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/12", - "parent": { - "$ref": "#/groups/8" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:20.000 --> 00:22.000", - "text": "00:20.000 --> 00:22.000" - }, - { - "self_ref": "#/texts/13", - "parent": { - "$ref": "#/groups/9" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Roger Bingham: ", - "text": "Roger Bingham: " - }, - { - "self_ref": "#/texts/14", - "parent": { - "$ref": "#/groups/9" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Astrophysicist, Director of the Hayden Planetarium", - "text": "Astrophysicist, Director of the Hayden Planetarium", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/15", - "parent": { - "$ref": "#/groups/10" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:22.000 --> 00:24.000", - "text": "00:22.000 --> 00:24.000" - }, - { - "self_ref": "#/texts/16", - "parent": { - "$ref": "#/groups/11" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Roger Bingham: ", - "text": "Roger Bingham: " - }, - { - "self_ref": "#/texts/17", - "parent": { - "$ref": "#/groups/11" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "at the AMNH.", - "text": "at the AMNH.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/18", - "parent": { - "$ref": "#/groups/12" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:24.000 --> 00:26.000", - "text": "00:24.000 --> 00:26.000" - }, - { - "self_ref": "#/texts/19", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Roger Bingham: ", - "text": "Roger Bingham: " - }, - { - "self_ref": "#/texts/20", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Thank you for walking down here.", - "text": "Thank you for walking down here.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/21", - "parent": { - "$ref": "#/groups/14" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:27.000 --> 00:30.000", - "text": "00:27.000 --> 00:30.000" - }, - { - "self_ref": "#/texts/22", - "parent": { - "$ref": "#/groups/15" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Roger Bingham: ", - "text": "Roger Bingham: " - }, - { - "self_ref": "#/texts/23", - "parent": { - "$ref": "#/groups/15" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "And I want to do a follow-up on the last conversation we did.", - "text": "And I want to do a follow-up on the last conversation we did.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/24", - "parent": { - "$ref": "#/groups/16" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:30.000 --> 00:31.500", - "text": "00:30.000 --> 00:31.500" - }, - { - "self_ref": "#/texts/25", - "parent": { - "$ref": "#/groups/17" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Roger Bingham: ", - "text": "Roger Bingham: " - }, - { - "self_ref": "#/texts/26", - "parent": { - "$ref": "#/groups/17" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "When we e-mailed—", - "text": "When we e-mailed—", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/27", - "parent": { - "$ref": "#/groups/18" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:30.500 --> 00:32.500", - "text": "00:30.500 --> 00:32.500" - }, - { - "self_ref": "#/texts/28", - "parent": { - "$ref": "#/groups/19" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Neil deGrasse Tyson: ", - "text": "Neil deGrasse Tyson: " - }, - { - "self_ref": "#/texts/29", - "parent": { - "$ref": "#/groups/19" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Didn’t we talk about enough in that conversation?", - "text": "Didn’t we talk about enough in that conversation?", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/30", - "parent": { - "$ref": "#/groups/20" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:32.000 --> 00:35.500", - "text": "00:32.000 --> 00:35.500" - }, - { - "self_ref": "#/texts/31", - "parent": { - "$ref": "#/groups/21" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Roger Bingham: ", - "text": "Roger Bingham: " - }, - { - "self_ref": "#/texts/32", - "parent": { - "$ref": "#/groups/21" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "No! No no no no; 'cos 'cos obviously 'cos", - "text": "No! No no no no; 'cos 'cos obviously 'cos", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/33", - "parent": { - "$ref": "#/groups/22" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:32.500 --> 00:33.500", - "text": "00:32.500 --> 00:33.500" - }, - { - "self_ref": "#/texts/34", - "parent": { - "$ref": "#/groups/23" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Neil deGrasse Tyson: ", - "text": "Neil deGrasse Tyson: " - }, - { - "self_ref": "#/texts/35", - "parent": { - "$ref": "#/groups/23" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Laughs", - "text": "Laughs", - "formatting": { - "bold": false, - "italic": true, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/36", - "parent": { - "$ref": "#/groups/24" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:35.500 --> 00:38.000", - "text": "00:35.500 --> 00:38.000" - }, - { - "self_ref": "#/texts/37", - "parent": { - "$ref": "#/groups/25" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Roger Bingham: ", - "text": "Roger Bingham: " - }, - { - "self_ref": "#/texts/38", - "parent": { - "$ref": "#/groups/25" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], + "prov": [ + { + "start_time": 35.5, + "end_time": 38.0, + "voice": "Roger Bingham" + } + ], "orig": "You know I’m so excited my glasses are falling off here.", - "text": "You know I’m so excited my glasses are falling off here.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } + "text": "You know I’m so excited my glasses are falling off here." } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md index c57670289..95d9e6575 100644 --- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md +++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md @@ -1,51 +1,25 @@ -00:11.000 --> 00:13.000 +We are in New York City -Roger Bingham: We are in New York City +We’re actually at the Lucern Hotel, just down the street -00:13.000 --> 00:16.000 +from the American Museum of Natural History -Roger Bingham: We’re actually at the Lucern Hotel, just down the street +And with me is Neil deGrasse Tyson -00:16.000 --> 00:18.000 +Astrophysicist, Director of the Hayden Planetarium -Roger Bingham: from the American Museum of Natural History +at the AMNH. -00:18.000 --> 00:20.000 +Thank you for walking down here. -Roger Bingham: And with me is Neil deGrasse Tyson +And I want to do a follow-up on the last conversation we did. -00:20.000 --> 00:22.000 +When we e-mailed— -Roger Bingham: Astrophysicist, Director of the Hayden Planetarium +Didn’t we talk about enough in that conversation? -00:22.000 --> 00:24.000 +No! No no no no; 'cos 'cos obviously 'cos -Roger Bingham: at the AMNH. +*Laughs* -00:24.000 --> 00:26.000 - -Roger Bingham: Thank you for walking down here. - -00:27.000 --> 00:30.000 - -Roger Bingham: And I want to do a follow-up on the last conversation we did. - -00:30.000 --> 00:31.500 - -Roger Bingham: When we e-mailed— - -00:30.500 --> 00:32.500 - -Neil deGrasse Tyson: Didn’t we talk about enough in that conversation? - -00:32.000 --> 00:35.500 - -Roger Bingham: No! No no no no; 'cos 'cos obviously 'cos - -00:32.500 --> 00:33.500 - -Neil deGrasse Tyson: *Laughs* - -00:35.500 --> 00:38.000 - -Roger Bingham: You know I’m so excited my glasses are falling off here. \ No newline at end of file +You know I’m so excited my glasses are falling off here. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt index 6d90404ff..56f63bc3f 100644 --- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt +++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt @@ -1,22 +1,12 @@ item-0 at level 0: unspecified: group _root_ - item-1 at level 1: section: group WebVTT cue block - item-2 at level 2: text: 00:00.000 --> 00:02.000 - item-3 at level 2: inline: group WebVTT cue voice span - item-4 at level 3: text: Esme (first, loud): - item-5 at level 3: text: It’s a blue apple tree! - item-6 at level 1: section: group WebVTT cue block - item-7 at level 2: text: 00:02.000 --> 00:04.000 - item-8 at level 2: inline: group WebVTT cue voice span - item-9 at level 3: text: Mary: - item-10 at level 3: text: No way! - item-11 at level 1: section: group WebVTT cue block - item-12 at level 2: text: 00:04.000 --> 00:06.000 - item-13 at level 2: inline: group WebVTT cue voice span - item-14 at level 3: text: Esme: - item-15 at level 3: text: Hee! - item-16 at level 2: text: laughter - item-17 at level 1: section: group WebVTT cue block - item-18 at level 2: text: 00:06.000 --> 00:08.000 - item-19 at level 2: inline: group WebVTT cue voice span - item-20 at level 3: text: Mary (loud): - item-21 at level 3: text: That’s awesome! \ No newline at end of file + item-1 at level 1: text: It’s a blue apple tree! + item-2 at level 1: text: No way! + item-3 at level 1: inline: group WebVTT cue span + item-4 at level 2: text: Hee! + item-5 at level 2: text: + item-6 at level 2: text: laughter + item-7 at level 1: text: That’s awesome! + item-8 at level 1: inline: group WebVTT cue span + item-9 at level 2: text: Sur les + item-10 at level 2: text: playground + item-11 at level 2: text: , ici à Montpellier \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json index 72647d93d..67a95ef50 100644 --- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json +++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json @@ -1,10 +1,10 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "webvtt_example_02", "origin": { "mimetype": "text/vtt", - "binary_hash": 5029965721282070624, + "binary_hash": 8584853280299071027, "filename": "webvtt_example_02.vtt" }, "furniture": { @@ -18,16 +18,19 @@ "self_ref": "#/body", "children": [ { - "$ref": "#/groups/0" + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" }, { - "$ref": "#/groups/2" + "$ref": "#/groups/0" }, { - "$ref": "#/groups/4" + "$ref": "#/texts/5" }, { - "$ref": "#/groups/6" + "$ref": "#/groups/1" } ], "content_layer": "body", @@ -41,70 +44,22 @@ "$ref": "#/body" }, "children": [ - { - "$ref": "#/texts/0" - }, - { - "$ref": "#/groups/1" - } - ], - "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/1", - "parent": { - "$ref": "#/groups/0" - }, - "children": [ - { - "$ref": "#/texts/1" - }, { "$ref": "#/texts/2" - } - ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" - }, - { - "self_ref": "#/groups/2", - "parent": { - "$ref": "#/body" - }, - "children": [ + }, { "$ref": "#/texts/3" }, - { - "$ref": "#/groups/3" - } - ], - "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/3", - "parent": { - "$ref": "#/groups/2" - }, - "children": [ { "$ref": "#/texts/4" - }, - { - "$ref": "#/texts/5" } ], "content_layer": "body", - "name": "WebVTT cue voice span", + "name": "WebVTT cue span", "label": "inline" }, { - "self_ref": "#/groups/4", + "self_ref": "#/groups/1", "parent": { "$ref": "#/body" }, @@ -112,23 +67,6 @@ { "$ref": "#/texts/6" }, - { - "$ref": "#/groups/5" - }, - { - "$ref": "#/texts/9" - } - ], - "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/5", - "parent": { - "$ref": "#/groups/4" - }, - "children": [ { "$ref": "#/texts/7" }, @@ -137,41 +75,7 @@ } ], "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" - }, - { - "self_ref": "#/groups/6", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/10" - }, - { - "$ref": "#/groups/7" - } - ], - "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/7", - "parent": { - "$ref": "#/groups/6" - }, - "children": [ - { - "$ref": "#/texts/11" - }, - { - "$ref": "#/texts/12" - } - ], - "content_layer": "body", - "name": "WebVTT cue voice span", + "name": "WebVTT cue span", "label": "inline" } ], @@ -179,143 +83,161 @@ { "self_ref": "#/texts/0", "parent": { - "$ref": "#/groups/0" + "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "text", - "prov": [], - "orig": "00:00.000 --> 00:02.000", - "text": "00:00.000 --> 00:02.000" + "prov": [ + { + "start_time": 0.0, + "end_time": 2.0, + "voice": "Esme", + "classes": [ + "v.first.loud" + ] + } + ], + "orig": "It’s a blue apple tree!", + "text": "It’s a blue apple tree!" }, { "self_ref": "#/texts/1", "parent": { - "$ref": "#/groups/1" + "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "text", - "prov": [], - "orig": "Esme (first, loud): ", - "text": "Esme (first, loud): " + "prov": [ + { + "start_time": 2.0, + "end_time": 4.0, + "voice": "Mary" + } + ], + "orig": "No way!", + "text": "No way!" }, { "self_ref": "#/texts/2", "parent": { - "$ref": "#/groups/1" + "$ref": "#/groups/0" }, "children": [], "content_layer": "body", "label": "text", - "prov": [], - "orig": "It’s a blue apple tree!", - "text": "It’s a blue apple tree!", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } + "prov": [ + { + "start_time": 4.0, + "end_time": 6.0, + "voice": "Esme" + } + ], + "orig": "Hee!", + "text": "Hee!" }, { "self_ref": "#/texts/3", "parent": { - "$ref": "#/groups/2" + "$ref": "#/groups/0" }, "children": [], "content_layer": "body", "label": "text", - "prov": [], - "orig": "00:02.000 --> 00:04.000", - "text": "00:02.000 --> 00:04.000" + "prov": [ + { + "start_time": 4.0, + "end_time": 6.0 + } + ], + "orig": " ", + "text": " " }, { "self_ref": "#/texts/4", "parent": { - "$ref": "#/groups/3" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Mary: ", - "text": "Mary: " - }, - { - "self_ref": "#/texts/5", - "parent": { - "$ref": "#/groups/3" + "$ref": "#/groups/0" }, "children": [], "content_layer": "body", "label": "text", - "prov": [], - "orig": "No way!", - "text": "No way!", + "prov": [ + { + "start_time": 4.0, + "end_time": 6.0 + } + ], + "orig": "laughter", + "text": "laughter", "formatting": { "bold": false, - "italic": false, + "italic": true, "underline": false, "strikethrough": false, "script": "baseline" } }, { - "self_ref": "#/texts/6", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:04.000 --> 00:06.000", - "text": "00:04.000 --> 00:06.000" - }, - { - "self_ref": "#/texts/7", + "self_ref": "#/texts/5", "parent": { - "$ref": "#/groups/5" + "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "text", - "prov": [], - "orig": "Esme: ", - "text": "Esme: " + "prov": [ + { + "start_time": 6.0, + "end_time": 8.0, + "voice": "Mary", + "classes": [ + "v.loud" + ] + } + ], + "orig": "That’s awesome!", + "text": "That’s awesome!" }, { - "self_ref": "#/texts/8", + "self_ref": "#/texts/6", "parent": { - "$ref": "#/groups/5" + "$ref": "#/groups/1" }, "children": [], "content_layer": "body", "label": "text", - "prov": [], - "orig": "Hee!", - "text": "Hee!", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } + "prov": [ + { + "start_time": 8.0, + "end_time": 10.0 + } + ], + "orig": "Sur les ", + "text": "Sur les " }, { - "self_ref": "#/texts/9", + "self_ref": "#/texts/7", "parent": { - "$ref": "#/groups/4" + "$ref": "#/groups/1" }, "children": [], "content_layer": "body", "label": "text", - "prov": [], - "orig": "laughter", - "text": "laughter", + "prov": [ + { + "start_time": 8.0, + "end_time": 10.0, + "languages": [ + "en" + ], + "classes": [ + "i.foreignphrase" + ] + } + ], + "orig": "playground", + "text": "playground", "formatting": { "bold": false, "italic": true, @@ -325,47 +247,21 @@ } }, { - "self_ref": "#/texts/10", - "parent": { - "$ref": "#/groups/6" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:06.000 --> 00:08.000", - "text": "00:06.000 --> 00:08.000" - }, - { - "self_ref": "#/texts/11", - "parent": { - "$ref": "#/groups/7" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Mary (loud): ", - "text": "Mary (loud): " - }, - { - "self_ref": "#/texts/12", + "self_ref": "#/texts/8", "parent": { - "$ref": "#/groups/7" + "$ref": "#/groups/1" }, "children": [], "content_layer": "body", "label": "text", - "prov": [], - "orig": "That’s awesome!", - "text": "That’s awesome!", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } + "prov": [ + { + "start_time": 8.0, + "end_time": 10.0 + } + ], + "orig": ", ici à Montpellier", + "text": ", ici à Montpellier" } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md index db84cf116..5c6485f3a 100644 --- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md +++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md @@ -1,17 +1,9 @@ -00:00.000 --> 00:02.000 +It’s a blue apple tree! -Esme (first, loud): It’s a blue apple tree! +No way! -00:02.000 --> 00:04.000 +Hee! *laughter* -Mary: No way! +That’s awesome! -00:04.000 --> 00:06.000 - -Esme: Hee! - -*laughter* - -00:06.000 --> 00:08.000 - -Mary (loud): That’s awesome! \ No newline at end of file +Sur les *playground* , ici à Montpellier \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt index ca344e595..a46794123 100644 --- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt +++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt @@ -1,77 +1,18 @@ item-0 at level 0: unspecified: group _root_ - item-1 at level 1: section: group WebVTT cue block - item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0 - item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571 - item-4 at level 2: inline: group WebVTT cue voice span - item-5 at level 3: text: Speaker A: - item-6 at level 3: text: OK, I think now we should be recording - item-7 at level 1: section: group WebVTT cue block - item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1 - item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403 - item-10 at level 2: inline: group WebVTT cue voice span - item-11 at level 3: text: Speaker A: - item-12 at level 3: text: properly. - item-13 at level 1: section: group WebVTT cue block - item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0 - item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563 - item-16 at level 2: text: Good. - item-17 at level 1: section: group WebVTT cue block - item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0 - item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803 - item-20 at level 2: inline: group WebVTT cue voice span - item-21 at level 3: text: Speaker A: - item-22 at level 3: text: Yeah. - item-23 at level 1: section: group WebVTT cue block - item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0 - item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363 - item-26 at level 2: inline: group WebVTT cue voice span - item-27 at level 3: text: Speaker B: - item-28 at level 3: text: I was also thinking. - item-29 at level 1: section: group WebVTT cue block - item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0 - item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072 - item-32 at level 2: inline: group WebVTT cue voice span - item-33 at level 3: text: Speaker B: - item-34 at level 3: text: Would be maybe good to create items, - item-35 at level 1: section: group WebVTT cue block - item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1 - item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811 - item-38 at level 2: inline: group WebVTT cue voice span - item-39 at level 3: text: Speaker B: - item-40 at level 3: text: some metadata, some options that can be specific. - item-41 at level 1: section: group WebVTT cue block - item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0 - item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014 - item-44 at level 2: inline: group WebVTT cue voice span - item-45 at level 3: text: Speaker A: - item-46 at level 3: text: Yeah, I mean I think you went even more than - item-47 at level 1: section: group WebVTT cue block - item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0 - item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643 - item-50 at level 2: inline: group WebVTT cue voice span - item-51 at level 3: text: Speaker B: - item-52 at level 3: text: But we preserved the atoms. - item-53 at level 1: section: group WebVTT cue block - item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1 - item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907 - item-56 at level 2: inline: group WebVTT cue voice span - item-57 at level 3: text: Speaker A: - item-58 at level 3: text: than me. I just opened the format. - item-59 at level 1: section: group WebVTT cue block - item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1 - item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643 - item-62 at level 2: inline: group WebVTT cue voice span - item-63 at level 3: text: Speaker A: - item-64 at level 3: text: give it a try, yeah. - item-65 at level 1: section: group WebVTT cue block - item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0 - item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043 - item-68 at level 2: inline: group WebVTT cue voice span - item-69 at level 3: text: Speaker B: - item-70 at level 3: text: Okay, talk to you later. - item-71 at level 1: section: group WebVTT cue block - item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0 - item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283 - item-74 at level 2: inline: group WebVTT cue voice span - item-75 at level 3: text: Speaker A: - item-76 at level 3: text: See you. \ No newline at end of file + item-1 at level 1: text: OK, + item-2 at level 1: text: I think now we should be recording + item-3 at level 1: text: properly. + item-4 at level 1: text: Good. + item-5 at level 1: text: Yeah. + item-6 at level 1: text: I was also thinking. + item-7 at level 1: text: Would be maybe good to create items, + item-8 at level 1: text: some metadata, + item-9 at level 1: text: some options that can be specific. + item-10 at level 1: text: Yeah, + item-11 at level 1: text: I mean I think you went even more than + item-12 at level 1: text: But we preserved the atoms. + item-13 at level 1: text: than me. + item-14 at level 1: text: I just opened the format. + item-15 at level 1: text: give it a try, yeah. + item-16 at level 1: text: Okay, talk to you later. + item-17 at level 1: text: See you. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json index 5df08e2bf..dddce0f28 100644 --- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json +++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "webvtt_example_03", "origin": { "mimetype": "text/vtt", @@ -18,1218 +18,384 @@ "self_ref": "#/body", "children": [ { - "$ref": "#/groups/0" + "$ref": "#/texts/0" }, { - "$ref": "#/groups/2" + "$ref": "#/texts/1" }, { - "$ref": "#/groups/4" + "$ref": "#/texts/2" }, { - "$ref": "#/groups/5" + "$ref": "#/texts/3" }, { - "$ref": "#/groups/7" + "$ref": "#/texts/4" }, { - "$ref": "#/groups/9" + "$ref": "#/texts/5" }, { - "$ref": "#/groups/11" + "$ref": "#/texts/6" }, { - "$ref": "#/groups/13" + "$ref": "#/texts/7" }, { - "$ref": "#/groups/15" + "$ref": "#/texts/8" }, { - "$ref": "#/groups/17" + "$ref": "#/texts/9" }, { - "$ref": "#/groups/19" + "$ref": "#/texts/10" }, { - "$ref": "#/groups/21" + "$ref": "#/texts/11" }, { - "$ref": "#/groups/23" + "$ref": "#/texts/12" + }, + { + "$ref": "#/texts/13" + }, + { + "$ref": "#/texts/14" + }, + { + "$ref": "#/texts/15" + }, + { + "$ref": "#/texts/16" } ], "content_layer": "body", "name": "_root_", "label": "unspecified" }, - "groups": [ + "groups": [], + "texts": [ { - "self_ref": "#/groups/0", + "self_ref": "#/texts/0", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/0" - }, - { - "$ref": "#/texts/1" - }, - { - "$ref": "#/groups/1" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/1", - "parent": { - "$ref": "#/groups/0" - }, - "children": [ - { - "$ref": "#/texts/2" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/3" + "start_time": 4.963, + "end_time": 8.571, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0", + "voice": "Speaker A" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "OK,", + "text": "OK," }, { - "self_ref": "#/groups/2", + "self_ref": "#/texts/1", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/4" - }, - { - "$ref": "#/texts/5" - }, - { - "$ref": "#/groups/3" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/3", - "parent": { - "$ref": "#/groups/2" - }, - "children": [ - { - "$ref": "#/texts/6" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/7" + "start_time": 4.963, + "end_time": 8.571, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0", + "voice": "Speaker A" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "I think now we should be recording", + "text": "I think now we should be recording" }, { - "self_ref": "#/groups/4", + "self_ref": "#/texts/2", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/8" - }, - { - "$ref": "#/texts/9" - }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/10" + "start_time": 8.571, + "end_time": 9.403, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1", + "voice": "Speaker A" } ], - "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" + "orig": "properly.", + "text": "properly." }, { - "self_ref": "#/groups/5", + "self_ref": "#/texts/3", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/11" - }, - { - "$ref": "#/texts/12" - }, - { - "$ref": "#/groups/6" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/6", - "parent": { - "$ref": "#/groups/5" - }, - "children": [ - { - "$ref": "#/texts/13" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/14" + "start_time": 10.683, + "end_time": 11.563, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "Good.", + "text": "Good." }, { - "self_ref": "#/groups/7", + "self_ref": "#/texts/4", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/15" - }, - { - "$ref": "#/texts/16" - }, - { - "$ref": "#/groups/8" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/8", - "parent": { - "$ref": "#/groups/7" - }, - "children": [ - { - "$ref": "#/texts/17" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/18" + "start_time": 13.363, + "end_time": 13.803, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0", + "voice": "Speaker A" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "Yeah.", + "text": "Yeah." }, { - "self_ref": "#/groups/9", + "self_ref": "#/texts/5", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/19" - }, - { - "$ref": "#/texts/20" - }, - { - "$ref": "#/groups/10" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/10", - "parent": { - "$ref": "#/groups/9" - }, - "children": [ - { - "$ref": "#/texts/21" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/22" + "start_time": 49.603, + "end_time": 53.363, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0", + "voice": "Speaker B" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "I was also thinking.", + "text": "I was also thinking." }, { - "self_ref": "#/groups/11", + "self_ref": "#/texts/6", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/23" - }, - { - "$ref": "#/texts/24" - }, - { - "$ref": "#/groups/12" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/12", - "parent": { - "$ref": "#/groups/11" - }, - "children": [ - { - "$ref": "#/texts/25" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/26" + "start_time": 54.963, + "end_time": 62.072, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0", + "voice": "Speaker B" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "Would be maybe good to create items,", + "text": "Would be maybe good to create items," }, { - "self_ref": "#/groups/13", + "self_ref": "#/texts/7", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/27" - }, - { - "$ref": "#/texts/28" - }, - { - "$ref": "#/groups/14" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/14", - "parent": { - "$ref": "#/groups/13" - }, - "children": [ - { - "$ref": "#/texts/29" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/30" + "start_time": 62.072, + "end_time": 66.811, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1", + "voice": "Speaker B" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "some metadata,", + "text": "some metadata," }, { - "self_ref": "#/groups/15", + "self_ref": "#/texts/8", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/31" - }, - { - "$ref": "#/texts/32" - }, - { - "$ref": "#/groups/16" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/16", - "parent": { - "$ref": "#/groups/15" - }, - "children": [ - { - "$ref": "#/texts/33" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/34" + "start_time": 62.072, + "end_time": 66.811, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1", + "voice": "Speaker B" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "some options that can be specific.", + "text": "some options that can be specific." }, { - "self_ref": "#/groups/17", + "self_ref": "#/texts/9", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/35" - }, - { - "$ref": "#/texts/36" - }, - { - "$ref": "#/groups/18" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/18", - "parent": { - "$ref": "#/groups/17" - }, - "children": [ - { - "$ref": "#/texts/37" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/38" + "start_time": 70.243, + "end_time": 73.014, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0", + "voice": "Speaker A" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "Yeah,", + "text": "Yeah," }, { - "self_ref": "#/groups/19", + "self_ref": "#/texts/10", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/39" - }, - { - "$ref": "#/texts/40" - }, - { - "$ref": "#/groups/20" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/20", - "parent": { - "$ref": "#/groups/19" - }, - "children": [ - { - "$ref": "#/texts/41" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/42" + "start_time": 70.243, + "end_time": 73.014, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0", + "voice": "Speaker A" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "I mean I think you went even more than", + "text": "I mean I think you went even more than" }, { - "self_ref": "#/groups/21", + "self_ref": "#/texts/11", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/43" - }, - { - "$ref": "#/texts/44" - }, - { - "$ref": "#/groups/22" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/22", - "parent": { - "$ref": "#/groups/21" - }, - "children": [ - { - "$ref": "#/texts/45" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/46" + "start_time": 70.563, + "end_time": 72.643, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0", + "voice": "Speaker B" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" + "orig": "But we preserved the atoms.", + "text": "But we preserved the atoms." }, { - "self_ref": "#/groups/23", + "self_ref": "#/texts/12", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/47" - }, - { - "$ref": "#/texts/48" - }, - { - "$ref": "#/groups/24" - } - ], + "children": [], "content_layer": "body", - "name": "WebVTT cue block", - "label": "section" - }, - { - "self_ref": "#/groups/24", - "parent": { - "$ref": "#/groups/23" - }, - "children": [ - { - "$ref": "#/texts/49" - }, + "label": "text", + "prov": [ { - "$ref": "#/texts/50" + "start_time": 73.014, + "end_time": 75.907, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1", + "voice": "Speaker A" } ], - "content_layer": "body", - "name": "WebVTT cue voice span", - "label": "inline" - } - ], - "texts": [ - { - "self_ref": "#/texts/0", - "parent": { - "$ref": "#/groups/0" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0", - "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0" - }, - { - "self_ref": "#/texts/1", - "parent": { - "$ref": "#/groups/0" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:00:04.963 --> 00:00:08.571", - "text": "00:00:04.963 --> 00:00:08.571" - }, - { - "self_ref": "#/texts/2", - "parent": { - "$ref": "#/groups/1" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Speaker A: ", - "text": "Speaker A: " - }, - { - "self_ref": "#/texts/3", - "parent": { - "$ref": "#/groups/1" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "OK, I think now we should be recording", - "text": "OK, I think now we should be recording", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } + "orig": "than me.", + "text": "than me." }, { - "self_ref": "#/texts/4", + "self_ref": "#/texts/13", "parent": { - "$ref": "#/groups/2" + "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "text", - "prov": [], - "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1", - "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1" + "prov": [ + { + "start_time": 73.014, + "end_time": 75.907, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1", + "voice": "Speaker A" + } + ], + "orig": "I just opened the format.", + "text": "I just opened the format." }, { - "self_ref": "#/texts/5", + "self_ref": "#/texts/14", "parent": { - "$ref": "#/groups/2" + "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "text", - "prov": [], - "orig": "00:00:08.571 --> 00:00:09.403", - "text": "00:00:08.571 --> 00:00:09.403" + "prov": [ + { + "start_time": 110.222, + "end_time": 111.643, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1", + "voice": "Speaker A" + } + ], + "orig": "give it a try, yeah.", + "text": "give it a try, yeah." }, { - "self_ref": "#/texts/6", + "self_ref": "#/texts/15", "parent": { - "$ref": "#/groups/3" + "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "text", - "prov": [], - "orig": "Speaker A: ", - "text": "Speaker A: " + "prov": [ + { + "start_time": 112.043, + "end_time": 115.043, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0", + "voice": "Speaker B" + } + ], + "orig": "Okay, talk to you later.", + "text": "Okay, talk to you later." }, { - "self_ref": "#/texts/7", + "self_ref": "#/texts/16", "parent": { - "$ref": "#/groups/3" + "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "text", - "prov": [], - "orig": "properly.", - "text": "properly.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/8", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0", - "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0" - }, - { - "self_ref": "#/texts/9", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:00:10.683 --> 00:00:11.563", - "text": "00:00:10.683 --> 00:00:11.563" - }, - { - "self_ref": "#/texts/10", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Good.", - "text": "Good.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/11", - "parent": { - "$ref": "#/groups/5" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0", - "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0" - }, - { - "self_ref": "#/texts/12", - "parent": { - "$ref": "#/groups/5" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:00:13.363 --> 00:00:13.803", - "text": "00:00:13.363 --> 00:00:13.803" - }, - { - "self_ref": "#/texts/13", - "parent": { - "$ref": "#/groups/6" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Speaker A: ", - "text": "Speaker A: " - }, - { - "self_ref": "#/texts/14", - "parent": { - "$ref": "#/groups/6" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Yeah.", - "text": "Yeah.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/15", - "parent": { - "$ref": "#/groups/7" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0", - "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0" - }, - { - "self_ref": "#/texts/16", - "parent": { - "$ref": "#/groups/7" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:00:49.603 --> 00:00:53.363", - "text": "00:00:49.603 --> 00:00:53.363" - }, - { - "self_ref": "#/texts/17", - "parent": { - "$ref": "#/groups/8" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Speaker B: ", - "text": "Speaker B: " - }, - { - "self_ref": "#/texts/18", - "parent": { - "$ref": "#/groups/8" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "I was also thinking.", - "text": "I was also thinking.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/19", - "parent": { - "$ref": "#/groups/9" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0", - "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0" - }, - { - "self_ref": "#/texts/20", - "parent": { - "$ref": "#/groups/9" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:00:54.963 --> 00:01:02.072", - "text": "00:00:54.963 --> 00:01:02.072" - }, - { - "self_ref": "#/texts/21", - "parent": { - "$ref": "#/groups/10" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Speaker B: ", - "text": "Speaker B: " - }, - { - "self_ref": "#/texts/22", - "parent": { - "$ref": "#/groups/10" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Would be maybe good to create items,", - "text": "Would be maybe good to create items,", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/23", - "parent": { - "$ref": "#/groups/11" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1", - "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1" - }, - { - "self_ref": "#/texts/24", - "parent": { - "$ref": "#/groups/11" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:01:02.072 --> 00:01:06.811", - "text": "00:01:02.072 --> 00:01:06.811" - }, - { - "self_ref": "#/texts/25", - "parent": { - "$ref": "#/groups/12" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Speaker B: ", - "text": "Speaker B: " - }, - { - "self_ref": "#/texts/26", - "parent": { - "$ref": "#/groups/12" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "some metadata, some options that can be specific.", - "text": "some metadata, some options that can be specific.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/27", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0", - "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0" - }, - { - "self_ref": "#/texts/28", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:01:10.243 --> 00:01:13.014", - "text": "00:01:10.243 --> 00:01:13.014" - }, - { - "self_ref": "#/texts/29", - "parent": { - "$ref": "#/groups/14" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Speaker A: ", - "text": "Speaker A: " - }, - { - "self_ref": "#/texts/30", - "parent": { - "$ref": "#/groups/14" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Yeah, I mean I think you went even more than", - "text": "Yeah, I mean I think you went even more than", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/31", - "parent": { - "$ref": "#/groups/15" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0", - "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0" - }, - { - "self_ref": "#/texts/32", - "parent": { - "$ref": "#/groups/15" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:01:10.563 --> 00:01:12.643", - "text": "00:01:10.563 --> 00:01:12.643" - }, - { - "self_ref": "#/texts/33", - "parent": { - "$ref": "#/groups/16" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Speaker B: ", - "text": "Speaker B: " - }, - { - "self_ref": "#/texts/34", - "parent": { - "$ref": "#/groups/16" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "But we preserved the atoms.", - "text": "But we preserved the atoms.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/35", - "parent": { - "$ref": "#/groups/17" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1", - "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1" - }, - { - "self_ref": "#/texts/36", - "parent": { - "$ref": "#/groups/17" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:01:13.014 --> 00:01:15.907", - "text": "00:01:13.014 --> 00:01:15.907" - }, - { - "self_ref": "#/texts/37", - "parent": { - "$ref": "#/groups/18" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Speaker A: ", - "text": "Speaker A: " - }, - { - "self_ref": "#/texts/38", - "parent": { - "$ref": "#/groups/18" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "than me. I just opened the format.", - "text": "than me. I just opened the format.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/39", - "parent": { - "$ref": "#/groups/19" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1", - "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1" - }, - { - "self_ref": "#/texts/40", - "parent": { - "$ref": "#/groups/19" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:01:50.222 --> 00:01:51.643", - "text": "00:01:50.222 --> 00:01:51.643" - }, - { - "self_ref": "#/texts/41", - "parent": { - "$ref": "#/groups/20" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Speaker A: ", - "text": "Speaker A: " - }, - { - "self_ref": "#/texts/42", - "parent": { - "$ref": "#/groups/20" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "give it a try, yeah.", - "text": "give it a try, yeah.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/43", - "parent": { - "$ref": "#/groups/21" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0", - "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0" - }, - { - "self_ref": "#/texts/44", - "parent": { - "$ref": "#/groups/21" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:01:52.043 --> 00:01:55.043", - "text": "00:01:52.043 --> 00:01:55.043" - }, - { - "self_ref": "#/texts/45", - "parent": { - "$ref": "#/groups/22" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Speaker B: ", - "text": "Speaker B: " - }, - { - "self_ref": "#/texts/46", - "parent": { - "$ref": "#/groups/22" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Okay, talk to you later.", - "text": "Okay, talk to you later.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } - }, - { - "self_ref": "#/texts/47", - "parent": { - "$ref": "#/groups/23" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0", - "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0" - }, - { - "self_ref": "#/texts/48", - "parent": { - "$ref": "#/groups/23" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "00:01:54.603 --> 00:01:55.283", - "text": "00:01:54.603 --> 00:01:55.283" - }, - { - "self_ref": "#/texts/49", - "parent": { - "$ref": "#/groups/24" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], - "orig": "Speaker A: ", - "text": "Speaker A: " - }, - { - "self_ref": "#/texts/50", - "parent": { - "$ref": "#/groups/24" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [], + "prov": [ + { + "start_time": 114.603, + "end_time": 115.283, + "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0", + "voice": "Speaker A" + } + ], "orig": "See you.", - "text": "See you.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } + "text": "See you." } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md index 859a6dde3..b58d350b3 100644 --- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md +++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md @@ -1,77 +1,33 @@ -62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0 +OK, -00:00:04.963 --> 00:00:08.571 +I think now we should be recording -Speaker A: OK, I think now we should be recording - -62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1 - -00:00:08.571 --> 00:00:09.403 - -Speaker A: properly. - -62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0 - -00:00:10.683 --> 00:00:11.563 +properly. Good. -62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0 - -00:00:13.363 --> 00:00:13.803 - -Speaker A: Yeah. - -62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0 - -00:00:49.603 --> 00:00:53.363 - -Speaker B: I was also thinking. - -62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0 - -00:00:54.963 --> 00:01:02.072 - -Speaker B: Would be maybe good to create items, - -62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1 - -00:01:02.072 --> 00:01:06.811 - -Speaker B: some metadata, some options that can be specific. - -62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0 - -00:01:10.243 --> 00:01:13.014 - -Speaker A: Yeah, I mean I think you went even more than - -62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0 - -00:01:10.563 --> 00:01:12.643 - -Speaker B: But we preserved the atoms. +Yeah. -62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1 +I was also thinking. -00:01:13.014 --> 00:01:15.907 +Would be maybe good to create items, -Speaker A: than me. I just opened the format. +some metadata, -62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1 +some options that can be specific. -00:01:50.222 --> 00:01:51.643 +Yeah, -Speaker A: give it a try, yeah. +I mean I think you went even more than -62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0 +But we preserved the atoms. -00:01:52.043 --> 00:01:55.043 +than me. -Speaker B: Okay, talk to you later. +I just opened the format. -62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0 +give it a try, yeah. -00:01:54.603 --> 00:01:55.283 +Okay, talk to you later. -Speaker A: See you. \ No newline at end of file +See you. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.itxt new file mode 100644 index 000000000..93feba5e9 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.itxt @@ -0,0 +1,14 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: text: Last night the chef surprised us with a culinary adventure. + item-2 at level 1: inline: group WebVTT cue span + item-3 at level 2: text: The waiter offered a + item-4 at level 2: text: steaming bowl of + item-5 at level 2: text: paella + item-6 at level 2: text: that instantly transported the diners to a sunny Mediterranean coast. + item-7 at level 1: inline: group WebVTT cue span + item-8 at level 2: text: The dessert’s + item-9 at level 2: text: unexpected + item-10 at level 2: text: + item-11 at level 2: text: arcobaleno + item-12 at level 2: text: of flavors + item-13 at level 2: text: left everyone in awe. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.json new file mode 100644 index 000000000..17ab9f501 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.json @@ -0,0 +1,344 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.8.0", + "name": "webvtt_example_04", + "origin": { + "mimetype": "text/vtt", + "binary_hash": 5389775195091554844, + "filename": "webvtt_example_04.vtt" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + } + ], + "content_layer": "body", + "name": "WebVTT cue span", + "label": "inline" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + } + ], + "content_layer": "body", + "name": "WebVTT cue span", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14580.0, + "end_time": 14760.0, + "identifier": "agcvs-08234" + } + ], + "orig": "Last night the chef surprised us with a culinary adventure.", + "text": "Last night the chef surprised us with a culinary adventure." + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": "The waiter offered a ", + "text": "The waiter offered a " + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": "steaming bowl of ", + "text": "steaming bowl of ", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234", + "languages": [ + "es-ES" + ] + } + ], + "orig": "paella", + "text": "paella", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": " that instantly transported the diners to a sunny Mediterranean coast.", + "text": " that instantly transported the diners to a sunny Mediterranean coast." + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": "The dessert’s ", + "text": "The dessert’s " + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234", + "classes": [ + "b.loud" + ] + } + ], + "orig": "unexpected", + "text": "unexpected", + "formatting": { + "bold": true, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": " ", + "text": " ", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234", + "languages": [ + "it" + ] + } + ], + "orig": "arcobaleno", + "text": "arcobaleno", + "formatting": { + "bold": false, + "italic": true, + "underline": true, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": " of flavors", + "text": " of flavors", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "start_time": 14760.0, + "end_time": 14818.239, + "identifier": "agcvs-08234" + } + ], + "orig": " left everyone in awe.", + "text": " left everyone in awe." + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.md new file mode 100644 index 000000000..f2312a059 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.md @@ -0,0 +1,5 @@ +Last night the chef surprised us with a culinary adventure. + +The waiter offered a *steaming bowl of * *paella* that instantly transported the diners to a sunny Mediterranean coast. + +The dessert’s ***unexpected*** * * *arcobaleno* * of flavors* left everyone in awe. \ No newline at end of file diff --git a/tests/data/webvtt/webvtt_example_02.vtt b/tests/data/webvtt/webvtt_example_02.vtt index 1152a1e8f..6bd182101 100644 --- a/tests/data/webvtt/webvtt_example_02.vtt +++ b/tests/data/webvtt/webvtt_example_02.vtt @@ -12,4 +12,7 @@ NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/ Hee! laughter 00:06.000 --> 00:08.000 -That’s awesome! \ No newline at end of file +That’s awesome! + +00:08.000 --> 00:10.000 +Sur les playground, ici à Montpellier \ No newline at end of file diff --git a/tests/data/webvtt/webvtt_example_04.vtt b/tests/data/webvtt/webvtt_example_04.vtt new file mode 100644 index 000000000..fd7b788c0 --- /dev/null +++ b/tests/data/webvtt/webvtt_example_04.vtt @@ -0,0 +1,10 @@ +WEBVTT + +agcvs-08234 +04:03:00.000 --> 04:06:00.000 +Last night the chef surprised us with a culinary adventure. + +agcvs-08234 +04:06:00.000 --> 04:06:58.239 +The waiter offered a steaming bowl of paella that instantly transported the diners to a sunny Mediterranean coast. +The dessert’s unexpected arcobaleno of flavors left everyone in awe. \ No newline at end of file diff --git a/tests/test_backend_vtt.py b/tests/test_backend_vtt.py index a910671bb..54e91219d 100644 --- a/tests/test_backend_vtt.py +++ b/tests/test_backend_vtt.py @@ -1,19 +1,7 @@ -# Assisted by watsonx Code Assistant - from pathlib import Path -import pytest from docling_core.types.doc import DoclingDocument -from pydantic import ValidationError -from docling.backend.webvtt_backend import ( - _WebVTTCueItalicSpan, - _WebVTTCueTextSpan, - _WebVTTCueTimings, - _WebVTTCueVoiceSpan, - _WebVTTFile, - _WebVTTTimestamp, -) from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.document_converter import DocumentConverter @@ -24,187 +12,6 @@ GENERATE = GEN_TEST_DATA -def test_vtt_cue_commponents(): - """Test WebVTT components.""" - valid_timestamps = [ - "00:01:02.345", - "12:34:56.789", - "02:34.567", - "00:00:00.000", - ] - valid_total_seconds = [ - 1 * 60 + 2.345, - 12 * 3600 + 34 * 60 + 56.789, - 2 * 60 + 34.567, - 0.0, - ] - for idx, ts in enumerate(valid_timestamps): - model = _WebVTTTimestamp(raw=ts) - assert model.seconds == valid_total_seconds[idx] - - """Test invalid WebVTT timestamps.""" - invalid_timestamps = [ - "00:60:02.345", # minutes > 59 - "00:01:60.345", # seconds > 59 - "00:01:02.1000", # milliseconds > 999 - "01:02:03", # missing milliseconds - "01:02", # missing milliseconds - ":01:02.345", # extra : for missing hours - "abc:01:02.345", # invalid format - ] - for ts in invalid_timestamps: - with pytest.raises(ValidationError): - _WebVTTTimestamp(raw=ts) - - """Test the timestamp __str__ method.""" - model = _WebVTTTimestamp(raw="00:01:02.345") - assert str(model) == "00:01:02.345" - - """Test valid cue timings.""" - start = _WebVTTTimestamp(raw="00:10.005") - end = _WebVTTTimestamp(raw="00:14.007") - cue_timings = _WebVTTCueTimings(start=start, end=end) - assert cue_timings.start == start - assert cue_timings.end == end - assert str(cue_timings) == "00:10.005 --> 00:14.007" - - """Test invalid cue timings with end timestamp before start.""" - start = _WebVTTTimestamp(raw="00:10.700") - end = _WebVTTTimestamp(raw="00:10.500") - with pytest.raises(ValidationError) as excinfo: - _WebVTTCueTimings(start=start, end=end) - assert "End timestamp must be greater than start timestamp" in str(excinfo.value) - - """Test invalid cue timings with missing end.""" - start = _WebVTTTimestamp(raw="00:10.500") - with pytest.raises(ValidationError) as excinfo: - _WebVTTCueTimings(start=start) - assert "Field required" in str(excinfo.value) - - """Test invalid cue timings with missing start.""" - end = _WebVTTTimestamp(raw="00:10.500") - with pytest.raises(ValidationError) as excinfo: - _WebVTTCueTimings(end=end) - assert "Field required" in str(excinfo.value) - - """Test with valid text.""" - valid_text = "This is a valid cue text span." - span = _WebVTTCueTextSpan(text=valid_text) - assert span.text == valid_text - assert str(span) == valid_text - - """Test with text containing newline characters.""" - invalid_text = "This cue text span\ncontains a newline." - with pytest.raises(ValidationError): - _WebVTTCueTextSpan(text=invalid_text) - - """Test with text containing ampersand.""" - invalid_text = "This cue text span contains &." - with pytest.raises(ValidationError): - _WebVTTCueTextSpan(text=invalid_text) - - """Test with text containing less-than sign.""" - invalid_text = "This cue text span contains <." - with pytest.raises(ValidationError): - _WebVTTCueTextSpan(text=invalid_text) - - """Test with empty text.""" - with pytest.raises(ValidationError): - _WebVTTCueTextSpan(text="") - - """Test that annotation validation works correctly.""" - valid_annotation = "valid-annotation" - invalid_annotation = "invalid\nannotation" - with pytest.raises(ValidationError): - _WebVTTCueVoiceSpan(annotation=invalid_annotation) - assert _WebVTTCueVoiceSpan(annotation=valid_annotation) - - """Test that classes validation works correctly.""" - annotation = "speaker name" - valid_classes = ["class1", "class2"] - invalid_classes = ["class\nwith\nnewlines", ""] - with pytest.raises(ValidationError): - _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes) - assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes) - - """Test that components validation works correctly.""" - annotation = "speaker name" - valid_components = [_WebVTTCueTextSpan(text="random text")] - invalid_components = [123, "not a component"] - with pytest.raises(ValidationError): - _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components) - assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components) - - """Test valid cue voice spans.""" - cue_span = _WebVTTCueVoiceSpan( - annotation="speaker", - classes=["loud", "clear"], - components=[_WebVTTCueTextSpan(text="random text")], - ) - - expected_str = "random text" - assert str(cue_span) == expected_str - - cue_span = _WebVTTCueVoiceSpan( - annotation="speaker", - components=[_WebVTTCueTextSpan(text="random text")], - ) - expected_str = "random text" - assert str(cue_span) == expected_str - - -def test_webvtt_file(): - """Test WebVTT files.""" - with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f: - content = f.read() - vtt = _WebVTTFile.parse(content) - assert len(vtt) == 13 - block = vtt.cue_blocks[11] - assert str(block.timings) == "00:32.500 --> 00:33.500" - assert len(block.payload) == 1 - cue_span = block.payload[0] - assert isinstance(cue_span, _WebVTTCueVoiceSpan) - assert cue_span.annotation == "Neil deGrasse Tyson" - assert not cue_span.classes - assert len(cue_span.components) == 1 - comp = cue_span.components[0] - assert isinstance(comp, _WebVTTCueItalicSpan) - assert len(comp.components) == 1 - comp2 = comp.components[0] - assert isinstance(comp2, _WebVTTCueTextSpan) - assert comp2.text == "Laughs" - - with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f: - content = f.read() - vtt = _WebVTTFile.parse(content) - assert len(vtt) == 4 - reverse = ( - "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. " - "https://www.w3.org/TR/webvtt1/\n\n" - ) - reverse += "\n\n".join([str(block) for block in vtt.cue_blocks]) - assert content == reverse - - with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f: - content = f.read() - vtt = _WebVTTFile.parse(content) - assert len(vtt) == 13 - for block in vtt: - assert block.identifier - block = vtt.cue_blocks[0] - assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0" - assert str(block.timings) == "00:00:04.963 --> 00:00:08.571" - assert len(block.payload) == 1 - assert isinstance(block.payload[0], _WebVTTCueVoiceSpan) - block = vtt.cue_blocks[2] - assert isinstance(cue_span, _WebVTTCueVoiceSpan) - assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0" - assert str(block.timings) == "00:00:10.683 --> 00:00:11.563" - assert len(block.payload) == 1 - assert isinstance(block.payload[0], _WebVTTCueTextSpan) - assert block.payload[0].text == "Good." - - def test_e2e_vtt_conversions(): directory = Path("./tests/data/webvtt/") vtt_paths = sorted(directory.rglob("*.vtt")) diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 93f33e1fd..ad7eafa98 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -10,6 +10,8 @@ DoclingDocument, FormulaItem, PictureItem, + ProvenanceItem, + ProvenanceTrack, TableItem, TextItem, ) @@ -237,7 +239,30 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: true_prov = true_item.prov[0] pred_prov = pred_item.prov[0] - assert true_prov.page_no == pred_prov.page_no, "Page provenance mistmatch" + assert type(pred_prov) is type(true_prov), "Provenance type mismatch" + if isinstance(pred_prov, ProvenanceItem): + assert true_prov.page_no == pred_prov.page_no, ( + "Page provenance mistmatch" + ) + elif isinstance(pred_prov, ProvenanceTrack): + assert true_prov.start_time._seconds == pred_prov.start_time._seconds, ( + "ProvenanceTrack start time mismatch" + ) + assert true_prov.end_time._seconds == pred_prov.end_time._seconds, ( + "ProvenanceTrack end time mismatch" + ) + assert true_prov.languages == pred_prov.languages, ( + "ProvenanceTrack languages mismatch" + ) + assert true_prov.classes == pred_prov.classes, ( + "ProvenanceTrack classes mismatch" + ) + assert true_prov.identifier == pred_prov.identifier, ( + "ProvenanceTrack identifier mismatch" + ) + assert true_prov.voice == pred_prov.voice, ( + "ProvenanceTrack voice mismatch" + ) # TODO: add bbox check with tolerance diff --git a/uv.lock b/uv.lock index 6548b79f4..cd0663a3d 100644 --- a/uv.lock +++ b/uv.lock @@ -1546,7 +1546,7 @@ requires-dist = [ { name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" }, { name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" }, { name = "certifi", specifier = ">=2024.7.4" }, - { name = "docling-core", extras = ["chunking"], specifier = ">=2.50.1,<3.0.0" }, + { name = "docling-core", extras = ["chunking"], git = "ssh://git@github.com/docling-project/docling-core.git?rev=c75516516358f25add2682674fc7dc6eef2c5164" }, { name = "docling-ibm-models", specifier = ">=3.9.1,<4" }, { name = "docling-parse", specifier = ">=4.7.0,<5.0.0" }, { name = "easyocr", marker = "extra == 'easyocr'", specifier = ">=1.7,<2.0" }, @@ -1631,8 +1631,8 @@ examples = [ [[package]] name = "docling-core" -version = "2.51.1" -source = { registry = "https://pypi.org/simple" } +version = "2.55.0" +source = { git = "ssh://git@github.com/docling-project/docling-core.git?rev=c75516516358f25add2682674fc7dc6eef2c5164#c75516516358f25add2682674fc7dc6eef2c5164" } dependencies = [ { name = "jsonref" }, { name = "jsonschema" }, @@ -1645,10 +1645,6 @@ dependencies = [ { name = "typer" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/33/76/665a61f6208923fb312549d9c7a2ef5275bdd7fd4d83cbe8ddd668f2fa35/docling_core-2.51.1.tar.gz", hash = "sha256:f5b0d8ead535c8451f67f9545af007f5bebfda72744a8e90af6e83fb6a483a99", size = 184664, upload-time = "2025-11-14T13:33:48.586Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a6/06/911a7374d59afff0dd8b50f84e1b7e5c4452886bbbe0e31e04510f44d43e/docling_core-2.51.1-py3-none-any.whl", hash = "sha256:76ca2b4c5c1d33475583671fe584b390e769152cac48d1fb24bf5a7457864a66", size = 186005, upload-time = "2025-11-14T13:33:46.695Z" }, -] [package.optional-dependencies] chunking = [ @@ -6119,6 +6115,9 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/8a/b35a615ae6f04550d696bb179c414538b3b477999435fdd4ad75b76139e4/pybase64-1.4.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:a370dea7b1cee2a36a4d5445d4e09cc243816c5bc8def61f602db5a6f5438e52", size = 54320, upload-time = "2025-07-27T13:03:27.495Z" }, { url = "https://files.pythonhosted.org/packages/d3/a9/8bd4f9bcc53689f1b457ecefed1eaa080e4949d65a62c31a38b7253d5226/pybase64-1.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9aa4de83f02e462a6f4e066811c71d6af31b52d7484de635582d0e3ec3d6cc3e", size = 56482, upload-time = "2025-07-27T13:03:28.942Z" }, { url = "https://files.pythonhosted.org/packages/75/e5/4a7735b54a1191f61c3f5c2952212c85c2d6b06eb5fb3671c7603395f70c/pybase64-1.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83a1c2f9ed00fee8f064d548c8654a480741131f280e5750bb32475b7ec8ee38", size = 70959, upload-time = "2025-07-27T13:03:30.171Z" }, + { url = "https://files.pythonhosted.org/packages/f4/56/5337f27a8b8d2d6693f46f7b36bae47895e5820bfa259b0072574a4e1057/pybase64-1.4.2-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:0f331aa59549de21f690b6ccc79360ffed1155c3cfbc852eb5c097c0b8565a2b", size = 33888, upload-time = "2025-07-27T13:03:35.698Z" }, + { url = "https://files.pythonhosted.org/packages/e3/ff/470768f0fe6de0aa302a8cb1bdf2f9f5cffc3f69e60466153be68bc953aa/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:69d3f0445b0faeef7bb7f93bf8c18d850785e2a77f12835f49e524cc54af04e7", size = 30914, upload-time = "2025-07-27T13:03:38.475Z" }, + { url = "https://files.pythonhosted.org/packages/75/6b/d328736662665e0892409dc410353ebef175b1be5eb6bab1dad579efa6df/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2372b257b1f4dd512f317fb27e77d313afd137334de64c87de8374027aacd88a", size = 31380, upload-time = "2025-07-27T13:03:39.7Z" }, { url = "https://files.pythonhosted.org/packages/ca/96/7ff718f87c67f4147c181b73d0928897cefa17dc75d7abc6e37730d5908f/pybase64-1.4.2-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:fb794502b4b1ec91c4ca5d283ae71aef65e3de7721057bd9e2b3ec79f7a62d7d", size = 38230, upload-time = "2025-07-27T13:03:41.637Z" }, { url = "https://files.pythonhosted.org/packages/71/ab/db4dbdfccb9ca874d6ce34a0784761471885d96730de85cee3d300381529/pybase64-1.4.2-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d377d48acf53abf4b926c2a7a24a19deb092f366a04ffd856bf4b3aa330b025d", size = 71608, upload-time = "2025-07-27T13:03:47.01Z" }, { url = "https://files.pythonhosted.org/packages/f2/58/7f2cef1ceccc682088958448d56727369de83fa6b29148478f4d2acd107a/pybase64-1.4.2-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:ab9cdb6a8176a5cb967f53e6ad60e40c83caaa1ae31c5e1b29e5c8f507f17538", size = 56413, upload-time = "2025-07-27T13:03:49.908Z" }, @@ -6140,6 +6139,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/95/f0/c392c4ac8ccb7a34b28377c21faa2395313e3c676d76c382642e19a20703/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad59362fc267bf15498a318c9e076686e4beeb0dfe09b457fabbc2b32468b97a", size = 58103, upload-time = "2025-07-27T13:04:29.996Z" }, { url = "https://files.pythonhosted.org/packages/32/30/00ab21316e7df8f526aa3e3dc06f74de6711d51c65b020575d0105a025b2/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:01593bd064e7dcd6c86d04e94e44acfe364049500c20ac68ca1e708fbb2ca970", size = 60779, upload-time = "2025-07-27T13:04:31.549Z" }, { url = "https://files.pythonhosted.org/packages/a6/65/114ca81839b1805ce4a2b7d58bc16e95634734a2059991f6382fc71caf3e/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5b81547ad8ea271c79fdf10da89a1e9313cb15edcba2a17adf8871735e9c02a0", size = 74684, upload-time = "2025-07-27T13:04:32.976Z" }, + { url = "https://files.pythonhosted.org/packages/99/bf/00a87d951473ce96c8c08af22b6983e681bfabdb78dd2dcf7ee58eac0932/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:4157ad277a32cf4f02a975dffc62a3c67d73dfa4609b2c1978ef47e722b18b8e", size = 30924, upload-time = "2025-07-27T13:04:39.189Z" }, + { url = "https://files.pythonhosted.org/packages/ae/43/dee58c9d60e60e6fb32dc6da722d84592e22f13c277297eb4ce6baf99a99/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:e113267dc349cf624eb4f4fbf53fd77835e1aa048ac6877399af426aab435757", size = 31390, upload-time = "2025-07-27T13:04:40.995Z" }, { url = "https://files.pythonhosted.org/packages/e1/11/b28906fc2e330b8b1ab4bc845a7bef808b8506734e90ed79c6062b095112/pybase64-1.4.2-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:cea5aaf218fd9c5c23afacfe86fd4464dfedc1a0316dd3b5b4075b068cc67df0", size = 38212, upload-time = "2025-07-27T13:04:42.729Z" }, { url = "https://files.pythonhosted.org/packages/e4/2e/851eb51284b97354ee5dfa1309624ab90920696e91a33cd85b13d20cc5c1/pybase64-1.4.2-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a3e54dcf0d0305ec88473c9d0009f698cabf86f88a8a10090efeff2879c421bb", size = 71674, upload-time = "2025-07-27T13:04:49.294Z" }, { url = "https://files.pythonhosted.org/packages/a4/8e/3479266bc0e65f6cc48b3938d4a83bff045330649869d950a378f2ddece0/pybase64-1.4.2-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:753da25d4fd20be7bda2746f545935773beea12d5cb5ec56ec2d2960796477b1", size = 56461, upload-time = "2025-07-27T13:04:52.37Z" },